From 4c74bb7b6f2398da81ce462cbdfd9c5a77ffa683 Mon Sep 17 00:00:00 2001
From: pini <pini@48e7efb5-ca39-0410-a469-dd3cf9ba447f>
Date: Thu, 11 Mar 2010 14:47:35 +0000
Subject: [PATCH] Added our tweaked version of NAS benchmarks.

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/simgrid/simgrid/trunk@7221 48e7efb5-ca39-0410-a469-dd3cf9ba447f
---
 examples/smpi/NAS/BT/Makefile                 |  106 +
 examples/smpi/NAS/BT/add.f                    |   30 +
 examples/smpi/NAS/BT/adi.f                    |   21 +
 examples/smpi/NAS/BT/bt.f                     |  275 ++
 examples/smpi/NAS/BT/btio.f                   |   72 +
 examples/smpi/NAS/BT/btio_common.f            |   30 +
 examples/smpi/NAS/BT/copy_faces.f             |  316 +++
 examples/smpi/NAS/BT/define.f                 |   64 +
 examples/smpi/NAS/BT/epio.f                   |  165 ++
 examples/smpi/NAS/BT/error.f                  |  106 +
 examples/smpi/NAS/BT/exact_rhs.f              |  360 +++
 examples/smpi/NAS/BT/exact_solution.f         |   29 +
 examples/smpi/NAS/BT/fortran_io.f             |  174 ++
 examples/smpi/NAS/BT/full_mpiio.f             |  307 ++
 examples/smpi/NAS/BT/header.h                 |  137 +
 examples/smpi/NAS/BT/initialize.f             |  308 ++
 examples/smpi/NAS/BT/inputbt.data.sample      |    5 +
 examples/smpi/NAS/BT/make_set.f               |  124 +
 examples/smpi/NAS/BT/mpinpb.h                 |   12 +
 examples/smpi/NAS/BT/rhs.f                    |  425 +++
 examples/smpi/NAS/BT/set_constants.f          |  202 ++
 examples/smpi/NAS/BT/setup_mpi.f              |   64 +
 examples/smpi/NAS/BT/simple_mpiio.f           |  213 ++
 examples/smpi/NAS/BT/solve_subs.f             |  642 +++++
 examples/smpi/NAS/BT/verify.f                 |  435 +++
 examples/smpi/NAS/BT/work_lhs.h               |   14 +
 examples/smpi/NAS/BT/work_lhs_vec.h           |   14 +
 examples/smpi/NAS/BT/x_solve.f                |  761 +++++
 examples/smpi/NAS/BT/x_solve_vec.f            |  789 ++++++
 examples/smpi/NAS/BT/y_solve.f                |  771 +++++
 examples/smpi/NAS/BT/y_solve_vec.f            |  788 ++++++
 examples/smpi/NAS/BT/z_solve.f                |  776 ++++++
 examples/smpi/NAS/BT/z_solve_vec.f            |  793 ++++++
 examples/smpi/NAS/CG/Makefile                 |   23 +
 examples/smpi/NAS/CG/cg.f                     | 1787 ++++++++++++
 examples/smpi/NAS/CG/mpinpb.h                 |    9 +
 examples/smpi/NAS/DT/DGraph.c                 |  184 ++
 examples/smpi/NAS/DT/DGraph.h                 |   43 +
 examples/smpi/NAS/DT/Makefile                 |   26 +
 examples/smpi/NAS/DT/README                   |   22 +
 examples/smpi/NAS/DT/dt.c                     |  759 +++++
 examples/smpi/NAS/EP/Makefile                 |   28 +
 examples/smpi/NAS/EP/README                   |    6 +
 examples/smpi/NAS/EP/ep.c                     |  440 +++
 examples/smpi/NAS/EP/ep.f                     |  316 +++
 examples/smpi/NAS/EP/mpinpb.h                 |    9 +
 examples/smpi/NAS/EP/randlc.c                 |  107 +
 examples/smpi/NAS/EP/randlc.h                 |    3 +
 examples/smpi/NAS/FT/Makefile                 |   23 +
 examples/smpi/NAS/FT/README                   |    5 +
 examples/smpi/NAS/FT/ft.f                     | 1993 +++++++++++++
 examples/smpi/NAS/FT/global.h                 |  134 +
 examples/smpi/NAS/FT/inputft.data.sample      |    3 +
 examples/smpi/NAS/FT/mpinpb.h                 |    4 +
 examples/smpi/NAS/IS/Makefile                 |   23 +
 examples/smpi/NAS/IS/is.c                     | 1150 ++++++++
 examples/smpi/NAS/LU/Makefile                 |   74 +
 examples/smpi/NAS/LU/applu.incl               |  153 +
 examples/smpi/NAS/LU/bcast_inputs.f           |   41 +
 examples/smpi/NAS/LU/blts.f                   |  261 ++
 examples/smpi/NAS/LU/blts_vec.f               |  334 +++
 examples/smpi/NAS/LU/buts.f                   |  259 ++
 examples/smpi/NAS/LU/buts_vec.f               |  332 +++
 examples/smpi/NAS/LU/erhs.f                   |  536 ++++
 examples/smpi/NAS/LU/error.f                  |   81 +
 examples/smpi/NAS/LU/exact.f                  |   53 +
 examples/smpi/NAS/LU/exchange_1.f             |  180 ++
 examples/smpi/NAS/LU/exchange_3.f             |  312 +++
 examples/smpi/NAS/LU/exchange_4.f             |  133 +
 examples/smpi/NAS/LU/exchange_5.f             |   81 +
 examples/smpi/NAS/LU/exchange_6.f             |   81 +
 examples/smpi/NAS/LU/init_comm.f              |   57 +
 examples/smpi/NAS/LU/inputlu.data.sample      |   24 +
 examples/smpi/NAS/LU/jacld.f                  |  384 +++
 examples/smpi/NAS/LU/jacu.f                   |  384 +++
 examples/smpi/NAS/LU/l2norm.f                 |   68 +
 examples/smpi/NAS/LU/lu.f                     |  164 ++
 examples/smpi/NAS/LU/mpinpb.h                 |   11 +
 examples/smpi/NAS/LU/neighbors.f              |   48 +
 examples/smpi/NAS/LU/nodedim.f                |   36 +
 examples/smpi/NAS/LU/pintgr.f                 |  288 ++
 examples/smpi/NAS/LU/proc_grid.f              |   36 +
 examples/smpi/NAS/LU/read_input.f             |  127 +
 examples/smpi/NAS/LU/rhs.f                    |  504 ++++
 examples/smpi/NAS/LU/setbv.f                  |   79 +
 examples/smpi/NAS/LU/setcoeff.f               |  159 ++
 examples/smpi/NAS/LU/sethyper.f               |   94 +
 examples/smpi/NAS/LU/setiv.f                  |   67 +
 examples/smpi/NAS/LU/ssor.f                   |  241 ++
 examples/smpi/NAS/LU/subdomain.f              |  103 +
 examples/smpi/NAS/LU/verify.f                 |  403 +++
 examples/smpi/NAS/MG/Makefile                 |   23 +
 examples/smpi/NAS/MG/README                   |  138 +
 examples/smpi/NAS/MG/globals.h                |   55 +
 examples/smpi/NAS/MG/mg.f                     | 2479 +++++++++++++++++
 examples/smpi/NAS/MG/mg.input.sample          |    4 +
 examples/smpi/NAS/MG/mpinpb.h                 |    9 +
 examples/smpi/NAS/MPI_dummy/Makefile          |   38 +
 examples/smpi/NAS/MPI_dummy/README            |   52 +
 examples/smpi/NAS/MPI_dummy/mpi.h             |  112 +
 examples/smpi/NAS/MPI_dummy/mpi_dummy.c       |  267 ++
 examples/smpi/NAS/MPI_dummy/mpi_dummy.f       |  309 ++
 examples/smpi/NAS/MPI_dummy/mpif.h            |   27 +
 examples/smpi/NAS/MPI_dummy/test.f            |   10 +
 examples/smpi/NAS/MPI_dummy/wtime.c           |   13 +
 examples/smpi/NAS/MPI_dummy/wtime.f           |   12 +
 examples/smpi/NAS/MPI_dummy/wtime.h           |   12 +
 examples/smpi/NAS/MPI_dummy/wtime_sgi64.c     |   74 +
 examples/smpi/NAS/Makefile                    |   69 +
 examples/smpi/NAS/README                      |   52 +
 examples/smpi/NAS/README.install              |  156 ++
 examples/smpi/NAS/SP/Makefile                 |   60 +
 examples/smpi/NAS/SP/README                   |   17 +
 examples/smpi/NAS/SP/add.f                    |   31 +
 examples/smpi/NAS/SP/adi.f                    |   24 +
 examples/smpi/NAS/SP/copy_faces.f             |  306 ++
 examples/smpi/NAS/SP/define.f                 |   66 +
 examples/smpi/NAS/SP/error.f                  |  105 +
 examples/smpi/NAS/SP/exact_rhs.f              |  363 +++
 examples/smpi/NAS/SP/exact_solution.f         |   30 +
 examples/smpi/NAS/SP/header.h                 |  113 +
 examples/smpi/NAS/SP/initialize.f             |  286 ++
 examples/smpi/NAS/SP/inputsp.data.sample      |    3 +
 examples/smpi/NAS/SP/lhsx.f                   |  124 +
 examples/smpi/NAS/SP/lhsy.f                   |  125 +
 examples/smpi/NAS/SP/lhsz.f                   |  123 +
 examples/smpi/NAS/SP/make_set.f               |  120 +
 examples/smpi/NAS/SP/mpinpb.h                 |   13 +
 examples/smpi/NAS/SP/ninvr.f                  |   45 +
 examples/smpi/NAS/SP/pinvr.f                  |   48 +
 examples/smpi/NAS/SP/rhs.f                    |  446 +++
 examples/smpi/NAS/SP/set_constants.f          |  203 ++
 examples/smpi/NAS/SP/setup_mpi.f              |   65 +
 examples/smpi/NAS/SP/sp.f                     |  194 ++
 examples/smpi/NAS/SP/txinvr.f                 |   59 +
 examples/smpi/NAS/SP/tzetar.f                 |   60 +
 examples/smpi/NAS/SP/verify.f                 |  358 +++
 examples/smpi/NAS/SP/x_solve.f                |  545 ++++
 examples/smpi/NAS/SP/y_solve.f                |  538 ++++
 examples/smpi/NAS/SP/z_solve.f                |  532 ++++
 examples/smpi/NAS/common/c_print_results.c    |   94 +
 examples/smpi/NAS/common/c_timers.c           |   45 +
 examples/smpi/NAS/common/print_results.f      |  115 +
 examples/smpi/NAS/common/randdp.c             |   64 +
 examples/smpi/NAS/common/randdp.f             |  137 +
 examples/smpi/NAS/common/randdpvec.f          |  186 ++
 examples/smpi/NAS/common/randi8.f             |   79 +
 examples/smpi/NAS/common/randi8_safe.f        |   64 +
 examples/smpi/NAS/common/timers.f             |   78 +
 examples/smpi/NAS/config/NAS.samples/README   |    7 +
 .../NAS/config/NAS.samples/make.def.dec_alpha |   18 +
 .../NAS/config/NAS.samples/make.def.irix6.2   |   16 +
 .../NAS/config/NAS.samples/make.def.origin    |   20 +
 .../NAS.samples/make.def.sgi_powerchallenge   |   16 +
 .../config/NAS.samples/make.def.sp2_babbage   |   17 +
 .../NAS.samples/make.def.sun_ultra_sparc      |   30 +
 .../config/NAS.samples/make.def.t3d_cosmos    |   25 +
 .../NAS/config/NAS.samples/make.def_sun_mpich |  165 ++
 .../smpi/NAS/config/NAS.samples/suite.def.bt  |   37 +
 .../smpi/NAS/config/NAS.samples/suite.def.cg  |   29 +
 .../smpi/NAS/config/NAS.samples/suite.def.ep  |   29 +
 .../smpi/NAS/config/NAS.samples/suite.def.ft  |   29 +
 .../smpi/NAS/config/NAS.samples/suite.def.is  |   29 +
 .../smpi/NAS/config/NAS.samples/suite.def.lu  |   29 +
 .../smpi/NAS/config/NAS.samples/suite.def.mg  |   29 +
 .../NAS/config/NAS.samples/suite.def.small    |    8 +
 .../smpi/NAS/config/NAS.samples/suite.def.sp  |   37 +
 examples/smpi/NAS/config/make.def             |  162 ++
 examples/smpi/NAS/config/make.def.template    |  162 ++
 examples/smpi/NAS/config/make.dummy           |    7 +
 examples/smpi/NAS/config/suite.def.template   |   24 +
 examples/smpi/NAS/sys/Makefile                |   22 +
 examples/smpi/NAS/sys/README                  |   39 +
 examples/smpi/NAS/sys/make.common             |   54 +
 examples/smpi/NAS/sys/print_header            |    5 +
 examples/smpi/NAS/sys/print_instructions      |   26 +
 examples/smpi/NAS/sys/setparams.c             | 1210 ++++++++
 examples/smpi/NAS/sys/suite.awk               |   20 +
 178 files changed, 34689 insertions(+)
 create mode 100644 examples/smpi/NAS/BT/Makefile
 create mode 100644 examples/smpi/NAS/BT/add.f
 create mode 100644 examples/smpi/NAS/BT/adi.f
 create mode 100644 examples/smpi/NAS/BT/bt.f
 create mode 100644 examples/smpi/NAS/BT/btio.f
 create mode 100644 examples/smpi/NAS/BT/btio_common.f
 create mode 100644 examples/smpi/NAS/BT/copy_faces.f
 create mode 100644 examples/smpi/NAS/BT/define.f
 create mode 100644 examples/smpi/NAS/BT/epio.f
 create mode 100644 examples/smpi/NAS/BT/error.f
 create mode 100644 examples/smpi/NAS/BT/exact_rhs.f
 create mode 100644 examples/smpi/NAS/BT/exact_solution.f
 create mode 100644 examples/smpi/NAS/BT/fortran_io.f
 create mode 100644 examples/smpi/NAS/BT/full_mpiio.f
 create mode 100644 examples/smpi/NAS/BT/header.h
 create mode 100644 examples/smpi/NAS/BT/initialize.f
 create mode 100644 examples/smpi/NAS/BT/inputbt.data.sample
 create mode 100644 examples/smpi/NAS/BT/make_set.f
 create mode 100644 examples/smpi/NAS/BT/mpinpb.h
 create mode 100644 examples/smpi/NAS/BT/rhs.f
 create mode 100644 examples/smpi/NAS/BT/set_constants.f
 create mode 100644 examples/smpi/NAS/BT/setup_mpi.f
 create mode 100644 examples/smpi/NAS/BT/simple_mpiio.f
 create mode 100644 examples/smpi/NAS/BT/solve_subs.f
 create mode 100644 examples/smpi/NAS/BT/verify.f
 create mode 100644 examples/smpi/NAS/BT/work_lhs.h
 create mode 100644 examples/smpi/NAS/BT/work_lhs_vec.h
 create mode 100644 examples/smpi/NAS/BT/x_solve.f
 create mode 100644 examples/smpi/NAS/BT/x_solve_vec.f
 create mode 100644 examples/smpi/NAS/BT/y_solve.f
 create mode 100644 examples/smpi/NAS/BT/y_solve_vec.f
 create mode 100644 examples/smpi/NAS/BT/z_solve.f
 create mode 100644 examples/smpi/NAS/BT/z_solve_vec.f
 create mode 100644 examples/smpi/NAS/CG/Makefile
 create mode 100644 examples/smpi/NAS/CG/cg.f
 create mode 100644 examples/smpi/NAS/CG/mpinpb.h
 create mode 100644 examples/smpi/NAS/DT/DGraph.c
 create mode 100644 examples/smpi/NAS/DT/DGraph.h
 create mode 100644 examples/smpi/NAS/DT/Makefile
 create mode 100644 examples/smpi/NAS/DT/README
 create mode 100644 examples/smpi/NAS/DT/dt.c
 create mode 100644 examples/smpi/NAS/EP/Makefile
 create mode 100644 examples/smpi/NAS/EP/README
 create mode 100644 examples/smpi/NAS/EP/ep.c
 create mode 100644 examples/smpi/NAS/EP/ep.f
 create mode 100644 examples/smpi/NAS/EP/mpinpb.h
 create mode 100644 examples/smpi/NAS/EP/randlc.c
 create mode 100644 examples/smpi/NAS/EP/randlc.h
 create mode 100644 examples/smpi/NAS/FT/Makefile
 create mode 100644 examples/smpi/NAS/FT/README
 create mode 100644 examples/smpi/NAS/FT/ft.f
 create mode 100644 examples/smpi/NAS/FT/global.h
 create mode 100644 examples/smpi/NAS/FT/inputft.data.sample
 create mode 100644 examples/smpi/NAS/FT/mpinpb.h
 create mode 100644 examples/smpi/NAS/IS/Makefile
 create mode 100644 examples/smpi/NAS/IS/is.c
 create mode 100644 examples/smpi/NAS/LU/Makefile
 create mode 100644 examples/smpi/NAS/LU/applu.incl
 create mode 100644 examples/smpi/NAS/LU/bcast_inputs.f
 create mode 100644 examples/smpi/NAS/LU/blts.f
 create mode 100644 examples/smpi/NAS/LU/blts_vec.f
 create mode 100644 examples/smpi/NAS/LU/buts.f
 create mode 100644 examples/smpi/NAS/LU/buts_vec.f
 create mode 100644 examples/smpi/NAS/LU/erhs.f
 create mode 100644 examples/smpi/NAS/LU/error.f
 create mode 100644 examples/smpi/NAS/LU/exact.f
 create mode 100644 examples/smpi/NAS/LU/exchange_1.f
 create mode 100644 examples/smpi/NAS/LU/exchange_3.f
 create mode 100644 examples/smpi/NAS/LU/exchange_4.f
 create mode 100644 examples/smpi/NAS/LU/exchange_5.f
 create mode 100644 examples/smpi/NAS/LU/exchange_6.f
 create mode 100644 examples/smpi/NAS/LU/init_comm.f
 create mode 100644 examples/smpi/NAS/LU/inputlu.data.sample
 create mode 100644 examples/smpi/NAS/LU/jacld.f
 create mode 100644 examples/smpi/NAS/LU/jacu.f
 create mode 100644 examples/smpi/NAS/LU/l2norm.f
 create mode 100644 examples/smpi/NAS/LU/lu.f
 create mode 100644 examples/smpi/NAS/LU/mpinpb.h
 create mode 100644 examples/smpi/NAS/LU/neighbors.f
 create mode 100644 examples/smpi/NAS/LU/nodedim.f
 create mode 100644 examples/smpi/NAS/LU/pintgr.f
 create mode 100644 examples/smpi/NAS/LU/proc_grid.f
 create mode 100644 examples/smpi/NAS/LU/read_input.f
 create mode 100644 examples/smpi/NAS/LU/rhs.f
 create mode 100644 examples/smpi/NAS/LU/setbv.f
 create mode 100644 examples/smpi/NAS/LU/setcoeff.f
 create mode 100644 examples/smpi/NAS/LU/sethyper.f
 create mode 100644 examples/smpi/NAS/LU/setiv.f
 create mode 100644 examples/smpi/NAS/LU/ssor.f
 create mode 100644 examples/smpi/NAS/LU/subdomain.f
 create mode 100644 examples/smpi/NAS/LU/verify.f
 create mode 100644 examples/smpi/NAS/MG/Makefile
 create mode 100644 examples/smpi/NAS/MG/README
 create mode 100644 examples/smpi/NAS/MG/globals.h
 create mode 100644 examples/smpi/NAS/MG/mg.f
 create mode 100644 examples/smpi/NAS/MG/mg.input.sample
 create mode 100644 examples/smpi/NAS/MG/mpinpb.h
 create mode 100644 examples/smpi/NAS/MPI_dummy/Makefile
 create mode 100644 examples/smpi/NAS/MPI_dummy/README
 create mode 100644 examples/smpi/NAS/MPI_dummy/mpi.h
 create mode 100644 examples/smpi/NAS/MPI_dummy/mpi_dummy.c
 create mode 100644 examples/smpi/NAS/MPI_dummy/mpi_dummy.f
 create mode 100644 examples/smpi/NAS/MPI_dummy/mpif.h
 create mode 100644 examples/smpi/NAS/MPI_dummy/test.f
 create mode 100644 examples/smpi/NAS/MPI_dummy/wtime.c
 create mode 100644 examples/smpi/NAS/MPI_dummy/wtime.f
 create mode 100644 examples/smpi/NAS/MPI_dummy/wtime.h
 create mode 100644 examples/smpi/NAS/MPI_dummy/wtime_sgi64.c
 create mode 100644 examples/smpi/NAS/Makefile
 create mode 100644 examples/smpi/NAS/README
 create mode 100644 examples/smpi/NAS/README.install
 create mode 100644 examples/smpi/NAS/SP/Makefile
 create mode 100644 examples/smpi/NAS/SP/README
 create mode 100644 examples/smpi/NAS/SP/add.f
 create mode 100644 examples/smpi/NAS/SP/adi.f
 create mode 100644 examples/smpi/NAS/SP/copy_faces.f
 create mode 100644 examples/smpi/NAS/SP/define.f
 create mode 100644 examples/smpi/NAS/SP/error.f
 create mode 100644 examples/smpi/NAS/SP/exact_rhs.f
 create mode 100644 examples/smpi/NAS/SP/exact_solution.f
 create mode 100644 examples/smpi/NAS/SP/header.h
 create mode 100644 examples/smpi/NAS/SP/initialize.f
 create mode 100644 examples/smpi/NAS/SP/inputsp.data.sample
 create mode 100644 examples/smpi/NAS/SP/lhsx.f
 create mode 100644 examples/smpi/NAS/SP/lhsy.f
 create mode 100644 examples/smpi/NAS/SP/lhsz.f
 create mode 100644 examples/smpi/NAS/SP/make_set.f
 create mode 100644 examples/smpi/NAS/SP/mpinpb.h
 create mode 100644 examples/smpi/NAS/SP/ninvr.f
 create mode 100644 examples/smpi/NAS/SP/pinvr.f
 create mode 100644 examples/smpi/NAS/SP/rhs.f
 create mode 100644 examples/smpi/NAS/SP/set_constants.f
 create mode 100644 examples/smpi/NAS/SP/setup_mpi.f
 create mode 100644 examples/smpi/NAS/SP/sp.f
 create mode 100644 examples/smpi/NAS/SP/txinvr.f
 create mode 100644 examples/smpi/NAS/SP/tzetar.f
 create mode 100644 examples/smpi/NAS/SP/verify.f
 create mode 100644 examples/smpi/NAS/SP/x_solve.f
 create mode 100644 examples/smpi/NAS/SP/y_solve.f
 create mode 100644 examples/smpi/NAS/SP/z_solve.f
 create mode 100644 examples/smpi/NAS/common/c_print_results.c
 create mode 100644 examples/smpi/NAS/common/c_timers.c
 create mode 100644 examples/smpi/NAS/common/print_results.f
 create mode 100644 examples/smpi/NAS/common/randdp.c
 create mode 100644 examples/smpi/NAS/common/randdp.f
 create mode 100644 examples/smpi/NAS/common/randdpvec.f
 create mode 100644 examples/smpi/NAS/common/randi8.f
 create mode 100644 examples/smpi/NAS/common/randi8_safe.f
 create mode 100644 examples/smpi/NAS/common/timers.f
 create mode 100644 examples/smpi/NAS/config/NAS.samples/README
 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha
 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.irix6.2
 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.origin
 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge
 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage
 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc
 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos
 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.bt
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.cg
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.ep
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.ft
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.is
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.lu
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.mg
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.small
 create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.sp
 create mode 100644 examples/smpi/NAS/config/make.def
 create mode 100644 examples/smpi/NAS/config/make.def.template
 create mode 100644 examples/smpi/NAS/config/make.dummy
 create mode 100644 examples/smpi/NAS/config/suite.def.template
 create mode 100644 examples/smpi/NAS/sys/Makefile
 create mode 100644 examples/smpi/NAS/sys/README
 create mode 100644 examples/smpi/NAS/sys/make.common
 create mode 100755 examples/smpi/NAS/sys/print_header
 create mode 100755 examples/smpi/NAS/sys/print_instructions
 create mode 100644 examples/smpi/NAS/sys/setparams.c
 create mode 100644 examples/smpi/NAS/sys/suite.awk

diff --git a/examples/smpi/NAS/BT/Makefile b/examples/smpi/NAS/BT/Makefile
new file mode 100644
index 0000000000..dd27503221
--- /dev/null
+++ b/examples/smpi/NAS/BT/Makefile
@@ -0,0 +1,106 @@
+SHELL=/bin/sh
+BENCHMARK=bt
+BENCHMARKU=BT
+VEC=
+
+include ../config/make.def
+
+
+OBJS = bt.o make_set.o initialize.o exact_solution.o exact_rhs.o \
+       set_constants.o adi.o define.o copy_faces.o rhs.o solve_subs.o \
+       x_solve$(VEC).o y_solve$(VEC).o z_solve$(VEC).o add.o error.o \
+       verify.o setup_mpi.o \
+       ${COMMON}/print_results.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+# npbparams.h is included by header.h
+# The following rule should do the trick but many make programs (not gmake)
+# will do the wrong thing and rebuild the world every time (because the
+# mod time on header.h is not changed. One solution would be to 
+# touch header.h but this might cause confusion if someone has
+# accidentally deleted it. Instead, make the dependency on npbparams.h
+# explicit in all the lines below (even though dependence is indirect). 
+
+# header.h: npbparams.h
+
+${PROGRAM}: config
+	@if [ x$(VERSION) = xvec ] ; then	\
+		${MAKE} VEC=_vec exec;		\
+	elif [ x$(VERSION) = xVEC ] ; then	\
+		${MAKE} VEC=_vec exec;		\
+	else					\
+		${MAKE} exec;			\
+	fi
+
+exec: $(OBJS)
+	@if [ x$(SUBTYPE) = xfull ] ; then	\
+		${MAKE} bt-full;		\
+	elif [ x$(SUBTYPE) = xFULL ] ; then	\
+		${MAKE} bt-full;		\
+	elif [ x$(SUBTYPE) = xsimple ] ; then	\
+		${MAKE} bt-simple;		\
+	elif [ x$(SUBTYPE) = xSIMPLE ] ; then	\
+		${MAKE} bt-simple;		\
+	elif [ x$(SUBTYPE) = xfortran ] ; then	\
+		${MAKE} bt-fortran;		\
+	elif [ x$(SUBTYPE) = xFORTRAN ] ; then	\
+		${MAKE} bt-fortran;		\
+	elif [ x$(SUBTYPE) = xepio ] ; then	\
+		${MAKE} bt-epio;		\
+	elif [ x$(SUBTYPE) = xEPIO ] ; then	\
+		${MAKE} bt-epio;		\
+	else					\
+		${MAKE} bt-bt;			\
+	fi
+
+bt-bt: ${OBJS} btio.o
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} btio.o ${FMPI_LIB}
+
+bt-full: ${OBJS} full_mpiio.o btio_common.o
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_full ${OBJS} btio_common.o full_mpiio.o ${FMPI_LIB}
+
+bt-simple: ${OBJS} simple_mpiio.o btio_common.o
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_simple ${OBJS} btio_common.o simple_mpiio.o ${FMPI_LIB}
+
+bt-fortran: ${OBJS} fortran_io.o btio_common.o
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.fortran_io ${OBJS} btio_common.o fortran_io.o ${FMPI_LIB}
+
+bt-epio: ${OBJS} epio.o btio_common.o
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.ep_io ${OBJS} btio_common.o epio.o ${FMPI_LIB}
+
+.f.o:
+	${FCOMPILE} $<
+
+.c.o:
+	${CCOMPILE} $<
+
+
+bt.o:             bt.f  header.h npbparams.h  mpinpb.h
+make_set.o:       make_set.f  header.h npbparams.h  mpinpb.h
+initialize.o:     initialize.f  header.h npbparams.h
+exact_solution.o: exact_solution.f  header.h npbparams.h
+exact_rhs.o:      exact_rhs.f  header.h npbparams.h
+set_constants.o:  set_constants.f  header.h npbparams.h
+adi.o:            adi.f  header.h npbparams.h
+define.o:         define.f  header.h npbparams.h
+copy_faces.o:     copy_faces.f  header.h npbparams.h  mpinpb.h
+rhs.o:            rhs.f  header.h npbparams.h
+x_solve$(VEC).o:  x_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
+y_solve$(VEC).o:  y_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
+z_solve$(VEC).o:  z_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
+solve_subs.o:     solve_subs.f  npbparams.h
+add.o:            add.f  header.h npbparams.h
+error.o:          error.f  header.h npbparams.h  mpinpb.h
+verify.o:         verify.f  header.h npbparams.h  mpinpb.h
+setup_mpi.o:      setup_mpi.f mpinpb.h npbparams.h 
+btio.o:           btio.f  header.h npbparams.h
+btio_common.o:    btio_common.f mpinpb.h npbparams.h 
+fortran_io.o:     fortran_io.f mpinpb.h npbparams.h 
+simple_mpiio.o:   simple_mpiio.f mpinpb.h npbparams.h 
+full_mpiio.o:     full_mpiio.f mpinpb.h npbparams.h 
+epio.o:           epio.f mpinpb.h npbparams.h 
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f  npbparams.h core
diff --git a/examples/smpi/NAS/BT/add.f b/examples/smpi/NAS/BT/add.f
new file mode 100644
index 0000000000..e14cde46ef
--- /dev/null
+++ b/examples/smpi/NAS/BT/add.f
@@ -0,0 +1,30 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine  add
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     addition of update to the vector u
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer  c, i, j, k, m
+
+      do     c = 1, ncells
+         do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do    m = 1, 5
+                     u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/adi.f b/examples/smpi/NAS/BT/adi.f
new file mode 100644
index 0000000000..58450c028e
--- /dev/null
+++ b/examples/smpi/NAS/BT/adi.f
@@ -0,0 +1,21 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine  adi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      call copy_faces
+
+      call x_solve
+
+      call y_solve
+
+      call z_solve
+
+      call add
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/bt.f b/examples/smpi/NAS/BT/bt.f
new file mode 100644
index 0000000000..36e50781b3
--- /dev/null
+++ b/examples/smpi/NAS/BT/bt.f
@@ -0,0 +1,275 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                                   B T                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+!    It is described in NAS Technical Reports 95-020 and 02-007.          !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+c---------------------------------------------------------------------
+c
+c Authors: R. F. Van der Wijngaart
+c          T. Harris
+c          M. Yarrow
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+       program MPBT
+c---------------------------------------------------------------------
+
+       include  'header.h'
+       include  'mpinpb.h'
+      
+       integer i, niter, step, c, error, fstatus
+       double precision navg, mflops, mbytes, n3
+
+       external timer_read
+       double precision t, tmax, tiominv, tpc, timer_read
+       logical verified
+       character class, cbuff*40
+
+       integer wr_interval
+
+       call setup_mpi
+       if (.not. active) goto 999
+
+c---------------------------------------------------------------------
+c      Root node reads input file (if it exists) else takes
+c      defaults from parameters
+c---------------------------------------------------------------------
+       if (node .eq. root) then
+          
+          write(*, 1000)
+          open (unit=2,file='inputbt.data',status='old', iostat=fstatus)
+c
+          rd_interval = 0
+          if (fstatus .eq. 0) then
+            write(*,233) 
+ 233        format(' Reading from input file inputbt.data')
+            read (2,*) niter
+            read (2,*) dt
+            read (2,*) grid_points(1), grid_points(2), grid_points(3)
+            if (iotype .ne. 0) then
+                read (2,'(A)') cbuff
+                read (cbuff,*,iostat=i) wr_interval, rd_interval
+                if (i .ne. 0) rd_interval = 0
+                if (wr_interval .le. 0) wr_interval = wr_default
+            endif
+            if (iotype .eq. 1) then
+                read (2,*) collbuf_nodes, collbuf_size
+                write(*,*) 'collbuf_nodes ', collbuf_nodes
+                write(*,*) 'collbuf_size  ', collbuf_size
+            endif
+            close(2)
+          else
+            write(*,234) 
+            niter = niter_default
+            dt    = dt_default
+            grid_points(1) = problem_size
+            grid_points(2) = problem_size
+            grid_points(3) = problem_size
+            wr_interval = wr_default
+            if (iotype .eq. 1) then
+c             set number of nodes involved in collective buffering to 4,
+c             unless total number of nodes is smaller than that.
+c             set buffer size for collective buffering to 1MB per node
+c             collbuf_nodes = min(4,no_nodes)
+c             set default to No-File-Hints with a value of 0
+              collbuf_nodes = 0
+              collbuf_size = 1000000
+            endif
+          endif
+ 234      format(' No input file inputbt.data. Using compiled defaults')
+
+          write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
+          write(*, 1002) niter, dt
+          if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
+          if (no_nodes .ne. maxcells*maxcells) 
+     >        write(*, 1005) maxcells*maxcells
+          write(*, 1003) no_nodes
+
+          if (iotype .eq. 1) write(*, 1006) 'FULL MPI-IO', wr_interval
+          if (iotype .eq. 2) write(*, 1006) 'SIMPLE MPI-IO', wr_interval
+          if (iotype .eq. 3) write(*, 1006) 'EPIO', wr_interval
+          if (iotype .eq. 4) write(*, 1006) 'FORTRAN IO', wr_interval
+
+ 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- BT Benchmark ',/)
+ 1001     format(' Size: ', i4, 'x', i4, 'x', i4)
+ 1002     format(' Iterations: ', i4, '    dt: ', F11.7)
+ 1004     format(' Total number of processes: ', i5)
+ 1005     format(' WARNING: compiled for ', i5, ' processes ')
+ 1003     format(' Number of active processes: ', i5, /)
+ 1006     format(' BTIO -- ', A, ' write interval: ', i3 /)
+
+       endif
+
+       call mpi_bcast(niter, 1, MPI_INTEGER,
+     >                root, comm_setup, error)
+
+       call mpi_bcast(dt, 1, dp_type, 
+     >                root, comm_setup, error)
+
+       call mpi_bcast(grid_points(1), 3, MPI_INTEGER, 
+     >                root, comm_setup, error)
+
+       call mpi_bcast(wr_interval, 1, MPI_INTEGER,
+     >                root, comm_setup, error)
+
+       call mpi_bcast(rd_interval, 1, MPI_INTEGER,
+     >                root, comm_setup, error)
+
+       call make_set
+
+       do  c = 1, maxcells
+          if ( (cell_size(1,c) .gt. IMAX) .or.
+     >         (cell_size(2,c) .gt. JMAX) .or.
+     >         (cell_size(3,c) .gt. KMAX) ) then
+             print *,node, c, (cell_size(i,c),i=1,3)
+             print *,' Problem size too big for compiled array sizes'
+             goto 999
+          endif
+       end do
+
+       call set_constants
+
+       call initialize
+
+       call setup_btio
+       idump = 0
+
+       call lhsinit
+
+       call exact_rhs
+
+       call compute_buffer_size(5)
+
+c---------------------------------------------------------------------
+c      do one time step to touch all code, and reinitialize
+c---------------------------------------------------------------------
+       call adi
+       call initialize
+
+       call timer_clear(2)
+
+c---------------------------------------------------------------------
+c      Synchronize before placing time stamp
+c---------------------------------------------------------------------
+       call mpi_barrier(comm_setup, error)
+
+       call timer_clear(1)
+       call timer_start(1)
+
+       do  step = 1, niter
+
+          if (node .eq. root) then
+             if (mod(step, 20) .eq. 0 .or. step .eq. niter .or.
+     >           step .eq. 1) then
+                write(*, 200) step
+ 200            format(' Time step ', i4)
+             endif
+          endif
+
+          call adi
+
+          if (iotype .ne. 0) then
+              call timer_start(2)
+              if (mod(step, wr_interval).eq.0 .or. step .eq. niter) then
+                  if (node .eq. root) then
+                      print *, 'Writing data set, time step', step
+                  endif
+                  if (step .eq. niter .and. rd_interval .gt. 1) then
+                      rd_interval = 1
+                  endif
+                  call output_timestep
+                  idump = idump + 1
+              endif
+              call timer_stop(2)
+          endif
+       end do
+
+       call btio_cleanup
+
+       call timer_stop(1)
+       t = timer_read(1)
+       
+       call verify(niter, class, verified)
+
+       call mpi_reduce(t, tmax, 1, 
+     >                 dp_type, MPI_MAX, 
+     >                 root, comm_setup, error)
+
+       if (iotype .ne. 0) then
+          t = timer_read(2)
+          if (t .ne. 0.d0) t = 1.0d0 / t
+          call mpi_reduce(t, tiominv, 1, 
+     >                    dp_type, MPI_SUM, 
+     >                    root, comm_setup, error)
+       endif
+
+       if( node .eq. root ) then
+          n3 = 1.0d0*grid_points(1)*grid_points(2)*grid_points(3)
+          navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0
+          if( tmax .ne. 0. ) then
+             mflops = 1.0e-6*float(niter)*
+     >     (3478.8*n3-17655.7*navg**2+28023.7*navg)
+     >     / tmax
+          else
+             mflops = 0.0
+          endif
+
+          if (iotype .ne. 0) then
+             mbytes = n3 * 40.0 * idump * 1.0d-6
+             tiominv = tiominv / no_nodes
+             t = 0.0
+             if (tiominv .ne. 0.) t = 1.d0 / tiominv
+             tpc = 0.0
+             if (tmax .ne. 0.) tpc = t * 100.0 / tmax
+             write(*,1100) t, tpc, mbytes, mbytes*tiominv
+ 1100        format(/' BTIO -- statistics:'/
+     >               '   I/O timing in seconds   : ', f14.2/
+     >               '   I/O timing percentage   : ', f14.2/
+     >               '   Total data written (MB) : ', f14.2/
+     >               '   I/O data rate  (MB/sec) : ', f14.2)
+          endif
+
+         call print_results('BT', class, grid_points(1), 
+     >     grid_points(2), grid_points(3), niter, maxcells*maxcells, 
+     >     total_nodes, tmax, mflops, '          floating point', 
+     >     verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, 
+     >     cs6, '(none)')
+       endif
+
+ 999   continue
+       call mpi_barrier(MPI_COMM_WORLD, error)
+       call mpi_finalize(error)
+
+       end
+
diff --git a/examples/smpi/NAS/BT/btio.f b/examples/smpi/NAS/BT/btio.f
new file mode 100644
index 0000000000..1fb730b1f6
--- /dev/null
+++ b/examples/smpi/NAS/BT/btio.f
@@ -0,0 +1,72 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_verify(verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      logical verified
+
+      verified = .true.
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision xce_acc(5)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine checksum_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/btio_common.f b/examples/smpi/NAS/BT/btio_common.f
new file mode 100644
index 0000000000..9227a12b70
--- /dev/null
+++ b/examples/smpi/NAS/BT/btio_common.f
@@ -0,0 +1,30 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine clear_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer cio, kio, jio, ix
+
+      do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  do ix=0,cell_size(1,cio)-1
+                            u(1,ix, jio,kio,cio) = 0
+                            u(2,ix, jio,kio,cio) = 0
+                            u(3,ix, jio,kio,cio) = 0
+                            u(4,ix, jio,kio,cio) = 0
+                            u(5,ix, jio,kio,cio) = 0
+                  enddo
+              enddo
+          enddo
+      enddo
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/copy_faces.f b/examples/smpi/NAS/BT/copy_faces.f
new file mode 100644
index 0000000000..14b82caf83
--- /dev/null
+++ b/examples/smpi/NAS/BT/copy_faces.f
@@ -0,0 +1,316 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine copy_faces
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c This function copies the face values of a variable defined on a set 
+c of cells to the overlap locations of the adjacent sets of cells. 
+c Because a set of cells interfaces in each direction with exactly one 
+c other set, we only need to fill six different buffers. We could try to 
+c overlap communication with computation, by computing
+c some internal values while communicating boundary values, but this
+c adds so much overhead that it's not clearly useful. 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i, j, k, c, m, requests(0:11), p0, p1, 
+     >     p2, p3, p4, p5, b_size(0:5), ss(0:5), 
+     >     sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
+
+c---------------------------------------------------------------------
+c     exit immediately if there are no faces to be copied           
+c---------------------------------------------------------------------
+      if (no_nodes .eq. 1) then
+         call compute_rhs
+         return
+      endif
+
+      ss(0) = start_send_east
+      ss(1) = start_send_west
+      ss(2) = start_send_north
+      ss(3) = start_send_south
+      ss(4) = start_send_top
+      ss(5) = start_send_bottom
+
+      sr(0) = start_recv_east
+      sr(1) = start_recv_west
+      sr(2) = start_recv_north
+      sr(3) = start_recv_south
+      sr(4) = start_recv_top
+      sr(5) = start_recv_bottom
+
+      b_size(0) = east_size   
+      b_size(1) = west_size   
+      b_size(2) = north_size  
+      b_size(3) = south_size  
+      b_size(4) = top_size    
+      b_size(5) = bottom_size 
+
+c---------------------------------------------------------------------
+c     because the difference stencil for the diagonalized scheme is 
+c     orthogonal, we do not have to perform the staged copying of faces, 
+c     but can send all face information simultaneously to the neighboring 
+c     cells in all directions          
+c---------------------------------------------------------------------
+      p0 = 0
+      p1 = 0
+      p2 = 0
+      p3 = 0
+      p4 = 0
+      p5 = 0
+
+      do  c = 1, ncells
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to eastern neighbors (i-dir)
+c---------------------------------------------------------------------
+         if (cell_coord(1,c) .ne. ncells) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = cell_size(1,c)-2, cell_size(1,c)-1
+                     do   m = 1, 5
+                        out_buffer(ss(0)+p0) = u(m,i,j,k,c)
+                        p0 = p0 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to western neighbors 
+c---------------------------------------------------------------------
+         if (cell_coord(1,c) .ne. 1) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = 0, 1
+                     do   m = 1, 5
+                        out_buffer(ss(1)+p1) = u(m,i,j,k,c)
+                        p1 = p1 + 1
+                     end do
+                  end do
+               end do
+            end do
+
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to northern neighbors (j_dir)
+c---------------------------------------------------------------------
+         if (cell_coord(2,c) .ne. ncells) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = cell_size(2,c)-2, cell_size(2,c)-1
+                  do   i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        out_buffer(ss(2)+p2) = u(m,i,j,k,c)
+                        p2 = p2 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to southern neighbors 
+c---------------------------------------------------------------------
+         if (cell_coord(2,c).ne. 1) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = 0, 1
+                  do   i = 0, cell_size(1,c)-1   
+                     do   m = 1, 5
+                        out_buffer(ss(3)+p3) = u(m,i,j,k,c)
+                        p3 = p3 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to top neighbors (k-dir)
+c---------------------------------------------------------------------
+         if (cell_coord(3,c) .ne. ncells) then
+            do   k = cell_size(3,c)-2, cell_size(3,c)-1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        out_buffer(ss(4)+p4) = u(m,i,j,k,c)
+                        p4 = p4 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to bottom neighbors
+c---------------------------------------------------------------------
+         if (cell_coord(3,c).ne. 1) then
+            do    k=0, 1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        out_buffer(ss(5)+p5) = u(m,i,j,k,c)
+                        p5 = p5 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     cell loop
+c---------------------------------------------------------------------
+      end do
+
+      call mpi_irecv(in_buffer(sr(0)), b_size(0), 
+     >     dp_type, successor(1), WEST,  
+     >     comm_rhs, requests(0), error)
+      call mpi_irecv(in_buffer(sr(1)), b_size(1), 
+     >     dp_type, predecessor(1), EAST,  
+     >     comm_rhs, requests(1), error)
+      call mpi_irecv(in_buffer(sr(2)), b_size(2), 
+     >     dp_type, successor(2), SOUTH, 
+     >     comm_rhs, requests(2), error)
+      call mpi_irecv(in_buffer(sr(3)), b_size(3), 
+     >     dp_type, predecessor(2), NORTH, 
+     >     comm_rhs, requests(3), error)
+      call mpi_irecv(in_buffer(sr(4)), b_size(4), 
+     >     dp_type, successor(3), BOTTOM,
+     >     comm_rhs, requests(4), error)
+      call mpi_irecv(in_buffer(sr(5)), b_size(5), 
+     >     dp_type, predecessor(3), TOP,   
+     >     comm_rhs, requests(5), error)
+
+      call mpi_isend(out_buffer(ss(0)), b_size(0), 
+     >     dp_type, successor(1),   EAST, 
+     >     comm_rhs, requests(6), error)
+      call mpi_isend(out_buffer(ss(1)), b_size(1), 
+     >     dp_type, predecessor(1), WEST, 
+     >     comm_rhs, requests(7), error)
+      call mpi_isend(out_buffer(ss(2)), b_size(2), 
+     >     dp_type,successor(2),   NORTH, 
+     >     comm_rhs, requests(8), error)
+      call mpi_isend(out_buffer(ss(3)), b_size(3), 
+     >     dp_type,predecessor(2), SOUTH, 
+     >     comm_rhs, requests(9), error)
+      call mpi_isend(out_buffer(ss(4)), b_size(4), 
+     >     dp_type,successor(3),   TOP, 
+     >     comm_rhs,   requests(10), error)
+      call mpi_isend(out_buffer(ss(5)), b_size(5), 
+     >     dp_type,predecessor(3), BOTTOM, 
+     >     comm_rhs,requests(11), error)
+
+
+      call mpi_waitall(12, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c     unpack the data that has just been received;             
+c---------------------------------------------------------------------
+      p0 = 0
+      p1 = 0
+      p2 = 0
+      p3 = 0
+      p4 = 0
+      p5 = 0
+
+      do   c = 1, ncells
+
+         if (cell_coord(1,c) .ne. 1) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = -2, -1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(1)+p0)
+                        p0 = p0 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+         if (cell_coord(1,c) .ne. ncells) then
+            do  k = 0, cell_size(3,c)-1
+               do  j = 0, cell_size(2,c)-1
+                  do  i = cell_size(1,c), cell_size(1,c)+1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(0)+p1)
+                        p1 = p1 + 1
+                     end do
+                  end do
+               end do
+            end do
+         end if
+            
+         if (cell_coord(2,c) .ne. 1) then
+            do  k = 0, cell_size(3,c)-1
+               do   j = -2, -1
+                  do  i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(3)+p2)
+                        p2 = p2 + 1
+                     end do
+                  end do
+               end do
+            end do
+
+         endif
+            
+         if (cell_coord(2,c) .ne. ncells) then
+            do  k = 0, cell_size(3,c)-1
+               do   j = cell_size(2,c), cell_size(2,c)+1
+                  do  i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(2)+p3)
+                        p3 = p3 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+         if (cell_coord(3,c) .ne. 1) then
+            do  k = -2, -1
+               do  j = 0, cell_size(2,c)-1
+                  do  i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(5)+p4)
+                        p4 = p4 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+         if (cell_coord(3,c) .ne. ncells) then
+            do  k = cell_size(3,c), cell_size(3,c)+1
+               do  j = 0, cell_size(2,c)-1
+                  do  i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(4)+p5)
+                        p5 = p5 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     cells loop
+c---------------------------------------------------------------------
+      end do
+
+c---------------------------------------------------------------------
+c     do the rest of the rhs that uses the copied face values          
+c---------------------------------------------------------------------
+      call compute_rhs
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/define.f b/examples/smpi/NAS/BT/define.f
new file mode 100644
index 0000000000..03c4c6edd7
--- /dev/null
+++ b/examples/smpi/NAS/BT/define.f
@@ -0,0 +1,64 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine compute_buffer_size(dim)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer  c, dim, face_size
+
+      if (ncells .eq. 1) return
+
+c---------------------------------------------------------------------
+c     compute the actual sizes of the buffers; note that there is 
+c     always one cell face that doesn't need buffer space, because it 
+c     is at the boundary of the grid
+c---------------------------------------------------------------------
+      west_size = 0
+      east_size = 0
+
+      do   c = 1, ncells
+         face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
+         if (cell_coord(1,c).ne.1) west_size = west_size + face_size
+         if (cell_coord(1,c).ne.ncells) east_size = east_size + 
+     >        face_size 
+      end do
+
+      north_size = 0
+      south_size = 0
+      do   c = 1, ncells
+         face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
+         if (cell_coord(2,c).ne.1) south_size = south_size + face_size
+         if (cell_coord(2,c).ne.ncells) north_size = north_size + 
+     >        face_size 
+      end do
+
+      top_size = 0
+      bottom_size = 0
+      do   c = 1, ncells
+         face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
+         if (cell_coord(3,c).ne.1) bottom_size = bottom_size + 
+     >        face_size
+         if (cell_coord(3,c).ne.ncells) top_size = top_size +
+     >        face_size     
+      end do
+
+      start_send_west   = 1
+      start_send_east   = start_send_west   + west_size
+      start_send_south  = start_send_east   + east_size
+      start_send_north  = start_send_south  + south_size
+      start_send_bottom = start_send_north  + north_size
+      start_send_top    = start_send_bottom + bottom_size
+      start_recv_west   = 1
+      start_recv_east   = start_recv_west   + west_size
+      start_recv_south  = start_recv_east   + east_size
+      start_recv_north  = start_recv_south  + south_size
+      start_recv_bottom = start_recv_north  + north_size
+      start_recv_top    = start_recv_bottom + bottom_size
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/epio.f b/examples/smpi/NAS/BT/epio.f
new file mode 100644
index 0000000000..52b630999d
--- /dev/null
+++ b/examples/smpi/NAS/BT/epio.f
@@ -0,0 +1,165 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      character*(128) newfilenm
+      integer m
+
+      if (node .lt. 10000) then
+          write (newfilenm, 996) filenm,node
+      else
+          print *, 'error generating file names (> 10000 nodes)'
+          stop
+      endif
+
+996   format (a,'.',i4.4)
+
+      open (unit=99, file=newfilenm, form='unformatted',
+     $       status='unknown')
+
+      do m = 1, 5
+         xce_sub(m) = 0.d0
+      end do
+
+      idump_sub = 0
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ix, iio, jio, kio, cio, aio
+
+      do cio=1,ncells
+          write(99)
+     $         ((((u(aio,ix, jio,kio,cio),aio=1,5),
+     $             ix=0, cell_size(1,cio)-1),
+     $             jio=0, cell_size(2,cio)-1),
+     $             kio=0, cell_size(3,cio)-1)
+      enddo
+
+      idump_sub = idump_sub + 1
+      if (rd_interval .gt. 0) then
+         if (idump_sub .ge. rd_interval) then
+
+            rewind(99)
+            call acc_sub_norms(idump+1)
+
+            rewind(99)
+            idump_sub = 0
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine acc_sub_norms(idump_cur)
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer idump_cur
+
+      integer ix, jio, kio, cio, ii, m, ichunk
+      double precision xce_single(5)
+
+      ichunk = idump_cur - idump_sub + 1
+      do ii=0, idump_sub-1
+        do cio=1,ncells
+          read(99)
+     $         ((((u(m,ix, jio,kio,cio),m=1,5),
+     $             ix=0, cell_size(1,cio)-1),
+     $             jio=0, cell_size(2,cio)-1),
+     $             kio=0, cell_size(3,cio)-1)
+        enddo
+
+        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+        call error_norm(xce_single)
+        do m = 1, 5
+           xce_sub(m) = xce_sub(m) + xce_single(m)
+        end do
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      close(unit=99)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      double precision xce_acc(5)
+
+      character*(128) newfilenm
+      integer m
+
+      if (rd_interval .gt. 0) goto 20
+
+      if (node .lt. 10000) then
+          write (newfilenm, 996) filenm,node
+      else
+          print *, 'error generating file names (> 10000 nodes)'
+          stop
+      endif
+
+996   format (a,'.',i4.4)
+
+      open (unit=99, file=newfilenm,
+     $      form='unformatted')
+
+c     clear the last time step
+
+      call clear_timestep
+
+c     read back the time steps and accumulate norms
+
+      call acc_sub_norms(idump)
+
+      close(unit=99)
+
+ 20   continue
+      do m = 1, 5
+         xce_acc(m) = xce_sub(m) / dble(idump)
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/error.f b/examples/smpi/NAS/BT/error.f
new file mode 100644
index 0000000000..147a582b58
--- /dev/null
+++ b/examples/smpi/NAS/BT/error.f
@@ -0,0 +1,106 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine error_norm(rms)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     this function computes the norm of the difference between the
+c     computed solution and the exact solution
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer c, i, j, k, m, ii, jj, kk, d, error
+      double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
+     >     add
+
+      do m = 1, 5 
+         rms_work(m) = 0.0d0
+      enddo
+
+      do c = 1, ncells
+         kk = 0
+         do k = cell_low(3,c), cell_high(3,c)
+            zeta = dble(k) * dnzm1
+            jj = 0
+            do j = cell_low(2,c), cell_high(2,c)
+               eta = dble(j) * dnym1
+               ii = 0
+               do i = cell_low(1,c), cell_high(1,c)
+                  xi = dble(i) * dnxm1
+                  call exact_solution(xi, eta, zeta, u_exact)
+
+                  do m = 1, 5
+                     add = u(m,ii,jj,kk,c)-u_exact(m)
+                     rms_work(m) = rms_work(m) + add*add
+                  enddo
+                  ii = ii + 1
+               enddo
+               jj = jj + 1
+            enddo
+            kk = kk + 1
+         enddo
+      enddo
+
+      call mpi_allreduce(rms_work, rms, 5, dp_type, 
+     >     MPI_SUM, comm_setup, error)
+
+      do m = 1, 5
+         do d = 1, 3
+            rms(m) = rms(m) / dble(grid_points(d)-2)
+         enddo
+         rms(m) = dsqrt(rms(m))
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine rhs_norm(rms)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer c, i, j, k, d, m, error
+      double precision rms(5), rms_work(5), add
+
+      do m = 1, 5
+         rms_work(m) = 0.0d0
+      enddo 
+
+      do c = 1, ncells
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     add = rhs(m,i,j,k,c)
+                     rms_work(m) = rms_work(m) + add*add
+                  enddo 
+               enddo 
+            enddo 
+         enddo 
+      enddo 
+
+      call mpi_allreduce(rms_work, rms, 5, dp_type, 
+     >     MPI_SUM, comm_setup, error)
+
+      do m = 1, 5
+         do d = 1, 3
+            rms(m) = rms(m) / dble(grid_points(d)-2)
+         enddo 
+         rms(m) = dsqrt(rms(m))
+      enddo 
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/exact_rhs.f b/examples/smpi/NAS/BT/exact_rhs.f
new file mode 100644
index 0000000000..26a2871d20
--- /dev/null
+++ b/examples/smpi/NAS/BT/exact_rhs.f
@@ -0,0 +1,360 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exact_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      double precision dtemp(5), xi, eta, zeta, dtpp
+      integer          c, m, i, j, k, ip1, im1, jp1, 
+     >     jm1, km1, kp1
+
+
+c---------------------------------------------------------------------
+c     loop over all cells owned by this node                   
+c---------------------------------------------------------------------
+      do c = 1, ncells
+
+c---------------------------------------------------------------------
+c     initialize                                  
+c---------------------------------------------------------------------
+         do k= 0, cell_size(3,c)-1
+            do j = 0, cell_size(2,c)-1
+               do i = 0, cell_size(1,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = 0.0d0
+                  enddo
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     xi-direction flux differences                      
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            zeta = dble(k+cell_low(3,c)) * dnzm1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               eta = dble(j+cell_low(2,c)) * dnym1
+
+               do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
+                  xi = dble(i+cell_low(1,c)) * dnxm1
+
+                  call exact_solution(xi, eta, zeta, dtemp)
+                  do m = 1, 5
+                     ue(i,m) = dtemp(m)
+                  enddo
+
+                  dtpp = 1.0d0 / dtemp(1)
+
+                  do m = 2, 5
+                     buf(i,m) = dtpp * dtemp(m)
+                  enddo
+
+                  cuf(i)   = buf(i,2) * buf(i,2)
+                  buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + 
+     >                 buf(i,4) * buf(i,4) 
+                  q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
+     >                 buf(i,4)*ue(i,4))
+
+               enddo
+               
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  im1 = i-1
+                  ip1 = i+1
+
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+     >                 tx2*( ue(ip1,2)-ue(im1,2) )+
+     >                 dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * (
+     >                 (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
+     >                 (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
+     >                 xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
+     >                 dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * (
+     >                 ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
+     >                 xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
+     >                 dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
+                  
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*(
+     >                 ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
+     >                 xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
+     >                 dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*(
+     >                 buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
+     >                 buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
+     >                 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
+     >                 buf(im1,1))+
+     >                 xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
+     >                 xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
+     >                 dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
+               enddo
+
+c---------------------------------------------------------------------
+c     Fourth-order dissipation                         
+c---------------------------------------------------------------------
+               if (start(1,c) .gt. 0) then
+                  do m = 1, 5
+                     i = 1
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
+                     i = 2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
+     >                    4.0d0*ue(i+1,m) +       ue(i+2,m))
+                  enddo
+               endif
+
+               do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
+                  enddo
+               enddo
+
+               if (end(1,c) .gt. 0) then
+                  do m = 1, 5
+                     i = cell_size(1,c)-3
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
+                     i = cell_size(1,c)-2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
+                  enddo
+               endif
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     eta-direction flux differences             
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1          
+            zeta = dble(k+cell_low(3,c)) * dnzm1
+            do i=start(1,c), cell_size(1,c)-end(1,c)-1
+               xi = dble(i+cell_low(1,c)) * dnxm1
+
+               do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
+                  eta = dble(j+cell_low(2,c)) * dnym1
+
+                  call exact_solution(xi, eta, zeta, dtemp)
+                  do m = 1, 5 
+                     ue(j,m) = dtemp(m)
+                  enddo
+                  
+                  dtpp = 1.0d0/dtemp(1)
+
+                  do m = 2, 5
+                     buf(j,m) = dtpp * dtemp(m)
+                  enddo
+
+                  cuf(j)   = buf(j,3) * buf(j,3)
+                  buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + 
+     >                 buf(j,4) * buf(j,4)
+                  q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
+     >                 buf(j,4)*ue(j,4))
+               enddo
+
+               do j = start(2,c), cell_size(2,c)-end(2,c)-1
+                  jm1 = j-1
+                  jp1 = j+1
+                  
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+     >                 ty2*( ue(jp1,3)-ue(jm1,3) )+
+     >                 dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*(
+     >                 ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
+     >                 yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
+     >                 dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*(
+     >                 (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
+     >                 (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
+     >                 yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
+     >                 dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
+
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*(
+     >                 ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
+     >                 yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
+     >                 dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*(
+     >                 buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
+     >                 buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
+     >                 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
+     >                 buf(jm1,1))+
+     >                 yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
+     >                 yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
+     >                 dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
+               enddo
+
+c---------------------------------------------------------------------
+c     Fourth-order dissipation                      
+c---------------------------------------------------------------------
+               if (start(2,c) .gt. 0) then
+                  do m = 1, 5
+                     j = 1
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
+                     j = 2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
+     >                    4.0d0*ue(j+1,m) +       ue(j+2,m))
+                  enddo
+               endif
+
+               do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
+                  enddo
+               enddo
+
+               if (end(2,c) .gt. 0) then
+                  do m = 1, 5
+                     j = cell_size(2,c)-3
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
+                     j = cell_size(2,c)-2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
+
+                  enddo
+               endif
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     zeta-direction flux differences                      
+c---------------------------------------------------------------------
+         do j=start(2,c), cell_size(2,c)-end(2,c)-1
+            eta = dble(j+cell_low(2,c)) * dnym1
+            do i = start(1,c), cell_size(1,c)-end(1,c)-1
+               xi = dble(i+cell_low(1,c)) * dnxm1
+
+               do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
+                  zeta = dble(k+cell_low(3,c)) * dnzm1
+
+                  call exact_solution(xi, eta, zeta, dtemp)
+                  do m = 1, 5
+                     ue(k,m) = dtemp(m)
+                  enddo
+
+                  dtpp = 1.0d0/dtemp(1)
+
+                  do m = 2, 5
+                     buf(k,m) = dtpp * dtemp(m)
+                  enddo
+
+                  cuf(k)   = buf(k,4) * buf(k,4)
+                  buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + 
+     >                 buf(k,3) * buf(k,3)
+                  q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
+     >                 buf(k,4)*ue(k,4))
+               enddo
+
+               do k=start(3,c), cell_size(3,c)-end(3,c)-1
+                  km1 = k-1
+                  kp1 = k+1
+                  
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+     >                 tz2*( ue(kp1,4)-ue(km1,4) )+
+     >                 dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * (
+     >                 ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
+     >                 zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
+     >                 dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * (
+     >                 ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
+     >                 zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
+     >                 dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
+
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * (
+     >                 (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
+     >                 (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
+     >                 zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
+     >                 dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * (
+     >                 buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
+     >                 buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
+     >                 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
+     >                 +buf(km1,1))+
+     >                 zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
+     >                 zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
+     >                 dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
+               enddo
+
+c---------------------------------------------------------------------
+c     Fourth-order dissipation                        
+c---------------------------------------------------------------------
+               if (start(3,c) .gt. 0) then
+                  do m = 1, 5
+                     k = 1
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
+                     k = 2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
+     >                    4.0d0*ue(k+1,m) +       ue(k+2,m))
+                  enddo
+               endif
+
+               do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
+                  enddo
+               enddo
+
+               if (end(3,c) .gt. 0) then
+                  do m = 1, 5
+                     k = cell_size(3,c)-3
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
+                     k = cell_size(3,c)-2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
+                  enddo
+               endif
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     now change the sign of the forcing function, 
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = -1.d0 * forcing(m,i,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+
+      enddo
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/exact_solution.f b/examples/smpi/NAS/BT/exact_solution.f
new file mode 100644
index 0000000000..b093b46d16
--- /dev/null
+++ b/examples/smpi/NAS/BT/exact_solution.f
@@ -0,0 +1,29 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exact_solution(xi,eta,zeta,dtemp)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     this function returns the exact solution at point xi, eta, zeta  
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      double precision  xi, eta, zeta, dtemp(5)
+      integer m
+
+      do m = 1, 5
+         dtemp(m) =  ce(m,1) +
+     >     xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
+     >     eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
+     >     zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + 
+     >     zeta*ce(m,13))))
+      enddo
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/BT/fortran_io.f b/examples/smpi/NAS/BT/fortran_io.f
new file mode 100644
index 0000000000..d3085a030a
--- /dev/null
+++ b/examples/smpi/NAS/BT/fortran_io.f
@@ -0,0 +1,174 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      character*(128) newfilenm
+      integer m, ierr
+
+      if (node.eq.root) record_length = 40/fortran_rec_sz
+      call mpi_bcast(record_length, 1, MPI_INTEGER,
+     >                root, comm_setup, ierr)
+
+      open (unit=99, file=filenm,
+     $      form='unformatted', access='direct',
+     $      recl=record_length)
+
+      do m = 1, 5
+         xce_sub(m) = 0.d0
+      end do
+
+      idump_sub = 0
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ix, jio, kio, cio
+
+      do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  iseek=(cell_low(1,cio) +
+     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+     $                   PROBLEM_SIZE*idump_sub)))
+
+                  do ix=0,cell_size(1,cio)-1
+                      write(99, rec=iseek+ix+1)
+     $                      u(1,ix, jio,kio,cio),
+     $                      u(2,ix, jio,kio,cio),
+     $                      u(3,ix, jio,kio,cio),
+     $                      u(4,ix, jio,kio,cio),
+     $                      u(5,ix, jio,kio,cio)
+                  enddo
+              enddo
+          enddo
+      enddo
+
+      idump_sub = idump_sub + 1
+      if (rd_interval .gt. 0) then
+         if (idump_sub .ge. rd_interval) then
+
+            call acc_sub_norms(idump+1)
+
+            idump_sub = 0
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine acc_sub_norms(idump_cur)
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer idump_cur
+
+      integer ix, jio, kio, cio, ii, m, ichunk
+      double precision xce_single(5)
+
+      ichunk = idump_cur - idump_sub + 1
+      do ii=0, idump_sub-1
+        do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  iseek=(cell_low(1,cio) +
+     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+     $                   PROBLEM_SIZE*ii)))
+
+
+                  do ix=0,cell_size(1,cio)-1
+                      read(99, rec=iseek+ix+1)
+     $                      u(1,ix, jio,kio,cio),
+     $                      u(2,ix, jio,kio,cio),
+     $                      u(3,ix, jio,kio,cio),
+     $                      u(4,ix, jio,kio,cio),
+     $                      u(5,ix, jio,kio,cio)
+                  enddo
+              enddo
+          enddo
+        enddo
+
+        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+        call error_norm(xce_single)
+        do m = 1, 5
+           xce_sub(m) = xce_sub(m) + xce_single(m)
+        end do
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      close(unit=99)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      double precision xce_acc(5)
+      integer m
+
+      if (rd_interval .gt. 0) goto 20
+
+      open (unit=99, file=filenm,
+     $      form='unformatted', access='direct',
+     $      recl=record_length)
+
+c     clear the last time step
+
+      call clear_timestep
+
+c     read back the time steps and accumulate norms
+
+      call acc_sub_norms(idump)
+
+      close(unit=99)
+
+ 20   continue
+      do m = 1, 5
+         xce_acc(m) = xce_sub(m) / dble(idump)
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/full_mpiio.f b/examples/smpi/NAS/BT/full_mpiio.f
new file mode 100644
index 0000000000..ecfd41ca73
--- /dev/null
+++ b/examples/smpi/NAS/BT/full_mpiio.f
@@ -0,0 +1,307 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ierr
+      integer mstatus(MPI_STATUS_SIZE)
+      integer sizes(4), starts(4), subsizes(4)
+      integer cell_btype(maxcells), cell_ftype(maxcells)
+      integer cell_blength(maxcells)
+      integer info
+      character*20 cb_nodes, cb_size
+      integer c, m
+      integer cell_disp(maxcells)
+
+       call mpi_bcast(collbuf_nodes, 1, MPI_INTEGER,
+     >                root, comm_setup, ierr)
+
+       call mpi_bcast(collbuf_size, 1, MPI_INTEGER,
+     >                root, comm_setup, ierr)
+
+       if (collbuf_nodes .eq. 0) then
+          info = MPI_INFO_NULL
+       else
+          write (cb_nodes,*) collbuf_nodes
+          write (cb_size,*) collbuf_size
+          call MPI_Info_create(info, ierr)
+          call MPI_Info_set(info, 'cb_nodes', cb_nodes, ierr)
+          call MPI_Info_set(info, 'cb_buffer_size', cb_size, ierr)
+          call MPI_Info_set(info, 'collective_buffering', 'true', ierr)
+       endif
+
+       call MPI_Type_contiguous(5, MPI_DOUBLE_PRECISION,
+     $                          element, ierr)
+       call MPI_Type_commit(element, ierr)
+       call MPI_Type_extent(element, eltext, ierr)
+
+       do  c = 1, ncells
+c
+c Outer array dimensions ar same for every cell
+c
+           sizes(1) = IMAX+4
+           sizes(2) = JMAX+4
+           sizes(3) = KMAX+4
+c
+c 4th dimension is cell number, total of maxcells cells
+c
+           sizes(4) = maxcells
+c
+c Internal dimensions of cells can differ slightly between cells
+c
+           subsizes(1) = cell_size(1, c)
+           subsizes(2) = cell_size(2, c)
+           subsizes(3) = cell_size(3, c)
+c
+c Cell is 4th dimension, 1 cell per cell type to handle varying 
+c cell sub-array sizes
+c
+           subsizes(4) = 1
+
+c
+c type constructors use 0-based start addresses
+c
+           starts(1) = 2 
+           starts(2) = 2
+           starts(3) = 2
+           starts(4) = c-1
+
+c 
+c Create buftype for a cell
+c
+           call MPI_Type_create_subarray(4, sizes, subsizes, 
+     $          starts, MPI_ORDER_FORTRAN, element, 
+     $          cell_btype(c), ierr)
+c
+c block length and displacement for joining cells - 
+c 1 cell buftype per block, cell buftypes have own displacment
+c generated from cell number (4th array dimension)
+c
+           cell_blength(c) = 1
+           cell_disp(c) = 0
+
+       enddo
+c
+c Create combined buftype for all cells
+c
+       call MPI_Type_struct(ncells, cell_blength, cell_disp,
+     $            cell_btype, combined_btype, ierr)
+       call MPI_Type_commit(combined_btype, ierr)
+
+       do  c = 1, ncells
+c
+c Entire array size
+c
+           sizes(1) = PROBLEM_SIZE
+           sizes(2) = PROBLEM_SIZE
+           sizes(3) = PROBLEM_SIZE
+
+c
+c Size of c'th cell
+c
+           subsizes(1) = cell_size(1, c)
+           subsizes(2) = cell_size(2, c)
+           subsizes(3) = cell_size(3, c)
+
+c
+c Starting point in full array of c'th cell
+c
+           starts(1) = cell_low(1,c)
+           starts(2) = cell_low(2,c)
+           starts(3) = cell_low(3,c)
+
+           call MPI_Type_create_subarray(3, sizes, subsizes,
+     $          starts, MPI_ORDER_FORTRAN,
+     $          element, cell_ftype(c), ierr)
+           cell_blength(c) = 1
+           cell_disp(c) = 0
+       enddo
+
+       call MPI_Type_struct(ncells, cell_blength, cell_disp,
+     $            cell_ftype, combined_ftype, ierr)
+       call MPI_Type_commit(combined_ftype, ierr)
+
+       iseek=0
+       if (node .eq. root) then
+          call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
+       endif
+
+
+      call MPI_Barrier(comm_solve, ierr)
+
+       call MPI_File_open(comm_solve,
+     $          filenm,
+     $          MPI_MODE_RDWR+MPI_MODE_CREATE,
+     $          MPI_INFO_NULL, fp, ierr)
+
+       if (ierr .ne. MPI_SUCCESS) then
+                print *, 'Error opening file'
+                stop
+       endif
+
+        call MPI_File_set_view(fp, iseek, element, 
+     $          combined_ftype, 'native', info, ierr)
+
+       if (ierr .ne. MPI_SUCCESS) then
+                print *, 'Error setting file view'
+                stop
+       endif
+
+      do m = 1, 5
+         xce_sub(m) = 0.d0
+      end do
+
+      idump_sub = 0
+
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer mstatus(MPI_STATUS_SIZE)
+      integer ierr
+
+      call MPI_File_write_at_all(fp, iseek, u,
+     $                           1, combined_btype, mstatus, ierr)
+      if (ierr .ne. MPI_SUCCESS) then
+          print *, 'Error writing to file'
+          stop
+      endif
+
+      call MPI_Type_size(combined_btype, iosize, ierr)
+      iseek = iseek + iosize/eltext
+
+      idump_sub = idump_sub + 1
+      if (rd_interval .gt. 0) then
+         if (idump_sub .ge. rd_interval) then
+
+            iseek = 0
+            call acc_sub_norms(idump+1)
+
+            iseek = 0
+            idump_sub = 0
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine acc_sub_norms(idump_cur)
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer idump_cur
+
+      integer ii, m, ichunk
+      integer ierr
+      integer mstatus(MPI_STATUS_SIZE)
+      double precision xce_single(5)
+
+      ichunk = idump_cur - idump_sub + 1
+      do ii=0, idump_sub-1
+
+        call MPI_File_read_at_all(fp, iseek, u,
+     $                           1, combined_btype, mstatus, ierr)
+        if (ierr .ne. MPI_SUCCESS) then
+           print *, 'Error reading back file'
+           call MPI_File_close(fp, ierr)
+           stop
+        endif
+
+        call MPI_Type_size(combined_btype, iosize, ierr)
+        iseek = iseek + iosize/eltext
+
+        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+        call error_norm(xce_single)
+        do m = 1, 5
+           xce_sub(m) = xce_sub(m) + xce_single(m)
+        end do
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ierr
+
+      call MPI_File_close(fp, ierr)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      double precision xce_acc(5)
+      integer m, ierr
+
+      if (rd_interval .gt. 0) goto 20
+
+      call MPI_File_open(comm_solve,
+     $          filenm,
+     $          MPI_MODE_RDONLY,
+     $          MPI_INFO_NULL,
+     $          fp,
+     $          ierr)
+
+      iseek = 0
+      call MPI_File_set_view(fp, iseek, element, combined_ftype,
+     $          'native', MPI_INFO_NULL, ierr)
+
+c     clear the last time step
+
+      call clear_timestep
+
+c     read back the time steps and accumulate norms
+
+      call acc_sub_norms(idump)
+
+      call MPI_File_close(fp, ierr)
+
+ 20   continue
+      do m = 1, 5
+         xce_acc(m) = xce_sub(m) / dble(idump)
+      end do
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/header.h b/examples/smpi/NAS/BT/header.h
new file mode 100644
index 0000000000..47719da674
--- /dev/null
+++ b/examples/smpi/NAS/BT/header.h
@@ -0,0 +1,137 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c  header.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ 
+      implicit none
+
+c---------------------------------------------------------------------
+c The following include file is generated automatically by the
+c "setparams" utility. It defines 
+c      maxcells:      the square root of the maximum number of processors
+c      problem_size:  12, 64, 102, 162 (for class T, A, B, C)
+c      dt_default:    default time step for this problem size if no
+c                     config file
+c      niter_default: default number of iterations for this problem size
+c---------------------------------------------------------------------
+
+      include 'npbparams.h'
+
+      integer           aa, bb, cc, BLOCK_SIZE
+      parameter (aa=1, bb=2, cc=3, BLOCK_SIZE=5)
+
+      integer           ncells, grid_points(3)
+      double precision  elapsed_time
+      common /global/   elapsed_time, ncells, grid_points
+
+      double precision  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
+     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+     >                  ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, 
+     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+      common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
+     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+     >                  ce, dxmax, dymax, dzmax, xxcon1, xxcon2, 
+     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+      integer           EAST, WEST, NORTH, SOUTH, 
+     >                  BOTTOM, TOP
+
+      parameter (EAST=2000, WEST=3000,      NORTH=4000, SOUTH=5000,
+     >           BOTTOM=6000, TOP=7000)
+
+      integer cell_coord (3,maxcells), cell_low (3,maxcells), 
+     >        cell_high  (3,maxcells), cell_size(3,maxcells),
+     >        predecessor(3),          slice    (3,maxcells),
+     >        grid_size  (3),          successor(3)         ,
+     >        start      (3,maxcells), end      (3,maxcells)
+      common /partition/ cell_coord, cell_low, cell_high, cell_size,
+     >                   grid_size, successor, predecessor, slice,
+     >                   start, end
+
+      integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE
+
+      parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
+
+      parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
+
+      parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60+1)
+
+      double precision 
+     >   us      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   vs      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   ws      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   qs      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   rho_i   (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   square  (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   forcing (5,   0:IMAX-1, 0:JMAX-1, 0:KMAX-1, maxcells),
+     >   u       (5,  -2:IMAX+1,-2:JMAX+1,-2:KMAX+1, maxcells),
+     >   rhs     (5,  -1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
+     >   lhsc    (5,5,-1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
+     >   backsub_info (5, 0:MAX_CELL_DIM, 0:MAX_CELL_DIM, maxcells),
+     >   in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
+      common /fields/  u, us, vs, ws, qs, rho_i, square, 
+     >                 rhs, forcing, lhsc, in_buffer, out_buffer,
+     >                 backsub_info
+
+      double precision cv(-2:MAX_CELL_DIM+1),   rhon(-2:MAX_CELL_DIM+1),
+     >                 rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
+     >                 cuf(-2:MAX_CELL_DIM+1),  q(-2:MAX_CELL_DIM+1),
+     >                 ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
+      common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
+
+      integer  west_size, east_size, bottom_size, top_size,
+     >         north_size, south_size, start_send_west, 
+     >         start_send_east, start_send_south, start_send_north,
+     >         start_send_bottom, start_send_top, start_recv_west,
+     >         start_recv_east, start_recv_south, start_recv_north,
+     >         start_recv_bottom, start_recv_top
+      common /box/ west_size, east_size, bottom_size,
+     >             top_size, north_size, south_size, 
+     >             start_send_west, start_send_east, start_send_south,
+     >             start_send_north, start_send_bottom, start_send_top,
+     >             start_recv_west, start_recv_east, start_recv_south,
+     >             start_recv_north, start_recv_bottom, start_recv_top
+
+      double precision  tmp_block(5,5), b_inverse(5,5), tmp_vec(5)
+      common /work_solve/ tmp_block, b_inverse, tmp_vec
+
+c
+c     These are used by btio
+c
+      integer collbuf_nodes, collbuf_size, iosize, eltext,
+     $        combined_btype, fp, idump, record_length, element,
+     $        combined_ftype, idump_sub, rd_interval
+      common /btio/ collbuf_nodes, collbuf_size, iosize, eltext,
+     $              combined_btype, fp, idump, record_length,
+     $              idump_sub, rd_interval
+      double precision sum(niter_default), xce_sub(5)
+      common /btio/ sum, xce_sub
+      integer*8 iseek
+      common /btio/ iseek, element, combined_ftype
+
+
+
diff --git a/examples/smpi/NAS/BT/initialize.f b/examples/smpi/NAS/BT/initialize.f
new file mode 100644
index 0000000000..274cdb1899
--- /dev/null
+++ b/examples/smpi/NAS/BT/initialize.f
@@ -0,0 +1,308 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine  initialize
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     This subroutine initializes the field variable u using 
+c     tri-linear transfinite interpolation of the boundary values     
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      
+      integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
+      double precision  xi, eta, zeta, Pface(5,3,2), Pxi, Peta, 
+     >     Pzeta, temp(5)
+
+c---------------------------------------------------------------------
+c  Later (in compute_rhs) we compute 1/u for every element. A few of 
+c  the corner elements are not used, but it convenient (and faster) 
+c  to compute the whole thing with a simple loop. Make sure those 
+c  values are nonzero by initializing the whole thing here. 
+c---------------------------------------------------------------------
+      do c = 1, ncells
+         do kk = -1, KMAX
+            do jj = -1, JMAX
+               do ii = -1, IMAX
+                  do m = 1, 5
+                     u(m, ii, jj, kk, c) = 1.0
+                  end do
+               end do
+            end do
+         end do
+      end do
+c---------------------------------------------------------------------
+
+
+
+c---------------------------------------------------------------------
+c     first store the "interpolated" values everywhere on the grid    
+c---------------------------------------------------------------------
+      do c=1, ncells
+         kk = 0
+         do k = cell_low(3,c), cell_high(3,c)
+            zeta = dble(k) * dnzm1
+            jj = 0
+            do j = cell_low(2,c), cell_high(2,c)
+               eta = dble(j) * dnym1
+               ii = 0
+               do i = cell_low(1,c), cell_high(1,c)
+                  xi = dble(i) * dnxm1
+                  
+                  do ix = 1, 2
+                     call exact_solution(dble(ix-1), eta, zeta, 
+     >                    Pface(1,1,ix))
+                  enddo
+
+                  do iy = 1, 2
+                     call exact_solution(xi, dble(iy-1) , zeta, 
+     >                    Pface(1,2,iy))
+                  enddo
+
+                  do iz = 1, 2
+                     call exact_solution(xi, eta, dble(iz-1),   
+     >                    Pface(1,3,iz))
+                  enddo
+
+                  do m = 1, 5
+                     Pxi   = xi   * Pface(m,1,2) + 
+     >                    (1.0d0-xi)   * Pface(m,1,1)
+                     Peta  = eta  * Pface(m,2,2) + 
+     >                    (1.0d0-eta)  * Pface(m,2,1)
+                     Pzeta = zeta * Pface(m,3,2) + 
+     >                    (1.0d0-zeta) * Pface(m,3,1)
+                     
+                     u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta - 
+     >                    Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + 
+     >                    Pxi*Peta*Pzeta
+
+                  enddo
+                  ii = ii + 1
+               enddo
+               jj = jj + 1
+            enddo
+            kk = kk+1
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     now store the exact values on the boundaries        
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     west face                                                  
+c---------------------------------------------------------------------
+      c = slice(1,1)
+      ii = 0
+      xi = 0.0d0
+      kk = 0
+      do k = cell_low(3,c), cell_high(3,c)
+         zeta = dble(k) * dnzm1
+         jj = 0
+         do j = cell_low(2,c), cell_high(2,c)
+            eta = dble(j) * dnym1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            jj = jj + 1
+         enddo
+         kk = kk + 1
+      enddo
+
+c---------------------------------------------------------------------
+c     east face                                                      
+c---------------------------------------------------------------------
+      c  = slice(1,ncells)
+      ii = cell_size(1,c)-1
+      xi = 1.0d0
+      kk = 0
+      do k = cell_low(3,c), cell_high(3,c)
+         zeta = dble(k) * dnzm1
+         jj = 0
+         do j = cell_low(2,c), cell_high(2,c)
+            eta = dble(j) * dnym1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            jj = jj + 1
+         enddo
+         kk = kk + 1
+      enddo
+
+c---------------------------------------------------------------------
+c     south face                                                 
+c---------------------------------------------------------------------
+      c = slice(2,1)
+      jj = 0
+      eta = 0.0d0
+      kk = 0
+      do k = cell_low(3,c), cell_high(3,c)
+         zeta = dble(k) * dnzm1
+         ii = 0
+         do i = cell_low(1,c), cell_high(1,c)
+            xi = dble(i) * dnxm1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            ii = ii + 1
+         enddo
+         kk = kk + 1
+      enddo
+
+
+c---------------------------------------------------------------------
+c     north face                                    
+c---------------------------------------------------------------------
+      c = slice(2,ncells)
+      jj = cell_size(2,c)-1
+      eta = 1.0d0
+      kk = 0
+      do k = cell_low(3,c), cell_high(3,c)
+         zeta = dble(k) * dnzm1
+         ii = 0
+         do i = cell_low(1,c), cell_high(1,c)
+            xi = dble(i) * dnxm1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            ii = ii + 1
+         enddo
+         kk = kk + 1
+      enddo
+
+c---------------------------------------------------------------------
+c     bottom face                                       
+c---------------------------------------------------------------------
+      c = slice(3,1)
+      kk = 0
+      zeta = 0.0d0
+      jj = 0
+      do j = cell_low(2,c), cell_high(2,c)
+         eta = dble(j) * dnym1
+         ii = 0
+         do i =cell_low(1,c), cell_high(1,c)
+            xi = dble(i) *dnxm1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            ii = ii + 1
+         enddo
+         jj = jj + 1
+      enddo
+
+c---------------------------------------------------------------------
+c     top face     
+c---------------------------------------------------------------------
+      c = slice(3,ncells)
+      kk = cell_size(3,c)-1
+      zeta = 1.0d0
+      jj = 0
+      do j = cell_low(2,c), cell_high(2,c)
+         eta = dble(j) * dnym1
+         ii = 0
+         do i =cell_low(1,c), cell_high(1,c)
+            xi = dble(i) * dnxm1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            ii = ii + 1
+         enddo
+         jj = jj + 1
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine lhsinit
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      
+      integer i, j, k, d, c, m, n
+
+c---------------------------------------------------------------------
+c     loop over all cells                                       
+c---------------------------------------------------------------------
+      do c = 1, ncells
+
+c---------------------------------------------------------------------
+c     first, initialize the start and end arrays
+c---------------------------------------------------------------------
+         do d = 1, 3
+            if (cell_coord(d,c) .eq. 1) then
+               start(d,c) = 1
+            else 
+               start(d,c) = 0
+            endif
+            if (cell_coord(d,c) .eq. ncells) then
+               end(d,c) = 1
+            else
+               end(d,c) = 0
+            endif
+         enddo
+
+c---------------------------------------------------------------------
+c     zero the whole left hand side for starters
+c---------------------------------------------------------------------
+         do k = 0, cell_size(3,c)-1
+            do j = 0, cell_size(2,c)-1
+               do i = 0, cell_size(1,c)-1
+                  do m = 1,5
+                     do n = 1, 5
+                        lhsc(m,n,i,j,k,c) = 0.0d0
+                     enddo
+                  enddo
+               enddo
+            enddo
+         enddo
+
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine lhsabinit(lhsa, lhsb, size)
+      implicit none
+
+      integer size
+      double precision lhsa(5, 5, -1:size), lhsb(5, 5, -1:size)
+
+      integer i, m, n
+
+c---------------------------------------------------------------------
+c     next, set all diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+      do i = 0, size
+         do m = 1, 5
+            do n = 1, 5
+               lhsa(m,n,i) = 0.0d0
+               lhsb(m,n,i) = 0.0d0
+            enddo
+            lhsb(m,m,i) = 1.0d0
+         enddo
+      enddo
+
+      return
+      end
+
+
+
diff --git a/examples/smpi/NAS/BT/inputbt.data.sample b/examples/smpi/NAS/BT/inputbt.data.sample
new file mode 100644
index 0000000000..776654e8d0
--- /dev/null
+++ b/examples/smpi/NAS/BT/inputbt.data.sample
@@ -0,0 +1,5 @@
+200       number of time steps
+0.0008d0  dt for class A = 0.0008d0. class B = 0.0003d0  class C = 0.0001d0
+64 64 64
+5 0        write interval (optional read interval) for BTIO
+0 1000000  number of nodes in collective buffering and buffer size for BTIO
diff --git a/examples/smpi/NAS/BT/make_set.f b/examples/smpi/NAS/BT/make_set.f
new file mode 100644
index 0000000000..b8d90c65a4
--- /dev/null
+++ b/examples/smpi/NAS/BT/make_set.f
@@ -0,0 +1,124 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine make_set
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     This function allocates space for a set of cells and fills the set     
+c     such that communication between cells on different nodes is only
+c     nearest neighbor                                                   
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+
+      integer p, i, j, c, dir, size, excess, ierr,ierrcode
+
+c---------------------------------------------------------------------
+c     compute square root; add small number to allow for roundoff
+c     (note: this is computed in setup_mpi.f also, but prefer to do
+c     it twice because of some include file problems).
+c---------------------------------------------------------------------
+      ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c     this makes coding easier
+c---------------------------------------------------------------------
+      p = ncells
+      
+c---------------------------------------------------------------------
+c     determine the location of the cell at the bottom of the 3D 
+c     array of cells
+c---------------------------------------------------------------------
+      cell_coord(1,1) = mod(node,p) 
+      cell_coord(2,1) = node/p 
+      cell_coord(3,1) = 0
+
+c---------------------------------------------------------------------
+c     set the cell_coords for cells in the rest of the z-layers; 
+c     this comes down to a simple linear numbering in the z-direct-
+c     ion, and to the doubly-cyclic numbering in the other dirs     
+c---------------------------------------------------------------------
+      do c=2, p
+         cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) 
+         cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) 
+         cell_coord(3,c) = c-1
+      end do
+
+c---------------------------------------------------------------------
+c     offset all the coordinates by 1 to adjust for Fortran arrays
+c---------------------------------------------------------------------
+      do dir = 1, 3
+         do c = 1, p
+            cell_coord(dir,c) = cell_coord(dir,c) + 1
+         end do
+      end do
+      
+c---------------------------------------------------------------------
+c     slice(dir,n) contains the sequence number of the cell that is in
+c     coordinate plane n in the dir direction
+c---------------------------------------------------------------------
+      do dir = 1, 3
+         do c = 1, p
+            slice(dir,cell_coord(dir,c)) = c
+         end do
+      end do
+
+
+c---------------------------------------------------------------------
+c     fill the predecessor and successor entries, using the indices 
+c     of the bottom cells (they are the same at each level of k 
+c     anyway) acting as if full periodicity pertains; note that p is
+c     added to those arguments to the mod functions that might
+c     otherwise return wrong values when using the modulo function
+c---------------------------------------------------------------------
+      i = cell_coord(1,1)-1
+      j = cell_coord(2,1)-1
+
+      predecessor(1) = mod(i-1+p,p) + p*j
+      predecessor(2) = i + p*mod(j-1+p,p)
+      predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
+      successor(1)   = mod(i+1,p) + p*j
+      successor(2)   = i + p*mod(j+1,p)
+      successor(3)   = mod(i-1+p,p) + p*mod(j+1,p)
+
+c---------------------------------------------------------------------
+c     now compute the sizes of the cells                                    
+c---------------------------------------------------------------------
+      do dir= 1, 3
+c---------------------------------------------------------------------
+c     set cell_coord range for each direction                            
+c---------------------------------------------------------------------
+         size   = grid_points(dir)/p
+         excess = mod(grid_points(dir),p)
+         do c=1, ncells
+            if (cell_coord(dir,c) .le. excess) then
+               cell_size(dir,c) = size+1
+               cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
+               cell_high(dir,c) = cell_low(dir,c)+size
+            else 
+               cell_size(dir,c) = size
+               cell_low(dir,c)  = excess*(size+1)+
+     >              (cell_coord(dir,c)-excess-1)*size
+               cell_high(dir,c) = cell_low(dir,c)+size-1
+            endif
+            if (cell_size(dir, c) .le. 2) then
+               write(*,50)
+ 50            format(' Error: Cell size too small. Min size is 3')
+               call MPI_Abort(mpi_comm_world,ierrcode,ierr)
+               stop
+            endif
+         end do
+      end do
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
diff --git a/examples/smpi/NAS/BT/mpinpb.h b/examples/smpi/NAS/BT/mpinpb.h
new file mode 100644
index 0000000000..f621f08b64
--- /dev/null
+++ b/examples/smpi/NAS/BT/mpinpb.h
@@ -0,0 +1,12 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'mpif.h'
+
+      integer           node, no_nodes, total_nodes, root, comm_setup, 
+     >                  comm_solve, comm_rhs, dp_type
+      logical           active
+      common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, 
+     >                  comm_solve, comm_rhs, dp_type, active
+
diff --git a/examples/smpi/NAS/BT/rhs.f b/examples/smpi/NAS/BT/rhs.f
new file mode 100644
index 0000000000..89171a6741
--- /dev/null
+++ b/examples/smpi/NAS/BT/rhs.f
@@ -0,0 +1,425 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine compute_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer c, i, j, k, m
+      double precision rho_inv, uijk, up1, um1, vijk, vp1, vm1,
+     >     wijk, wp1, wm1
+
+
+c---------------------------------------------------------------------
+c     loop over all cells owned by this node                           
+c---------------------------------------------------------------------
+      do c = 1, ncells
+
+c---------------------------------------------------------------------
+c     compute the reciprocal of density, and the kinetic energy, 
+c     and the speed of sound.
+c---------------------------------------------------------------------
+         do k = -1, cell_size(3,c)
+            do j = -1, cell_size(2,c)
+               do i = -1, cell_size(1,c)
+                  rho_inv = 1.0d0/u(1,i,j,k,c)
+                  rho_i(i,j,k,c) = rho_inv
+                  us(i,j,k,c) = u(2,i,j,k,c) * rho_inv
+                  vs(i,j,k,c) = u(3,i,j,k,c) * rho_inv
+                  ws(i,j,k,c) = u(4,i,j,k,c) * rho_inv
+                  square(i,j,k,c)     = 0.5d0* (
+     >                 u(2,i,j,k,c)*u(2,i,j,k,c) + 
+     >                 u(3,i,j,k,c)*u(3,i,j,k,c) +
+     >                 u(4,i,j,k,c)*u(4,i,j,k,c) ) * rho_inv
+                  qs(i,j,k,c) = square(i,j,k,c) * rho_inv
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c copy the exact forcing term to the right hand side;  because 
+c this forcing term is known, we can store it on the whole of every 
+c cell,  including the boundary                   
+c---------------------------------------------------------------------
+
+         do k = 0, cell_size(3,c)-1
+            do j = 0, cell_size(2,c)-1
+               do i = 0, cell_size(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = forcing(m,i,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+
+
+c---------------------------------------------------------------------
+c     compute xi-direction fluxes 
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  uijk = us(i,j,k,c)
+                  up1  = us(i+1,j,k,c)
+                  um1  = us(i-1,j,k,c)
+
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dx1tx1 * 
+     >                 (u(1,i+1,j,k,c) - 2.0d0*u(1,i,j,k,c) + 
+     >                 u(1,i-1,j,k,c)) -
+     >                 tx2 * (u(2,i+1,j,k,c) - u(2,i-1,j,k,c))
+
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dx2tx1 * 
+     >                 (u(2,i+1,j,k,c) - 2.0d0*u(2,i,j,k,c) + 
+     >                 u(2,i-1,j,k,c)) +
+     >                 xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
+     >                 tx2 * (u(2,i+1,j,k,c)*up1 - 
+     >                 u(2,i-1,j,k,c)*um1 +
+     >                 (u(5,i+1,j,k,c)- square(i+1,j,k,c)-
+     >                 u(5,i-1,j,k,c)+ square(i-1,j,k,c))*
+     >                 c2)
+
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dx3tx1 * 
+     >                 (u(3,i+1,j,k,c) - 2.0d0*u(3,i,j,k,c) +
+     >                 u(3,i-1,j,k,c)) +
+     >                 xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
+     >                 vs(i-1,j,k,c)) -
+     >                 tx2 * (u(3,i+1,j,k,c)*up1 - 
+     >                 u(3,i-1,j,k,c)*um1)
+
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dx4tx1 * 
+     >                 (u(4,i+1,j,k,c) - 2.0d0*u(4,i,j,k,c) +
+     >                 u(4,i-1,j,k,c)) +
+     >                 xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
+     >                 ws(i-1,j,k,c)) -
+     >                 tx2 * (u(4,i+1,j,k,c)*up1 - 
+     >                 u(4,i-1,j,k,c)*um1)
+
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dx5tx1 * 
+     >                 (u(5,i+1,j,k,c) - 2.0d0*u(5,i,j,k,c) +
+     >                 u(5,i-1,j,k,c)) +
+     >                 xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
+     >                 qs(i-1,j,k,c)) +
+     >                 xxcon4 * (up1*up1 -       2.0d0*uijk*uijk + 
+     >                 um1*um1) +
+     >                 xxcon5 * (u(5,i+1,j,k,c)*rho_i(i+1,j,k,c) - 
+     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+     >                 u(5,i-1,j,k,c)*rho_i(i-1,j,k,c)) -
+     >                 tx2 * ( (c1*u(5,i+1,j,k,c) - 
+     >                 c2*square(i+1,j,k,c))*up1 -
+     >                 (c1*u(5,i-1,j,k,c) - 
+     >                 c2*square(i-1,j,k,c))*um1 )
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     add fourth order xi-direction dissipation               
+c---------------------------------------------------------------------
+         if (start(1,c) .gt. 0) then
+            do k = start(3,c), cell_size(3,c)-end(3,c)-1
+               do j = start(2,c), cell_size(2,c)-end(2,c)-1
+                  i = 1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) +
+     >                    u(m,i+2,j,k,c))
+                  enddo
+
+                  i = 2
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (-4.0d0*u(m,i-1,j,k,c) + 6.0d0*u(m,i,j,k,c) -
+     >                    4.0d0*u(m,i+1,j,k,c) + u(m,i+2,j,k,c))
+                  enddo
+               enddo
+            enddo
+         endif
+
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (  u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + 
+     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) + 
+     >                    u(m,i+2,j,k,c) )
+                  enddo
+               enddo
+            enddo
+         enddo
+         
+
+         if (end(1,c) .gt. 0) then
+            do k = start(3,c), cell_size(3,c)-end(3,c)-1
+               do j = start(2,c), cell_size(2,c)-end(2,c)-1
+                  i = cell_size(1,c)-3
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + 
+     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) )
+                  enddo
+
+                  i = cell_size(1,c)-2
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i-2,j,k,c) - 4.d0*u(m,i-1,j,k,c) +
+     >                    5.d0*u(m,i,j,k,c) )
+                  enddo
+               enddo
+            enddo
+         endif
+
+c---------------------------------------------------------------------
+c     compute eta-direction fluxes 
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  vijk = vs(i,j,k,c)
+                  vp1  = vs(i,j+1,k,c)
+                  vm1  = vs(i,j-1,k,c)
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dy1ty1 * 
+     >                 (u(1,i,j+1,k,c) - 2.0d0*u(1,i,j,k,c) + 
+     >                 u(1,i,j-1,k,c)) -
+     >                 ty2 * (u(3,i,j+1,k,c) - u(3,i,j-1,k,c))
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dy2ty1 * 
+     >                 (u(2,i,j+1,k,c) - 2.0d0*u(2,i,j,k,c) + 
+     >                 u(2,i,j-1,k,c)) +
+     >                 yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + 
+     >                 us(i,j-1,k,c)) -
+     >                 ty2 * (u(2,i,j+1,k,c)*vp1 - 
+     >                 u(2,i,j-1,k,c)*vm1)
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dy3ty1 * 
+     >                 (u(3,i,j+1,k,c) - 2.0d0*u(3,i,j,k,c) + 
+     >                 u(3,i,j-1,k,c)) +
+     >                 yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
+     >                 ty2 * (u(3,i,j+1,k,c)*vp1 - 
+     >                 u(3,i,j-1,k,c)*vm1 +
+     >                 (u(5,i,j+1,k,c) - square(i,j+1,k,c) - 
+     >                 u(5,i,j-1,k,c) + square(i,j-1,k,c))
+     >                 *c2)
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dy4ty1 * 
+     >                 (u(4,i,j+1,k,c) - 2.0d0*u(4,i,j,k,c) + 
+     >                 u(4,i,j-1,k,c)) +
+     >                 yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + 
+     >                 ws(i,j-1,k,c)) -
+     >                 ty2 * (u(4,i,j+1,k,c)*vp1 - 
+     >                 u(4,i,j-1,k,c)*vm1)
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dy5ty1 * 
+     >                 (u(5,i,j+1,k,c) - 2.0d0*u(5,i,j,k,c) + 
+     >                 u(5,i,j-1,k,c)) +
+     >                 yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + 
+     >                 qs(i,j-1,k,c)) +
+     >                 yycon4 * (vp1*vp1       - 2.0d0*vijk*vijk + 
+     >                 vm1*vm1) +
+     >                 yycon5 * (u(5,i,j+1,k,c)*rho_i(i,j+1,k,c) - 
+     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+     >                 u(5,i,j-1,k,c)*rho_i(i,j-1,k,c)) -
+     >                 ty2 * ((c1*u(5,i,j+1,k,c) - 
+     >                 c2*square(i,j+1,k,c)) * vp1 -
+     >                 (c1*u(5,i,j-1,k,c) - 
+     >                 c2*square(i,j-1,k,c)) * vm1)
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     add fourth order eta-direction dissipation         
+c---------------------------------------------------------------------
+         if (start(2,c) .gt. 0) then
+            do k = start(3,c), cell_size(3,c)-end(3,c)-1
+               j = 1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) +
+     >                    u(m,i,j+2,k,c))
+                  enddo
+               enddo
+
+               j = 2
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (-4.0d0*u(m,i,j-1,k,c) + 6.0d0*u(m,i,j,k,c) -
+     >                    4.0d0*u(m,i,j+1,k,c) + u(m,i,j+2,k,c))
+                  enddo
+               enddo
+            enddo
+         endif
+
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
+               do i = start(1,c),cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (  u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + 
+     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) + 
+     >                    u(m,i,j+2,k,c) )
+                  enddo
+               enddo
+            enddo
+         enddo
+         
+         if (end(2,c) .gt. 0) then
+            do k = start(3,c), cell_size(3,c)-end(3,c)-1
+               j = cell_size(2,c)-3
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + 
+     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) )
+                  enddo
+               enddo
+
+               j = cell_size(2,c)-2
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i,j-2,k,c) - 4.d0*u(m,i,j-1,k,c) +
+     >                    5.d0*u(m,i,j,k,c) )
+                  enddo
+               enddo
+            enddo
+         endif
+
+c---------------------------------------------------------------------
+c     compute zeta-direction fluxes 
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  wijk = ws(i,j,k,c)
+                  wp1  = ws(i,j,k+1,c)
+                  wm1  = ws(i,j,k-1,c)
+
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dz1tz1 * 
+     >                 (u(1,i,j,k+1,c) - 2.0d0*u(1,i,j,k,c) + 
+     >                 u(1,i,j,k-1,c)) -
+     >                 tz2 * (u(4,i,j,k+1,c) - u(4,i,j,k-1,c))
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dz2tz1 * 
+     >                 (u(2,i,j,k+1,c) - 2.0d0*u(2,i,j,k,c) + 
+     >                 u(2,i,j,k-1,c)) +
+     >                 zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + 
+     >                 us(i,j,k-1,c)) -
+     >                 tz2 * (u(2,i,j,k+1,c)*wp1 - 
+     >                 u(2,i,j,k-1,c)*wm1)
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dz3tz1 * 
+     >                 (u(3,i,j,k+1,c) - 2.0d0*u(3,i,j,k,c) + 
+     >                 u(3,i,j,k-1,c)) +
+     >                 zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + 
+     >                 vs(i,j,k-1,c)) -
+     >                 tz2 * (u(3,i,j,k+1,c)*wp1 - 
+     >                 u(3,i,j,k-1,c)*wm1)
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dz4tz1 * 
+     >                 (u(4,i,j,k+1,c) - 2.0d0*u(4,i,j,k,c) + 
+     >                 u(4,i,j,k-1,c)) +
+     >                 zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
+     >                 tz2 * (u(4,i,j,k+1,c)*wp1 - 
+     >                 u(4,i,j,k-1,c)*wm1 +
+     >                 (u(5,i,j,k+1,c) - square(i,j,k+1,c) - 
+     >                 u(5,i,j,k-1,c) + square(i,j,k-1,c))
+     >                 *c2)
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dz5tz1 * 
+     >                 (u(5,i,j,k+1,c) - 2.0d0*u(5,i,j,k,c) + 
+     >                 u(5,i,j,k-1,c)) +
+     >                 zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + 
+     >                 qs(i,j,k-1,c)) +
+     >                 zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + 
+     >                 wm1*wm1) +
+     >                 zzcon5 * (u(5,i,j,k+1,c)*rho_i(i,j,k+1,c) - 
+     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+     >                 u(5,i,j,k-1,c)*rho_i(i,j,k-1,c)) -
+     >                 tz2 * ( (c1*u(5,i,j,k+1,c) - 
+     >                 c2*square(i,j,k+1,c))*wp1 -
+     >                 (c1*u(5,i,j,k-1,c) - 
+     >                 c2*square(i,j,k-1,c))*wm1)
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     add fourth order zeta-direction dissipation                
+c---------------------------------------------------------------------
+         if (start(3,c) .gt. 0) then
+            k = 1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) +
+     >                    u(m,i,j,k+2,c))
+                  enddo
+               enddo
+            enddo
+
+            k = 2
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (-4.0d0*u(m,i,j,k-1,c) + 6.0d0*u(m,i,j,k,c) -
+     >                    4.0d0*u(m,i,j,k+1,c) + u(m,i,j,k+2,c))
+                  enddo
+               enddo
+            enddo
+         endif
+
+         do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c),cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (  u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + 
+     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) + 
+     >                    u(m,i,j,k+2,c) )
+                  enddo
+               enddo
+            enddo
+         enddo
+         
+         if (end(3,c) .gt. 0) then
+            k = cell_size(3,c)-3
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + 
+     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) )
+                  enddo
+               enddo
+            enddo
+
+            k = cell_size(3,c)-2
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i,j,k-2,c) - 4.d0*u(m,i,j,k-1,c) +
+     >                    5.d0*u(m,i,j,k,c) )
+                  enddo
+               enddo
+            enddo
+         endif
+
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) * dt
+                  enddo
+               enddo
+            enddo
+         enddo
+
+      enddo
+      
+      return
+      end
+
+
+
+
diff --git a/examples/smpi/NAS/BT/set_constants.f b/examples/smpi/NAS/BT/set_constants.f
new file mode 100644
index 0000000000..81397d4bcf
--- /dev/null
+++ b/examples/smpi/NAS/BT/set_constants.f
@@ -0,0 +1,202 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine  set_constants
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      
+      ce(1,1)  = 2.0d0
+      ce(1,2)  = 0.0d0
+      ce(1,3)  = 0.0d0
+      ce(1,4)  = 4.0d0
+      ce(1,5)  = 5.0d0
+      ce(1,6)  = 3.0d0
+      ce(1,7)  = 0.5d0
+      ce(1,8)  = 0.02d0
+      ce(1,9)  = 0.01d0
+      ce(1,10) = 0.03d0
+      ce(1,11) = 0.5d0
+      ce(1,12) = 0.4d0
+      ce(1,13) = 0.3d0
+      
+      ce(2,1)  = 1.0d0
+      ce(2,2)  = 0.0d0
+      ce(2,3)  = 0.0d0
+      ce(2,4)  = 0.0d0
+      ce(2,5)  = 1.0d0
+      ce(2,6)  = 2.0d0
+      ce(2,7)  = 3.0d0
+      ce(2,8)  = 0.01d0
+      ce(2,9)  = 0.03d0
+      ce(2,10) = 0.02d0
+      ce(2,11) = 0.4d0
+      ce(2,12) = 0.3d0
+      ce(2,13) = 0.5d0
+
+      ce(3,1)  = 2.0d0
+      ce(3,2)  = 2.0d0
+      ce(3,3)  = 0.0d0
+      ce(3,4)  = 0.0d0
+      ce(3,5)  = 0.0d0
+      ce(3,6)  = 2.0d0
+      ce(3,7)  = 3.0d0
+      ce(3,8)  = 0.04d0
+      ce(3,9)  = 0.03d0
+      ce(3,10) = 0.05d0
+      ce(3,11) = 0.3d0
+      ce(3,12) = 0.5d0
+      ce(3,13) = 0.4d0
+
+      ce(4,1)  = 2.0d0
+      ce(4,2)  = 2.0d0
+      ce(4,3)  = 0.0d0
+      ce(4,4)  = 0.0d0
+      ce(4,5)  = 0.0d0
+      ce(4,6)  = 2.0d0
+      ce(4,7)  = 3.0d0
+      ce(4,8)  = 0.03d0
+      ce(4,9)  = 0.05d0
+      ce(4,10) = 0.04d0
+      ce(4,11) = 0.2d0
+      ce(4,12) = 0.1d0
+      ce(4,13) = 0.3d0
+
+      ce(5,1)  = 5.0d0
+      ce(5,2)  = 4.0d0
+      ce(5,3)  = 3.0d0
+      ce(5,4)  = 2.0d0
+      ce(5,5)  = 0.1d0
+      ce(5,6)  = 0.4d0
+      ce(5,7)  = 0.3d0
+      ce(5,8)  = 0.05d0
+      ce(5,9)  = 0.04d0
+      ce(5,10) = 0.03d0
+      ce(5,11) = 0.1d0
+      ce(5,12) = 0.3d0
+      ce(5,13) = 0.2d0
+
+      c1 = 1.4d0
+      c2 = 0.4d0
+      c3 = 0.1d0
+      c4 = 1.0d0
+      c5 = 1.4d0
+
+      bt = dsqrt(0.5d0)
+
+      dnxm1 = 1.0d0 / dble(grid_points(1)-1)
+      dnym1 = 1.0d0 / dble(grid_points(2)-1)
+      dnzm1 = 1.0d0 / dble(grid_points(3)-1)
+
+      c1c2 = c1 * c2
+      c1c5 = c1 * c5
+      c3c4 = c3 * c4
+      c1345 = c1c5 * c3c4
+
+      conz1 = (1.0d0-c1c5)
+
+      tx1 = 1.0d0 / (dnxm1 * dnxm1)
+      tx2 = 1.0d0 / (2.0d0 * dnxm1)
+      tx3 = 1.0d0 / dnxm1
+
+      ty1 = 1.0d0 / (dnym1 * dnym1)
+      ty2 = 1.0d0 / (2.0d0 * dnym1)
+      ty3 = 1.0d0 / dnym1
+      
+      tz1 = 1.0d0 / (dnzm1 * dnzm1)
+      tz2 = 1.0d0 / (2.0d0 * dnzm1)
+      tz3 = 1.0d0 / dnzm1
+
+      dx1 = 0.75d0
+      dx2 = 0.75d0
+      dx3 = 0.75d0
+      dx4 = 0.75d0
+      dx5 = 0.75d0
+
+      dy1 = 0.75d0
+      dy2 = 0.75d0
+      dy3 = 0.75d0
+      dy4 = 0.75d0
+      dy5 = 0.75d0
+
+      dz1 = 1.0d0
+      dz2 = 1.0d0
+      dz3 = 1.0d0
+      dz4 = 1.0d0
+      dz5 = 1.0d0
+
+      dxmax = dmax1(dx3, dx4)
+      dymax = dmax1(dy2, dy4)
+      dzmax = dmax1(dz2, dz3)
+
+      dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
+
+      c4dssp = 4.0d0 * dssp
+      c5dssp = 5.0d0 * dssp
+
+      dttx1 = dt*tx1
+      dttx2 = dt*tx2
+      dtty1 = dt*ty1
+      dtty2 = dt*ty2
+      dttz1 = dt*tz1
+      dttz2 = dt*tz2
+
+      c2dttx1 = 2.0d0*dttx1
+      c2dtty1 = 2.0d0*dtty1
+      c2dttz1 = 2.0d0*dttz1
+
+      dtdssp = dt*dssp
+
+      comz1  = dtdssp
+      comz4  = 4.0d0*dtdssp
+      comz5  = 5.0d0*dtdssp
+      comz6  = 6.0d0*dtdssp
+
+      c3c4tx3 = c3c4*tx3
+      c3c4ty3 = c3c4*ty3
+      c3c4tz3 = c3c4*tz3
+
+      dx1tx1 = dx1*tx1
+      dx2tx1 = dx2*tx1
+      dx3tx1 = dx3*tx1
+      dx4tx1 = dx4*tx1
+      dx5tx1 = dx5*tx1
+      
+      dy1ty1 = dy1*ty1
+      dy2ty1 = dy2*ty1
+      dy3ty1 = dy3*ty1
+      dy4ty1 = dy4*ty1
+      dy5ty1 = dy5*ty1
+      
+      dz1tz1 = dz1*tz1
+      dz2tz1 = dz2*tz1
+      dz3tz1 = dz3*tz1
+      dz4tz1 = dz4*tz1
+      dz5tz1 = dz5*tz1
+
+      c2iv  = 2.5d0
+      con43 = 4.0d0/3.0d0
+      con16 = 1.0d0/6.0d0
+      
+      xxcon1 = c3c4tx3*con43*tx3
+      xxcon2 = c3c4tx3*tx3
+      xxcon3 = c3c4tx3*conz1*tx3
+      xxcon4 = c3c4tx3*con16*tx3
+      xxcon5 = c3c4tx3*c1c5*tx3
+
+      yycon1 = c3c4ty3*con43*ty3
+      yycon2 = c3c4ty3*ty3
+      yycon3 = c3c4ty3*conz1*ty3
+      yycon4 = c3c4ty3*con16*ty3
+      yycon5 = c3c4ty3*c1c5*ty3
+
+      zzcon1 = c3c4tz3*con43*tz3
+      zzcon2 = c3c4tz3*tz3
+      zzcon3 = c3c4tz3*conz1*tz3
+      zzcon4 = c3c4tz3*con16*tz3
+      zzcon5 = c3c4tz3*c1c5*tz3
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/setup_mpi.f b/examples/smpi/NAS/BT/setup_mpi.f
new file mode 100644
index 0000000000..987c6bfba4
--- /dev/null
+++ b/examples/smpi/NAS/BT/setup_mpi.f
@@ -0,0 +1,64 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_mpi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c set up MPI stuff
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'mpinpb.h'
+      include 'npbparams.h'
+      integer error, color, nc
+
+      call mpi_init(error)
+      
+      call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
+      call mpi_comm_rank(MPI_COMM_WORLD, node, error)
+
+      if (.not. convertdouble) then
+         dp_type = MPI_DOUBLE_PRECISION
+      else
+         dp_type = MPI_REAL
+      endif
+
+c---------------------------------------------------------------------
+c     compute square root; add small number to allow for roundoff
+c---------------------------------------------------------------------
+      nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c We handle a non-square number of nodes by making the excess nodes
+c inactive. However, we can never handle more cells than were compiled
+c in. 
+c---------------------------------------------------------------------
+
+      if (nc .gt. maxcells) nc = maxcells
+      if (node .ge. nc*nc) then
+         active = .false.
+         color = 1
+      else
+         active = .true.
+         color = 0
+      end if
+      
+      call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
+      if (.not. active) return
+
+      call mpi_comm_size(comm_setup, no_nodes, error)
+      call mpi_comm_dup(comm_setup, comm_solve, error)
+      call mpi_comm_dup(comm_setup, comm_rhs, error)
+      
+c---------------------------------------------------------------------
+c     let node 0 be the root for the group (there is only one)
+c---------------------------------------------------------------------
+      root = 0
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/simple_mpiio.f b/examples/smpi/NAS/BT/simple_mpiio.f
new file mode 100644
index 0000000000..02e2700177
--- /dev/null
+++ b/examples/smpi/NAS/BT/simple_mpiio.f
@@ -0,0 +1,213 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer m, ierr
+
+      iseek=0
+
+      if (node .eq. root) then
+          call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
+      endif
+
+      call MPI_Barrier(comm_solve, ierr)
+
+      call MPI_File_open(comm_solve,
+     $          filenm,
+     $          MPI_MODE_RDWR + MPI_MODE_CREATE,
+     $          MPI_INFO_NULL,
+     $          fp,
+     $          ierr)
+
+      call MPI_File_set_view(fp,
+     $          iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
+     $          'native', MPI_INFO_NULL, ierr)
+
+      if (ierr .ne. MPI_SUCCESS) then
+          print *, 'Error opening file'
+          stop
+      endif
+
+      do m = 1, 5
+         xce_sub(m) = 0.d0
+      end do
+
+      idump_sub = 0
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer count, jio, kio, cio, aio
+      integer ierr
+      integer mstatus(MPI_STATUS_SIZE)
+
+      do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  iseek=5*(cell_low(1,cio) +
+     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+     $                   PROBLEM_SIZE*idump_sub)))
+
+                  count=5*cell_size(1,cio)
+
+                  call MPI_File_write_at(fp, iseek,
+     $                  u(1,0,jio,kio,cio),
+     $                  count, MPI_DOUBLE_PRECISION,
+     $                  mstatus, ierr)
+
+                  if (ierr .ne. MPI_SUCCESS) then
+                      print *, 'Error writing to file'
+                      stop
+                  endif
+              enddo
+          enddo
+      enddo
+
+      idump_sub = idump_sub + 1
+      if (rd_interval .gt. 0) then
+         if (idump_sub .ge. rd_interval) then
+
+            call acc_sub_norms(idump+1)
+
+            idump_sub = 0
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine acc_sub_norms(idump_cur)
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer idump_cur
+
+      integer count, jio, kio, cio, ii, m, ichunk
+      integer ierr
+      integer mstatus(MPI_STATUS_SIZE)
+      double precision xce_single(5)
+
+      ichunk = idump_cur - idump_sub + 1
+      do ii=0, idump_sub-1
+        do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  iseek=5*(cell_low(1,cio) +
+     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+     $                   PROBLEM_SIZE*ii)))
+
+                  count=5*cell_size(1,cio)
+
+                  call MPI_File_read_at(fp, iseek,
+     $                  u(1,0,jio,kio,cio),
+     $                  count, MPI_DOUBLE_PRECISION,
+     $                  mstatus, ierr)
+
+                  if (ierr .ne. MPI_SUCCESS) then
+                      print *, 'Error reading back file'
+                      call MPI_File_close(fp, ierr)
+                      stop
+                  endif
+              enddo
+          enddo
+        enddo
+
+        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+        call error_norm(xce_single)
+        do m = 1, 5
+           xce_sub(m) = xce_sub(m) + xce_single(m)
+        end do
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ierr
+
+      call MPI_File_close(fp, ierr)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      double precision xce_acc(5)
+      integer m, ierr
+
+      if (rd_interval .gt. 0) goto 20
+
+      call MPI_File_open(comm_solve,
+     $          filenm,
+     $          MPI_MODE_RDONLY,
+     $          MPI_INFO_NULL,
+     $          fp,
+     $          ierr)
+
+      iseek = 0
+      call MPI_File_set_view(fp,
+     $          iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
+     $          'native', MPI_INFO_NULL, ierr)
+
+c     clear the last time step
+
+      call clear_timestep
+
+c     read back the time steps and accumulate norms
+
+      call acc_sub_norms(idump)
+
+      call MPI_File_close(fp, ierr)
+
+ 20   continue
+      do m = 1, 5
+         xce_acc(m) = xce_sub(m) / dble(idump)
+      end do
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/solve_subs.f b/examples/smpi/NAS/BT/solve_subs.f
new file mode 100644
index 0000000000..351489a313
--- /dev/null
+++ b/examples/smpi/NAS/BT/solve_subs.f
@@ -0,0 +1,642 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine matvec_sub(ablock,avec,bvec)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     subtracts bvec=bvec - ablock*avec
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision ablock,avec,bvec
+      dimension ablock(5,5),avec(5),bvec(5)
+
+c---------------------------------------------------------------------
+c            rhs(i,ic,jc,kc,ccell) = rhs(i,ic,jc,kc,ccell) 
+c     $           - lhs(i,1,ablock,ia,ja,ka,acell)*
+c---------------------------------------------------------------------
+         bvec(1) = bvec(1) - ablock(1,1)*avec(1)
+     >                     - ablock(1,2)*avec(2)
+     >                     - ablock(1,3)*avec(3)
+     >                     - ablock(1,4)*avec(4)
+     >                     - ablock(1,5)*avec(5)
+         bvec(2) = bvec(2) - ablock(2,1)*avec(1)
+     >                     - ablock(2,2)*avec(2)
+     >                     - ablock(2,3)*avec(3)
+     >                     - ablock(2,4)*avec(4)
+     >                     - ablock(2,5)*avec(5)
+         bvec(3) = bvec(3) - ablock(3,1)*avec(1)
+     >                     - ablock(3,2)*avec(2)
+     >                     - ablock(3,3)*avec(3)
+     >                     - ablock(3,4)*avec(4)
+     >                     - ablock(3,5)*avec(5)
+         bvec(4) = bvec(4) - ablock(4,1)*avec(1)
+     >                     - ablock(4,2)*avec(2)
+     >                     - ablock(4,3)*avec(3)
+     >                     - ablock(4,4)*avec(4)
+     >                     - ablock(4,5)*avec(5)
+         bvec(5) = bvec(5) - ablock(5,1)*avec(1)
+     >                     - ablock(5,2)*avec(2)
+     >                     - ablock(5,3)*avec(3)
+     >                     - ablock(5,4)*avec(4)
+     >                     - ablock(5,5)*avec(5)
+
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine matmul_sub(ablock, bblock, cblock)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     subtracts a(i,j,k) X b(i,j,k) from c(i,j,k)
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision ablock, bblock, cblock
+      dimension ablock(5,5), bblock(5,5), cblock(5,5)
+
+
+         cblock(1,1) = cblock(1,1) - ablock(1,1)*bblock(1,1)
+     >                             - ablock(1,2)*bblock(2,1)
+     >                             - ablock(1,3)*bblock(3,1)
+     >                             - ablock(1,4)*bblock(4,1)
+     >                             - ablock(1,5)*bblock(5,1)
+         cblock(2,1) = cblock(2,1) - ablock(2,1)*bblock(1,1)
+     >                             - ablock(2,2)*bblock(2,1)
+     >                             - ablock(2,3)*bblock(3,1)
+     >                             - ablock(2,4)*bblock(4,1)
+     >                             - ablock(2,5)*bblock(5,1)
+         cblock(3,1) = cblock(3,1) - ablock(3,1)*bblock(1,1)
+     >                             - ablock(3,2)*bblock(2,1)
+     >                             - ablock(3,3)*bblock(3,1)
+     >                             - ablock(3,4)*bblock(4,1)
+     >                             - ablock(3,5)*bblock(5,1)
+         cblock(4,1) = cblock(4,1) - ablock(4,1)*bblock(1,1)
+     >                             - ablock(4,2)*bblock(2,1)
+     >                             - ablock(4,3)*bblock(3,1)
+     >                             - ablock(4,4)*bblock(4,1)
+     >                             - ablock(4,5)*bblock(5,1)
+         cblock(5,1) = cblock(5,1) - ablock(5,1)*bblock(1,1)
+     >                             - ablock(5,2)*bblock(2,1)
+     >                             - ablock(5,3)*bblock(3,1)
+     >                             - ablock(5,4)*bblock(4,1)
+     >                             - ablock(5,5)*bblock(5,1)
+         cblock(1,2) = cblock(1,2) - ablock(1,1)*bblock(1,2)
+     >                             - ablock(1,2)*bblock(2,2)
+     >                             - ablock(1,3)*bblock(3,2)
+     >                             - ablock(1,4)*bblock(4,2)
+     >                             - ablock(1,5)*bblock(5,2)
+         cblock(2,2) = cblock(2,2) - ablock(2,1)*bblock(1,2)
+     >                             - ablock(2,2)*bblock(2,2)
+     >                             - ablock(2,3)*bblock(3,2)
+     >                             - ablock(2,4)*bblock(4,2)
+     >                             - ablock(2,5)*bblock(5,2)
+         cblock(3,2) = cblock(3,2) - ablock(3,1)*bblock(1,2)
+     >                             - ablock(3,2)*bblock(2,2)
+     >                             - ablock(3,3)*bblock(3,2)
+     >                             - ablock(3,4)*bblock(4,2)
+     >                             - ablock(3,5)*bblock(5,2)
+         cblock(4,2) = cblock(4,2) - ablock(4,1)*bblock(1,2)
+     >                             - ablock(4,2)*bblock(2,2)
+     >                             - ablock(4,3)*bblock(3,2)
+     >                             - ablock(4,4)*bblock(4,2)
+     >                             - ablock(4,5)*bblock(5,2)
+         cblock(5,2) = cblock(5,2) - ablock(5,1)*bblock(1,2)
+     >                             - ablock(5,2)*bblock(2,2)
+     >                             - ablock(5,3)*bblock(3,2)
+     >                             - ablock(5,4)*bblock(4,2)
+     >                             - ablock(5,5)*bblock(5,2)
+         cblock(1,3) = cblock(1,3) - ablock(1,1)*bblock(1,3)
+     >                             - ablock(1,2)*bblock(2,3)
+     >                             - ablock(1,3)*bblock(3,3)
+     >                             - ablock(1,4)*bblock(4,3)
+     >                             - ablock(1,5)*bblock(5,3)
+         cblock(2,3) = cblock(2,3) - ablock(2,1)*bblock(1,3)
+     >                             - ablock(2,2)*bblock(2,3)
+     >                             - ablock(2,3)*bblock(3,3)
+     >                             - ablock(2,4)*bblock(4,3)
+     >                             - ablock(2,5)*bblock(5,3)
+         cblock(3,3) = cblock(3,3) - ablock(3,1)*bblock(1,3)
+     >                             - ablock(3,2)*bblock(2,3)
+     >                             - ablock(3,3)*bblock(3,3)
+     >                             - ablock(3,4)*bblock(4,3)
+     >                             - ablock(3,5)*bblock(5,3)
+         cblock(4,3) = cblock(4,3) - ablock(4,1)*bblock(1,3)
+     >                             - ablock(4,2)*bblock(2,3)
+     >                             - ablock(4,3)*bblock(3,3)
+     >                             - ablock(4,4)*bblock(4,3)
+     >                             - ablock(4,5)*bblock(5,3)
+         cblock(5,3) = cblock(5,3) - ablock(5,1)*bblock(1,3)
+     >                             - ablock(5,2)*bblock(2,3)
+     >                             - ablock(5,3)*bblock(3,3)
+     >                             - ablock(5,4)*bblock(4,3)
+     >                             - ablock(5,5)*bblock(5,3)
+         cblock(1,4) = cblock(1,4) - ablock(1,1)*bblock(1,4)
+     >                             - ablock(1,2)*bblock(2,4)
+     >                             - ablock(1,3)*bblock(3,4)
+     >                             - ablock(1,4)*bblock(4,4)
+     >                             - ablock(1,5)*bblock(5,4)
+         cblock(2,4) = cblock(2,4) - ablock(2,1)*bblock(1,4)
+     >                             - ablock(2,2)*bblock(2,4)
+     >                             - ablock(2,3)*bblock(3,4)
+     >                             - ablock(2,4)*bblock(4,4)
+     >                             - ablock(2,5)*bblock(5,4)
+         cblock(3,4) = cblock(3,4) - ablock(3,1)*bblock(1,4)
+     >                             - ablock(3,2)*bblock(2,4)
+     >                             - ablock(3,3)*bblock(3,4)
+     >                             - ablock(3,4)*bblock(4,4)
+     >                             - ablock(3,5)*bblock(5,4)
+         cblock(4,4) = cblock(4,4) - ablock(4,1)*bblock(1,4)
+     >                             - ablock(4,2)*bblock(2,4)
+     >                             - ablock(4,3)*bblock(3,4)
+     >                             - ablock(4,4)*bblock(4,4)
+     >                             - ablock(4,5)*bblock(5,4)
+         cblock(5,4) = cblock(5,4) - ablock(5,1)*bblock(1,4)
+     >                             - ablock(5,2)*bblock(2,4)
+     >                             - ablock(5,3)*bblock(3,4)
+     >                             - ablock(5,4)*bblock(4,4)
+     >                             - ablock(5,5)*bblock(5,4)
+         cblock(1,5) = cblock(1,5) - ablock(1,1)*bblock(1,5)
+     >                             - ablock(1,2)*bblock(2,5)
+     >                             - ablock(1,3)*bblock(3,5)
+     >                             - ablock(1,4)*bblock(4,5)
+     >                             - ablock(1,5)*bblock(5,5)
+         cblock(2,5) = cblock(2,5) - ablock(2,1)*bblock(1,5)
+     >                             - ablock(2,2)*bblock(2,5)
+     >                             - ablock(2,3)*bblock(3,5)
+     >                             - ablock(2,4)*bblock(4,5)
+     >                             - ablock(2,5)*bblock(5,5)
+         cblock(3,5) = cblock(3,5) - ablock(3,1)*bblock(1,5)
+     >                             - ablock(3,2)*bblock(2,5)
+     >                             - ablock(3,3)*bblock(3,5)
+     >                             - ablock(3,4)*bblock(4,5)
+     >                             - ablock(3,5)*bblock(5,5)
+         cblock(4,5) = cblock(4,5) - ablock(4,1)*bblock(1,5)
+     >                             - ablock(4,2)*bblock(2,5)
+     >                             - ablock(4,3)*bblock(3,5)
+     >                             - ablock(4,4)*bblock(4,5)
+     >                             - ablock(4,5)*bblock(5,5)
+         cblock(5,5) = cblock(5,5) - ablock(5,1)*bblock(1,5)
+     >                             - ablock(5,2)*bblock(2,5)
+     >                             - ablock(5,3)*bblock(3,5)
+     >                             - ablock(5,4)*bblock(4,5)
+     >                             - ablock(5,5)*bblock(5,5)
+
+              
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine binvcrhs( lhs,c,r )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision pivot, coeff, lhs
+      dimension lhs(5,5)
+      double precision c(5,5), r(5)
+
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+
+      pivot = 1.00d0/lhs(1,1)
+      lhs(1,2) = lhs(1,2)*pivot
+      lhs(1,3) = lhs(1,3)*pivot
+      lhs(1,4) = lhs(1,4)*pivot
+      lhs(1,5) = lhs(1,5)*pivot
+      c(1,1) = c(1,1)*pivot
+      c(1,2) = c(1,2)*pivot
+      c(1,3) = c(1,3)*pivot
+      c(1,4) = c(1,4)*pivot
+      c(1,5) = c(1,5)*pivot
+      r(1)   = r(1)  *pivot
+
+      coeff = lhs(2,1)
+      lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
+      lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
+      lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
+      c(2,1) = c(2,1) - coeff*c(1,1)
+      c(2,2) = c(2,2) - coeff*c(1,2)
+      c(2,3) = c(2,3) - coeff*c(1,3)
+      c(2,4) = c(2,4) - coeff*c(1,4)
+      c(2,5) = c(2,5) - coeff*c(1,5)
+      r(2)   = r(2)   - coeff*r(1)
+
+      coeff = lhs(3,1)
+      lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
+      lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
+      lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
+      c(3,1) = c(3,1) - coeff*c(1,1)
+      c(3,2) = c(3,2) - coeff*c(1,2)
+      c(3,3) = c(3,3) - coeff*c(1,3)
+      c(3,4) = c(3,4) - coeff*c(1,4)
+      c(3,5) = c(3,5) - coeff*c(1,5)
+      r(3)   = r(3)   - coeff*r(1)
+
+      coeff = lhs(4,1)
+      lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
+      lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
+      c(4,1) = c(4,1) - coeff*c(1,1)
+      c(4,2) = c(4,2) - coeff*c(1,2)
+      c(4,3) = c(4,3) - coeff*c(1,3)
+      c(4,4) = c(4,4) - coeff*c(1,4)
+      c(4,5) = c(4,5) - coeff*c(1,5)
+      r(4)   = r(4)   - coeff*r(1)
+
+      coeff = lhs(5,1)
+      lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
+      lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
+      c(5,1) = c(5,1) - coeff*c(1,1)
+      c(5,2) = c(5,2) - coeff*c(1,2)
+      c(5,3) = c(5,3) - coeff*c(1,3)
+      c(5,4) = c(5,4) - coeff*c(1,4)
+      c(5,5) = c(5,5) - coeff*c(1,5)
+      r(5)   = r(5)   - coeff*r(1)
+
+
+      pivot = 1.00d0/lhs(2,2)
+      lhs(2,3) = lhs(2,3)*pivot
+      lhs(2,4) = lhs(2,4)*pivot
+      lhs(2,5) = lhs(2,5)*pivot
+      c(2,1) = c(2,1)*pivot
+      c(2,2) = c(2,2)*pivot
+      c(2,3) = c(2,3)*pivot
+      c(2,4) = c(2,4)*pivot
+      c(2,5) = c(2,5)*pivot
+      r(2)   = r(2)  *pivot
+
+      coeff = lhs(1,2)
+      lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
+      lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
+      c(1,1) = c(1,1) - coeff*c(2,1)
+      c(1,2) = c(1,2) - coeff*c(2,2)
+      c(1,3) = c(1,3) - coeff*c(2,3)
+      c(1,4) = c(1,4) - coeff*c(2,4)
+      c(1,5) = c(1,5) - coeff*c(2,5)
+      r(1)   = r(1)   - coeff*r(2)
+
+      coeff = lhs(3,2)
+      lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
+      lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
+      c(3,1) = c(3,1) - coeff*c(2,1)
+      c(3,2) = c(3,2) - coeff*c(2,2)
+      c(3,3) = c(3,3) - coeff*c(2,3)
+      c(3,4) = c(3,4) - coeff*c(2,4)
+      c(3,5) = c(3,5) - coeff*c(2,5)
+      r(3)   = r(3)   - coeff*r(2)
+
+      coeff = lhs(4,2)
+      lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
+      c(4,1) = c(4,1) - coeff*c(2,1)
+      c(4,2) = c(4,2) - coeff*c(2,2)
+      c(4,3) = c(4,3) - coeff*c(2,3)
+      c(4,4) = c(4,4) - coeff*c(2,4)
+      c(4,5) = c(4,5) - coeff*c(2,5)
+      r(4)   = r(4)   - coeff*r(2)
+
+      coeff = lhs(5,2)
+      lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
+      c(5,1) = c(5,1) - coeff*c(2,1)
+      c(5,2) = c(5,2) - coeff*c(2,2)
+      c(5,3) = c(5,3) - coeff*c(2,3)
+      c(5,4) = c(5,4) - coeff*c(2,4)
+      c(5,5) = c(5,5) - coeff*c(2,5)
+      r(5)   = r(5)   - coeff*r(2)
+
+
+      pivot = 1.00d0/lhs(3,3)
+      lhs(3,4) = lhs(3,4)*pivot
+      lhs(3,5) = lhs(3,5)*pivot
+      c(3,1) = c(3,1)*pivot
+      c(3,2) = c(3,2)*pivot
+      c(3,3) = c(3,3)*pivot
+      c(3,4) = c(3,4)*pivot
+      c(3,5) = c(3,5)*pivot
+      r(3)   = r(3)  *pivot
+
+      coeff = lhs(1,3)
+      lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
+      c(1,1) = c(1,1) - coeff*c(3,1)
+      c(1,2) = c(1,2) - coeff*c(3,2)
+      c(1,3) = c(1,3) - coeff*c(3,3)
+      c(1,4) = c(1,4) - coeff*c(3,4)
+      c(1,5) = c(1,5) - coeff*c(3,5)
+      r(1)   = r(1)   - coeff*r(3)
+
+      coeff = lhs(2,3)
+      lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
+      c(2,1) = c(2,1) - coeff*c(3,1)
+      c(2,2) = c(2,2) - coeff*c(3,2)
+      c(2,3) = c(2,3) - coeff*c(3,3)
+      c(2,4) = c(2,4) - coeff*c(3,4)
+      c(2,5) = c(2,5) - coeff*c(3,5)
+      r(2)   = r(2)   - coeff*r(3)
+
+      coeff = lhs(4,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
+      c(4,1) = c(4,1) - coeff*c(3,1)
+      c(4,2) = c(4,2) - coeff*c(3,2)
+      c(4,3) = c(4,3) - coeff*c(3,3)
+      c(4,4) = c(4,4) - coeff*c(3,4)
+      c(4,5) = c(4,5) - coeff*c(3,5)
+      r(4)   = r(4)   - coeff*r(3)
+
+      coeff = lhs(5,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
+      c(5,1) = c(5,1) - coeff*c(3,1)
+      c(5,2) = c(5,2) - coeff*c(3,2)
+      c(5,3) = c(5,3) - coeff*c(3,3)
+      c(5,4) = c(5,4) - coeff*c(3,4)
+      c(5,5) = c(5,5) - coeff*c(3,5)
+      r(5)   = r(5)   - coeff*r(3)
+
+
+      pivot = 1.00d0/lhs(4,4)
+      lhs(4,5) = lhs(4,5)*pivot
+      c(4,1) = c(4,1)*pivot
+      c(4,2) = c(4,2)*pivot
+      c(4,3) = c(4,3)*pivot
+      c(4,4) = c(4,4)*pivot
+      c(4,5) = c(4,5)*pivot
+      r(4)   = r(4)  *pivot
+
+      coeff = lhs(1,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
+      c(1,1) = c(1,1) - coeff*c(4,1)
+      c(1,2) = c(1,2) - coeff*c(4,2)
+      c(1,3) = c(1,3) - coeff*c(4,3)
+      c(1,4) = c(1,4) - coeff*c(4,4)
+      c(1,5) = c(1,5) - coeff*c(4,5)
+      r(1)   = r(1)   - coeff*r(4)
+
+      coeff = lhs(2,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
+      c(2,1) = c(2,1) - coeff*c(4,1)
+      c(2,2) = c(2,2) - coeff*c(4,2)
+      c(2,3) = c(2,3) - coeff*c(4,3)
+      c(2,4) = c(2,4) - coeff*c(4,4)
+      c(2,5) = c(2,5) - coeff*c(4,5)
+      r(2)   = r(2)   - coeff*r(4)
+
+      coeff = lhs(3,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
+      c(3,1) = c(3,1) - coeff*c(4,1)
+      c(3,2) = c(3,2) - coeff*c(4,2)
+      c(3,3) = c(3,3) - coeff*c(4,3)
+      c(3,4) = c(3,4) - coeff*c(4,4)
+      c(3,5) = c(3,5) - coeff*c(4,5)
+      r(3)   = r(3)   - coeff*r(4)
+
+      coeff = lhs(5,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
+      c(5,1) = c(5,1) - coeff*c(4,1)
+      c(5,2) = c(5,2) - coeff*c(4,2)
+      c(5,3) = c(5,3) - coeff*c(4,3)
+      c(5,4) = c(5,4) - coeff*c(4,4)
+      c(5,5) = c(5,5) - coeff*c(4,5)
+      r(5)   = r(5)   - coeff*r(4)
+
+
+      pivot = 1.00d0/lhs(5,5)
+      c(5,1) = c(5,1)*pivot
+      c(5,2) = c(5,2)*pivot
+      c(5,3) = c(5,3)*pivot
+      c(5,4) = c(5,4)*pivot
+      c(5,5) = c(5,5)*pivot
+      r(5)   = r(5)  *pivot
+
+      coeff = lhs(1,5)
+      c(1,1) = c(1,1) - coeff*c(5,1)
+      c(1,2) = c(1,2) - coeff*c(5,2)
+      c(1,3) = c(1,3) - coeff*c(5,3)
+      c(1,4) = c(1,4) - coeff*c(5,4)
+      c(1,5) = c(1,5) - coeff*c(5,5)
+      r(1)   = r(1)   - coeff*r(5)
+
+      coeff = lhs(2,5)
+      c(2,1) = c(2,1) - coeff*c(5,1)
+      c(2,2) = c(2,2) - coeff*c(5,2)
+      c(2,3) = c(2,3) - coeff*c(5,3)
+      c(2,4) = c(2,4) - coeff*c(5,4)
+      c(2,5) = c(2,5) - coeff*c(5,5)
+      r(2)   = r(2)   - coeff*r(5)
+
+      coeff = lhs(3,5)
+      c(3,1) = c(3,1) - coeff*c(5,1)
+      c(3,2) = c(3,2) - coeff*c(5,2)
+      c(3,3) = c(3,3) - coeff*c(5,3)
+      c(3,4) = c(3,4) - coeff*c(5,4)
+      c(3,5) = c(3,5) - coeff*c(5,5)
+      r(3)   = r(3)   - coeff*r(5)
+
+      coeff = lhs(4,5)
+      c(4,1) = c(4,1) - coeff*c(5,1)
+      c(4,2) = c(4,2) - coeff*c(5,2)
+      c(4,3) = c(4,3) - coeff*c(5,3)
+      c(4,4) = c(4,4) - coeff*c(5,4)
+      c(4,5) = c(4,5) - coeff*c(5,5)
+      r(4)   = r(4)   - coeff*r(5)
+
+
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine binvrhs( lhs,r )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision pivot, coeff, lhs
+      dimension lhs(5,5)
+      double precision r(5)
+
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+
+
+      pivot = 1.00d0/lhs(1,1)
+      lhs(1,2) = lhs(1,2)*pivot
+      lhs(1,3) = lhs(1,3)*pivot
+      lhs(1,4) = lhs(1,4)*pivot
+      lhs(1,5) = lhs(1,5)*pivot
+      r(1)   = r(1)  *pivot
+
+      coeff = lhs(2,1)
+      lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
+      lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
+      lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
+      r(2)   = r(2)   - coeff*r(1)
+
+      coeff = lhs(3,1)
+      lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
+      lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
+      lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
+      r(3)   = r(3)   - coeff*r(1)
+
+      coeff = lhs(4,1)
+      lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
+      lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
+      r(4)   = r(4)   - coeff*r(1)
+
+      coeff = lhs(5,1)
+      lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
+      lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
+      r(5)   = r(5)   - coeff*r(1)
+
+
+      pivot = 1.00d0/lhs(2,2)
+      lhs(2,3) = lhs(2,3)*pivot
+      lhs(2,4) = lhs(2,4)*pivot
+      lhs(2,5) = lhs(2,5)*pivot
+      r(2)   = r(2)  *pivot
+
+      coeff = lhs(1,2)
+      lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
+      lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
+      r(1)   = r(1)   - coeff*r(2)
+
+      coeff = lhs(3,2)
+      lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
+      lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
+      r(3)   = r(3)   - coeff*r(2)
+
+      coeff = lhs(4,2)
+      lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
+      r(4)   = r(4)   - coeff*r(2)
+
+      coeff = lhs(5,2)
+      lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
+      r(5)   = r(5)   - coeff*r(2)
+
+
+      pivot = 1.00d0/lhs(3,3)
+      lhs(3,4) = lhs(3,4)*pivot
+      lhs(3,5) = lhs(3,5)*pivot
+      r(3)   = r(3)  *pivot
+
+      coeff = lhs(1,3)
+      lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
+      r(1)   = r(1)   - coeff*r(3)
+
+      coeff = lhs(2,3)
+      lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
+      r(2)   = r(2)   - coeff*r(3)
+
+      coeff = lhs(4,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
+      r(4)   = r(4)   - coeff*r(3)
+
+      coeff = lhs(5,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
+      r(5)   = r(5)   - coeff*r(3)
+
+
+      pivot = 1.00d0/lhs(4,4)
+      lhs(4,5) = lhs(4,5)*pivot
+      r(4)   = r(4)  *pivot
+
+      coeff = lhs(1,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
+      r(1)   = r(1)   - coeff*r(4)
+
+      coeff = lhs(2,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
+      r(2)   = r(2)   - coeff*r(4)
+
+      coeff = lhs(3,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
+      r(3)   = r(3)   - coeff*r(4)
+
+      coeff = lhs(5,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
+      r(5)   = r(5)   - coeff*r(4)
+
+
+      pivot = 1.00d0/lhs(5,5)
+      r(5)   = r(5)  *pivot
+
+      coeff = lhs(1,5)
+      r(1)   = r(1)   - coeff*r(5)
+
+      coeff = lhs(2,5)
+      r(2)   = r(2)   - coeff*r(5)
+
+      coeff = lhs(3,5)
+      r(3)   = r(3)   - coeff*r(5)
+
+      coeff = lhs(4,5)
+      r(4)   = r(4)   - coeff*r(5)
+
+
+      return
+      end
+
+
+
diff --git a/examples/smpi/NAS/BT/verify.f b/examples/smpi/NAS/BT/verify.f
new file mode 100644
index 0000000000..7dbc8a96a2
--- /dev/null
+++ b/examples/smpi/NAS/BT/verify.f
@@ -0,0 +1,435 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+        subroutine verify(no_time_steps, class, verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c  verification routine                         
+c---------------------------------------------------------------------
+
+        include 'header.h'
+        include 'mpinpb.h'
+
+        double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), 
+     >                   epsilon, xce(5), xcr(5), dtref
+        integer m, no_time_steps
+        character class
+        logical verified
+
+c---------------------------------------------------------------------
+c   tolerance level
+c---------------------------------------------------------------------
+        epsilon = 1.0d-08
+        verified = .true.
+
+c---------------------------------------------------------------------
+c   compute the error norm and the residual norm, and exit if not printing
+c---------------------------------------------------------------------
+
+        if (iotype .ne. 0) then
+           call accumulate_norms(xce)
+        else
+           call error_norm(xce)
+        endif
+
+        call copy_faces
+
+        call rhs_norm(xcr)
+
+        do m = 1, 5
+           xcr(m) = xcr(m) / dt
+        enddo
+
+        if (node .ne. 0) return
+
+        class = 'U'
+
+        do m = 1,5
+           xcrref(m) = 1.0
+           xceref(m) = 1.0
+        end do
+
+c---------------------------------------------------------------------
+c    reference data for 12X12X12 grids after 60 time steps, with DT = 1.0d-02
+c---------------------------------------------------------------------
+        if ( (grid_points(1)  .eq. 12     ) .and. 
+     >       (grid_points(2)  .eq. 12     ) .and.
+     >       (grid_points(3)  .eq. 12     ) .and.
+     >       (no_time_steps   .eq. 60    ))  then
+
+           class = 'S'
+           dtref = 1.0d-2
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 1.7034283709541311d-01
+         xcrref(2) = 1.2975252070034097d-02
+         xcrref(3) = 3.2527926989486055d-02
+         xcrref(4) = 2.6436421275166801d-02
+         xcrref(5) = 1.9211784131744430d-01
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 4.9976913345811579d-04
+           xceref(2) = 4.5195666782961927d-05
+           xceref(3) = 7.3973765172921357d-05
+           xceref(4) = 7.3821238632439731d-05
+           xceref(5) = 8.9269630987491446d-04
+         else
+           xceref(1) = 0.1149036328945d+02
+           xceref(2) = 0.9156788904727d+00
+           xceref(3) = 0.2857899428614d+01
+           xceref(4) = 0.2598273346734d+01
+           xceref(5) = 0.2652795397547d+02
+         endif
+
+c---------------------------------------------------------------------
+c    reference data for 24X24X24 grids after 200 time steps, with DT = 0.8d-3
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 24) .and. 
+     >           (grid_points(2) .eq. 24) .and.
+     >           (grid_points(3) .eq. 24) .and.
+     >           (no_time_steps . eq. 200) ) then
+
+           class = 'W'
+           dtref = 0.8d-3
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 0.1125590409344d+03
+           xcrref(2) = 0.1180007595731d+02
+           xcrref(3) = 0.2710329767846d+02
+           xcrref(4) = 0.2469174937669d+02
+           xcrref(5) = 0.2638427874317d+03
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 0.4419655736008d+01
+           xceref(2) = 0.4638531260002d+00
+           xceref(3) = 0.1011551749967d+01
+           xceref(4) = 0.9235878729944d+00
+           xceref(5) = 0.1018045837718d+02
+         else
+           xceref(1) = 0.6729594398612d+02
+           xceref(2) = 0.5264523081690d+01
+           xceref(3) = 0.1677107142637d+02
+           xceref(4) = 0.1508721463436d+02
+           xceref(5) = 0.1477018363393d+03
+         endif
+
+
+c---------------------------------------------------------------------
+c    reference data for 64X64X64 grids after 200 time steps, with DT = 0.8d-3
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 64) .and. 
+     >           (grid_points(2) .eq. 64) .and.
+     >           (grid_points(3) .eq. 64) .and.
+     >           (no_time_steps . eq. 200) ) then
+
+           class = 'A'
+           dtref = 0.8d-3
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 1.0806346714637264d+02
+         xcrref(2) = 1.1319730901220813d+01
+         xcrref(3) = 2.5974354511582465d+01
+         xcrref(4) = 2.3665622544678910d+01
+         xcrref(5) = 2.5278963211748344d+02
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 4.2348416040525025d+00
+           xceref(2) = 4.4390282496995698d-01
+           xceref(3) = 9.6692480136345650d-01
+           xceref(4) = 8.8302063039765474d-01
+           xceref(5) = 9.7379901770829278d+00
+         else
+           xceref(1) = 0.6482218724961d+02
+           xceref(2) = 0.5066461714527d+01
+           xceref(3) = 0.1613931961359d+02
+           xceref(4) = 0.1452010201481d+02
+           xceref(5) = 0.1420099377681d+03
+         endif
+
+c---------------------------------------------------------------------
+c    reference data for 102X102X102 grids after 200 time steps,
+c    with DT = 3.0d-04
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 102) .and. 
+     >           (grid_points(2) .eq. 102) .and.
+     >           (grid_points(3) .eq. 102) .and.
+     >           (no_time_steps . eq. 200) ) then
+
+           class = 'B'
+           dtref = 3.0d-4
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 1.4233597229287254d+03
+         xcrref(2) = 9.9330522590150238d+01
+         xcrref(3) = 3.5646025644535285d+02
+         xcrref(4) = 3.2485447959084092d+02
+         xcrref(5) = 3.2707541254659363d+03
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 5.2969847140936856d+01
+           xceref(2) = 4.4632896115670668d+00
+           xceref(3) = 1.3122573342210174d+01
+           xceref(4) = 1.2006925323559144d+01
+           xceref(5) = 1.2459576151035986d+02
+         else
+           xceref(1) = 0.1477545106464d+03
+           xceref(2) = 0.1108895555053d+02
+           xceref(3) = 0.3698065590331d+02
+           xceref(4) = 0.3310505581440d+02
+           xceref(5) = 0.3157928282563d+03
+         endif
+
+c---------------------------------------------------------------------
+c    reference data for 162X162X162 grids after 200 time steps,
+c    with DT = 1.0d-04
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 162) .and. 
+     >           (grid_points(2) .eq. 162) .and.
+     >           (grid_points(3) .eq. 162) .and.
+     >           (no_time_steps . eq. 200) ) then
+
+           class = 'C'
+           dtref = 1.0d-4
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 0.62398116551764615d+04
+         xcrref(2) = 0.50793239190423964d+03
+         xcrref(3) = 0.15423530093013596d+04
+         xcrref(4) = 0.13302387929291190d+04
+         xcrref(5) = 0.11604087428436455d+05
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 0.16462008369091265d+03
+           xceref(2) = 0.11497107903824313d+02
+           xceref(3) = 0.41207446207461508d+02
+           xceref(4) = 0.37087651059694167d+02
+           xceref(5) = 0.36211053051841265d+03
+         else
+           xceref(1) = 0.2597156483475d+03
+           xceref(2) = 0.1985384289495d+02
+           xceref(3) = 0.6517950485788d+02
+           xceref(4) = 0.5757235541520d+02
+           xceref(5) = 0.5215668188726d+03
+         endif 
+
+
+c---------------------------------------------------------------------
+c    reference data for 408x408x408 grids after 250 time steps,
+c    with DT = 0.2d-04
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 408) .and. 
+     >           (grid_points(2) .eq. 408) .and.
+     >           (grid_points(3) .eq. 408) .and.
+     >           (no_time_steps . eq. 250) ) then
+
+           class = 'D'
+           dtref = 0.2d-4
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 0.2533188551738d+05
+         xcrref(2) = 0.2346393716980d+04
+         xcrref(3) = 0.6294554366904d+04
+         xcrref(4) = 0.5352565376030d+04
+         xcrref(5) = 0.3905864038618d+05
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 0.3100009377557d+03
+           xceref(2) = 0.2424086324913d+02
+           xceref(3) = 0.7782212022645d+02
+           xceref(4) = 0.6835623860116d+02
+           xceref(5) = 0.6065737200368d+03
+         else
+           xceref(1) = 0.3813781566713d+03
+           xceref(2) = 0.3160872966198d+02
+           xceref(3) = 0.9593576357290d+02
+           xceref(4) = 0.8363391989815d+02
+           xceref(5) = 0.7063466087423d+03
+         endif
+
+
+c---------------------------------------------------------------------
+c    reference data for 1020x1020x1020 grids after 250 time steps,
+c    with DT = 0.4d-05
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 1020) .and. 
+     >           (grid_points(2) .eq. 1020) .and.
+     >           (grid_points(3) .eq. 1020) .and.
+     >           (no_time_steps . eq. 250) ) then
+
+           class = 'E'
+           dtref = 0.4d-5
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 0.9795372484517d+05
+         xcrref(2) = 0.9739814511521d+04
+         xcrref(3) = 0.2467606342965d+05
+         xcrref(4) = 0.2092419572860d+05
+         xcrref(5) = 0.1392138856939d+06
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 0.4327562208414d+03
+           xceref(2) = 0.3699051964887d+02
+           xceref(3) = 0.1089845040954d+03
+           xceref(4) = 0.9462517622043d+02
+           xceref(5) = 0.7765512765309d+03
+         else
+c  wr_interval = 5
+           xceref(1) = 0.4729898413058d+03
+           xceref(2) = 0.4145899331704d+02
+           xceref(3) = 0.1192850917138d+03
+           xceref(4) = 0.1032746026932d+03
+           xceref(5) = 0.8270322177634d+03
+c  wr_interval = 10
+c          xceref(1) = 0.4718135916251d+03
+c          xceref(2) = 0.4132620259096d+02
+c          xceref(3) = 0.1189831133503d+03
+c          xceref(4) = 0.1030212798803d+03
+c          xceref(5) = 0.8255924078458d+03
+        endif
+
+        else
+           verified = .false.
+        endif
+
+c---------------------------------------------------------------------
+c    verification test for residuals if gridsize is one of 
+c    the defined grid sizes above (class .ne. 'U')
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c    Compute the difference of solution values and the known reference 
+c    values.
+c---------------------------------------------------------------------
+        do m = 1, 5
+           
+           xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) 
+           xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
+           
+        enddo
+
+c---------------------------------------------------------------------
+c    Output the comparison of computed results to known cases.
+c---------------------------------------------------------------------
+
+        if (class .ne. 'U') then
+           write(*, 1990) class
+ 1990      format(' Verification being performed for class ', a)
+           write (*,2000) epsilon
+ 2000      format(' accuracy setting for epsilon = ', E20.13)
+           verified = (dabs(dt-dtref) .le. epsilon)
+           if (.not.verified) then  
+              verified = .false.
+              class = 'U'
+              write (*,1000) dtref
+ 1000         format(' DT does not match the reference value of ', 
+     >                 E15.8)
+           endif
+        else 
+           write(*, 1995)
+ 1995      format(' Unknown class')
+        endif
+
+
+        if (class .ne. 'U') then
+           write (*,2001) 
+        else
+           write (*, 2005)
+        endif
+
+ 2001   format(' Comparison of RMS-norms of residual')
+ 2005   format(' RMS-norms of residual')
+        do m = 1, 5
+           if (class .eq. 'U') then
+              write(*, 2015) m, xcr(m)
+           else if (xcrdif(m) .le. epsilon) then
+              write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
+           else 
+              verified = .false.
+              write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
+           endif
+        enddo
+
+        if (class .ne. 'U') then
+           write (*,2002)
+        else
+           write (*,2006)
+        endif
+ 2002   format(' Comparison of RMS-norms of solution error')
+ 2006   format(' RMS-norms of solution error')
+        
+        do m = 1, 5
+           if (class .eq. 'U') then
+              write(*, 2015) m, xce(m)
+           else if (xcedif(m) .le. epsilon) then
+              write (*,2011) m,xce(m),xceref(m),xcedif(m)
+           else
+              verified = .false.
+              write (*,2010) m,xce(m),xceref(m),xcedif(m)
+           endif
+        enddo
+        
+ 2010   format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
+ 2011   format('          ', i2, E20.13, E20.13, E20.13)
+ 2015   format('          ', i2, E20.13)
+        
+        if (class .eq. 'U') then
+           write(*, 2022)
+           write(*, 2023)
+ 2022      format(' No reference values provided')
+ 2023      format(' No verification performed')
+        else if (verified) then
+           write(*, 2020)
+ 2020      format(' Verification Successful')
+        else
+           write(*, 2021)
+ 2021      format(' Verification failed')
+        endif
+
+        return
+
+
+        end
diff --git a/examples/smpi/NAS/BT/work_lhs.h b/examples/smpi/NAS/BT/work_lhs.h
new file mode 100644
index 0000000000..d9bc9e4d66
--- /dev/null
+++ b/examples/smpi/NAS/BT/work_lhs.h
@@ -0,0 +1,14 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c  work_lhs.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision fjac(5, 5, -2:MAX_CELL_DIM+1),
+     >                 njac(5, 5, -2:MAX_CELL_DIM+1),
+     >                 lhsa(5, 5, -1:MAX_CELL_DIM),
+     >                 lhsb(5, 5, -1:MAX_CELL_DIM),
+     >                 tmp1, tmp2, tmp3
+      common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
diff --git a/examples/smpi/NAS/BT/work_lhs_vec.h b/examples/smpi/NAS/BT/work_lhs_vec.h
new file mode 100644
index 0000000000..a97054f419
--- /dev/null
+++ b/examples/smpi/NAS/BT/work_lhs_vec.h
@@ -0,0 +1,14 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c  work_lhs_vec.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision fjac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
+     >                 njac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
+     >                 lhsa(5, 5, -1:MAX_CELL_DIM,   -1:MAX_CELL_DIM),
+     >                 lhsb(5, 5, -1:MAX_CELL_DIM,   -1:MAX_CELL_DIM),
+     >                 tmp1, tmp2, tmp3
+      common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
diff --git a/examples/smpi/NAS/BT/x_solve.f b/examples/smpi/NAS/BT/x_solve.f
new file mode 100644
index 0000000000..5386732616
--- /dev/null
+++ b/examples/smpi/NAS/BT/x_solve.f
@@ -0,0 +1,761 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c     Performs line solves in X direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c     
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+      integer  c, istart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      istart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the x-direction
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(1,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+         
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsx(c)
+            call x_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call x_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsx(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(istart) and rhs'(istart) to be used in this cell
+c---------------------------------------------------------------------
+            call x_unpack_solve_info(c)
+            call x_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call x_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(1,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call x_backsubstitute(first, last,c)
+         else
+            call x_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call x_unpack_backsub_info(c)
+            call x_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call x_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      integer j,k,m,n,ptr,c,istart 
+
+      istart = 0
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine x_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(iend) and rhs'(iend) for
+c     all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer j,k,m,n,isize,ptr,c,jp,kp
+      integer error,send_id,buffer_size 
+
+      isize = cell_size(1,c)-1
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,isize,j,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(1),
+     >     WEST+jp+kp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(istart) for all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer j,k,n,ptr,c,istart,jp,kp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      istart = 0
+      jp = cell_coord(2,c)-1
+      kp = cell_coord(3,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,istart,j,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(1), 
+     >     EAST+jp+kp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(isize) for all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      integer j,k,n,ptr,c
+
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,j,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,jp,kp,c,buffer_size
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(1), 
+     >     EAST+jp+kp*NCELLS, comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer jp,kp,recv_id,error,c,buffer_size
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, predecessor(1), 
+     >     WEST+jp+kp*NCELLS,  comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine x_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(isize)=rhs(isize)
+c     else assume U(isize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(istart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, j, k
+      integer m,n,isize,jsize,ksize,istart
+      
+      istart = 0
+      isize = cell_size(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1      
+      ksize = cell_size(3,c)-end(3,c)-1
+      if (last .eq. 0) then
+         do k=start(3,c),ksize
+            do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c     U(isize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+     >                    - lhsc(m,n,isize,j,k,c)*
+     >                    backsub_info(n,j,k,c)
+c---------------------------------------------------------------------
+c     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+c     $                    - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
+c---------------------------------------------------------------------
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=start(3,c),ksize
+         do j=start(2,c),jsize
+            do i=isize-1,istart,-1
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(IMAX) and rhs'(IMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs.h'
+
+      integer first,last,c
+      integer i,j,k,isize,ksize,jsize,istart
+
+      istart = 0
+      isize = cell_size(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+
+      call lhsabinit(lhsa, lhsb, isize)
+
+      do k=start(3,c),ksize 
+         do j=start(2,c),jsize
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side in the xi-direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+            do i = start(1,c)-1, cell_size(1,c) - end(1,c)
+
+               tmp1 = rho_i(i,j,k,c)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+               fjac(1,1,i) = 0.0d+00
+               fjac(1,2,i) = 1.0d+00
+               fjac(1,3,i) = 0.0d+00
+               fjac(1,4,i) = 0.0d+00
+               fjac(1,5,i) = 0.0d+00
+
+               fjac(2,1,i) = -(u(2,i,j,k,c) * tmp2 * 
+     >              u(2,i,j,k,c))
+     >              + c2 * qs(i,j,k,c)
+               fjac(2,2,i) = ( 2.0d+00 - c2 )
+     >              * ( u(2,i,j,k,c) * tmp1 )
+               fjac(2,3,i) = - c2 * ( u(3,i,j,k,c) * tmp1 )
+               fjac(2,4,i) = - c2 * ( u(4,i,j,k,c) * tmp1 )
+               fjac(2,5,i) = c2
+
+               fjac(3,1,i) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
+               fjac(3,2,i) = u(3,i,j,k,c) * tmp1
+               fjac(3,3,i) = u(2,i,j,k,c) * tmp1
+               fjac(3,4,i) = 0.0d+00
+               fjac(3,5,i) = 0.0d+00
+
+               fjac(4,1,i) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
+               fjac(4,2,i) = u(4,i,j,k,c) * tmp1
+               fjac(4,3,i) = 0.0d+00
+               fjac(4,4,i) = u(2,i,j,k,c) * tmp1
+               fjac(4,5,i) = 0.0d+00
+
+               fjac(5,1,i) = ( c2 * 2.0d0 * qs(i,j,k,c)
+     >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+     >              * ( u(2,i,j,k,c) * tmp1 )
+               fjac(5,2,i) = c1 *  u(5,i,j,k,c) * tmp1 
+     >              - c2
+     >              * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
+     >              + qs(i,j,k,c) )
+               fjac(5,3,i) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
+     >              * tmp2
+               fjac(5,4,i) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
+     >              * tmp2
+               fjac(5,5,i) = c1 * ( u(2,i,j,k,c) * tmp1 )
+
+               njac(1,1,i) = 0.0d+00
+               njac(1,2,i) = 0.0d+00
+               njac(1,3,i) = 0.0d+00
+               njac(1,4,i) = 0.0d+00
+               njac(1,5,i) = 0.0d+00
+
+               njac(2,1,i) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
+               njac(2,2,i) =   con43 * c3c4 * tmp1
+               njac(2,3,i) =   0.0d+00
+               njac(2,4,i) =   0.0d+00
+               njac(2,5,i) =   0.0d+00
+
+               njac(3,1,i) = - c3c4 * tmp2 * u(3,i,j,k,c)
+               njac(3,2,i) =   0.0d+00
+               njac(3,3,i) =   c3c4 * tmp1
+               njac(3,4,i) =   0.0d+00
+               njac(3,5,i) =   0.0d+00
+
+               njac(4,1,i) = - c3c4 * tmp2 * u(4,i,j,k,c)
+               njac(4,2,i) =   0.0d+00 
+               njac(4,3,i) =   0.0d+00
+               njac(4,4,i) =   c3c4 * tmp1
+               njac(4,5,i) =   0.0d+00
+
+               njac(5,1,i) = - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+     >              - c1345 * tmp2 * u(5,i,j,k,c)
+
+               njac(5,2,i) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * u(2,i,j,k,c)
+               njac(5,3,i) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
+               njac(5,4,i) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+               njac(5,5,i) = ( c1345 ) * tmp1
+
+            enddo
+c---------------------------------------------------------------------
+c     now jacobians set, so form left hand side in x direction
+c---------------------------------------------------------------------
+            do i = start(1,c), isize - end(1,c)
+
+               tmp1 = dt * tx1
+               tmp2 = dt * tx2
+
+               lhsa(1,1,i) = - tmp2 * fjac(1,1,i-1)
+     >              - tmp1 * njac(1,1,i-1)
+     >              - tmp1 * dx1 
+               lhsa(1,2,i) = - tmp2 * fjac(1,2,i-1)
+     >              - tmp1 * njac(1,2,i-1)
+               lhsa(1,3,i) = - tmp2 * fjac(1,3,i-1)
+     >              - tmp1 * njac(1,3,i-1)
+               lhsa(1,4,i) = - tmp2 * fjac(1,4,i-1)
+     >              - tmp1 * njac(1,4,i-1)
+               lhsa(1,5,i) = - tmp2 * fjac(1,5,i-1)
+     >              - tmp1 * njac(1,5,i-1)
+
+               lhsa(2,1,i) = - tmp2 * fjac(2,1,i-1)
+     >              - tmp1 * njac(2,1,i-1)
+               lhsa(2,2,i) = - tmp2 * fjac(2,2,i-1)
+     >              - tmp1 * njac(2,2,i-1)
+     >              - tmp1 * dx2
+               lhsa(2,3,i) = - tmp2 * fjac(2,3,i-1)
+     >              - tmp1 * njac(2,3,i-1)
+               lhsa(2,4,i) = - tmp2 * fjac(2,4,i-1)
+     >              - tmp1 * njac(2,4,i-1)
+               lhsa(2,5,i) = - tmp2 * fjac(2,5,i-1)
+     >              - tmp1 * njac(2,5,i-1)
+
+               lhsa(3,1,i) = - tmp2 * fjac(3,1,i-1)
+     >              - tmp1 * njac(3,1,i-1)
+               lhsa(3,2,i) = - tmp2 * fjac(3,2,i-1)
+     >              - tmp1 * njac(3,2,i-1)
+               lhsa(3,3,i) = - tmp2 * fjac(3,3,i-1)
+     >              - tmp1 * njac(3,3,i-1)
+     >              - tmp1 * dx3 
+               lhsa(3,4,i) = - tmp2 * fjac(3,4,i-1)
+     >              - tmp1 * njac(3,4,i-1)
+               lhsa(3,5,i) = - tmp2 * fjac(3,5,i-1)
+     >              - tmp1 * njac(3,5,i-1)
+
+               lhsa(4,1,i) = - tmp2 * fjac(4,1,i-1)
+     >              - tmp1 * njac(4,1,i-1)
+               lhsa(4,2,i) = - tmp2 * fjac(4,2,i-1)
+     >              - tmp1 * njac(4,2,i-1)
+               lhsa(4,3,i) = - tmp2 * fjac(4,3,i-1)
+     >              - tmp1 * njac(4,3,i-1)
+               lhsa(4,4,i) = - tmp2 * fjac(4,4,i-1)
+     >              - tmp1 * njac(4,4,i-1)
+     >              - tmp1 * dx4
+               lhsa(4,5,i) = - tmp2 * fjac(4,5,i-1)
+     >              - tmp1 * njac(4,5,i-1)
+
+               lhsa(5,1,i) = - tmp2 * fjac(5,1,i-1)
+     >              - tmp1 * njac(5,1,i-1)
+               lhsa(5,2,i) = - tmp2 * fjac(5,2,i-1)
+     >              - tmp1 * njac(5,2,i-1)
+               lhsa(5,3,i) = - tmp2 * fjac(5,3,i-1)
+     >              - tmp1 * njac(5,3,i-1)
+               lhsa(5,4,i) = - tmp2 * fjac(5,4,i-1)
+     >              - tmp1 * njac(5,4,i-1)
+               lhsa(5,5,i) = - tmp2 * fjac(5,5,i-1)
+     >              - tmp1 * njac(5,5,i-1)
+     >              - tmp1 * dx5
+
+               lhsb(1,1,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,i)
+     >              + tmp1 * 2.0d+00 * dx1
+               lhsb(1,2,i) = tmp1 * 2.0d+00 * njac(1,2,i)
+               lhsb(1,3,i) = tmp1 * 2.0d+00 * njac(1,3,i)
+               lhsb(1,4,i) = tmp1 * 2.0d+00 * njac(1,4,i)
+               lhsb(1,5,i) = tmp1 * 2.0d+00 * njac(1,5,i)
+
+               lhsb(2,1,i) = tmp1 * 2.0d+00 * njac(2,1,i)
+               lhsb(2,2,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,i)
+     >              + tmp1 * 2.0d+00 * dx2
+               lhsb(2,3,i) = tmp1 * 2.0d+00 * njac(2,3,i)
+               lhsb(2,4,i) = tmp1 * 2.0d+00 * njac(2,4,i)
+               lhsb(2,5,i) = tmp1 * 2.0d+00 * njac(2,5,i)
+
+               lhsb(3,1,i) = tmp1 * 2.0d+00 * njac(3,1,i)
+               lhsb(3,2,i) = tmp1 * 2.0d+00 * njac(3,2,i)
+               lhsb(3,3,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,i)
+     >              + tmp1 * 2.0d+00 * dx3
+               lhsb(3,4,i) = tmp1 * 2.0d+00 * njac(3,4,i)
+               lhsb(3,5,i) = tmp1 * 2.0d+00 * njac(3,5,i)
+
+               lhsb(4,1,i) = tmp1 * 2.0d+00 * njac(4,1,i)
+               lhsb(4,2,i) = tmp1 * 2.0d+00 * njac(4,2,i)
+               lhsb(4,3,i) = tmp1 * 2.0d+00 * njac(4,3,i)
+               lhsb(4,4,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,i)
+     >              + tmp1 * 2.0d+00 * dx4
+               lhsb(4,5,i) = tmp1 * 2.0d+00 * njac(4,5,i)
+
+               lhsb(5,1,i) = tmp1 * 2.0d+00 * njac(5,1,i)
+               lhsb(5,2,i) = tmp1 * 2.0d+00 * njac(5,2,i)
+               lhsb(5,3,i) = tmp1 * 2.0d+00 * njac(5,3,i)
+               lhsb(5,4,i) = tmp1 * 2.0d+00 * njac(5,4,i)
+               lhsb(5,5,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,i)
+     >              + tmp1 * 2.0d+00 * dx5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i+1)
+     >              - tmp1 * njac(1,1,i+1)
+     >              - tmp1 * dx1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i+1)
+     >              - tmp1 * njac(1,2,i+1)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i+1)
+     >              - tmp1 * njac(1,3,i+1)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i+1)
+     >              - tmp1 * njac(1,4,i+1)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i+1)
+     >              - tmp1 * njac(1,5,i+1)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i+1)
+     >              - tmp1 * njac(2,1,i+1)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i+1)
+     >              - tmp1 * njac(2,2,i+1)
+     >              - tmp1 * dx2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i+1)
+     >              - tmp1 * njac(2,3,i+1)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i+1)
+     >              - tmp1 * njac(2,4,i+1)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i+1)
+     >              - tmp1 * njac(2,5,i+1)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i+1)
+     >              - tmp1 * njac(3,1,i+1)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i+1)
+     >              - tmp1 * njac(3,2,i+1)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i+1)
+     >              - tmp1 * njac(3,3,i+1)
+     >              - tmp1 * dx3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i+1)
+     >              - tmp1 * njac(3,4,i+1)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i+1)
+     >              - tmp1 * njac(3,5,i+1)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i+1)
+     >              - tmp1 * njac(4,1,i+1)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i+1)
+     >              - tmp1 * njac(4,2,i+1)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i+1)
+     >              - tmp1 * njac(4,3,i+1)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i+1)
+     >              - tmp1 * njac(4,4,i+1)
+     >              - tmp1 * dx4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i+1)
+     >              - tmp1 * njac(4,5,i+1)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i+1)
+     >              - tmp1 * njac(5,1,i+1)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i+1)
+     >              - tmp1 * njac(5,2,i+1)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i+1)
+     >              - tmp1 * njac(5,3,i+1)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i+1)
+     >              - tmp1 * njac(5,4,i+1)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i+1)
+     >              - tmp1 * njac(5,5,i+1)
+     >              - tmp1 * dx5
+
+            enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+            if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(istart,j,k) by b_inverse and copy back to c
+c     multiply rhs(istart) by b_inverse(istart) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,istart),
+     >                        lhsc(1,1,istart,j,k,c),
+     >                        rhs(1,istart,j,k,c) )
+
+            endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+            do i=istart+first,isize-last
+
+c---------------------------------------------------------------------
+c     rhs(i) = rhs(i) - A*rhs(i-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,i),
+     >                         rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(i) = B(i) - C(i-1)*A(i)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,i),
+     >                         lhsc(1,1,i-1,j,k,c),
+     >                         lhsb(1,1,i))
+
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,i),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+            if (last .eq. 1) then
+
+c---------------------------------------------------------------------
+c     rhs(isize) = rhs(isize) - A*rhs(isize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,isize),
+     >                         rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(isize) = B(isize) - C(isize-1)*A(isize)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,isize),
+     >                         lhsc(1,1,isize-1,j,k,c),
+     >                         lhsb(1,1,isize))
+
+c---------------------------------------------------------------------
+c     multiply rhs() by b_inverse() and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,isize),
+     >                       rhs(1,isize,j,k,c) )
+
+            endif
+         enddo
+      enddo
+
+
+      return
+      end
+      
diff --git a/examples/smpi/NAS/BT/x_solve_vec.f b/examples/smpi/NAS/BT/x_solve_vec.f
new file mode 100644
index 0000000000..8f1c1371db
--- /dev/null
+++ b/examples/smpi/NAS/BT/x_solve_vec.f
@@ -0,0 +1,789 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c     Performs line solves in X direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c     
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+      integer  c, istart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      istart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the x-direct
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(1,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+         
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsx(c)
+            call x_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call x_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsx(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(istart) and rhs'(istart) to be used in this cell
+c---------------------------------------------------------------------
+            call x_unpack_solve_info(c)
+            call x_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call x_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(1,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call x_backsubstitute(first, last,c)
+         else
+            call x_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call x_unpack_backsub_info(c)
+            call x_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call x_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      integer j,k,m,n,ptr,c,istart 
+
+      istart = 0
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine x_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(iend) and rhs'(iend) for
+c     all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer j,k,m,n,isize,ptr,c,jp,kp
+      integer error,send_id,buffer_size 
+
+      isize = cell_size(1,c)-1
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,isize,j,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(1),
+     >     WEST+jp+kp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(istart) for all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer j,k,n,ptr,c,istart,jp,kp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      istart = 0
+      jp = cell_coord(2,c)-1
+      kp = cell_coord(3,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,istart,j,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(1), 
+     >     EAST+jp+kp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(isize) for all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      integer j,k,n,ptr,c
+
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,j,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,jp,kp,c,buffer_size
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(1), 
+     >     EAST+jp+kp*NCELLS, comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer jp,kp,recv_id,error,c,buffer_size
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, predecessor(1), 
+     >     WEST+jp+kp*NCELLS,  comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine x_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(isize)=rhs(isize)
+c     else assume U(isize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(istart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, j, k
+      integer m,n,isize,jsize,ksize,istart
+      
+      istart = 0
+      isize = cell_size(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1      
+      ksize = cell_size(3,c)-end(3,c)-1
+      if (last .eq. 0) then
+         do k=start(3,c),ksize
+            do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c     U(isize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+     >                    - lhsc(m,n,isize,j,k,c)*
+     >                    backsub_info(n,j,k,c)
+c---------------------------------------------------------------------
+c     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+c     $                    - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
+c---------------------------------------------------------------------
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=start(3,c),ksize
+         do j=start(2,c),jsize
+            do i=isize-1,istart,-1
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(IMAX) and rhs'(IMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs_vec.h'
+
+      integer first,last,c
+      integer i,j,k,m,n,isize,ksize,jsize,istart
+
+      istart = 0
+      isize = cell_size(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+
+c---------------------------------------------------------------------
+c     zero the left hand side for starters
+c     set diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+      do j = 0, jsize
+         do m = 1, 5
+            do n = 1, 5
+               lhsa(m,n,0,j) = 0.0d0
+               lhsb(m,n,0,j) = 0.0d0
+               lhsa(m,n,isize,j) = 0.0d0
+               lhsb(m,n,isize,j) = 0.0d0
+            enddo
+            lhsb(m,m,0,j) = 1.0d0
+            lhsb(m,m,isize,j) = 1.0d0
+         enddo
+      enddo
+
+      do k=start(3,c),ksize 
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side in the xi-direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+         do j=start(2,c),jsize
+            do i = start(1,c)-1, cell_size(1,c) - end(1,c)
+
+               tmp1 = rho_i(i,j,k,c)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+               fjac(1,1,i,j) = 0.0d+00
+               fjac(1,2,i,j) = 1.0d+00
+               fjac(1,3,i,j) = 0.0d+00
+               fjac(1,4,i,j) = 0.0d+00
+               fjac(1,5,i,j) = 0.0d+00
+
+               fjac(2,1,i,j) = -(u(2,i,j,k,c) * tmp2 * 
+     >              u(2,i,j,k,c))
+     >              + c2 * qs(i,j,k,c)
+               fjac(2,2,i,j) = ( 2.0d+00 - c2 )
+     >              * ( u(2,i,j,k,c) * tmp1 )
+               fjac(2,3,i,j) = - c2 * ( u(3,i,j,k,c) * tmp1 )
+               fjac(2,4,i,j) = - c2 * ( u(4,i,j,k,c) * tmp1 )
+               fjac(2,5,i,j) = c2
+
+               fjac(3,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
+               fjac(3,2,i,j) = u(3,i,j,k,c) * tmp1
+               fjac(3,3,i,j) = u(2,i,j,k,c) * tmp1
+               fjac(3,4,i,j) = 0.0d+00
+               fjac(3,5,i,j) = 0.0d+00
+
+               fjac(4,1,i,j) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
+               fjac(4,2,i,j) = u(4,i,j,k,c) * tmp1
+               fjac(4,3,i,j) = 0.0d+00
+               fjac(4,4,i,j) = u(2,i,j,k,c) * tmp1
+               fjac(4,5,i,j) = 0.0d+00
+
+               fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
+     >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+     >              * ( u(2,i,j,k,c) * tmp1 )
+               fjac(5,2,i,j) = c1 *  u(5,i,j,k,c) * tmp1 
+     >              - c2
+     >              * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
+     >              + qs(i,j,k,c) )
+               fjac(5,3,i,j) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
+     >              * tmp2
+               fjac(5,4,i,j) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
+     >              * tmp2
+               fjac(5,5,i,j) = c1 * ( u(2,i,j,k,c) * tmp1 )
+
+               njac(1,1,i,j) = 0.0d+00
+               njac(1,2,i,j) = 0.0d+00
+               njac(1,3,i,j) = 0.0d+00
+               njac(1,4,i,j) = 0.0d+00
+               njac(1,5,i,j) = 0.0d+00
+
+               njac(2,1,i,j) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
+               njac(2,2,i,j) =   con43 * c3c4 * tmp1
+               njac(2,3,i,j) =   0.0d+00
+               njac(2,4,i,j) =   0.0d+00
+               njac(2,5,i,j) =   0.0d+00
+
+               njac(3,1,i,j) = - c3c4 * tmp2 * u(3,i,j,k,c)
+               njac(3,2,i,j) =   0.0d+00
+               njac(3,3,i,j) =   c3c4 * tmp1
+               njac(3,4,i,j) =   0.0d+00
+               njac(3,5,i,j) =   0.0d+00
+
+               njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
+               njac(4,2,i,j) =   0.0d+00 
+               njac(4,3,i,j) =   0.0d+00
+               njac(4,4,i,j) =   c3c4 * tmp1
+               njac(4,5,i,j) =   0.0d+00
+
+               njac(5,1,i,j) = - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+     >              - c1345 * tmp2 * u(5,i,j,k,c)
+
+               njac(5,2,i,j) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * u(2,i,j,k,c)
+               njac(5,3,i,j) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
+               njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+               njac(5,5,i,j) = ( c1345 ) * tmp1
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     now jacobians set, so form left hand side in x direction
+c---------------------------------------------------------------------
+         do j=start(2,c),jsize
+            do i = start(1,c), isize - end(1,c)
+
+               tmp1 = dt * tx1
+               tmp2 = dt * tx2
+
+               lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i-1,j)
+     >              - tmp1 * njac(1,1,i-1,j)
+     >              - tmp1 * dx1 
+               lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i-1,j)
+     >              - tmp1 * njac(1,2,i-1,j)
+               lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i-1,j)
+     >              - tmp1 * njac(1,3,i-1,j)
+               lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i-1,j)
+     >              - tmp1 * njac(1,4,i-1,j)
+               lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i-1,j)
+     >              - tmp1 * njac(1,5,i-1,j)
+
+               lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i-1,j)
+     >              - tmp1 * njac(2,1,i-1,j)
+               lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i-1,j)
+     >              - tmp1 * njac(2,2,i-1,j)
+     >              - tmp1 * dx2
+               lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i-1,j)
+     >              - tmp1 * njac(2,3,i-1,j)
+               lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i-1,j)
+     >              - tmp1 * njac(2,4,i-1,j)
+               lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i-1,j)
+     >              - tmp1 * njac(2,5,i-1,j)
+
+               lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i-1,j)
+     >              - tmp1 * njac(3,1,i-1,j)
+               lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i-1,j)
+     >              - tmp1 * njac(3,2,i-1,j)
+               lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i-1,j)
+     >              - tmp1 * njac(3,3,i-1,j)
+     >              - tmp1 * dx3 
+               lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i-1,j)
+     >              - tmp1 * njac(3,4,i-1,j)
+               lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i-1,j)
+     >              - tmp1 * njac(3,5,i-1,j)
+
+               lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i-1,j)
+     >              - tmp1 * njac(4,1,i-1,j)
+               lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i-1,j)
+     >              - tmp1 * njac(4,2,i-1,j)
+               lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i-1,j)
+     >              - tmp1 * njac(4,3,i-1,j)
+               lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i-1,j)
+     >              - tmp1 * njac(4,4,i-1,j)
+     >              - tmp1 * dx4
+               lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i-1,j)
+     >              - tmp1 * njac(4,5,i-1,j)
+
+               lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i-1,j)
+     >              - tmp1 * njac(5,1,i-1,j)
+               lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i-1,j)
+     >              - tmp1 * njac(5,2,i-1,j)
+               lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i-1,j)
+     >              - tmp1 * njac(5,3,i-1,j)
+               lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i-1,j)
+     >              - tmp1 * njac(5,4,i-1,j)
+               lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i-1,j)
+     >              - tmp1 * njac(5,5,i-1,j)
+     >              - tmp1 * dx5
+
+               lhsb(1,1,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,i,j)
+     >              + tmp1 * 2.0d+00 * dx1
+               lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
+               lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
+               lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
+               lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
+
+               lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
+               lhsb(2,2,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,i,j)
+     >              + tmp1 * 2.0d+00 * dx2
+               lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
+               lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
+               lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
+
+               lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
+               lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
+               lhsb(3,3,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,i,j)
+     >              + tmp1 * 2.0d+00 * dx3
+               lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
+               lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
+
+               lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
+               lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
+               lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
+               lhsb(4,4,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,i,j)
+     >              + tmp1 * 2.0d+00 * dx4
+               lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
+
+               lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
+               lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
+               lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
+               lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
+               lhsb(5,5,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,i,j)
+     >              + tmp1 * 2.0d+00 * dx5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i+1,j)
+     >              - tmp1 * njac(1,1,i+1,j)
+     >              - tmp1 * dx1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i+1,j)
+     >              - tmp1 * njac(1,2,i+1,j)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i+1,j)
+     >              - tmp1 * njac(1,3,i+1,j)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i+1,j)
+     >              - tmp1 * njac(1,4,i+1,j)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i+1,j)
+     >              - tmp1 * njac(1,5,i+1,j)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i+1,j)
+     >              - tmp1 * njac(2,1,i+1,j)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i+1,j)
+     >              - tmp1 * njac(2,2,i+1,j)
+     >              - tmp1 * dx2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i+1,j)
+     >              - tmp1 * njac(2,3,i+1,j)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i+1,j)
+     >              - tmp1 * njac(2,4,i+1,j)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i+1,j)
+     >              - tmp1 * njac(2,5,i+1,j)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i+1,j)
+     >              - tmp1 * njac(3,1,i+1,j)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i+1,j)
+     >              - tmp1 * njac(3,2,i+1,j)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i+1,j)
+     >              - tmp1 * njac(3,3,i+1,j)
+     >              - tmp1 * dx3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i+1,j)
+     >              - tmp1 * njac(3,4,i+1,j)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i+1,j)
+     >              - tmp1 * njac(3,5,i+1,j)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i+1,j)
+     >              - tmp1 * njac(4,1,i+1,j)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i+1,j)
+     >              - tmp1 * njac(4,2,i+1,j)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i+1,j)
+     >              - tmp1 * njac(4,3,i+1,j)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i+1,j)
+     >              - tmp1 * njac(4,4,i+1,j)
+     >              - tmp1 * dx4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i+1,j)
+     >              - tmp1 * njac(4,5,i+1,j)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i+1,j)
+     >              - tmp1 * njac(5,1,i+1,j)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i+1,j)
+     >              - tmp1 * njac(5,2,i+1,j)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i+1,j)
+     >              - tmp1 * njac(5,3,i+1,j)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i+1,j)
+     >              - tmp1 * njac(5,4,i+1,j)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i+1,j)
+     >              - tmp1 * njac(5,5,i+1,j)
+     >              - tmp1 * dx5
+
+            enddo
+         enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+         if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(istart,j,k) by b_inverse and copy back to c
+c     multiply rhs(istart) by b_inverse(istart) and copy to rhs
+c---------------------------------------------------------------------
+!dir$ ivdep
+            do j=start(2,c),jsize
+               call binvcrhs( lhsb(1,1,istart,j),
+     >                        lhsc(1,1,istart,j,k,c),
+     >                        rhs(1,istart,j,k,c) )
+            enddo
+
+         endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+!dir$ ivdep
+!dir$ interchange(i,j)
+         do j=start(2,c),jsize
+            do i=istart+first,isize-last
+
+c---------------------------------------------------------------------
+c     rhs(i) = rhs(i) - A*rhs(i-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,i,j),
+     >                         rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(i) = B(i) - C(i-1)*A(i)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,i,j),
+     >                         lhsc(1,1,i-1,j,k,c),
+     >                         lhsb(1,1,i,j))
+
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,i,j),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+         if (last .eq. 1) then
+
+!dir$ ivdep
+            do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c     rhs(isize) = rhs(isize) - A*rhs(isize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,isize,j),
+     >                         rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(isize) = B(isize) - C(isize-1)*A(isize)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,isize,j),
+     >                         lhsc(1,1,isize-1,j,k,c),
+     >                         lhsb(1,1,isize,j))
+
+c---------------------------------------------------------------------
+c     multiply rhs() by b_inverse() and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,isize,j),
+     >                       rhs(1,isize,j,k,c) )
+            enddo
+
+         endif
+      enddo
+
+
+      return
+      end
+      
diff --git a/examples/smpi/NAS/BT/y_solve.f b/examples/smpi/NAS/BT/y_solve.f
new file mode 100644
index 0000000000..33e2ebc018
--- /dev/null
+++ b/examples/smpi/NAS/BT/y_solve.f
@@ -0,0 +1,771 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Performs line solves in Y direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer 
+     >     c, jstart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      jstart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the y-direction
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(2,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsy(c)
+            call y_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call y_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsy(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
+c---------------------------------------------------------------------
+            call y_unpack_solve_info(c)
+            call y_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call y_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(2,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call y_backsubstitute(first, last,c)
+         else
+            call y_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call y_unpack_backsub_info(c)
+            call y_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call y_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine y_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,k,m,n,ptr,c,jstart 
+
+      jstart = 0
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine y_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(jend) and rhs'(jend) for
+c     all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,k,m,n,jsize,ptr,c,ip,kp
+      integer error,send_id,buffer_size 
+
+      jsize = cell_size(2,c)-1
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(2),
+     >     SOUTH+ip+kp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(jstart) for all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,k,n,ptr,c,jstart,ip,kp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      jstart = 0
+      ip = cell_coord(1,c)-1
+      kp = cell_coord(3,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(2), 
+     >     NORTH+ip+kp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(jsize) for all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,k,n,ptr,c 
+
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,i,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,ip,kp,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(2), 
+     >     NORTH+ip+kp*NCELLS, comm_solve, 
+     >     recv_id, error)
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ip,kp,recv_id,error,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size, 
+     >     dp_type, predecessor(2), 
+     >     SOUTH+ip+kp*NCELLS,  comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(jsize)=rhs(jsize)
+c     else assume U(jsize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(jstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, k
+      integer m,n,j,jsize,isize,ksize,jstart
+      
+      jstart = 0
+      isize = cell_size(1,c)-end(1,c)-1      
+      jsize = cell_size(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+      if (last .eq. 0) then
+         do k=start(3,c),ksize
+            do i=start(1,c),isize
+c---------------------------------------------------------------------
+c     U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) 
+     >                    - lhsc(m,n,i,jsize,k,c)*
+     >                    backsub_info(n,i,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=start(3,c),ksize
+         do j=jsize-1,jstart,-1
+            do i=start(1,c),isize
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(JMAX) and rhs'(JMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs.h'
+
+      integer first,last,c
+      integer i,j,k,isize,ksize,jsize,jstart
+      double precision utmp(6,-2:JMAX+1)
+
+      jstart = 0
+      isize = cell_size(1,c)-end(1,c)-1
+      jsize = cell_size(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+
+      call lhsabinit(lhsa, lhsb, jsize)
+
+      do k=start(3,c),ksize 
+         do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side for the three y-factors   
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Compute the indices for storing the tri-diagonal matrix;
+c     determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+            do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+               utmp(1,j) = 1.0d0 / u(1,i,j,k,c)
+               utmp(2,j) = u(2,i,j,k,c)
+               utmp(3,j) = u(3,i,j,k,c)
+               utmp(4,j) = u(4,i,j,k,c)
+               utmp(5,j) = u(5,i,j,k,c)
+               utmp(6,j) = qs(i,j,k,c)
+            end do
+
+            do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+
+               tmp1 = utmp(1,j)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               fjac(1,1,j) = 0.0d+00
+               fjac(1,2,j) = 0.0d+00
+               fjac(1,3,j) = 1.0d+00
+               fjac(1,4,j) = 0.0d+00
+               fjac(1,5,j) = 0.0d+00
+
+               fjac(2,1,j) = - ( utmp(2,j)*utmp(3,j) )
+     >              * tmp2
+               fjac(2,2,j) = utmp(3,j) * tmp1
+               fjac(2,3,j) = utmp(2,j) * tmp1
+               fjac(2,4,j) = 0.0d+00
+               fjac(2,5,j) = 0.0d+00
+
+               fjac(3,1,j) = - ( utmp(3,j)*utmp(3,j)*tmp2)
+     >              + c2 * utmp(6,j)
+               fjac(3,2,j) = - c2 *  utmp(2,j) * tmp1
+               fjac(3,3,j) = ( 2.0d+00 - c2 )
+     >              *  utmp(3,j) * tmp1 
+               fjac(3,4,j) = - c2 * utmp(4,j) * tmp1 
+               fjac(3,5,j) = c2
+
+               fjac(4,1,j) = - ( utmp(3,j)*utmp(4,j) )
+     >              * tmp2
+               fjac(4,2,j) = 0.0d+00
+               fjac(4,3,j) = utmp(4,j) * tmp1
+               fjac(4,4,j) = utmp(3,j) * tmp1
+               fjac(4,5,j) = 0.0d+00
+
+               fjac(5,1,j) = ( c2 * 2.0d0 * utmp(6,j)
+     >              - c1 * utmp(5,j) * tmp1 ) 
+     >              * utmp(3,j) * tmp1 
+               fjac(5,2,j) = - c2 * utmp(2,j)*utmp(3,j) 
+     >              * tmp2
+               fjac(5,3,j) = c1 * utmp(5,j) * tmp1 
+     >              - c2 * ( utmp(6,j)
+     >              + utmp(3,j)*utmp(3,j) * tmp2 )
+               fjac(5,4,j) = - c2 * ( utmp(3,j)*utmp(4,j) )
+     >              * tmp2
+               fjac(5,5,j) = c1 * utmp(3,j) * tmp1 
+
+               njac(1,1,j) = 0.0d+00
+               njac(1,2,j) = 0.0d+00
+               njac(1,3,j) = 0.0d+00
+               njac(1,4,j) = 0.0d+00
+               njac(1,5,j) = 0.0d+00
+
+               njac(2,1,j) = - c3c4 * tmp2 * utmp(2,j)
+               njac(2,2,j) =   c3c4 * tmp1
+               njac(2,3,j) =   0.0d+00
+               njac(2,4,j) =   0.0d+00
+               njac(2,5,j) =   0.0d+00
+
+               njac(3,1,j) = - con43 * c3c4 * tmp2 * utmp(3,j)
+               njac(3,2,j) =   0.0d+00
+               njac(3,3,j) =   con43 * c3c4 * tmp1
+               njac(3,4,j) =   0.0d+00
+               njac(3,5,j) =   0.0d+00
+
+               njac(4,1,j) = - c3c4 * tmp2 * utmp(4,j)
+               njac(4,2,j) =   0.0d+00
+               njac(4,3,j) =   0.0d+00
+               njac(4,4,j) =   c3c4 * tmp1
+               njac(4,5,j) =   0.0d+00
+
+               njac(5,1,j) = - (  c3c4
+     >              - c1345 ) * tmp3 * (utmp(2,j)**2)
+     >              - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (utmp(3,j)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (utmp(4,j)**2)
+     >              - c1345 * tmp2 * utmp(5,j)
+
+               njac(5,2,j) = (  c3c4 - c1345 ) * tmp2 * utmp(2,j)
+               njac(5,3,j) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * utmp(3,j)
+               njac(5,4,j) = ( c3c4 - c1345 ) * tmp2 * utmp(4,j)
+               njac(5,5,j) = ( c1345 ) * tmp1
+
+            enddo
+
+c---------------------------------------------------------------------
+c     now joacobians set, so form left hand side in y direction
+c---------------------------------------------------------------------
+            do j = start(2,c), jsize-end(2,c)
+
+               tmp1 = dt * ty1
+               tmp2 = dt * ty2
+
+               lhsa(1,1,j) = - tmp2 * fjac(1,1,j-1)
+     >              - tmp1 * njac(1,1,j-1)
+     >              - tmp1 * dy1 
+               lhsa(1,2,j) = - tmp2 * fjac(1,2,j-1)
+     >              - tmp1 * njac(1,2,j-1)
+               lhsa(1,3,j) = - tmp2 * fjac(1,3,j-1)
+     >              - tmp1 * njac(1,3,j-1)
+               lhsa(1,4,j) = - tmp2 * fjac(1,4,j-1)
+     >              - tmp1 * njac(1,4,j-1)
+               lhsa(1,5,j) = - tmp2 * fjac(1,5,j-1)
+     >              - tmp1 * njac(1,5,j-1)
+
+               lhsa(2,1,j) = - tmp2 * fjac(2,1,j-1)
+     >              - tmp1 * njac(2,1,j-1)
+               lhsa(2,2,j) = - tmp2 * fjac(2,2,j-1)
+     >              - tmp1 * njac(2,2,j-1)
+     >              - tmp1 * dy2
+               lhsa(2,3,j) = - tmp2 * fjac(2,3,j-1)
+     >              - tmp1 * njac(2,3,j-1)
+               lhsa(2,4,j) = - tmp2 * fjac(2,4,j-1)
+     >              - tmp1 * njac(2,4,j-1)
+               lhsa(2,5,j) = - tmp2 * fjac(2,5,j-1)
+     >              - tmp1 * njac(2,5,j-1)
+
+               lhsa(3,1,j) = - tmp2 * fjac(3,1,j-1)
+     >              - tmp1 * njac(3,1,j-1)
+               lhsa(3,2,j) = - tmp2 * fjac(3,2,j-1)
+     >              - tmp1 * njac(3,2,j-1)
+               lhsa(3,3,j) = - tmp2 * fjac(3,3,j-1)
+     >              - tmp1 * njac(3,3,j-1)
+     >              - tmp1 * dy3 
+               lhsa(3,4,j) = - tmp2 * fjac(3,4,j-1)
+     >              - tmp1 * njac(3,4,j-1)
+               lhsa(3,5,j) = - tmp2 * fjac(3,5,j-1)
+     >              - tmp1 * njac(3,5,j-1)
+
+               lhsa(4,1,j) = - tmp2 * fjac(4,1,j-1)
+     >              - tmp1 * njac(4,1,j-1)
+               lhsa(4,2,j) = - tmp2 * fjac(4,2,j-1)
+     >              - tmp1 * njac(4,2,j-1)
+               lhsa(4,3,j) = - tmp2 * fjac(4,3,j-1)
+     >              - tmp1 * njac(4,3,j-1)
+               lhsa(4,4,j) = - tmp2 * fjac(4,4,j-1)
+     >              - tmp1 * njac(4,4,j-1)
+     >              - tmp1 * dy4
+               lhsa(4,5,j) = - tmp2 * fjac(4,5,j-1)
+     >              - tmp1 * njac(4,5,j-1)
+
+               lhsa(5,1,j) = - tmp2 * fjac(5,1,j-1)
+     >              - tmp1 * njac(5,1,j-1)
+               lhsa(5,2,j) = - tmp2 * fjac(5,2,j-1)
+     >              - tmp1 * njac(5,2,j-1)
+               lhsa(5,3,j) = - tmp2 * fjac(5,3,j-1)
+     >              - tmp1 * njac(5,3,j-1)
+               lhsa(5,4,j) = - tmp2 * fjac(5,4,j-1)
+     >              - tmp1 * njac(5,4,j-1)
+               lhsa(5,5,j) = - tmp2 * fjac(5,5,j-1)
+     >              - tmp1 * njac(5,5,j-1)
+     >              - tmp1 * dy5
+
+               lhsb(1,1,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,j)
+     >              + tmp1 * 2.0d+00 * dy1
+               lhsb(1,2,j) = tmp1 * 2.0d+00 * njac(1,2,j)
+               lhsb(1,3,j) = tmp1 * 2.0d+00 * njac(1,3,j)
+               lhsb(1,4,j) = tmp1 * 2.0d+00 * njac(1,4,j)
+               lhsb(1,5,j) = tmp1 * 2.0d+00 * njac(1,5,j)
+
+               lhsb(2,1,j) = tmp1 * 2.0d+00 * njac(2,1,j)
+               lhsb(2,2,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,j)
+     >              + tmp1 * 2.0d+00 * dy2
+               lhsb(2,3,j) = tmp1 * 2.0d+00 * njac(2,3,j)
+               lhsb(2,4,j) = tmp1 * 2.0d+00 * njac(2,4,j)
+               lhsb(2,5,j) = tmp1 * 2.0d+00 * njac(2,5,j)
+
+               lhsb(3,1,j) = tmp1 * 2.0d+00 * njac(3,1,j)
+               lhsb(3,2,j) = tmp1 * 2.0d+00 * njac(3,2,j)
+               lhsb(3,3,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,j)
+     >              + tmp1 * 2.0d+00 * dy3
+               lhsb(3,4,j) = tmp1 * 2.0d+00 * njac(3,4,j)
+               lhsb(3,5,j) = tmp1 * 2.0d+00 * njac(3,5,j)
+
+               lhsb(4,1,j) = tmp1 * 2.0d+00 * njac(4,1,j)
+               lhsb(4,2,j) = tmp1 * 2.0d+00 * njac(4,2,j)
+               lhsb(4,3,j) = tmp1 * 2.0d+00 * njac(4,3,j)
+               lhsb(4,4,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,j)
+     >              + tmp1 * 2.0d+00 * dy4
+               lhsb(4,5,j) = tmp1 * 2.0d+00 * njac(4,5,j)
+
+               lhsb(5,1,j) = tmp1 * 2.0d+00 * njac(5,1,j)
+               lhsb(5,2,j) = tmp1 * 2.0d+00 * njac(5,2,j)
+               lhsb(5,3,j) = tmp1 * 2.0d+00 * njac(5,3,j)
+               lhsb(5,4,j) = tmp1 * 2.0d+00 * njac(5,4,j)
+               lhsb(5,5,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,j) 
+     >              + tmp1 * 2.0d+00 * dy5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,j+1)
+     >              - tmp1 * njac(1,1,j+1)
+     >              - tmp1 * dy1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,j+1)
+     >              - tmp1 * njac(1,2,j+1)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,j+1)
+     >              - tmp1 * njac(1,3,j+1)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,j+1)
+     >              - tmp1 * njac(1,4,j+1)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,j+1)
+     >              - tmp1 * njac(1,5,j+1)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,j+1)
+     >              - tmp1 * njac(2,1,j+1)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,j+1)
+     >              - tmp1 * njac(2,2,j+1)
+     >              - tmp1 * dy2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,j+1)
+     >              - tmp1 * njac(2,3,j+1)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,j+1)
+     >              - tmp1 * njac(2,4,j+1)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,j+1)
+     >              - tmp1 * njac(2,5,j+1)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,j+1)
+     >              - tmp1 * njac(3,1,j+1)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,j+1)
+     >              - tmp1 * njac(3,2,j+1)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,j+1)
+     >              - tmp1 * njac(3,3,j+1)
+     >              - tmp1 * dy3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,j+1)
+     >              - tmp1 * njac(3,4,j+1)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,j+1)
+     >              - tmp1 * njac(3,5,j+1)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,j+1)
+     >              - tmp1 * njac(4,1,j+1)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,j+1)
+     >              - tmp1 * njac(4,2,j+1)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,j+1)
+     >              - tmp1 * njac(4,3,j+1)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,j+1)
+     >              - tmp1 * njac(4,4,j+1)
+     >              - tmp1 * dy4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,j+1)
+     >              - tmp1 * njac(4,5,j+1)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,j+1)
+     >              - tmp1 * njac(5,1,j+1)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,j+1)
+     >              - tmp1 * njac(5,2,j+1)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,j+1)
+     >              - tmp1 * njac(5,3,j+1)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,j+1)
+     >              - tmp1 * njac(5,4,j+1)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,j+1)
+     >              - tmp1 * njac(5,5,j+1)
+     >              - tmp1 * dy5
+
+            enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+            if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(i,jstart,k) by b_inverse and copy back to c
+c     multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,jstart),
+     >                        lhsc(1,1,i,jstart,k,c),
+     >                        rhs(1,i,jstart,k,c) )
+
+            endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+            do j=jstart+first,jsize-last
+
+c---------------------------------------------------------------------
+c     subtract A*lhs_vector(j-1) from lhs_vector(j)
+c     
+c     rhs(j) = rhs(j) - A*rhs(j-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,j),
+     >                         rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(j) = B(j) - C(j-1)*A(j)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,j),
+     >                         lhsc(1,1,i,j-1,k,c),
+     >                         lhsb(1,1,j))
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,j),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+            if (last .eq. 1) then
+
+c---------------------------------------------------------------------
+c     rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,jsize),
+     >                         rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
+
+c---------------------------------------------------------------------
+c     B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
+c     call matmul_sub(aa,i,jsize,k,c,
+c     $              cc,i,jsize-1,k,c,bb,i,jsize,k,c)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,jsize),
+     >                         lhsc(1,1,i,jsize-1,k,c),
+     >                         lhsb(1,1,jsize))
+
+c---------------------------------------------------------------------
+c     multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,jsize),
+     >                       rhs(1,i,jsize,k,c) )
+
+            endif
+         enddo
+      enddo
+
+
+      return
+      end
+      
+
+
diff --git a/examples/smpi/NAS/BT/y_solve_vec.f b/examples/smpi/NAS/BT/y_solve_vec.f
new file mode 100644
index 0000000000..e21cfa36ce
--- /dev/null
+++ b/examples/smpi/NAS/BT/y_solve_vec.f
@@ -0,0 +1,788 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Performs line solves in Y direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer 
+     >     c, jstart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      jstart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the y-direct
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(2,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsy(c)
+            call y_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call y_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsy(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
+c---------------------------------------------------------------------
+            call y_unpack_solve_info(c)
+            call y_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call y_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(2,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call y_backsubstitute(first, last,c)
+         else
+            call y_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call y_unpack_backsub_info(c)
+            call y_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call y_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine y_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,k,m,n,ptr,c,jstart 
+
+      jstart = 0
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine y_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(jend) and rhs'(jend) for
+c     all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,k,m,n,jsize,ptr,c,ip,kp
+      integer error,send_id,buffer_size 
+
+      jsize = cell_size(2,c)-1
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(2),
+     >     SOUTH+ip+kp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(jstart) for all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,k,n,ptr,c,jstart,ip,kp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      jstart = 0
+      ip = cell_coord(1,c)-1
+      kp = cell_coord(3,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(2), 
+     >     NORTH+ip+kp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(jsize) for all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,k,n,ptr,c 
+
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,i,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,ip,kp,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(2), 
+     >     NORTH+ip+kp*NCELLS, comm_solve, 
+     >     recv_id, error)
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ip,kp,recv_id,error,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size, 
+     >     dp_type, predecessor(2), 
+     >     SOUTH+ip+kp*NCELLS,  comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(jsize)=rhs(jsize)
+c     else assume U(jsize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(jstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, k
+      integer m,n,j,jsize,isize,ksize,jstart
+      
+      jstart = 0
+      isize = cell_size(1,c)-end(1,c)-1      
+      jsize = cell_size(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+      if (last .eq. 0) then
+         do k=start(3,c),ksize
+            do i=start(1,c),isize
+c---------------------------------------------------------------------
+c     U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) 
+     >                    - lhsc(m,n,i,jsize,k,c)*
+     >                    backsub_info(n,i,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=start(3,c),ksize
+         do j=jsize-1,jstart,-1
+            do i=start(1,c),isize
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(JMAX) and rhs'(JMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs_vec.h'
+
+      integer first,last,c
+      integer i,j,k,m,n,isize,ksize,jsize,jstart
+
+      jstart = 0
+      isize = cell_size(1,c)-end(1,c)-1
+      jsize = cell_size(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+
+c---------------------------------------------------------------------
+c     zero the left hand side for starters
+c     set diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+      do i = 0, isize
+         do m = 1, 5
+            do n = 1, 5
+               lhsa(m,n,i,0) = 0.0d0
+               lhsb(m,n,i,0) = 0.0d0
+               lhsa(m,n,i,jsize) = 0.0d0
+               lhsb(m,n,i,jsize) = 0.0d0
+            enddo
+            lhsb(m,m,i,0) = 1.0d0
+            lhsb(m,m,i,jsize) = 1.0d0
+         enddo
+      enddo
+
+      do k=start(3,c),ksize 
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side for the three y-factors 
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Compute the indices for storing the tri-diagonal matrix;
+c     determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+
+         do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+            do i=start(1,c),isize
+
+               tmp1 = 1.0d0 / u(1,i,j,k,c)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               fjac(1,1,i,j) = 0.0d+00
+               fjac(1,2,i,j) = 0.0d+00
+               fjac(1,3,i,j) = 1.0d+00
+               fjac(1,4,i,j) = 0.0d+00
+               fjac(1,5,i,j) = 0.0d+00
+
+               fjac(2,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) )
+     >              * tmp2
+               fjac(2,2,i,j) = u(3,i,j,k,c) * tmp1
+               fjac(2,3,i,j) = u(2,i,j,k,c) * tmp1
+               fjac(2,4,i,j) = 0.0d+00
+               fjac(2,5,i,j) = 0.0d+00
+
+               fjac(3,1,i,j) = - ( u(3,i,j,k,c)*u(3,i,j,k,c)*tmp2)
+     >              + c2 * qs(i,j,k,c)
+               fjac(3,2,i,j) = - c2 *  u(2,i,j,k,c) * tmp1
+               fjac(3,3,i,j) = ( 2.0d+00 - c2 )
+     >              *  u(3,i,j,k,c) * tmp1 
+               fjac(3,4,i,j) = - c2 * u(4,i,j,k,c) * tmp1 
+               fjac(3,5,i,j) = c2
+
+               fjac(4,1,i,j) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+     >              * tmp2
+               fjac(4,2,i,j) = 0.0d+00
+               fjac(4,3,i,j) = u(4,i,j,k,c) * tmp1
+               fjac(4,4,i,j) = u(3,i,j,k,c) * tmp1
+               fjac(4,5,i,j) = 0.0d+00
+
+               fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
+     >              - c1 * u(5,i,j,k,c) * tmp1 ) 
+     >              * u(3,i,j,k,c) * tmp1 
+               fjac(5,2,i,j) = - c2 * u(2,i,j,k,c)*u(3,i,j,k,c) 
+     >              * tmp2
+               fjac(5,3,i,j) = c1 * u(5,i,j,k,c) * tmp1 
+     >              - c2 * ( qs(i,j,k,c)
+     >              + u(3,i,j,k,c)*u(3,i,j,k,c) * tmp2 )
+               fjac(5,4,i,j) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+     >              * tmp2
+               fjac(5,5,i,j) = c1 * u(3,i,j,k,c) * tmp1 
+
+               njac(1,1,i,j) = 0.0d+00
+               njac(1,2,i,j) = 0.0d+00
+               njac(1,3,i,j) = 0.0d+00
+               njac(1,4,i,j) = 0.0d+00
+               njac(1,5,i,j) = 0.0d+00
+
+               njac(2,1,i,j) = - c3c4 * tmp2 * u(2,i,j,k,c)
+               njac(2,2,i,j) =   c3c4 * tmp1
+               njac(2,3,i,j) =   0.0d+00
+               njac(2,4,i,j) =   0.0d+00
+               njac(2,5,i,j) =   0.0d+00
+
+               njac(3,1,i,j) = - con43 * c3c4 * tmp2 * u(3,i,j,k,c)
+               njac(3,2,i,j) =   0.0d+00
+               njac(3,3,i,j) =   con43 * c3c4 * tmp1
+               njac(3,4,i,j) =   0.0d+00
+               njac(3,5,i,j) =   0.0d+00
+
+               njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
+               njac(4,2,i,j) =   0.0d+00
+               njac(4,3,i,j) =   0.0d+00
+               njac(4,4,i,j) =   c3c4 * tmp1
+               njac(4,5,i,j) =   0.0d+00
+
+               njac(5,1,i,j) = - (  c3c4
+     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+     >              - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+     >              - c1345 * tmp2 * u(5,i,j,k,c)
+
+               njac(5,2,i,j) = (  c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
+               njac(5,3,i,j) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * u(3,i,j,k,c)
+               njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+               njac(5,5,i,j) = ( c1345 ) * tmp1
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     now joacobians set, so form left hand side in y direction
+c---------------------------------------------------------------------
+         do j = start(2,c), jsize-end(2,c)
+            do i=start(1,c),isize
+
+               tmp1 = dt * ty1
+               tmp2 = dt * ty2
+
+               lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i,j-1)
+     >              - tmp1 * njac(1,1,i,j-1)
+     >              - tmp1 * dy1 
+               lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i,j-1)
+     >              - tmp1 * njac(1,2,i,j-1)
+               lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i,j-1)
+     >              - tmp1 * njac(1,3,i,j-1)
+               lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i,j-1)
+     >              - tmp1 * njac(1,4,i,j-1)
+               lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i,j-1)
+     >              - tmp1 * njac(1,5,i,j-1)
+
+               lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i,j-1)
+     >              - tmp1 * njac(2,1,i,j-1)
+               lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i,j-1)
+     >              - tmp1 * njac(2,2,i,j-1)
+     >              - tmp1 * dy2
+               lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i,j-1)
+     >              - tmp1 * njac(2,3,i,j-1)
+               lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i,j-1)
+     >              - tmp1 * njac(2,4,i,j-1)
+               lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i,j-1)
+     >              - tmp1 * njac(2,5,i,j-1)
+
+               lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i,j-1)
+     >              - tmp1 * njac(3,1,i,j-1)
+               lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i,j-1)
+     >              - tmp1 * njac(3,2,i,j-1)
+               lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i,j-1)
+     >              - tmp1 * njac(3,3,i,j-1)
+     >              - tmp1 * dy3 
+               lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i,j-1)
+     >              - tmp1 * njac(3,4,i,j-1)
+               lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i,j-1)
+     >              - tmp1 * njac(3,5,i,j-1)
+
+               lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i,j-1)
+     >              - tmp1 * njac(4,1,i,j-1)
+               lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i,j-1)
+     >              - tmp1 * njac(4,2,i,j-1)
+               lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i,j-1)
+     >              - tmp1 * njac(4,3,i,j-1)
+               lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i,j-1)
+     >              - tmp1 * njac(4,4,i,j-1)
+     >              - tmp1 * dy4
+               lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i,j-1)
+     >              - tmp1 * njac(4,5,i,j-1)
+
+               lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i,j-1)
+     >              - tmp1 * njac(5,1,i,j-1)
+               lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i,j-1)
+     >              - tmp1 * njac(5,2,i,j-1)
+               lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i,j-1)
+     >              - tmp1 * njac(5,3,i,j-1)
+               lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i,j-1)
+     >              - tmp1 * njac(5,4,i,j-1)
+               lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i,j-1)
+     >              - tmp1 * njac(5,5,i,j-1)
+     >              - tmp1 * dy5
+
+               lhsb(1,1,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,i,j)
+     >              + tmp1 * 2.0d+00 * dy1
+               lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
+               lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
+               lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
+               lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
+
+               lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
+               lhsb(2,2,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,i,j)
+     >              + tmp1 * 2.0d+00 * dy2
+               lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
+               lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
+               lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
+
+               lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
+               lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
+               lhsb(3,3,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,i,j)
+     >              + tmp1 * 2.0d+00 * dy3
+               lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
+               lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
+
+               lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
+               lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
+               lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
+               lhsb(4,4,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,i,j)
+     >              + tmp1 * 2.0d+00 * dy4
+               lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
+
+               lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
+               lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
+               lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
+               lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
+               lhsb(5,5,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,i,j) 
+     >              + tmp1 * 2.0d+00 * dy5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i,j+1)
+     >              - tmp1 * njac(1,1,i,j+1)
+     >              - tmp1 * dy1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i,j+1)
+     >              - tmp1 * njac(1,2,i,j+1)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i,j+1)
+     >              - tmp1 * njac(1,3,i,j+1)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i,j+1)
+     >              - tmp1 * njac(1,4,i,j+1)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i,j+1)
+     >              - tmp1 * njac(1,5,i,j+1)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i,j+1)
+     >              - tmp1 * njac(2,1,i,j+1)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i,j+1)
+     >              - tmp1 * njac(2,2,i,j+1)
+     >              - tmp1 * dy2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i,j+1)
+     >              - tmp1 * njac(2,3,i,j+1)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i,j+1)
+     >              - tmp1 * njac(2,4,i,j+1)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i,j+1)
+     >              - tmp1 * njac(2,5,i,j+1)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i,j+1)
+     >              - tmp1 * njac(3,1,i,j+1)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i,j+1)
+     >              - tmp1 * njac(3,2,i,j+1)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i,j+1)
+     >              - tmp1 * njac(3,3,i,j+1)
+     >              - tmp1 * dy3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i,j+1)
+     >              - tmp1 * njac(3,4,i,j+1)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i,j+1)
+     >              - tmp1 * njac(3,5,i,j+1)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i,j+1)
+     >              - tmp1 * njac(4,1,i,j+1)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i,j+1)
+     >              - tmp1 * njac(4,2,i,j+1)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i,j+1)
+     >              - tmp1 * njac(4,3,i,j+1)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i,j+1)
+     >              - tmp1 * njac(4,4,i,j+1)
+     >              - tmp1 * dy4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i,j+1)
+     >              - tmp1 * njac(4,5,i,j+1)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i,j+1)
+     >              - tmp1 * njac(5,1,i,j+1)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i,j+1)
+     >              - tmp1 * njac(5,2,i,j+1)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i,j+1)
+     >              - tmp1 * njac(5,3,i,j+1)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i,j+1)
+     >              - tmp1 * njac(5,4,i,j+1)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i,j+1)
+     >              - tmp1 * njac(5,5,i,j+1)
+     >              - tmp1 * dy5
+
+            enddo
+         enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+         if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(i,jstart,k) by b_inverse and copy back to c
+c     multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
+c---------------------------------------------------------------------
+!dir$ ivdep
+            do i=start(1,c),isize
+               call binvcrhs( lhsb(1,1,i,jstart),
+     >                        lhsc(1,1,i,jstart,k,c),
+     >                        rhs(1,i,jstart,k,c) )
+            enddo
+
+         endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+         do j=jstart+first,jsize-last
+!dir$ ivdep
+            do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c     subtract A*lhs_vector(j-1) from lhs_vector(j)
+c     
+c     rhs(j) = rhs(j) - A*rhs(j-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,i,j),
+     >                         rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(j) = B(j) - C(j-1)*A(j)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,i,j),
+     >                         lhsc(1,1,i,j-1,k,c),
+     >                         lhsb(1,1,i,j))
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,i,j),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+         if (last .eq. 1) then
+
+!dir$ ivdep
+            do i=start(1,c),isize
+c---------------------------------------------------------------------
+c     rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,i,jsize),
+     >                         rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
+
+c---------------------------------------------------------------------
+c     B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
+c     call matmul_sub(aa,i,jsize,k,c,
+c     $              cc,i,jsize-1,k,c,bb,i,jsize,k,c)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,i,jsize),
+     >                         lhsc(1,1,i,jsize-1,k,c),
+     >                         lhsb(1,1,i,jsize))
+
+c---------------------------------------------------------------------
+c     multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,i,jsize),
+     >                       rhs(1,i,jsize,k,c) )
+            enddo
+
+         endif
+      enddo
+
+
+      return
+      end
+      
+
+
diff --git a/examples/smpi/NAS/BT/z_solve.f b/examples/smpi/NAS/BT/z_solve.f
new file mode 100644
index 0000000000..d7a5a2f1ec
--- /dev/null
+++ b/examples/smpi/NAS/BT/z_solve.f
@@ -0,0 +1,776 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Performs line solves in Z direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer c, kstart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      kstart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the y-direction
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(3,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsz(c)
+            call z_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call z_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsz(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
+c---------------------------------------------------------------------
+            call z_unpack_solve_info(c)
+            call z_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call z_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(3,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call z_backsubstitute(first, last,c)
+         else
+            call z_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call z_unpack_backsub_info(c)
+            call z_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call z_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine z_unpack_solve_info(c)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all i and j
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,j,m,n,ptr,c,kstart 
+
+      kstart = 0
+      ptr = 0
+      do j=0,JMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine z_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(kend) and rhs'(kend) for
+c     all i and j
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,j,m,n,ksize,ptr,c,ip,jp
+      integer error,send_id,buffer_size
+
+      ksize = cell_size(3,c)-1
+      ip = cell_coord(1,c) - 1
+      jp = cell_coord(2,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do j=0,JMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(3),
+     >     BOTTOM+ip+jp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(jstart) for all i and j
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,j,n,ptr,c,kstart,ip,jp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      kstart = 0
+      ip = cell_coord(1,c)-1
+      jp = cell_coord(2,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do j=0,JMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(3), 
+     >     TOP+ip+jp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(ksize) for all i and j
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,j,n,ptr,c
+
+      ptr = 0
+      do j=0,JMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,i,j,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,ip,jp,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      jp = cell_coord(2,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(3), 
+     >     TOP+ip+jp*NCELLS, comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ip,jp,recv_id,error,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      jp = cell_coord(2,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, predecessor(3), 
+     >     BOTTOM+ip+jp*NCELLS, comm_solve,
+     >     recv_id, error)
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(ksize)=rhs(ksize)
+c     else assume U(ksize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(kstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, k
+      integer m,n,j,jsize,isize,ksize,kstart
+      
+      kstart = 0
+      isize = cell_size(1,c)-end(1,c)-1      
+      jsize = cell_size(2,c)-end(2,c)-1
+      ksize = cell_size(3,c)-1
+      if (last .eq. 0) then
+         do j=start(2,c),jsize
+            do i=start(1,c),isize
+c---------------------------------------------------------------------
+c     U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) 
+     >                    - lhsc(m,n,i,j,ksize,c)*
+     >                    backsub_info(n,i,j,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=ksize-1,kstart,-1
+         do j=start(2,c),jsize
+            do i=start(1,c),isize
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(KMAX) and rhs'(KMAX) will be sent to next cell.
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs.h'
+
+      integer first,last,c
+      integer i,j,k,isize,ksize,jsize,kstart
+      double precision utmp(6,-2:KMAX+1)
+
+      kstart = 0
+      isize = cell_size(1,c)-end(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1
+      ksize = cell_size(3,c)-1
+
+      call lhsabinit(lhsa, lhsb, ksize)
+
+      do j=start(2,c),jsize 
+         do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side for the three z-factors   
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Compute the indices for storing the block-diagonal matrix;
+c     determine c (labeled f) and s jacobians for cell c
+c---------------------------------------------------------------------
+            do k = start(3,c)-1, cell_size(3,c)-end(3,c)
+               utmp(1,k) = 1.0d0 / u(1,i,j,k,c)
+               utmp(2,k) = u(2,i,j,k,c)
+               utmp(3,k) = u(3,i,j,k,c)
+               utmp(4,k) = u(4,i,j,k,c)
+               utmp(5,k) = u(5,i,j,k,c)
+               utmp(6,k) = qs(i,j,k,c)
+            end do
+
+            do k = start(3,c)-1, cell_size(3,c)-end(3,c)
+
+               tmp1 = utmp(1,k)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               fjac(1,1,k) = 0.0d+00
+               fjac(1,2,k) = 0.0d+00
+               fjac(1,3,k) = 0.0d+00
+               fjac(1,4,k) = 1.0d+00
+               fjac(1,5,k) = 0.0d+00
+
+               fjac(2,1,k) = - ( utmp(2,k)*utmp(4,k) ) 
+     >              * tmp2 
+               fjac(2,2,k) = utmp(4,k) * tmp1
+               fjac(2,3,k) = 0.0d+00
+               fjac(2,4,k) = utmp(2,k) * tmp1
+               fjac(2,5,k) = 0.0d+00
+
+               fjac(3,1,k) = - ( utmp(3,k)*utmp(4,k) )
+     >              * tmp2 
+               fjac(3,2,k) = 0.0d+00
+               fjac(3,3,k) = utmp(4,k) * tmp1
+               fjac(3,4,k) = utmp(3,k) * tmp1
+               fjac(3,5,k) = 0.0d+00
+
+               fjac(4,1,k) = - (utmp(4,k)*utmp(4,k) * tmp2 ) 
+     >              + c2 * utmp(6,k)
+               fjac(4,2,k) = - c2 *  utmp(2,k) * tmp1 
+               fjac(4,3,k) = - c2 *  utmp(3,k) * tmp1
+               fjac(4,4,k) = ( 2.0d+00 - c2 )
+     >              *  utmp(4,k) * tmp1 
+               fjac(4,5,k) = c2
+
+               fjac(5,1,k) = ( c2 * 2.0d0 * utmp(6,k)
+     >              - c1 * ( utmp(5,k) * tmp1 ) )
+     >              * ( utmp(4,k) * tmp1 )
+               fjac(5,2,k) = - c2 * ( utmp(2,k)*utmp(4,k) )
+     >              * tmp2 
+               fjac(5,3,k) = - c2 * ( utmp(3,k)*utmp(4,k) )
+     >              * tmp2
+               fjac(5,4,k) = c1 * ( utmp(5,k) * tmp1 )
+     >              - c2 * ( utmp(6,k)
+     >              + utmp(4,k)*utmp(4,k) * tmp2 )
+               fjac(5,5,k) = c1 * utmp(4,k) * tmp1
+
+               njac(1,1,k) = 0.0d+00
+               njac(1,2,k) = 0.0d+00
+               njac(1,3,k) = 0.0d+00
+               njac(1,4,k) = 0.0d+00
+               njac(1,5,k) = 0.0d+00
+
+               njac(2,1,k) = - c3c4 * tmp2 * utmp(2,k)
+               njac(2,2,k) =   c3c4 * tmp1
+               njac(2,3,k) =   0.0d+00
+               njac(2,4,k) =   0.0d+00
+               njac(2,5,k) =   0.0d+00
+
+               njac(3,1,k) = - c3c4 * tmp2 * utmp(3,k)
+               njac(3,2,k) =   0.0d+00
+               njac(3,3,k) =   c3c4 * tmp1
+               njac(3,4,k) =   0.0d+00
+               njac(3,5,k) =   0.0d+00
+
+               njac(4,1,k) = - con43 * c3c4 * tmp2 * utmp(4,k)
+               njac(4,2,k) =   0.0d+00
+               njac(4,3,k) =   0.0d+00
+               njac(4,4,k) =   con43 * c3 * c4 * tmp1
+               njac(4,5,k) =   0.0d+00
+
+               njac(5,1,k) = - (  c3c4
+     >              - c1345 ) * tmp3 * (utmp(2,k)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (utmp(3,k)**2)
+     >              - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (utmp(4,k)**2)
+     >              - c1345 * tmp2 * utmp(5,k)
+
+               njac(5,2,k) = (  c3c4 - c1345 ) * tmp2 * utmp(2,k)
+               njac(5,3,k) = (  c3c4 - c1345 ) * tmp2 * utmp(3,k)
+               njac(5,4,k) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * utmp(4,k)
+               njac(5,5,k) = ( c1345 )* tmp1
+
+
+            enddo
+
+c---------------------------------------------------------------------
+c     now joacobians set, so form left hand side in z direction
+c---------------------------------------------------------------------
+            do k = start(3,c), ksize-end(3,c)
+
+               tmp1 = dt * tz1
+               tmp2 = dt * tz2
+
+               lhsa(1,1,k) = - tmp2 * fjac(1,1,k-1)
+     >              - tmp1 * njac(1,1,k-1)
+     >              - tmp1 * dz1 
+               lhsa(1,2,k) = - tmp2 * fjac(1,2,k-1)
+     >              - tmp1 * njac(1,2,k-1)
+               lhsa(1,3,k) = - tmp2 * fjac(1,3,k-1)
+     >              - tmp1 * njac(1,3,k-1)
+               lhsa(1,4,k) = - tmp2 * fjac(1,4,k-1)
+     >              - tmp1 * njac(1,4,k-1)
+               lhsa(1,5,k) = - tmp2 * fjac(1,5,k-1)
+     >              - tmp1 * njac(1,5,k-1)
+
+               lhsa(2,1,k) = - tmp2 * fjac(2,1,k-1)
+     >              - tmp1 * njac(2,1,k-1)
+               lhsa(2,2,k) = - tmp2 * fjac(2,2,k-1)
+     >              - tmp1 * njac(2,2,k-1)
+     >              - tmp1 * dz2
+               lhsa(2,3,k) = - tmp2 * fjac(2,3,k-1)
+     >              - tmp1 * njac(2,3,k-1)
+               lhsa(2,4,k) = - tmp2 * fjac(2,4,k-1)
+     >              - tmp1 * njac(2,4,k-1)
+               lhsa(2,5,k) = - tmp2 * fjac(2,5,k-1)
+     >              - tmp1 * njac(2,5,k-1)
+
+               lhsa(3,1,k) = - tmp2 * fjac(3,1,k-1)
+     >              - tmp1 * njac(3,1,k-1)
+               lhsa(3,2,k) = - tmp2 * fjac(3,2,k-1)
+     >              - tmp1 * njac(3,2,k-1)
+               lhsa(3,3,k) = - tmp2 * fjac(3,3,k-1)
+     >              - tmp1 * njac(3,3,k-1)
+     >              - tmp1 * dz3 
+               lhsa(3,4,k) = - tmp2 * fjac(3,4,k-1)
+     >              - tmp1 * njac(3,4,k-1)
+               lhsa(3,5,k) = - tmp2 * fjac(3,5,k-1)
+     >              - tmp1 * njac(3,5,k-1)
+
+               lhsa(4,1,k) = - tmp2 * fjac(4,1,k-1)
+     >              - tmp1 * njac(4,1,k-1)
+               lhsa(4,2,k) = - tmp2 * fjac(4,2,k-1)
+     >              - tmp1 * njac(4,2,k-1)
+               lhsa(4,3,k) = - tmp2 * fjac(4,3,k-1)
+     >              - tmp1 * njac(4,3,k-1)
+               lhsa(4,4,k) = - tmp2 * fjac(4,4,k-1)
+     >              - tmp1 * njac(4,4,k-1)
+     >              - tmp1 * dz4
+               lhsa(4,5,k) = - tmp2 * fjac(4,5,k-1)
+     >              - tmp1 * njac(4,5,k-1)
+
+               lhsa(5,1,k) = - tmp2 * fjac(5,1,k-1)
+     >              - tmp1 * njac(5,1,k-1)
+               lhsa(5,2,k) = - tmp2 * fjac(5,2,k-1)
+     >              - tmp1 * njac(5,2,k-1)
+               lhsa(5,3,k) = - tmp2 * fjac(5,3,k-1)
+     >              - tmp1 * njac(5,3,k-1)
+               lhsa(5,4,k) = - tmp2 * fjac(5,4,k-1)
+     >              - tmp1 * njac(5,4,k-1)
+               lhsa(5,5,k) = - tmp2 * fjac(5,5,k-1)
+     >              - tmp1 * njac(5,5,k-1)
+     >              - tmp1 * dz5
+
+               lhsb(1,1,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,k)
+     >              + tmp1 * 2.0d+00 * dz1
+               lhsb(1,2,k) = tmp1 * 2.0d+00 * njac(1,2,k)
+               lhsb(1,3,k) = tmp1 * 2.0d+00 * njac(1,3,k)
+               lhsb(1,4,k) = tmp1 * 2.0d+00 * njac(1,4,k)
+               lhsb(1,5,k) = tmp1 * 2.0d+00 * njac(1,5,k)
+
+               lhsb(2,1,k) = tmp1 * 2.0d+00 * njac(2,1,k)
+               lhsb(2,2,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,k)
+     >              + tmp1 * 2.0d+00 * dz2
+               lhsb(2,3,k) = tmp1 * 2.0d+00 * njac(2,3,k)
+               lhsb(2,4,k) = tmp1 * 2.0d+00 * njac(2,4,k)
+               lhsb(2,5,k) = tmp1 * 2.0d+00 * njac(2,5,k)
+
+               lhsb(3,1,k) = tmp1 * 2.0d+00 * njac(3,1,k)
+               lhsb(3,2,k) = tmp1 * 2.0d+00 * njac(3,2,k)
+               lhsb(3,3,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,k)
+     >              + tmp1 * 2.0d+00 * dz3
+               lhsb(3,4,k) = tmp1 * 2.0d+00 * njac(3,4,k)
+               lhsb(3,5,k) = tmp1 * 2.0d+00 * njac(3,5,k)
+
+               lhsb(4,1,k) = tmp1 * 2.0d+00 * njac(4,1,k)
+               lhsb(4,2,k) = tmp1 * 2.0d+00 * njac(4,2,k)
+               lhsb(4,3,k) = tmp1 * 2.0d+00 * njac(4,3,k)
+               lhsb(4,4,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,k)
+     >              + tmp1 * 2.0d+00 * dz4
+               lhsb(4,5,k) = tmp1 * 2.0d+00 * njac(4,5,k)
+
+               lhsb(5,1,k) = tmp1 * 2.0d+00 * njac(5,1,k)
+               lhsb(5,2,k) = tmp1 * 2.0d+00 * njac(5,2,k)
+               lhsb(5,3,k) = tmp1 * 2.0d+00 * njac(5,3,k)
+               lhsb(5,4,k) = tmp1 * 2.0d+00 * njac(5,4,k)
+               lhsb(5,5,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,k) 
+     >              + tmp1 * 2.0d+00 * dz5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,k+1)
+     >              - tmp1 * njac(1,1,k+1)
+     >              - tmp1 * dz1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,k+1)
+     >              - tmp1 * njac(1,2,k+1)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,k+1)
+     >              - tmp1 * njac(1,3,k+1)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,k+1)
+     >              - tmp1 * njac(1,4,k+1)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,k+1)
+     >              - tmp1 * njac(1,5,k+1)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,k+1)
+     >              - tmp1 * njac(2,1,k+1)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,k+1)
+     >              - tmp1 * njac(2,2,k+1)
+     >              - tmp1 * dz2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,k+1)
+     >              - tmp1 * njac(2,3,k+1)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,k+1)
+     >              - tmp1 * njac(2,4,k+1)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,k+1)
+     >              - tmp1 * njac(2,5,k+1)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,k+1)
+     >              - tmp1 * njac(3,1,k+1)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,k+1)
+     >              - tmp1 * njac(3,2,k+1)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,k+1)
+     >              - tmp1 * njac(3,3,k+1)
+     >              - tmp1 * dz3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,k+1)
+     >              - tmp1 * njac(3,4,k+1)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,k+1)
+     >              - tmp1 * njac(3,5,k+1)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,k+1)
+     >              - tmp1 * njac(4,1,k+1)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,k+1)
+     >              - tmp1 * njac(4,2,k+1)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,k+1)
+     >              - tmp1 * njac(4,3,k+1)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,k+1)
+     >              - tmp1 * njac(4,4,k+1)
+     >              - tmp1 * dz4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,k+1)
+     >              - tmp1 * njac(4,5,k+1)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,k+1)
+     >              - tmp1 * njac(5,1,k+1)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,k+1)
+     >              - tmp1 * njac(5,2,k+1)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,k+1)
+     >              - tmp1 * njac(5,3,k+1)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,k+1)
+     >              - tmp1 * njac(5,4,k+1)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,k+1)
+     >              - tmp1 * njac(5,5,k+1)
+     >              - tmp1 * dz5
+
+            enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+            if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,kstart) by b_inverse and copy back to c
+c     multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,kstart),
+     >                        lhsc(1,1,i,j,kstart,c),
+     >                        rhs(1,i,j,kstart,c) )
+
+            endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+            do k=kstart+first,ksize-last
+
+c---------------------------------------------------------------------
+c     subtract A*lhs_vector(k-1) from lhs_vector(k)
+c     
+c     rhs(k) = rhs(k) - A*rhs(k-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,k),
+     >                         rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(k) = B(k) - C(k-1)*A(k)
+c     call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,k),
+     >                         lhsc(1,1,i,j,k-1,c),
+     >                         lhsb(1,1,k))
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,k),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+            if (last .eq. 1) then
+
+c---------------------------------------------------------------------
+c     rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,ksize),
+     >                         rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
+
+c---------------------------------------------------------------------
+c     B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
+c     call matmul_sub(aa,i,j,ksize,c,
+c     $              cc,i,j,ksize-1,c,bb,i,j,ksize,c)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,ksize),
+     >                         lhsc(1,1,i,j,ksize-1,c),
+     >                         lhsb(1,1,ksize))
+
+c---------------------------------------------------------------------
+c     multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,ksize),
+     >                       rhs(1,i,j,ksize,c) )
+
+            endif
+         enddo
+      enddo
+
+
+      return
+      end
+      
+
+
+
+
+
diff --git a/examples/smpi/NAS/BT/z_solve_vec.f b/examples/smpi/NAS/BT/z_solve_vec.f
new file mode 100644
index 0000000000..2c27fb00d1
--- /dev/null
+++ b/examples/smpi/NAS/BT/z_solve_vec.f
@@ -0,0 +1,793 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Performs line solves in Z direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer c, kstart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      kstart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the y-direct
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(3,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsz(c)
+            call z_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call z_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsz(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
+c---------------------------------------------------------------------
+            call z_unpack_solve_info(c)
+            call z_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call z_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(3,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call z_backsubstitute(first, last,c)
+         else
+            call z_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call z_unpack_backsub_info(c)
+            call z_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call z_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine z_unpack_solve_info(c)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all i and j
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,j,m,n,ptr,c,kstart 
+
+      kstart = 0
+      ptr = 0
+      do j=0,JMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine z_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(kend) and rhs'(kend) for
+c     all i and j
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,j,m,n,ksize,ptr,c,ip,jp
+      integer error,send_id,buffer_size
+
+      ksize = cell_size(3,c)-1
+      ip = cell_coord(1,c) - 1
+      jp = cell_coord(2,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do j=0,JMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(3),
+     >     BOTTOM+ip+jp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(jstart) for all i and j
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,j,n,ptr,c,kstart,ip,jp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      kstart = 0
+      ip = cell_coord(1,c)-1
+      jp = cell_coord(2,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do j=0,JMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(3), 
+     >     TOP+ip+jp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(ksize) for all i and j
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,j,n,ptr,c
+
+      ptr = 0
+      do j=0,JMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,i,j,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,ip,jp,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      jp = cell_coord(2,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(3), 
+     >     TOP+ip+jp*NCELLS, comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ip,jp,recv_id,error,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      jp = cell_coord(2,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, predecessor(3), 
+     >     BOTTOM+ip+jp*NCELLS, comm_solve,
+     >     recv_id, error)
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(ksize)=rhs(ksize)
+c     else assume U(ksize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(kstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, k
+      integer m,n,j,jsize,isize,ksize,kstart
+      
+      kstart = 0
+      isize = cell_size(1,c)-end(1,c)-1      
+      jsize = cell_size(2,c)-end(2,c)-1
+      ksize = cell_size(3,c)-1
+      if (last .eq. 0) then
+         do j=start(2,c),jsize
+            do i=start(1,c),isize
+c---------------------------------------------------------------------
+c     U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) 
+     >                    - lhsc(m,n,i,j,ksize,c)*
+     >                    backsub_info(n,i,j,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=ksize-1,kstart,-1
+         do j=start(2,c),jsize
+            do i=start(1,c),isize
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine z_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(KMAX) and rhs'(KMAX) will be sent to next cell.
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs_vec.h'
+
+      integer first,last,c
+      integer i,j,k,m,n,isize,ksize,jsize,kstart
+
+      kstart = 0
+      isize = cell_size(1,c)-end(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1
+      ksize = cell_size(3,c)-1
+
+c---------------------------------------------------------------------
+c     zero the left hand side for starters
+c     set diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+      do i = 0, isize
+         do m = 1, 5
+            do n = 1, 5
+               lhsa(m,n,i,0) = 0.0d0
+               lhsb(m,n,i,0) = 0.0d0
+               lhsa(m,n,i,ksize) = 0.0d0
+               lhsb(m,n,i,ksize) = 0.0d0
+            enddo
+            lhsb(m,m,i,0) = 1.0d0
+            lhsb(m,m,i,ksize) = 1.0d0
+         enddo
+      enddo
+
+      do j=start(2,c),jsize 
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side for the three z-factors 
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Compute the indices for storing the block-diagonal matrix;
+c     determine c (labeled f) and s jacobians for cell c
+c---------------------------------------------------------------------
+
+         do k = start(3,c)-1, cell_size(3,c)-end(3,c)
+            do i=start(1,c),isize
+
+               tmp1 = 1.0d0 / u(1,i,j,k,c)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               fjac(1,1,i,k) = 0.0d+00
+               fjac(1,2,i,k) = 0.0d+00
+               fjac(1,3,i,k) = 0.0d+00
+               fjac(1,4,i,k) = 1.0d+00
+               fjac(1,5,i,k) = 0.0d+00
+
+               fjac(2,1,i,k) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) 
+     >              * tmp2 
+               fjac(2,2,i,k) = u(4,i,j,k,c) * tmp1
+               fjac(2,3,i,k) = 0.0d+00
+               fjac(2,4,i,k) = u(2,i,j,k,c) * tmp1
+               fjac(2,5,i,k) = 0.0d+00
+
+               fjac(3,1,i,k) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+     >              * tmp2 
+               fjac(3,2,i,k) = 0.0d+00
+               fjac(3,3,i,k) = u(4,i,j,k,c) * tmp1
+               fjac(3,4,i,k) = u(3,i,j,k,c) * tmp1
+               fjac(3,5,i,k) = 0.0d+00
+
+               fjac(4,1,i,k) = - (u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 ) 
+     >              + c2 * qs(i,j,k,c)
+               fjac(4,2,i,k) = - c2 *  u(2,i,j,k,c) * tmp1 
+               fjac(4,3,i,k) = - c2 *  u(3,i,j,k,c) * tmp1
+               fjac(4,4,i,k) = ( 2.0d+00 - c2 )
+     >              *  u(4,i,j,k,c) * tmp1 
+               fjac(4,5,i,k) = c2
+
+               fjac(5,1,i,k) = ( c2 * 2.0d0 * qs(i,j,k,c)
+     >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+     >              * ( u(4,i,j,k,c) * tmp1 )
+               fjac(5,2,i,k) = - c2 * ( u(2,i,j,k,c)*u(4,i,j,k,c) )
+     >              * tmp2 
+               fjac(5,3,i,k) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+     >              * tmp2
+               fjac(5,4,i,k) = c1 * ( u(5,i,j,k,c) * tmp1 )
+     >              - c2 * ( qs(i,j,k,c)
+     >              + u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 )
+               fjac(5,5,i,k) = c1 * u(4,i,j,k,c) * tmp1
+
+               njac(1,1,i,k) = 0.0d+00
+               njac(1,2,i,k) = 0.0d+00
+               njac(1,3,i,k) = 0.0d+00
+               njac(1,4,i,k) = 0.0d+00
+               njac(1,5,i,k) = 0.0d+00
+
+               njac(2,1,i,k) = - c3c4 * tmp2 * u(2,i,j,k,c)
+               njac(2,2,i,k) =   c3c4 * tmp1
+               njac(2,3,i,k) =   0.0d+00
+               njac(2,4,i,k) =   0.0d+00
+               njac(2,5,i,k) =   0.0d+00
+
+               njac(3,1,i,k) = - c3c4 * tmp2 * u(3,i,j,k,c)
+               njac(3,2,i,k) =   0.0d+00
+               njac(3,3,i,k) =   c3c4 * tmp1
+               njac(3,4,i,k) =   0.0d+00
+               njac(3,5,i,k) =   0.0d+00
+
+               njac(4,1,i,k) = - con43 * c3c4 * tmp2 * u(4,i,j,k,c)
+               njac(4,2,i,k) =   0.0d+00
+               njac(4,3,i,k) =   0.0d+00
+               njac(4,4,i,k) =   con43 * c3 * c4 * tmp1
+               njac(4,5,i,k) =   0.0d+00
+
+               njac(5,1,i,k) = - (  c3c4
+     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+     >              - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+     >              - c1345 * tmp2 * u(5,i,j,k,c)
+
+               njac(5,2,i,k) = (  c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
+               njac(5,3,i,k) = (  c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
+               njac(5,4,i,k) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * u(4,i,j,k,c)
+               njac(5,5,i,k) = ( c1345 )* tmp1
+
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     now joacobians set, so form left hand side in z direction
+c---------------------------------------------------------------------
+         do k = start(3,c), ksize-end(3,c)
+            do i=start(1,c),isize
+
+               tmp1 = dt * tz1
+               tmp2 = dt * tz2
+
+               lhsa(1,1,i,k) = - tmp2 * fjac(1,1,i,k-1)
+     >              - tmp1 * njac(1,1,i,k-1)
+     >              - tmp1 * dz1 
+               lhsa(1,2,i,k) = - tmp2 * fjac(1,2,i,k-1)
+     >              - tmp1 * njac(1,2,i,k-1)
+               lhsa(1,3,i,k) = - tmp2 * fjac(1,3,i,k-1)
+     >              - tmp1 * njac(1,3,i,k-1)
+               lhsa(1,4,i,k) = - tmp2 * fjac(1,4,i,k-1)
+     >              - tmp1 * njac(1,4,i,k-1)
+               lhsa(1,5,i,k) = - tmp2 * fjac(1,5,i,k-1)
+     >              - tmp1 * njac(1,5,i,k-1)
+
+               lhsa(2,1,i,k) = - tmp2 * fjac(2,1,i,k-1)
+     >              - tmp1 * njac(2,1,i,k-1)
+               lhsa(2,2,i,k) = - tmp2 * fjac(2,2,i,k-1)
+     >              - tmp1 * njac(2,2,i,k-1)
+     >              - tmp1 * dz2
+               lhsa(2,3,i,k) = - tmp2 * fjac(2,3,i,k-1)
+     >              - tmp1 * njac(2,3,i,k-1)
+               lhsa(2,4,i,k) = - tmp2 * fjac(2,4,i,k-1)
+     >              - tmp1 * njac(2,4,i,k-1)
+               lhsa(2,5,i,k) = - tmp2 * fjac(2,5,i,k-1)
+     >              - tmp1 * njac(2,5,i,k-1)
+
+               lhsa(3,1,i,k) = - tmp2 * fjac(3,1,i,k-1)
+     >              - tmp1 * njac(3,1,i,k-1)
+               lhsa(3,2,i,k) = - tmp2 * fjac(3,2,i,k-1)
+     >              - tmp1 * njac(3,2,i,k-1)
+               lhsa(3,3,i,k) = - tmp2 * fjac(3,3,i,k-1)
+     >              - tmp1 * njac(3,3,i,k-1)
+     >              - tmp1 * dz3 
+               lhsa(3,4,i,k) = - tmp2 * fjac(3,4,i,k-1)
+     >              - tmp1 * njac(3,4,i,k-1)
+               lhsa(3,5,i,k) = - tmp2 * fjac(3,5,i,k-1)
+     >              - tmp1 * njac(3,5,i,k-1)
+
+               lhsa(4,1,i,k) = - tmp2 * fjac(4,1,i,k-1)
+     >              - tmp1 * njac(4,1,i,k-1)
+               lhsa(4,2,i,k) = - tmp2 * fjac(4,2,i,k-1)
+     >              - tmp1 * njac(4,2,i,k-1)
+               lhsa(4,3,i,k) = - tmp2 * fjac(4,3,i,k-1)
+     >              - tmp1 * njac(4,3,i,k-1)
+               lhsa(4,4,i,k) = - tmp2 * fjac(4,4,i,k-1)
+     >              - tmp1 * njac(4,4,i,k-1)
+     >              - tmp1 * dz4
+               lhsa(4,5,i,k) = - tmp2 * fjac(4,5,i,k-1)
+     >              - tmp1 * njac(4,5,i,k-1)
+
+               lhsa(5,1,i,k) = - tmp2 * fjac(5,1,i,k-1)
+     >              - tmp1 * njac(5,1,i,k-1)
+               lhsa(5,2,i,k) = - tmp2 * fjac(5,2,i,k-1)
+     >              - tmp1 * njac(5,2,i,k-1)
+               lhsa(5,3,i,k) = - tmp2 * fjac(5,3,i,k-1)
+     >              - tmp1 * njac(5,3,i,k-1)
+               lhsa(5,4,i,k) = - tmp2 * fjac(5,4,i,k-1)
+     >              - tmp1 * njac(5,4,i,k-1)
+               lhsa(5,5,i,k) = - tmp2 * fjac(5,5,i,k-1)
+     >              - tmp1 * njac(5,5,i,k-1)
+     >              - tmp1 * dz5
+
+               lhsb(1,1,i,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,i,k)
+     >              + tmp1 * 2.0d+00 * dz1
+               lhsb(1,2,i,k) = tmp1 * 2.0d+00 * njac(1,2,i,k)
+               lhsb(1,3,i,k) = tmp1 * 2.0d+00 * njac(1,3,i,k)
+               lhsb(1,4,i,k) = tmp1 * 2.0d+00 * njac(1,4,i,k)
+               lhsb(1,5,i,k) = tmp1 * 2.0d+00 * njac(1,5,i,k)
+
+               lhsb(2,1,i,k) = tmp1 * 2.0d+00 * njac(2,1,i,k)
+               lhsb(2,2,i,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,i,k)
+     >              + tmp1 * 2.0d+00 * dz2
+               lhsb(2,3,i,k) = tmp1 * 2.0d+00 * njac(2,3,i,k)
+               lhsb(2,4,i,k) = tmp1 * 2.0d+00 * njac(2,4,i,k)
+               lhsb(2,5,i,k) = tmp1 * 2.0d+00 * njac(2,5,i,k)
+
+               lhsb(3,1,i,k) = tmp1 * 2.0d+00 * njac(3,1,i,k)
+               lhsb(3,2,i,k) = tmp1 * 2.0d+00 * njac(3,2,i,k)
+               lhsb(3,3,i,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,i,k)
+     >              + tmp1 * 2.0d+00 * dz3
+               lhsb(3,4,i,k) = tmp1 * 2.0d+00 * njac(3,4,i,k)
+               lhsb(3,5,i,k) = tmp1 * 2.0d+00 * njac(3,5,i,k)
+
+               lhsb(4,1,i,k) = tmp1 * 2.0d+00 * njac(4,1,i,k)
+               lhsb(4,2,i,k) = tmp1 * 2.0d+00 * njac(4,2,i,k)
+               lhsb(4,3,i,k) = tmp1 * 2.0d+00 * njac(4,3,i,k)
+               lhsb(4,4,i,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,i,k)
+     >              + tmp1 * 2.0d+00 * dz4
+               lhsb(4,5,i,k) = tmp1 * 2.0d+00 * njac(4,5,i,k)
+
+               lhsb(5,1,i,k) = tmp1 * 2.0d+00 * njac(5,1,i,k)
+               lhsb(5,2,i,k) = tmp1 * 2.0d+00 * njac(5,2,i,k)
+               lhsb(5,3,i,k) = tmp1 * 2.0d+00 * njac(5,3,i,k)
+               lhsb(5,4,i,k) = tmp1 * 2.0d+00 * njac(5,4,i,k)
+               lhsb(5,5,i,k) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,i,k) 
+     >              + tmp1 * 2.0d+00 * dz5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i,k+1)
+     >              - tmp1 * njac(1,1,i,k+1)
+     >              - tmp1 * dz1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i,k+1)
+     >              - tmp1 * njac(1,2,i,k+1)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i,k+1)
+     >              - tmp1 * njac(1,3,i,k+1)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i,k+1)
+     >              - tmp1 * njac(1,4,i,k+1)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i,k+1)
+     >              - tmp1 * njac(1,5,i,k+1)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i,k+1)
+     >              - tmp1 * njac(2,1,i,k+1)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i,k+1)
+     >              - tmp1 * njac(2,2,i,k+1)
+     >              - tmp1 * dz2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i,k+1)
+     >              - tmp1 * njac(2,3,i,k+1)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i,k+1)
+     >              - tmp1 * njac(2,4,i,k+1)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i,k+1)
+     >              - tmp1 * njac(2,5,i,k+1)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i,k+1)
+     >              - tmp1 * njac(3,1,i,k+1)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i,k+1)
+     >              - tmp1 * njac(3,2,i,k+1)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i,k+1)
+     >              - tmp1 * njac(3,3,i,k+1)
+     >              - tmp1 * dz3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i,k+1)
+     >              - tmp1 * njac(3,4,i,k+1)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i,k+1)
+     >              - tmp1 * njac(3,5,i,k+1)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i,k+1)
+     >              - tmp1 * njac(4,1,i,k+1)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i,k+1)
+     >              - tmp1 * njac(4,2,i,k+1)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i,k+1)
+     >              - tmp1 * njac(4,3,i,k+1)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i,k+1)
+     >              - tmp1 * njac(4,4,i,k+1)
+     >              - tmp1 * dz4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i,k+1)
+     >              - tmp1 * njac(4,5,i,k+1)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i,k+1)
+     >              - tmp1 * njac(5,1,i,k+1)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i,k+1)
+     >              - tmp1 * njac(5,2,i,k+1)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i,k+1)
+     >              - tmp1 * njac(5,3,i,k+1)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i,k+1)
+     >              - tmp1 * njac(5,4,i,k+1)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i,k+1)
+     >              - tmp1 * njac(5,5,i,k+1)
+     >              - tmp1 * dz5
+
+            enddo
+         enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+         if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,kstart) by b_inverse and copy back to c
+c     multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
+c---------------------------------------------------------------------
+!dir$ ivdep
+            do i=start(1,c),isize
+               call binvcrhs( lhsb(1,1,i,kstart),
+     >                        lhsc(1,1,i,j,kstart,c),
+     >                        rhs(1,i,j,kstart,c) )
+            enddo
+
+         endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+         do k=kstart+first,ksize-last
+!dir$ ivdep
+            do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c     subtract A*lhs_vector(k-1) from lhs_vector(k)
+c     
+c     rhs(k) = rhs(k) - A*rhs(k-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,i,k),
+     >                         rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(k) = B(k) - C(k-1)*A(k)
+c     call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,i,k),
+     >                         lhsc(1,1,i,j,k-1,c),
+     >                         lhsb(1,1,i,k))
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,i,k),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+         if (last .eq. 1) then
+
+!dir$ ivdep
+            do i=start(1,c),isize
+c---------------------------------------------------------------------
+c     rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,i,ksize),
+     >                         rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
+
+c---------------------------------------------------------------------
+c     B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
+c     call matmul_sub(aa,i,j,ksize,c,
+c     $              cc,i,j,ksize-1,c,bb,i,j,ksize,c)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,i,ksize),
+     >                         lhsc(1,1,i,j,ksize-1,c),
+     >                         lhsb(1,1,i,ksize))
+
+c---------------------------------------------------------------------
+c     multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,i,ksize),
+     >                       rhs(1,i,j,ksize,c) )
+            enddo
+
+         endif
+      enddo
+
+
+      return
+      end
+      
+
+
+
+
+
diff --git a/examples/smpi/NAS/CG/Makefile b/examples/smpi/NAS/CG/Makefile
new file mode 100644
index 0000000000..33e52c697b
--- /dev/null
+++ b/examples/smpi/NAS/CG/Makefile
@@ -0,0 +1,23 @@
+SHELL=/bin/sh
+BENCHMARK=cg
+BENCHMARKU=CG
+
+include ../config/make.def
+
+OBJS = cg.o ${COMMON}/print_results.o  \
+       ${COMMON}/${RAND}.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+cg.o:		cg.f  mpinpb.h npbparams.h
+	${FCOMPILE} cg.f
+
+clean:
+	- rm -f *.o *~ 
+	- rm -f npbparams.h core
+
+
+
diff --git a/examples/smpi/NAS/CG/cg.f b/examples/smpi/NAS/CG/cg.f
new file mode 100644
index 0000000000..0d425d78a4
--- /dev/null
+++ b/examples/smpi/NAS/CG/cg.f
@@ -0,0 +1,1787 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                                   C G                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+!    It is described in NAS Technical Reports 95-020 and 02-007           !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c
+c Authors: M. Yarrow
+c          C. Kuszmaul
+c          R. F. Van der Wijngaart
+c          H. Jin
+c
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      program cg
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+      implicit none
+
+      include 'mpinpb.h'
+      integer status(MPI_STATUS_SIZE), request, ierr
+
+      include 'npbparams.h'
+
+c---------------------------------------------------------------------
+c  num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows.
+c  num_proc_cols and num_proc_cols are to be found in npbparams.h.
+c  When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows.
+c---------------------------------------------------------------------
+      integer    num_procs 
+      parameter( num_procs = num_proc_cols * num_proc_rows )
+
+
+
+c---------------------------------------------------------------------
+c  Class specific parameters: 
+c  It appears here for reference only.
+c  These are their values, however, this info is imported in the npbparams.h
+c  include file, which is written by the sys/setparams.c program.
+c---------------------------------------------------------------------
+
+C----------
+C  Class S:
+C----------
+CC       parameter( na=1400, 
+CC      >           nonzer=7, 
+CC      >           shift=10., 
+CC      >           niter=15,
+CC      >           rcond=1.0d-1 )
+C----------
+C  Class W:
+C----------
+CC       parameter( na=7000,
+CC      >           nonzer=8, 
+CC      >           shift=12., 
+CC      >           niter=15,
+CC      >           rcond=1.0d-1 )
+C----------
+C  Class A:
+C----------
+CC       parameter( na=14000,
+CC      >           nonzer=11, 
+CC      >           shift=20., 
+CC      >           niter=15,
+CC      >           rcond=1.0d-1 )
+C----------
+C  Class B:
+C----------
+CC       parameter( na=75000, 
+CC      >           nonzer=13, 
+CC      >           shift=60., 
+CC      >           niter=75,
+CC      >           rcond=1.0d-1 )
+C----------
+C  Class C:
+C----------
+CC       parameter( na=150000, 
+CC      >           nonzer=15, 
+CC      >           shift=110., 
+CC      >           niter=75,
+CC      >           rcond=1.0d-1 )
+C----------
+C  Class D:
+C----------
+CC       parameter( na=1500000, 
+CC      >           nonzer=21, 
+CC      >           shift=500., 
+CC      >           niter=100,
+CC      >           rcond=1.0d-1 )
+C----------
+C  Class E:
+C----------
+CC       parameter( na=9000000, 
+CC      >           nonzer=26, 
+CC      >           shift=1500., 
+CC      >           niter=100,
+CC      >           rcond=1.0d-1 )
+
+
+
+      integer    nz
+      parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer
+     >              + na*(nonzer+2+num_procs/256)/num_proc_cols )
+
+
+
+      common / partit_size  /  naa, nzz, 
+     >                         npcols, nprows,
+     >                         proc_col, proc_row,
+     >                         firstrow, 
+     >                         lastrow, 
+     >                         firstcol, 
+     >                         lastcol,
+     >                         exch_proc,
+     >                         exch_recv_length,
+     >                         send_start,
+     >                         send_len
+      integer                  naa, nzz, 
+     >                         npcols, nprows,
+     >                         proc_col, proc_row,
+     >                         firstrow, 
+     >                         lastrow, 
+     >                         firstcol, 
+     >                         lastcol,
+     >                         exch_proc,
+     >                         exch_recv_length,
+     >                         send_start,
+     >                         send_len
+
+
+      common / main_int_mem /  colidx,     rowstr,
+     >                         iv,         arow,     acol
+      integer                  colidx(nz), rowstr(na+1),
+     >                         iv(2*na+1), arow(nz), acol(nz)
+
+
+      common / main_flt_mem /  v,       aelt,     a,
+     >                         x,
+     >                         z,
+     >                         p,
+     >                         q,
+     >                         r,
+     >                         w
+      double precision         v(na+1), aelt(nz), a(nz),
+     >                         x(na/num_proc_rows+2),
+     >                         z(na/num_proc_rows+2),
+     >                         p(na/num_proc_rows+2),
+     >                         q(na/num_proc_rows+2),
+     >                         r(na/num_proc_rows+2),
+     >                         w(na/num_proc_rows+2)
+
+
+      common /urando/          amult, tran
+      double precision         amult, tran
+
+
+
+      integer            l2npcols
+      integer            reduce_exch_proc(num_proc_cols)
+      integer            reduce_send_starts(num_proc_cols)
+      integer            reduce_send_lengths(num_proc_cols)
+      integer            reduce_recv_starts(num_proc_cols)
+      integer            reduce_recv_lengths(num_proc_cols)
+
+      integer            i, j, k, it
+
+      double precision   zeta, randlc
+      external           randlc
+      double precision   rnorm
+      double precision   norm_temp1(2), norm_temp2(2)
+
+      double precision   t, tmax, mflops
+      external           timer_read
+      double precision   timer_read
+      character          class
+      logical            verified
+      double precision   zeta_verify_value, epsilon, err
+
+
+c---------------------------------------------------------------------
+c  Set up mpi initialization and number of proc testing
+c---------------------------------------------------------------------
+      call initialize_mpi
+
+
+      if( na .eq. 1400 .and. 
+     &    nonzer .eq. 7 .and. 
+     &    niter .eq. 15 .and.
+     &    shift .eq. 10.d0 ) then
+         class = 'S'
+         zeta_verify_value = 8.5971775078648d0
+      else if( na .eq. 7000 .and. 
+     &         nonzer .eq. 8 .and. 
+     &         niter .eq. 15 .and.
+     &         shift .eq. 12.d0 ) then
+         class = 'W'
+         zeta_verify_value = 10.362595087124d0
+      else if( na .eq. 14000 .and. 
+     &         nonzer .eq. 11 .and. 
+     &         niter .eq. 15 .and.
+     &         shift .eq. 20.d0 ) then
+         class = 'A'
+         zeta_verify_value = 17.130235054029d0
+      else if( na .eq. 75000 .and. 
+     &         nonzer .eq. 13 .and. 
+     &         niter .eq. 75 .and.
+     &         shift .eq. 60.d0 ) then
+         class = 'B'
+         zeta_verify_value = 22.712745482631d0
+      else if( na .eq. 150000 .and. 
+     &         nonzer .eq. 15 .and. 
+     &         niter .eq. 75 .and.
+     &         shift .eq. 110.d0 ) then
+         class = 'C'
+         zeta_verify_value = 28.973605592845d0
+      else if( na .eq. 1500000 .and. 
+     &         nonzer .eq. 21 .and. 
+     &         niter .eq. 100 .and.
+     &         shift .eq. 500.d0 ) then
+         class = 'D'
+         zeta_verify_value = 52.514532105794d0
+      else if( na .eq. 9000000 .and. 
+     &         nonzer .eq. 26 .and. 
+     &         niter .eq. 100 .and.
+     &         shift .eq. 1.5d3 ) then
+         class = 'E'
+         zeta_verify_value = 77.522164599383d0
+      else
+         class = 'U'
+      endif
+
+      if( me .eq. root )then
+         write( *,1000 ) 
+         write( *,1001 ) na
+         write( *,1002 ) niter
+         write( *,1003 ) nprocs
+         write( *,1004 ) nonzer
+         write( *,1005 ) shift
+ 1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /)
+ 1001 format(' Size: ', i10 )
+ 1002 format(' Iterations: ', i5 )
+ 1003 format(' Number of active processes: ', i5 )
+ 1004 format(' Number of nonzeroes per row: ', i8)
+ 1005 format(' Eigenvalue shift: ', e8.3)
+      endif
+
+      if (.not. convertdouble) then
+         dp_type = MPI_DOUBLE_PRECISION
+      else
+         dp_type = MPI_REAL
+      endif
+
+
+      naa = na
+      nzz = nz
+
+
+c---------------------------------------------------------------------
+c  Set up processor info, such as whether sq num of procs, etc
+c---------------------------------------------------------------------
+      call setup_proc_info( num_procs, 
+     >                      num_proc_rows, 
+     >                      num_proc_cols )
+
+
+c---------------------------------------------------------------------
+c  Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow
+c---------------------------------------------------------------------
+      call setup_submatrix_info( l2npcols,
+     >                           reduce_exch_proc,
+     >                           reduce_send_starts,
+     >                           reduce_send_lengths,
+     >                           reduce_recv_starts,
+     >                           reduce_recv_lengths )
+
+
+
+c---------------------------------------------------------------------
+c  Inialize random number generator
+c---------------------------------------------------------------------
+      tran    = 314159265.0D0
+      amult   = 1220703125.0D0
+      zeta    = randlc( tran, amult )
+
+c---------------------------------------------------------------------
+c  Set up partition's sparse random matrix for given class size
+c---------------------------------------------------------------------
+      call makea(naa, nzz, a, colidx, rowstr, nonzer,
+     >           firstrow, lastrow, firstcol, lastcol, 
+     >           rcond, arow, acol, aelt, v, iv, shift)
+
+
+
+c---------------------------------------------------------------------
+c  Note: as a result of the above call to makea:
+c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
+c        values of colidx which are col indexes go from firstcol --> lastcol
+c        So:
+c        Shift the col index vals from actual (firstcol --> lastcol ) 
+c        to local, i.e., (1 --> lastcol-firstcol+1)
+c---------------------------------------------------------------------
+      do j=1,lastrow-firstrow+1
+         do k=rowstr(j),rowstr(j+1)-1
+            colidx(k) = colidx(k) - firstcol + 1
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c  set starting vector to (1, 1, .... 1)
+c---------------------------------------------------------------------
+      do i = 1, na/num_proc_rows+1
+         x(i) = 1.0D0
+      enddo
+
+      zeta  = 0.0d0
+
+c---------------------------------------------------------------------
+c---->
+c  Do one iteration untimed to init all code and data page tables
+c---->                    (then reinit, start timing, to niter its)
+c---------------------------------------------------------------------
+      do it = 1, 1
+
+c---------------------------------------------------------------------
+c  The call to the conjugate gradient routine:
+c---------------------------------------------------------------------
+         call conj_grad ( colidx,
+     >                    rowstr,
+     >                    x,
+     >                    z,
+     >                    a,
+     >                    p,
+     >                    q,
+     >                    r,
+     >                    w,
+     >                    rnorm, 
+     >                    l2npcols,
+     >                    reduce_exch_proc,
+     >                    reduce_send_starts,
+     >                    reduce_send_lengths,
+     >                    reduce_recv_starts,
+     >                    reduce_recv_lengths )
+
+c---------------------------------------------------------------------
+c  zeta = shift + 1/(x.z)
+c  So, first: (x.z)
+c  Also, find norm of z
+c  So, first: (z.z)
+c---------------------------------------------------------------------
+         norm_temp1(1) = 0.0d0
+         norm_temp1(2) = 0.0d0
+         do j=1, lastcol-firstcol+1
+            norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
+            norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
+         enddo
+
+         do i = 1, l2npcols
+            call mpi_irecv( norm_temp2,
+     >                      2, 
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      request,
+     >                      ierr )
+            call mpi_send(  norm_temp1,
+     >                      2, 
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      ierr )
+            call mpi_wait( request, status, ierr )
+
+            norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
+            norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
+         enddo
+
+         norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
+
+
+c---------------------------------------------------------------------
+c  Normalize z to obtain x
+c---------------------------------------------------------------------
+         do j=1, lastcol-firstcol+1      
+            x(j) = norm_temp1(2)*z(j)    
+         enddo                           
+
+
+      enddo                              ! end of do one iteration untimed
+
+
+c---------------------------------------------------------------------
+c  set starting vector to (1, 1, .... 1)
+c---------------------------------------------------------------------
+c
+c  NOTE: a questionable limit on size:  should this be na/num_proc_cols+1 ?
+c
+      do i = 1, na/num_proc_rows+1
+         x(i) = 1.0D0
+      enddo
+
+      zeta  = 0.0d0
+
+c---------------------------------------------------------------------
+c  Synchronize and start timing
+c---------------------------------------------------------------------
+      call mpi_barrier( mpi_comm_world,
+     >                  ierr )
+
+      call timer_clear( 1 )
+      call timer_start( 1 )
+
+c---------------------------------------------------------------------
+c---->
+c  Main Iteration for inverse power method
+c---->
+c---------------------------------------------------------------------
+      do it = 1, niter
+
+c---------------------------------------------------------------------
+c  The call to the conjugate gradient routine:
+c---------------------------------------------------------------------
+         call conj_grad ( colidx,
+     >                    rowstr,
+     >                    x,
+     >                    z,
+     >                    a,
+     >                    p,
+     >                    q,
+     >                    r,
+     >                    w,
+     >                    rnorm, 
+     >                    l2npcols,
+     >                    reduce_exch_proc,
+     >                    reduce_send_starts,
+     >                    reduce_send_lengths,
+     >                    reduce_recv_starts,
+     >                    reduce_recv_lengths )
+
+
+c---------------------------------------------------------------------
+c  zeta = shift + 1/(x.z)
+c  So, first: (x.z)
+c  Also, find norm of z
+c  So, first: (z.z)
+c---------------------------------------------------------------------
+         norm_temp1(1) = 0.0d0
+         norm_temp1(2) = 0.0d0
+         do j=1, lastcol-firstcol+1
+            norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
+            norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
+         enddo
+
+         do i = 1, l2npcols
+            call mpi_irecv( norm_temp2,
+     >                      2, 
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      request,
+     >                      ierr )
+            call mpi_send(  norm_temp1,
+     >                      2, 
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      ierr )
+            call mpi_wait( request, status, ierr )
+
+            norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
+            norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
+         enddo
+
+         norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
+
+
+         if( me .eq. root )then
+            zeta = shift + 1.0d0 / norm_temp1(1)
+            if( it .eq. 1 ) write( *,9000 )
+            write( *,9001 ) it, rnorm, zeta
+         endif
+ 9000 format( /,'   iteration           ||r||                 zeta' )
+ 9001 format( 4x, i5, 7x, e20.14, f20.13 )
+
+c---------------------------------------------------------------------
+c  Normalize z to obtain x
+c---------------------------------------------------------------------
+         do j=1, lastcol-firstcol+1      
+            x(j) = norm_temp1(2)*z(j)    
+         enddo                           
+
+
+      enddo                              ! end of main iter inv pow meth
+
+      call timer_stop( 1 )
+
+c---------------------------------------------------------------------
+c  End of timed section
+c---------------------------------------------------------------------
+
+      t = timer_read( 1 )
+
+      call mpi_reduce( t,
+     >                 tmax,
+     >                 1, 
+     >                 dp_type,
+     >                 MPI_MAX,
+     >                 root,
+     >                 mpi_comm_world,
+     >                 ierr )
+
+      if( me .eq. root )then
+         write(*,100)
+ 100     format(' Benchmark completed ')
+
+         epsilon = 1.d-10
+         if (class .ne. 'U') then
+
+            err = abs( zeta - zeta_verify_value )/zeta_verify_value
+            if( err .le. epsilon ) then
+               verified = .TRUE.
+               write(*, 200)
+               write(*, 201) zeta
+               write(*, 202) err
+ 200           format(' VERIFICATION SUCCESSFUL ')
+ 201           format(' Zeta is    ', E20.13)
+ 202           format(' Error is   ', E20.13)
+            else
+               verified = .FALSE.
+               write(*, 300) 
+               write(*, 301) zeta
+               write(*, 302) zeta_verify_value
+ 300           format(' VERIFICATION FAILED')
+ 301           format(' Zeta                ', E20.13)
+ 302           format(' The correct zeta is ', E20.13)
+            endif
+         else
+            verified = .FALSE.
+            write (*, 400)
+            write (*, 401)
+            write (*, 201) zeta
+ 400        format(' Problem size unknown')
+ 401        format(' NO VERIFICATION PERFORMED')
+         endif
+
+
+         if( tmax .ne. 0. ) then
+            mflops = float( 2*niter*na )
+     &                  * ( 3.+float( nonzer*(nonzer+1) )
+     &                    + 25.*(5.+float( nonzer*(nonzer+1) ))
+     &                    + 3. ) / tmax / 1000000.0
+         else
+            mflops = 0.0
+         endif
+
+         call print_results('CG', class, na, 0, 0,
+     >                      niter, nnodes_compiled, nprocs, tmax,
+     >                      mflops, '          floating point', 
+     >                      verified, npbversion, compiletime,
+     >                      cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+
+
+      endif
+
+
+      call mpi_finalize(ierr)
+
+
+
+      end                              ! end main
+
+
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine initialize_mpi
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer   ierr
+
+
+      call mpi_init( ierr )
+      call mpi_comm_rank( mpi_comm_world, me, ierr )
+      call mpi_comm_size( mpi_comm_world, nprocs, ierr )
+      root = 0
+
+
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine setup_proc_info( num_procs, 
+     >                            num_proc_rows, 
+     >                            num_proc_cols )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+
+      common / partit_size  /  naa, nzz, 
+     >                         npcols, nprows,
+     >                         proc_col, proc_row,
+     >                         firstrow, 
+     >                         lastrow, 
+     >                         firstcol, 
+     >                         lastcol,
+     >                         exch_proc,
+     >                         exch_recv_length,
+     >                         send_start,
+     >                         send_len
+      integer                  naa, nzz, 
+     >                         npcols, nprows,
+     >                         proc_col, proc_row,
+     >                         firstrow, 
+     >                         lastrow, 
+     >                         firstcol, 
+     >                         lastcol,
+     >                         exch_proc,
+     >                         exch_recv_length,
+     >                         send_start,
+     >                         send_len
+
+      integer   num_procs, num_proc_cols, num_proc_rows
+      integer   i, ierr
+      integer   log2nprocs
+
+c---------------------------------------------------------------------
+c  num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows
+c  When num_procs is not square, then num_proc_cols = 2*num_proc_rows
+c---------------------------------------------------------------------
+c  First, number of procs must be power of two. 
+c---------------------------------------------------------------------
+      if( nprocs .ne. num_procs )then
+          if( me .eq. root ) write( *,9000 ) nprocs, num_procs
+ 9000     format(      /,'Error: ',/,'num of procs allocated   (', 
+     >                 i4, ' )',
+     >                 /,'is not equal to',/,
+     >                 'compiled number of procs (',
+     >                 i4, ' )',/   )
+          call mpi_finalize(ierr)
+          stop
+      endif
+
+
+      i = num_proc_cols
+ 100  continue
+          if( i .ne. 1 .and. i/2*2 .ne. i )then
+              if ( me .eq. root ) then  
+                 write( *,* ) 'Error: num_proc_cols is ',
+     >                         num_proc_cols,
+     >                        ' which is not a power of two'
+              endif
+              call mpi_finalize(ierr)
+              stop
+          endif
+          i = i / 2
+          if( i .ne. 0 )then
+              goto 100
+          endif
+      
+      i = num_proc_rows
+ 200  continue
+          if( i .ne. 1 .and. i/2*2 .ne. i )then
+              if ( me .eq. root ) then 
+                 write( *,* ) 'Error: num_proc_rows is ',
+     >                         num_proc_rows,
+     >                        ' which is not a power of two'
+              endif
+              call mpi_finalize(ierr)
+              stop
+          endif
+          i = i / 2
+          if( i .ne. 0 )then
+              goto 200
+          endif
+      
+      log2nprocs = 0
+      i = nprocs
+ 300  continue
+          if( i .ne. 1 .and. i/2*2 .ne. i )then
+              write( *,* ) 'Error: nprocs is ',
+     >                      nprocs,
+     >                      ' which is not a power of two'
+              call mpi_finalize(ierr)
+              stop
+          endif
+          i = i / 2
+          if( i .ne. 0 )then
+              log2nprocs = log2nprocs + 1
+              goto 300
+          endif
+
+CC       write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs
+
+      
+      npcols = num_proc_cols
+      nprows = num_proc_rows
+
+
+      return
+      end
+
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine setup_submatrix_info( l2npcols,
+     >                                 reduce_exch_proc,
+     >                                 reduce_send_starts,
+     >                                 reduce_send_lengths,
+     >                                 reduce_recv_starts,
+     >                                 reduce_recv_lengths )
+     >                                 
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer      col_size, row_size
+
+      common / partit_size  /  naa, nzz, 
+     >                         npcols, nprows,
+     >                         proc_col, proc_row,
+     >                         firstrow, 
+     >                         lastrow, 
+     >                         firstcol, 
+     >                         lastcol,
+     >                         exch_proc,
+     >                         exch_recv_length,
+     >                         send_start,
+     >                         send_len
+      integer                  naa, nzz, 
+     >                         npcols, nprows,
+     >                         proc_col, proc_row,
+     >                         firstrow, 
+     >                         lastrow, 
+     >                         firstcol, 
+     >                         lastcol,
+     >                         exch_proc,
+     >                         exch_recv_length,
+     >                         send_start,
+     >                         send_len
+
+      integer   reduce_exch_proc(*)
+      integer   reduce_send_starts(*)
+      integer   reduce_send_lengths(*)
+      integer   reduce_recv_starts(*)
+      integer   reduce_recv_lengths(*)
+
+      integer   i, j
+      integer   div_factor
+      integer   l2npcols
+
+
+      proc_row = me / npcols
+      proc_col = me - proc_row*npcols
+
+
+
+c---------------------------------------------------------------------
+c  If naa evenly divisible by npcols, then it is evenly divisible 
+c  by nprows 
+c---------------------------------------------------------------------
+
+      if( naa/npcols*npcols .eq. naa )then
+          col_size = naa/npcols
+          firstcol = proc_col*col_size + 1
+          lastcol  = firstcol - 1 + col_size
+          row_size = naa/nprows
+          firstrow = proc_row*row_size + 1
+          lastrow  = firstrow - 1 + row_size
+c---------------------------------------------------------------------
+c  If naa not evenly divisible by npcols, then first subdivide for nprows
+c  and then, if npcols not equal to nprows (i.e., not a sq number of procs), 
+c  get col subdivisions by dividing by 2 each row subdivision.
+c---------------------------------------------------------------------
+      else
+          if( proc_row .lt. naa - naa/nprows*nprows)then
+              row_size = naa/nprows+ 1
+              firstrow = proc_row*row_size + 1
+              lastrow  = firstrow - 1 + row_size
+          else
+              row_size = naa/nprows
+              firstrow = (naa - naa/nprows*nprows)*(row_size+1)
+     >                 + (proc_row-(naa-naa/nprows*nprows))
+     >                     *row_size + 1
+              lastrow  = firstrow - 1 + row_size
+          endif
+          if( npcols .eq. nprows )then
+              if( proc_col .lt. naa - naa/npcols*npcols )then
+                  col_size = naa/npcols+ 1
+                  firstcol = proc_col*col_size + 1
+                  lastcol  = firstcol - 1 + col_size
+              else
+                  col_size = naa/npcols
+                  firstcol = (naa - naa/npcols*npcols)*(col_size+1)
+     >                     + (proc_col-(naa-naa/npcols*npcols))
+     >                         *col_size + 1
+                  lastcol  = firstcol - 1 + col_size
+              endif
+          else
+              if( (proc_col/2) .lt. 
+     >                           naa - naa/(npcols/2)*(npcols/2) )then
+                  col_size = naa/(npcols/2) + 1
+                  firstcol = (proc_col/2)*col_size + 1
+                  lastcol  = firstcol - 1 + col_size
+              else
+                  col_size = naa/(npcols/2)
+                  firstcol = (naa - naa/(npcols/2)*(npcols/2))
+     >                                                 *(col_size+1)
+     >               + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2)))
+     >                         *col_size + 1
+                  lastcol  = firstcol - 1 + col_size
+              endif
+CC               write( *,* ) col_size,firstcol,lastcol
+              if( mod( me,2 ) .eq. 0 )then
+                  lastcol  = firstcol - 1 + (col_size-1)/2 + 1
+              else
+                  firstcol = firstcol + (col_size-1)/2 + 1
+                  lastcol  = firstcol - 1 + col_size/2
+CC                   write( *,* ) firstcol,lastcol
+              endif
+          endif
+      endif
+
+
+
+      if( npcols .eq. nprows )then
+          send_start = 1
+          send_len   = lastrow - firstrow + 1
+      else
+          if( mod( me,2 ) .eq. 0 )then
+              send_start = 1
+              send_len   = (1 + lastrow-firstrow+1)/2
+          else
+              send_start = (1 + lastrow-firstrow+1)/2 + 1
+              send_len   = (lastrow-firstrow+1)/2
+          endif
+      endif
+          
+
+
+
+c---------------------------------------------------------------------
+c  Transpose exchange processor
+c---------------------------------------------------------------------
+
+      if( npcols .eq. nprows )then
+          exch_proc = mod( me,nprows )*nprows + me/nprows
+      else
+          exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows)
+     >                 + mod( me,2 )
+      endif
+
+
+
+      i = npcols / 2
+      l2npcols = 0
+      do while( i .gt. 0 )
+         l2npcols = l2npcols + 1
+         i = i / 2
+      enddo
+
+
+c---------------------------------------------------------------------
+c  Set up the reduce phase schedules...
+c---------------------------------------------------------------------
+
+      div_factor = npcols
+      do i = 1, l2npcols
+
+         j = mod( proc_col+div_factor/2, div_factor )
+     >     + proc_col / div_factor * div_factor
+         reduce_exch_proc(i) = proc_row*npcols + j
+
+         div_factor = div_factor / 2
+
+      enddo
+
+
+      do i = l2npcols, 1, -1
+
+            if( nprows .eq. npcols )then
+               reduce_send_starts(i)  = send_start
+               reduce_send_lengths(i) = send_len
+               reduce_recv_lengths(i) = lastrow - firstrow + 1
+            else
+               reduce_recv_lengths(i) = send_len
+               if( i .eq. l2npcols )then
+                  reduce_send_lengths(i) = lastrow-firstrow+1 - send_len
+                  if( me/2*2 .eq. me )then
+                     reduce_send_starts(i) = send_start + send_len
+                  else
+                     reduce_send_starts(i) = 1
+                  endif
+               else
+                  reduce_send_lengths(i) = send_len
+                  reduce_send_starts(i)  = send_start
+               endif
+            endif
+            reduce_recv_starts(i) = send_start
+
+      enddo
+
+
+      exch_recv_length = lastcol - firstcol + 1
+
+
+      return
+      end
+
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine conj_grad ( colidx,
+     >                       rowstr,
+     >                       x,
+     >                       z,
+     >                       a,
+     >                       p,
+     >                       q,
+     >                       r,
+     >                       w,
+     >                       rnorm, 
+     >                       l2npcols,
+     >                       reduce_exch_proc,
+     >                       reduce_send_starts,
+     >                       reduce_send_lengths,
+     >                       reduce_recv_starts,
+     >                       reduce_recv_lengths )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c  Floaging point arrays here are named as in NPB1 spec discussion of 
+c  CG algorithm
+c---------------------------------------------------------------------
+ 
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer status(MPI_STATUS_SIZE ), request
+
+
+      common / partit_size  /  naa, nzz, 
+     >                         npcols, nprows,
+     >                         proc_col, proc_row,
+     >                         firstrow, 
+     >                         lastrow, 
+     >                         firstcol, 
+     >                         lastcol,
+     >                         exch_proc,
+     >                         exch_recv_length,
+     >                         send_start,
+     >                         send_len
+      integer                  naa, nzz, 
+     >                         npcols, nprows,
+     >                         proc_col, proc_row,
+     >                         firstrow, 
+     >                         lastrow, 
+     >                         firstcol, 
+     >                         lastcol,
+     >                         exch_proc,
+     >                         exch_recv_length,
+     >                         send_start,
+     >                         send_len
+
+
+
+      double precision   x(*),
+     >                   z(*),
+     >                   a(nzz)
+      integer            colidx(nzz), rowstr(naa+1)
+
+      double precision   p(*),
+     >                   q(*),
+     >                   r(*),               
+     >                   w(*)                ! used as work temporary
+
+      integer   l2npcols
+      integer   reduce_exch_proc(l2npcols)
+      integer   reduce_send_starts(l2npcols)
+      integer   reduce_send_lengths(l2npcols)
+      integer   reduce_recv_starts(l2npcols)
+      integer   reduce_recv_lengths(l2npcols)
+
+      integer   i, j, k, ierr
+      integer   cgit, cgitmax
+
+      double precision   d, sum, rho, rho0, alpha, beta, rnorm
+
+      external         timer_read
+      double precision timer_read
+
+      data      cgitmax / 25 /
+
+
+c---------------------------------------------------------------------
+c  Initialize the CG algorithm:
+c---------------------------------------------------------------------
+      do j=1,naa/nprows+1
+         q(j) = 0.0d0
+         z(j) = 0.0d0
+         r(j) = x(j)
+         p(j) = r(j)
+         w(j) = 0.0d0                 
+      enddo
+
+
+c---------------------------------------------------------------------
+c  rho = r.r
+c  Now, obtain the norm of r: First, sum squares of r elements locally...
+c---------------------------------------------------------------------
+      sum = 0.0d0
+      do j=1, lastcol-firstcol+1
+         sum = sum + r(j)*r(j)
+      enddo
+
+c---------------------------------------------------------------------
+c  Exchange and sum with procs identified in reduce_exch_proc
+c  (This is equivalent to mpi_allreduce.)
+c  Sum the partial sums of rho, leaving rho on all processors
+c---------------------------------------------------------------------
+      do i = 1, l2npcols
+         call mpi_irecv( rho,
+     >                   1,
+     >                   dp_type,
+     >                   reduce_exch_proc(i),
+     >                   i,
+     >                   mpi_comm_world,
+     >                   request,
+     >                   ierr )
+         call mpi_send(  sum,
+     >                   1,
+     >                   dp_type,
+     >                   reduce_exch_proc(i),
+     >                   i,
+     >                   mpi_comm_world,
+     >                   ierr )
+         call mpi_wait( request, status, ierr )
+
+         sum = sum + rho
+      enddo
+      rho = sum
+
+
+
+c---------------------------------------------------------------------
+c---->
+c  The conj grad iteration loop
+c---->
+c---------------------------------------------------------------------
+      do cgit = 1, cgitmax
+
+
+c---------------------------------------------------------------------
+c  q = A.p
+c  The partition submatrix-vector multiply: use workspace w
+c---------------------------------------------------------------------
+         do j=1,lastrow-firstrow+1
+            sum = 0.d0
+            do k=rowstr(j),rowstr(j+1)-1
+               sum = sum + a(k)*p(colidx(k))
+            enddo
+            w(j) = sum
+         enddo
+
+c---------------------------------------------------------------------
+c  Sum the partition submatrix-vec A.p's across rows
+c  Exchange and sum piece of w with procs identified in reduce_exch_proc
+c---------------------------------------------------------------------
+         do i = l2npcols, 1, -1
+            call mpi_irecv( q(reduce_recv_starts(i)),
+     >                      reduce_recv_lengths(i),
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      request,
+     >                      ierr )
+            call mpi_send(  w(reduce_send_starts(i)),
+     >                      reduce_send_lengths(i),
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      ierr )
+            call mpi_wait( request, status, ierr )
+            do j=send_start,send_start + reduce_recv_lengths(i) - 1
+               w(j) = w(j) + q(j)
+            enddo
+         enddo
+      
+
+c---------------------------------------------------------------------
+c  Exchange piece of q with transpose processor:
+c---------------------------------------------------------------------
+         if( l2npcols .ne. 0 )then
+            call mpi_irecv( q,               
+     >                      exch_recv_length,
+     >                      dp_type,
+     >                      exch_proc,
+     >                      1,
+     >                      mpi_comm_world,
+     >                      request,
+     >                      ierr )
+
+            call mpi_send(  w(send_start),   
+     >                      send_len,
+     >                      dp_type,
+     >                      exch_proc,
+     >                      1,
+     >                      mpi_comm_world,
+     >                      ierr )
+            call mpi_wait( request, status, ierr )
+         else
+            do j=1,exch_recv_length
+               q(j) = w(j)
+            enddo
+         endif
+
+
+c---------------------------------------------------------------------
+c  Clear w for reuse...
+c---------------------------------------------------------------------
+         do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 )
+            w(j) = 0.0d0
+         enddo
+         
+
+c---------------------------------------------------------------------
+c  Obtain p.q
+c---------------------------------------------------------------------
+         sum = 0.0d0
+         do j=1, lastcol-firstcol+1
+            sum = sum + p(j)*q(j)
+         enddo
+
+c---------------------------------------------------------------------
+c  Obtain d with a sum-reduce
+c---------------------------------------------------------------------
+         do i = 1, l2npcols
+            call mpi_irecv( d,
+     >                      1,
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      request,
+     >                      ierr )
+            call mpi_send(  sum,
+     >                      1,
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      ierr )
+
+            call mpi_wait( request, status, ierr )
+
+            sum = sum + d
+         enddo
+         d = sum
+
+
+c---------------------------------------------------------------------
+c  Obtain alpha = rho / (p.q)
+c---------------------------------------------------------------------
+         alpha = rho / d
+
+c---------------------------------------------------------------------
+c  Save a temporary of rho
+c---------------------------------------------------------------------
+         rho0 = rho
+
+c---------------------------------------------------------------------
+c  Obtain z = z + alpha*p
+c  and    r = r - alpha*q
+c---------------------------------------------------------------------
+         do j=1, lastcol-firstcol+1
+            z(j) = z(j) + alpha*p(j)
+            r(j) = r(j) - alpha*q(j)
+         enddo
+            
+c---------------------------------------------------------------------
+c  rho = r.r
+c  Now, obtain the norm of r: First, sum squares of r elements locally...
+c---------------------------------------------------------------------
+         sum = 0.0d0
+         do j=1, lastcol-firstcol+1
+            sum = sum + r(j)*r(j)
+         enddo
+
+c---------------------------------------------------------------------
+c  Obtain rho with a sum-reduce
+c---------------------------------------------------------------------
+         do i = 1, l2npcols
+            call mpi_irecv( rho,
+     >                      1,
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      request,
+     >                      ierr )
+            call mpi_send(  sum,
+     >                      1,
+     >                      dp_type,
+     >                      reduce_exch_proc(i),
+     >                      i,
+     >                      mpi_comm_world,
+     >                      ierr )
+            call mpi_wait( request, status, ierr )
+
+            sum = sum + rho
+         enddo
+         rho = sum
+
+c---------------------------------------------------------------------
+c  Obtain beta:
+c---------------------------------------------------------------------
+         beta = rho / rho0
+
+c---------------------------------------------------------------------
+c  p = r + beta*p
+c---------------------------------------------------------------------
+         do j=1, lastcol-firstcol+1
+            p(j) = r(j) + beta*p(j)
+         enddo
+
+
+
+      enddo                             ! end of do cgit=1,cgitmax
+
+
+
+c---------------------------------------------------------------------
+c  Compute residual norm explicitly:  ||r|| = ||x - A.z||
+c  First, form A.z
+c  The partition submatrix-vector multiply
+c---------------------------------------------------------------------
+      do j=1,lastrow-firstrow+1
+         sum = 0.d0
+         do k=rowstr(j),rowstr(j+1)-1
+            sum = sum + a(k)*z(colidx(k))
+         enddo
+         w(j) = sum
+      enddo
+
+
+
+c---------------------------------------------------------------------
+c  Sum the partition submatrix-vec A.z's across rows
+c---------------------------------------------------------------------
+      do i = l2npcols, 1, -1
+         call mpi_irecv( r(reduce_recv_starts(i)),
+     >                   reduce_recv_lengths(i),
+     >                   dp_type,
+     >                   reduce_exch_proc(i),
+     >                   i,
+     >                   mpi_comm_world,
+     >                   request,
+     >                   ierr )
+         call mpi_send(  w(reduce_send_starts(i)),
+     >                   reduce_send_lengths(i),
+     >                   dp_type,
+     >                   reduce_exch_proc(i),
+     >                   i,
+     >                   mpi_comm_world,
+     >                   ierr )
+         call mpi_wait( request, status, ierr )
+
+         do j=send_start,send_start + reduce_recv_lengths(i) - 1
+            w(j) = w(j) + r(j)
+         enddo
+      enddo
+      
+
+c---------------------------------------------------------------------
+c  Exchange piece of q with transpose processor:
+c---------------------------------------------------------------------
+      if( l2npcols .ne. 0 )then
+         call mpi_irecv( r,               
+     >                   exch_recv_length,
+     >                   dp_type,
+     >                   exch_proc,
+     >                   1,
+     >                   mpi_comm_world,
+     >                   request,
+     >                   ierr )
+   
+         call mpi_send(  w(send_start),   
+     >                   send_len,
+     >                   dp_type,
+     >                   exch_proc,
+     >                   1,
+     >                   mpi_comm_world,
+     >                   ierr )
+         call mpi_wait( request, status, ierr )
+      else
+         do j=1,exch_recv_length
+            r(j) = w(j)
+         enddo
+      endif
+
+
+c---------------------------------------------------------------------
+c  At this point, r contains A.z
+c---------------------------------------------------------------------
+         sum = 0.0d0
+         do j=1, lastcol-firstcol+1
+            d   = x(j) - r(j)         
+            sum = sum + d*d
+         enddo
+         
+c---------------------------------------------------------------------
+c  Obtain d with a sum-reduce
+c---------------------------------------------------------------------
+      do i = 1, l2npcols
+         call mpi_irecv( d,
+     >                   1,
+     >                   dp_type,
+     >                   reduce_exch_proc(i),
+     >                   i,
+     >                   mpi_comm_world,
+     >                   request,
+     >                   ierr )
+         call mpi_send(  sum,
+     >                   1,
+     >                   dp_type,
+     >                   reduce_exch_proc(i),
+     >                   i,
+     >                   mpi_comm_world,
+     >                   ierr )
+         call mpi_wait( request, status, ierr )
+
+         sum = sum + d
+      enddo
+      d = sum
+
+
+      if( me .eq. root ) rnorm = sqrt( d )
+
+
+
+      return
+      end                               ! end of routine conj_grad
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine makea( n, nz, a, colidx, rowstr, nonzer,
+     >                  firstrow, lastrow, firstcol, lastcol,
+     >                  rcond, arow, acol, aelt, v, iv, shift )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      integer             n, nz
+      integer             firstrow, lastrow, firstcol, lastcol
+      integer             colidx(nz), rowstr(n+1)
+      integer             iv(2*n+1), arow(nz), acol(nz)
+      double precision    v(n+1), aelt(nz)
+      double precision    rcond, a(nz), shift
+
+c---------------------------------------------------------------------
+c       generate the test problem for benchmark 6
+c       makea generates a sparse matrix with a
+c       prescribed sparsity distribution
+c
+c       parameter    type        usage
+c
+c       input
+c
+c       n            i           number of cols/rows of matrix
+c       nz           i           nonzeros as declared array size
+c       rcond        r*8         condition number
+c       shift        r*8         main diagonal shift
+c
+c       output
+c
+c       a            r*8         array for nonzeros
+c       colidx       i           col indices
+c       rowstr       i           row pointers
+c
+c       workspace
+c
+c       iv, arow, acol i
+c       v, aelt        r*8
+c---------------------------------------------------------------------
+
+      integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER
+
+c---------------------------------------------------------------------
+c      nonzer is approximately  (int(sqrt(nnza /n)));
+c---------------------------------------------------------------------
+
+      double precision  size, ratio, scale
+      external          sparse, sprnvc, vecset
+
+      size = 1.0D0
+      ratio = rcond ** (1.0D0 / dfloat(n))
+      nnza = 0
+
+c---------------------------------------------------------------------
+c  Initialize iv(n+1 .. 2n) to zero.
+c  Used by sprnvc to mark nonzero positions
+c---------------------------------------------------------------------
+
+      do i = 1, n
+           iv(n+i) = 0
+      enddo
+      do iouter = 1, n
+         nzv = nonzer
+         call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) )
+         call vecset( n, v, colidx, nzv, iouter, .5D0 )
+         do ivelt = 1, nzv
+              jcol = colidx(ivelt)
+              if (jcol.ge.firstcol .and. jcol.le.lastcol) then
+                 scale = size * v(ivelt)
+                 do ivelt1 = 1, nzv
+                    irow = colidx(ivelt1)
+                    if (irow.ge.firstrow .and. irow.le.lastrow) then
+                       nnza = nnza + 1
+                       if (nnza .gt. nz) goto 9999
+                       acol(nnza) = jcol
+                       arow(nnza) = irow
+                       aelt(nnza) = v(ivelt1) * scale
+                    endif
+                 enddo
+              endif
+         enddo
+         size = size * ratio
+      enddo
+
+
+c---------------------------------------------------------------------
+c       ... add the identity * rcond to the generated matrix to bound
+c           the smallest eigenvalue from below by rcond
+c---------------------------------------------------------------------
+        do i = firstrow, lastrow
+           if (i.ge.firstcol .and. i.le.lastcol) then
+              iouter = n + i
+              nnza = nnza + 1
+              if (nnza .gt. nz) goto 9999
+              acol(nnza) = i
+              arow(nnza) = i
+              aelt(nnza) = rcond - shift
+           endif
+        enddo
+
+
+c---------------------------------------------------------------------
+c       ... make the sparse matrix from list of elements with duplicates
+c           (v and iv are used as  workspace)
+c---------------------------------------------------------------------
+      call sparse( a, colidx, rowstr, n, arow, acol, aelt,
+     >             firstrow, lastrow,
+     >             v, iv(1), iv(n+1), nnza )
+      return
+
+ 9999 continue
+      write(*,*) 'Space for matrix elements exceeded in makea'
+      write(*,*) 'nnza, nzmax = ',nnza, nz
+      write(*,*) ' iouter = ',iouter
+
+      stop
+      end
+c-------end   of makea------------------------------
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt,
+     >                   firstrow, lastrow,
+     >                   x, mark, nzloc, nnza )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit           logical (a-z)
+      integer            colidx(*), rowstr(*)
+      integer            firstrow, lastrow
+      integer            n, arow(*), acol(*), nnza
+      double precision   a(*), aelt(*)
+
+c---------------------------------------------------------------------
+c       rows range from firstrow to lastrow
+c       the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
+c---------------------------------------------------------------------
+      integer            nzloc(n), nrows
+      double precision   x(n)
+      logical            mark(n)
+
+c---------------------------------------------------
+c       generate a sparse matrix from a list of
+c       [col, row, element] tri
+c---------------------------------------------------
+
+      integer            i, j, jajp1, nza, k, nzrow
+      double precision   xi
+
+c---------------------------------------------------------------------
+c    how many rows of result
+c---------------------------------------------------------------------
+      nrows = lastrow - firstrow + 1
+
+c---------------------------------------------------------------------
+c     ...count the number of triples in each row
+c---------------------------------------------------------------------
+      do j = 1, n
+         rowstr(j) = 0
+         mark(j) = .false.
+      enddo
+      rowstr(n+1) = 0
+
+      do nza = 1, nnza
+         j = (arow(nza) - firstrow + 1) + 1
+         rowstr(j) = rowstr(j) + 1
+      enddo
+
+      rowstr(1) = 1
+      do j = 2, nrows+1
+         rowstr(j) = rowstr(j) + rowstr(j-1)
+      enddo
+
+
+c---------------------------------------------------------------------
+c     ... rowstr(j) now is the location of the first nonzero
+c           of row j of a
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c     ... do a bucket sort of the triples on the row index
+c---------------------------------------------------------------------
+      do nza = 1, nnza
+         j = arow(nza) - firstrow + 1
+         k = rowstr(j)
+         a(k) = aelt(nza)
+         colidx(k) = acol(nza)
+         rowstr(j) = rowstr(j) + 1
+      enddo
+
+
+c---------------------------------------------------------------------
+c       ... rowstr(j) now points to the first element of row j+1
+c---------------------------------------------------------------------
+      do j = nrows, 1, -1
+          rowstr(j+1) = rowstr(j)
+      enddo
+      rowstr(1) = 1
+
+
+c---------------------------------------------------------------------
+c       ... generate the actual output rows by adding elements
+c---------------------------------------------------------------------
+      nza = 0
+      do i = 1, n
+          x(i)    = 0.0
+          mark(i) = .false.
+      enddo
+
+      jajp1 = rowstr(1)
+      do j = 1, nrows
+         nzrow = 0
+
+c---------------------------------------------------------------------
+c          ...loop over the jth row of a
+c---------------------------------------------------------------------
+         do k = jajp1 , rowstr(j+1)-1
+            i = colidx(k)
+            x(i) = x(i) + a(k)
+            if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then
+             mark(i) = .true.
+             nzrow = nzrow + 1
+             nzloc(nzrow) = i
+            endif
+         enddo
+
+c---------------------------------------------------------------------
+c          ... extract the nonzeros of this row
+c---------------------------------------------------------------------
+         do k = 1, nzrow
+            i = nzloc(k)
+            mark(i) = .false.
+            xi = x(i)
+            x(i) = 0.D0
+            if (xi .ne. 0.D0) then
+             nza = nza + 1
+             a(nza) = xi
+             colidx(nza) = i
+            endif
+         enddo
+         jajp1 = rowstr(j+1)
+         rowstr(j+1) = nza + rowstr(1)
+      enddo
+CC       write (*, 11000) nza
+      return
+11000   format ( //,'final nonzero count in sparse ',
+     1            /,'number of nonzeros       = ', i16 )
+      end
+c-------end   of sparse-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine sprnvc( n, nz, v, iv, nzloc, mark )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit           logical (a-z)
+      double precision   v(*)
+      integer            n, nz, iv(*), nzloc(n), nn1
+      integer mark(n)
+      common /urando/    amult, tran
+      double precision   amult, tran
+
+
+c---------------------------------------------------------------------
+c       generate a sparse n-vector (v, iv)
+c       having nzv nonzeros
+c
+c       mark(i) is set to 1 if position i is nonzero.
+c       mark is all zero on entry and is reset to all zero before exit
+c       this corrects a performance bug found by John G. Lewis, caused by
+c       reinitialization of mark on every one of the n calls to sprnvc
+c---------------------------------------------------------------------
+
+        integer            nzrow, nzv, ii, i, icnvrt
+
+        external           randlc, icnvrt
+        double precision   randlc, vecelt, vecloc
+
+
+        nzv = 0
+        nzrow = 0
+        nn1 = 1
+ 50     continue
+          nn1 = 2 * nn1
+          if (nn1 .lt. n) goto 50
+
+c---------------------------------------------------------------------
+c    nn1 is the smallest power of two not less than n
+c---------------------------------------------------------------------
+
+100     continue
+        if (nzv .ge. nz) goto 110
+         vecelt = randlc( tran, amult )
+
+c---------------------------------------------------------------------
+c   generate an integer between 1 and n in a portable manner
+c---------------------------------------------------------------------
+         vecloc = randlc(tran, amult)
+         i = icnvrt(vecloc, nn1) + 1
+         if (i .gt. n) goto 100
+
+c---------------------------------------------------------------------
+c  was this integer generated already?
+c---------------------------------------------------------------------
+         if (mark(i) .eq. 0) then
+            mark(i) = 1
+            nzrow = nzrow + 1
+            nzloc(nzrow) = i
+            nzv = nzv + 1
+            v(nzv) = vecelt
+            iv(nzv) = i
+         endif
+         goto 100
+110      continue
+      do ii = 1, nzrow
+         i = nzloc(ii)
+         mark(i) = 0
+      enddo
+      return
+      end
+c-------end   of sprnvc-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      function icnvrt(x, ipwr2)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit           logical (a-z)
+      double precision   x
+      integer            ipwr2, icnvrt
+
+c---------------------------------------------------------------------
+c    scale a double precision number x in (0,1) by a power of 2 and chop it
+c---------------------------------------------------------------------
+      icnvrt = int(ipwr2 * x)
+
+      return
+      end
+c-------end   of icnvrt-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine vecset(n, v, iv, nzv, i, val)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit           logical (a-z)
+      integer            n, iv(*), nzv, i, k
+      double precision   v(*), val
+
+c---------------------------------------------------------------------
+c       set ith element of sparse vector (v, iv) with
+c       nzv nonzeros to val
+c---------------------------------------------------------------------
+
+      logical set
+
+      set = .false.
+      do k = 1, nzv
+         if (iv(k) .eq. i) then
+            v(k) = val
+            set  = .true.
+         endif
+      enddo
+      if (.not. set) then
+         nzv     = nzv + 1
+         v(nzv)  = val
+         iv(nzv) = i
+      endif
+      return
+      end
+c-------end   of vecset-----------------------------
+
diff --git a/examples/smpi/NAS/CG/mpinpb.h b/examples/smpi/NAS/CG/mpinpb.h
new file mode 100644
index 0000000000..1f0368c0b7
--- /dev/null
+++ b/examples/smpi/NAS/CG/mpinpb.h
@@ -0,0 +1,9 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include           'mpif.h'
+
+      integer           me, nprocs, root, dp_type
+      common /mpistuff/ me, nprocs, root, dp_type
+
diff --git a/examples/smpi/NAS/DT/DGraph.c b/examples/smpi/NAS/DT/DGraph.c
new file mode 100644
index 0000000000..5d5839df19
--- /dev/null
+++ b/examples/smpi/NAS/DT/DGraph.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "DGraph.h"
+
+DGArc *newArc(DGNode *tl,DGNode *hd){
+  DGArc *ar=(DGArc *)malloc(sizeof(DGArc));
+  ar->tail=tl;
+  ar->head=hd;
+  return ar;
+}
+void arcShow(DGArc *ar){
+  DGNode *tl=(DGNode *)ar->tail,
+         *hd=(DGNode *)ar->head;
+  fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name);
+}
+
+DGNode *newNode(char *nm){
+  DGNode *nd=(DGNode *)malloc(sizeof(DGNode));
+  nd->attribute=0;
+  nd->color=0;
+  nd->inDegree=0;
+  nd->outDegree=0;
+  nd->maxInDegree=SMALL_BLOCK_SIZE;
+  nd->maxOutDegree=SMALL_BLOCK_SIZE;
+  nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*));
+  nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*));
+  nd->name=strdup(nm);
+  nd->feat=NULL;
+  return nd;
+}
+void nodeShow(DGNode* nd){
+  fprintf( stderr,"%3d.%s: (%d,%d)\n",
+	           nd->id,nd->name,nd->inDegree,nd->outDegree);
+/*
+  if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
+  else if(nd->verified==0)  fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
+  else  fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name);   
+*/
+}
+
+DGraph* newDGraph(char* nm){
+  DGraph *dg=(DGraph *)malloc(sizeof(DGraph));
+  dg->numNodes=0;
+  dg->numArcs=0;
+  dg->maxNodes=BLOCK_SIZE;
+  dg->maxArcs=BLOCK_SIZE;
+  dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*));
+  dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*));
+  dg->name=strdup(nm);
+  return dg;
+}
+int AttachNode(DGraph* dg, DGNode* nd) {
+  int i=0,j,len=0;
+  DGNode **nds =NULL, *tmpnd=NULL;
+  DGArc **ar=NULL;
+
+	if (dg->numNodes == dg->maxNodes-1 ) {
+	  dg->maxNodes += BLOCK_SIZE;
+          nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
+	  memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
+	  free(dg->node);
+	  dg->node=nds;
+	}
+
+        len = strlen( nd->name);
+	for (i = 0; i < dg->numNodes; i++) {
+	  tmpnd =dg->node[ i];
+	  ar=NULL;
+	  if ( strlen( tmpnd->name) != len ) continue;
+	  if ( strncmp( nd->name, tmpnd->name, len) ) continue;
+	  if ( nd->inDegree > 0 ) {
+	    tmpnd->maxInDegree += nd->maxInDegree;
+            ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
+	    memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
+	    free(tmpnd->inArc);
+	    tmpnd->inArc=ar;
+	    for (j = 0; j < nd->inDegree; j++ ) {
+	      nd->inArc[ j]->head = tmpnd;
+	    }
+	    memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *));
+	    tmpnd->inDegree += nd->inDegree;
+	  } 	
+	  if ( nd->outDegree > 0 ) {
+	    tmpnd->maxOutDegree += nd->maxOutDegree;
+            ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
+	    memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
+	    free(tmpnd->outArc);
+	    tmpnd->outArc=ar;
+	    for (j = 0; j < nd->outDegree; j++ ) {
+	      nd->outArc[ j]->tail = tmpnd;
+	    }			
+	    memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
+	    tmpnd->outDegree += nd->outDegree;
+	  } 
+	  free(nd); 
+	  return i;
+	}
+	nd->id = dg->numNodes;
+	dg->node[dg->numNodes] = nd;
+	dg->numNodes++;
+return nd->id;
+}
+int AttachArc(DGraph *dg,DGArc* nar){
+int	arcId = -1;
+int i=0,newNumber=0;
+DGNode	*head = nar->head,
+	*tail = nar->tail; 
+DGArc **ars=NULL,*probe=NULL;
+/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
+	if ( !tail || !head ) return arcId;
+	if ( dg->numArcs == dg->maxArcs-1 ) {
+	  dg->maxArcs += BLOCK_SIZE;
+          ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
+	  memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
+	  free(dg->arc);
+	  dg->arc=ars;
+	}
+	for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
+	  probe = tail->outArc[ i];
+	  if(probe->head == head
+	     &&
+	     probe->length == nar->length
+            ){
+            free(nar);
+	    return probe->id;   
+	  }
+	}
+	
+	nar->id = dg->numArcs;
+	arcId=dg->numArcs;
+	dg->arc[dg->numArcs] = nar;
+	dg->numArcs++;
+	
+	head->inArc[ head->inDegree] = nar;
+	head->inDegree++;
+	if ( head->inDegree >= head->maxInDegree ) {
+	  newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
+          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+	  memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
+	  free(head->inArc);
+	  head->inArc=ars;
+	  head->maxInDegree = newNumber;
+	}
+	tail->outArc[ tail->outDegree] = nar;
+	tail->outDegree++;
+	if(tail->outDegree >= tail->maxOutDegree ) {
+	  newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
+          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+	  memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
+	  free(tail->outArc);
+	  tail->outArc=ars;
+	  tail->maxOutDegree = newNumber;
+	}
+/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
+return arcId;
+}
+void graphShow(DGraph *dg,int DetailsLevel){
+  int i=0,j=0;
+  fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
+  if ( DetailsLevel < 1) return;
+  for (i = 0; i < dg->numNodes; i++ ) {
+    DGNode *focusNode = dg->node[ i];
+    if(DetailsLevel >= 2) {
+      for (j = 0; j < focusNode->inDegree; j++ ) {
+	fprintf(stderr,"\t ");
+	nodeShow(focusNode->inArc[ j]->tail);
+      }
+    }
+    nodeShow(focusNode);
+    if ( DetailsLevel < 2) continue;
+    for (j = 0; j < focusNode->outDegree; j++ ) {
+      fprintf(stderr, "\t ");
+      nodeShow(focusNode->outArc[ j]->head);
+    }	
+    fprintf(stderr, "---\n");
+  }
+  fprintf(stderr,"----------------------------------------\n");
+  if ( DetailsLevel < 3) return;
+}
+
+
+
diff --git a/examples/smpi/NAS/DT/DGraph.h b/examples/smpi/NAS/DT/DGraph.h
new file mode 100644
index 0000000000..f38f898b24
--- /dev/null
+++ b/examples/smpi/NAS/DT/DGraph.h
@@ -0,0 +1,43 @@
+#ifndef _DGRAPH
+#define _DGRAPH
+
+#define BLOCK_SIZE  128
+#define SMALL_BLOCK_SIZE 32
+
+typedef struct{
+  int id;
+  void *tail,*head;
+  int length,width,attribute,maxWidth;
+}DGArc;
+
+typedef struct{
+  int maxInDegree,maxOutDegree;
+  int inDegree,outDegree;
+  int id;
+  char *name;
+  DGArc **inArc,**outArc;
+  int depth,height,width;
+  int color,attribute,address,verified;
+  void *feat;
+}DGNode;
+
+typedef struct{
+  int maxNodes,maxArcs;
+  int id;
+  char *name;
+  int numNodes,numArcs;
+  DGNode **node;
+  DGArc **arc;
+} DGraph;
+
+DGArc *newArc(DGNode *tl,DGNode *hd);
+void arcShow(DGArc *ar);
+DGNode *newNode(char *nm);
+void nodeShow(DGNode* nd);
+
+DGraph* newDGraph(char *nm);
+int AttachNode(DGraph *dg,DGNode *nd);
+int AttachArc(DGraph *dg,DGArc* nar);
+void graphShow(DGraph *dg,int DetailsLevel);
+
+#endif
diff --git a/examples/smpi/NAS/DT/Makefile b/examples/smpi/NAS/DT/Makefile
new file mode 100644
index 0000000000..28d9502fe2
--- /dev/null
+++ b/examples/smpi/NAS/DT/Makefile
@@ -0,0 +1,26 @@
+SHELL=/bin/sh
+BENCHMARK=dt
+BENCHMARKU=DT
+
+include ../config/make.def
+
+include ../sys/make.common
+#Override PROGRAM
+DTPROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS)
+
+OBJS = dt.o DGraph.o \
+	${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
+
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+	${CCOMPILE} $<
+
+dt.o:             dt.c  npbparams.h
+DGraph.o:	DGraph.c DGraph.h
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f dt npbparams.h core
diff --git a/examples/smpi/NAS/DT/README b/examples/smpi/NAS/DT/README
new file mode 100644
index 0000000000..873e3ae6f2
--- /dev/null
+++ b/examples/smpi/NAS/DT/README
@@ -0,0 +1,22 @@
+Data Traffic benchmark DT is new in the NPB suite 
+(released as part of NPB3.x-MPI package).
+----------------------------------------------------
+
+DT is written in C and same executable can run on any number of processors,
+provided this number is not less than the number of nodes in the communication
+graph.  DT benchmark takes one argument: BH, WH, or SH. This argument 
+specifies the communication graph Black Hole, White Hole, or SHuffle 
+respectively. The current release contains verification numbers for 
+CLASSES S, W, A, and B only.  Classes C and D are defined, but verification 
+numbers are not provided in this release.
+
+The following table summarizes the number of nodes in the communication
+graph based on CLASS and graph TYPE.
+
+CLASS  N_Source N_Nodes(BH,WH) N_Nodes(SH)
+ S      4        5              12
+ W      8        11             32
+ A      16       21             80
+ B      32       43             192
+ C      64       85             448
+ D      128      171            1024
diff --git a/examples/smpi/NAS/DT/dt.c b/examples/smpi/NAS/DT/dt.c
new file mode 100644
index 0000000000..ed8cfeb55d
--- /dev/null
+++ b/examples/smpi/NAS/DT/dt.c
@@ -0,0 +1,759 @@
+/*************************************************************************
+ *                                                                       * 
+ *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
+ *                                                                       * 
+ *                                  D T					 * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
+ *                                                                       * 
+ *   Permission to use, copy, distribute and modify this software        * 
+ *   for any purpose with or without fee is hereby granted.  We          * 
+ *   request, however, that all derived work reference the NAS           * 
+ *   Parallel Benchmarks 3.3. This software is provided "as is"          *
+ *   without express or implied warranty.                                * 
+ *                                                                       * 
+ *   Information on NPB 3.3, including the technical report, the         *
+ *   original specifications, source code, results and information       * 
+ *   on how to submit new results, is available at:                      * 
+ *                                                                       * 
+ *          http:  www.nas.nasa.gov/Software/NPB                         * 
+ *                                                                       * 
+ *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
+ *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
+ *                                                                       * 
+ *         NAS Parallel Benchmarks Group                                 * 
+ *         NASA Ames Research Center                                     * 
+ *         Mail Stop: T27A-1                                             * 
+ *         Moffett Field, CA   94035-1000                                * 
+ *                                                                       * 
+ *         E-mail:  npb@nas.nasa.gov                                     * 
+ *         Fax:     (650) 604-3957                                       * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   Author: M. Frumkin							 *						 * 
+ *                                                                       * 
+ *************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS            1                 
+#endif
+
+//int      passed_verification;
+extern double randlc( double *X, double *A );
+extern
+void c_print_results( char   *name,
+                      char   class,
+                      int    n1, 
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      int    nprocs_compiled,
+                      int    nprocs_total,
+                      double t,
+                      double mops,
+		      char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *mpicc,
+                      char   *clink,
+                      char   *cmpi_lib,
+                      char   *cmpi_inc,
+                      char   *cflags,
+                      char   *clinkflags );
+		      
+void    timer_clear( int n );
+void    timer_start( int n );
+void    timer_stop( int n );
+double  timer_read( int n );
+int timer_on=0,timers_tot=64;
+
+int verify(char *bmname,double rnm2){
+    double verify_value=0.0;
+    double epsilon=1.0E-8;
+    char cls=CLASS;
+    int verified=-1;
+    if (cls != 'U') {
+       if(cls=='S') {
+         if(strstr(bmname,"BH")){
+           verify_value=30892725.0;
+         }else if(strstr(bmname,"WH")){
+           verify_value=67349758.0;
+         }else if(strstr(bmname,"SH")){
+           verify_value=58875767.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = 0;
+       }else if(cls=='W') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 4102461.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 204280762.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 186944764.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = 0;
+       }else if(cls=='A') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 17809491.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 1289925229.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 610856482.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+  	 verified = 0;
+       }else if(cls=='B') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 4317114.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 7877279917.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 1836863082.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+  	   verified = 0;
+         }
+       }else if(cls=='C') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 0.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+  	   verified = -1;
+         }
+       }else if(cls=='D') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 0.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = -1;
+       }else{
+         fprintf(stderr,"No such class as %c.\n",cls);
+       }
+       fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
+       if(verified==-1){
+  	 fprintf(stderr," No verification was performed.\n");
+       }else if( rnm2 - verify_value < epsilon &&
+                 rnm2 - verify_value > -epsilon) {  /* abs here does not work on ALTIX */
+  	  verified = 1;
+  	  fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
+       }else{
+  	 verified = 0;
+  	 fprintf(stderr," The correct verification value = %f\n",verify_value);
+  	 fprintf(stderr," Got value = %f\n",rnm2);
+       }
+    }else{
+       verified = -1;
+    }
+    return  verified;  
+  }
+
+int ipowMod(int a,long long int n,int md){ 
+  int seed=1,q=a,r=1;
+  if(n<0){
+    fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
+    n=-n; /* temp fix */
+/*    return 1; */
+  }
+  if(md<=0){
+    fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
+    return 1;
+  }
+  if(n==0) return 1;
+  while(n>1){
+    int n2 = n/2;
+    if (n2*2==n){
+       seed = (q*q)%md;
+       q=seed;
+       n = n2;
+    }else{
+       seed = (r*q)%md;
+       r=seed;
+       n = n-1;
+    }
+  }
+  seed = (r*q)%md;
+  return seed;
+}
+
+#include "DGraph.h"
+DGraph *buildSH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  DGraph *dg;
+  int numSources=NUM_SOURCES; /* must be power of 2 */
+  int numOfLayers=0,tmpS=numSources>>1;
+  int firstLayerNode=0;
+  DGArc *ar=NULL;
+  DGNode *nd=NULL;
+  int mask=0x0,ndid=0,ndoff=0;
+  int i=0,j=0;
+  char nm[BLOCK_SIZE];
+  
+  sprintf(nm,"DT_SH.%c",cls);
+  dg=newDGraph(nm);
+
+  while(tmpS>1){
+    numOfLayers++;
+    tmpS>>=1;
+  }
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Source.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  for(j=0;j<numOfLayers;j++){
+    mask=0x00000001<<j;
+    for(i=0;i<numSources;i++){
+      sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
+      nd=newNode(nm);
+      AttachNode(dg,nd);
+      ndoff=i&(~mask);
+      ndid=firstLayerNode+ndoff;
+      ar=newArc(dg->node[ndid],nd);     
+      AttachArc(dg,ar);
+      ndoff+=mask;
+      ndid=firstLayerNode+ndoff;
+      ar=newArc(dg->node[ndid],nd);     
+      AttachArc(dg,ar);
+    }
+    firstLayerNode+=numSources;
+  }
+  mask=0x00000001<<numOfLayers;
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Sink.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+    ndoff=i&(~mask);
+    ndid=firstLayerNode+ndoff;
+    ar=newArc(dg->node[ndid],nd);     
+    AttachArc(dg,ar);
+    ndoff+=mask;
+    ndid=firstLayerNode+ndoff;
+    ar=newArc(dg->node[ndid],nd);     
+    AttachArc(dg,ar);
+  }
+return dg;
+}
+DGraph *buildWH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  int i=0,j=0;
+  int numSources=NUM_SOURCES,maxInDeg=4;
+  int numLayerNodes=numSources,firstLayerNode=0;
+  int totComparators=0;
+  int numPrevLayerNodes=numLayerNodes;
+  int id=0,sid=0;
+  DGraph *dg;
+  DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
+  DGArc *ar=NULL;
+  char nm[BLOCK_SIZE];
+
+  sprintf(nm,"DT_WH.%c",cls);
+  dg=newDGraph(nm);
+
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Sink.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  totComparators=0;
+  numPrevLayerNodes=numLayerNodes;
+  while(numLayerNodes>maxInDeg){
+    numLayerNodes=numLayerNodes/maxInDeg;
+    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+    for(i=0;i<numLayerNodes;i++){
+      sprintf(nm,"Comparator.%d",totComparators);
+      totComparators++;
+      nd=newNode(nm);
+      id=AttachNode(dg,nd);
+      for(j=0;j<maxInDeg;j++){
+        sid=i*maxInDeg+j;
+	if(sid>=numPrevLayerNodes) break;
+        snd=dg->node[firstLayerNode+sid];
+        ar=newArc(dg->node[id],snd);
+        AttachArc(dg,ar);
+      }
+    }
+    firstLayerNode+=numPrevLayerNodes;
+    numPrevLayerNodes=numLayerNodes;
+  }
+  source=newNode("Source");
+  AttachNode(dg,source);   
+  for(i=0;i<numPrevLayerNodes;i++){
+    nd=dg->node[firstLayerNode+i];
+    ar=newArc(source,nd);
+    AttachArc(dg,ar);
+  }
+
+  for(i=0;i<dg->numNodes/2;i++){  /* Topological sorting */
+    tmp=dg->node[i];
+    dg->node[i]=dg->node[dg->numNodes-1-i];
+    dg->node[i]->id=i;
+    dg->node[dg->numNodes-1-i]=tmp;
+    dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
+  }
+return dg;
+}
+DGraph *buildBH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  int i=0,j=0;
+  int numSources=NUM_SOURCES,maxInDeg=4;
+  int numLayerNodes=numSources,firstLayerNode=0;
+  DGraph *dg;
+  DGNode *nd=NULL, *snd=NULL, *sink=NULL;
+  DGArc *ar=NULL;
+  int totComparators=0;
+  int numPrevLayerNodes=numLayerNodes;
+  int id=0, sid=0;
+  char nm[BLOCK_SIZE];
+
+  sprintf(nm,"DT_BH.%c",cls);
+  dg=newDGraph(nm);
+
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Source.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  while(numLayerNodes>maxInDeg){
+    numLayerNodes=numLayerNodes/maxInDeg;
+    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+    for(i=0;i<numLayerNodes;i++){
+      sprintf(nm,"Comparator.%d",totComparators);
+      totComparators++;
+      nd=newNode(nm);
+      id=AttachNode(dg,nd);
+      for(j=0;j<maxInDeg;j++){
+        sid=i*maxInDeg+j;
+	if(sid>=numPrevLayerNodes) break;
+        snd=dg->node[firstLayerNode+sid];
+        ar=newArc(snd,dg->node[id]);
+        AttachArc(dg,ar);
+      }
+    }
+    firstLayerNode+=numPrevLayerNodes;
+    numPrevLayerNodes=numLayerNodes;
+  }
+  sink=newNode("Sink");
+  AttachNode(dg,sink);   
+  for(i=0;i<numPrevLayerNodes;i++){
+    nd=dg->node[firstLayerNode+i];
+    ar=newArc(nd,sink);
+    AttachArc(dg,ar);
+  }
+return dg;
+}
+
+typedef struct{
+  int len;
+  double* val;
+} Arr;
+Arr *newArr(int len){
+  Arr *arr=(Arr *)malloc(sizeof(Arr));
+  arr->len=len;
+  arr->val=(double *)malloc(len*sizeof(double));
+  return arr;
+}
+void arrShow(Arr* a){
+  if(!a) fprintf(stderr,"-- NULL array\n");
+  else{
+    fprintf(stderr,"-- length=%d\n",a->len);
+  }
+}
+double CheckVal(Arr *feat){
+  double csum=0.0;
+  int i=0;
+  for(i=0;i<feat->len;i++){
+    csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since 
+                                                  result will be 0 for large len  */
+  }
+   return csum;
+}
+int GetFNumDPar(int* mean, int* stdev){
+  *mean=NUM_SAMPLES;
+  *stdev=STD_DEVIATION;
+  return 0;
+}
+int GetFeatureNum(char *mbname,int id){
+  double tran=314159265.0;
+  double A=2*id+1;
+  double denom=randlc(&tran,&A);
+  char cval='S';
+  int mean=NUM_SAMPLES,stdev=128;
+  int rtfs=0,len=0;
+  GetFNumDPar(&mean,&stdev);
+  rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
+  if(rtfs<0) rtfs=-rtfs;
+  len=mean-stdev+rtfs;
+  return len;
+}
+Arr* RandomFeatures(char *bmname,int fdim,int id){
+  int len=GetFeatureNum(bmname,id)*fdim;
+  Arr* feat=newArr(len);
+  int nxg=2,nyg=2,nzg=2,nfg=5;
+  int nx=421,ny=419,nz=1427,nf=3527;
+  long long int expon=(len*(id+1))%3141592;
+  int seedx=ipowMod(nxg,expon,nx),
+      seedy=ipowMod(nyg,expon,ny),
+      seedz=ipowMod(nzg,expon,nz),
+      seedf=ipowMod(nfg,expon,nf);
+  int i=0;
+  if(timer_on){
+    timer_clear(id+1);
+    timer_start(id+1);
+  }
+  for(i=0;i<len;i+=fdim){
+    seedx=(seedx*nxg)%nx;
+    seedy=(seedy*nyg)%ny;
+    seedz=(seedz*nzg)%nz;
+    seedf=(seedf*nfg)%nf;
+    feat->val[i]=seedx;
+    feat->val[i+1]=seedy;
+    feat->val[i+2]=seedz;
+    feat->val[i+3]=seedf;
+  }
+  if(timer_on){
+    timer_stop(id+1);
+    fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
+  }
+  return feat;   
+}
+void Resample(Arr *a,int blen){
+    long long int i=0,j=0,jlo=0,jhi=0;
+    double avval=0.0;
+    double *nval=(double *)malloc(blen*sizeof(double));
+    Arr *tmp=newArr(10);
+    for(i=0;i<blen;i++) nval[i]=0.0;
+    for(i=1;i<a->len-1;i++){
+      jlo=(int)(0.5*(2*i-1)*(blen/a->len)); 
+      jhi=(int)(0.5*(2*i+1)*(blen/a->len));
+
+      avval=a->val[i]/(jhi-jlo+1);    
+      for(j=jlo;j<=jhi;j++){
+        nval[j]+=avval;
+      }
+    }
+    nval[0]=a->val[0];
+    nval[blen-1]=a->val[a->len-1];
+    free(a->val);
+    a->val=nval;
+    a->len=blen;
+}
+#define fielddim 4
+Arr* WindowFilter(Arr *a, Arr* b,int w){
+  int i=0,j=0,k=0;
+  double rms0=0.0,rms1=0.0,rmsm1=0.0;
+  double weight=((double) (w+1))/(w+2);
+ 
+  w+=1;
+  if(timer_on){
+    timer_clear(w);
+    timer_start(w);
+  }
+  if(a->len<b->len) Resample(a,b->len);
+  if(a->len>b->len) Resample(b,a->len);
+  for(i=fielddim;i<a->len-fielddim;i+=fielddim){
+    rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
+	+(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
+	+(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
+	+(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
+    j=i+fielddim;
+    rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+    	+(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+    	+(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+    	+(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+    j=i-fielddim;
+    rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+	 +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+	 +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+	 +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+    k=0;
+    if(rms1<rms0){
+      k=1;
+      rms0=rms1;
+    }
+    if(rmsm1<rms0) k=-1;
+    if(k==0){
+      j=i+fielddim;
+      a->val[i]=weight*b->val[i];
+      a->val[i+1]=weight*b->val[i+1];
+      a->val[i+2]=weight*b->val[i+2];
+      a->val[i+3]=weight*b->val[i+3];  
+    }else if(k==1){
+      j=i+fielddim;
+      a->val[i]=weight*b->val[j];
+      a->val[i+1]=weight*b->val[j+1];
+      a->val[i+2]=weight*b->val[j+2];
+      a->val[i+3]=weight*b->val[j+3];  
+    }else { /*if(k==-1)*/
+      j=i-fielddim;
+      a->val[i]=weight*b->val[j];
+      a->val[i+1]=weight*b->val[j+1];
+      a->val[i+2]=weight*b->val[j+2];
+      a->val[i+3]=weight*b->val[j+3];  
+    }	   
+  }
+  if(timer_on){
+    timer_stop(w);
+    fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
+  }
+  return a;
+}
+
+int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
+  int i=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *head=NULL;
+  if(!feat) return 0;
+  for(i=0;i<nd->outDegree;i++){
+    ar=nd->outArc[i];
+    if(ar->tail!=nd) continue;
+    head=ar->head;
+    tag=ar->id;
+    if(head->address!=nd->address){
+      MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
+      MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
+    }
+  }
+  return 1;
+}
+Arr* CombineStreams(DGraph *dg,DGNode *nd){
+  Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
+  int i=0,len=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *tail=NULL;
+  MPI_Status status;
+  Arr *feat=NULL,*featp=NULL;
+
+  if(nd->inDegree==0) return NULL;
+  for(i=0;i<nd->inDegree;i++){
+    ar=nd->inArc[i];
+    if(ar->head!=nd) continue;
+    tail=ar->tail;
+    if(tail->address!=nd->address){
+      len=0;
+      tag=ar->id;
+      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+      feat=newArr(len);
+      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+      resfeat=WindowFilter(resfeat,feat,nd->id);
+      free(feat);
+    }else{
+      featp=(Arr *)tail->feat;
+      feat=newArr(featp->len);
+      memcpy(feat->val,featp->val,featp->len*sizeof(double));
+      resfeat=WindowFilter(resfeat,feat,nd->id);  
+      free(feat);
+    }
+  }
+  for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
+  nd->feat=resfeat;
+  return nd->feat;
+}
+double Reduce(Arr *a,int w){
+  double retv=0.0;
+  if(timer_on){
+    timer_clear(w);
+    timer_start(w);
+  }
+  retv=(int)(w*CheckVal(a));/* The casting needed for node  
+                               and array dependent verifcation */
+  if(timer_on){
+    timer_stop(w);
+    fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
+  }
+  return retv;
+}
+
+double ReduceStreams(DGraph *dg,DGNode *nd){
+  double csum=0.0;
+  int i=0,len=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *tail=NULL;
+  Arr *feat=NULL;
+  double retv=0.0;
+
+  for(i=0;i<nd->inDegree;i++){
+    ar=nd->inArc[i];
+    if(ar->head!=nd) continue;
+    tail=ar->tail;
+    if(tail->address!=nd->address){
+      MPI_Status status;
+      len=0;
+      tag=ar->id;
+      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+      feat=newArr(len);
+      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+      csum+=Reduce(feat,(nd->id+1));  
+      free(feat);
+    }else{
+      csum+=Reduce(tail->feat,(nd->id+1));  
+    }
+  }
+  if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
+  retv=(nd->id+1)*csum;
+  return retv;
+}
+
+int ProcessNodes(DGraph *dg,int me){
+  double chksum=0.0;
+  Arr *feat=NULL;
+  int i=0,verified=0,tag;
+  DGNode *nd=NULL;
+  double rchksum=0.0;
+  MPI_Status status;
+
+  for(i=0;i<dg->numNodes;i++){
+    nd=dg->node[i];
+    if(nd->address!=me) continue;
+    if(strstr(nd->name,"Source")){
+      nd->feat=RandomFeatures(dg->name,fielddim,nd->id); 
+      SendResults(dg,nd,nd->feat);
+    }else if(strstr(nd->name,"Sink")){
+      chksum=ReduceStreams(dg,nd);
+      tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+      MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
+    }else{
+      feat=CombineStreams(dg,nd);
+      SendResults(dg,nd,feat);
+    }
+  }
+  if(me==0){ /* Report node */
+    rchksum=0.0;
+    chksum=0.0;
+    for(i=0;i<dg->numNodes;i++){
+      nd=dg->node[i];
+      if(!strstr(nd->name,"Sink")) continue;
+       tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+      MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
+      chksum+=rchksum;
+    }
+    verified=verify(dg->name,chksum);
+  }
+return verified;
+}
+
+int main(int argc,char **argv ){
+  int my_rank,comm_size;
+  int i;
+  DGraph *dg=NULL;
+  int verified=0, featnum=0;
+  double bytes_sent=2.0,tot_time=0.0;
+
+    MPI_Init( &argc, &argv );
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+     if(argc!=2||
+                (  strncmp(argv[1],"BH",2)!=0
+                 &&strncmp(argv[1],"WH",2)!=0
+                 &&strncmp(argv[1],"SH",2)!=0
+                )
+      ){
+      if(my_rank==0){
+        fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
+        fprintf(stderr,"** Where \n   - N is integer number of MPI processes\n");
+        fprintf(stderr,"   - S is the class S, W, or A \n");
+        fprintf(stderr,"   - GraphName is the communication graph name BH, WH, or SH.\n");
+        fprintf(stderr,"   - the number of MPI processes N should not be be less than \n");
+        fprintf(stderr,"     the number of nodes in the graph\n");
+      }
+      MPI_Finalize();
+      exit(0);
+    } 
+   if(strncmp(argv[1],"BH",2)==0){
+      dg=buildBH(CLASS);
+    }else if(strncmp(argv[1],"WH",2)==0){
+      dg=buildWH(CLASS);
+    }else if(strncmp(argv[1],"SH",2)==0){
+      dg=buildSH(CLASS);
+    }
+
+    if(timer_on&&dg->numNodes+1>timers_tot){
+      timer_on=0;
+      if(my_rank==0)
+        fprintf(stderr,"Not enough timers. Node timeing is off. \n");
+    }
+    if(dg->numNodes>comm_size){
+      if(my_rank==0){
+        fprintf(stderr,"**  The number of MPI processes should not be less than \n");
+        fprintf(stderr,"**  the number of nodes in the graph\n");
+        fprintf(stderr,"**  Number of MPI processes = %d\n",comm_size);
+        fprintf(stderr,"**  Number nodes in the graph = %d\n",dg->numNodes);
+      }
+      MPI_Finalize();
+      exit(0);
+    }
+    for(i=0;i<dg->numNodes;i++){ 
+      dg->node[i]->address=i;
+    }
+    if( my_rank == 0 ){
+      printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
+      graphShow(dg,0);
+      timer_clear(0);
+      timer_start(0);
+    }
+    verified=ProcessNodes(dg,my_rank);
+    
+    featnum=NUM_SAMPLES*fielddim;
+    bytes_sent=featnum*dg->numArcs;
+    bytes_sent/=1048576;
+    if(my_rank==0){
+      timer_stop(0);
+      tot_time=timer_read(0);
+      c_print_results( dg->name,
+        	       CLASS,
+        	       featnum,
+        	       0,
+        	       0,
+        	       dg->numNodes,
+        	       0,
+        	       comm_size,
+        	       tot_time,
+        	       bytes_sent/tot_time,
+        	       "bytes transmitted", 
+        	       verified,
+        	       NPBVERSION,
+        	       COMPILETIME,
+        	       MPICC,
+        	       CLINK,
+        	       CMPI_LIB,
+        	       CMPI_INC,
+        	       CFLAGS,
+        	       CLINKFLAGS );
+    }          
+    MPI_Finalize();
+  return 1;
+}
diff --git a/examples/smpi/NAS/EP/Makefile b/examples/smpi/NAS/EP/Makefile
new file mode 100644
index 0000000000..2014329288
--- /dev/null
+++ b/examples/smpi/NAS/EP/Makefile
@@ -0,0 +1,28 @@
+SHELL=/bin/sh
+BENCHMARK=ep
+BENCHMARKU=EP
+
+include ../config/make.def
+
+#OBJS = ep.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o
+OBJS = ep.o randlc.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+#	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
+
+
+#ep.o:		ep.f  mpinpb.h npbparams.h
+#	${FCOMPILE} ep.f
+
+ep.o:	ep.c randlc.c mpinpb.h npbparams.h
+	${CCOMPILE} ep.c
+
+clean:
+	- rm -f *.o *~ 
+	- rm -f npbparams.h core
+
+
+
diff --git a/examples/smpi/NAS/EP/README b/examples/smpi/NAS/EP/README
new file mode 100644
index 0000000000..6eb36571af
--- /dev/null
+++ b/examples/smpi/NAS/EP/README
@@ -0,0 +1,6 @@
+This code implements the random-number generator described in the
+NAS Parallel Benchmark document RNR Technical Report RNR-94-007.
+The code is "embarrassingly" parallel in that no communication is
+required for the generation of the random numbers itself. There is
+no special requirement on the number of processors used for running
+the benchmark.
diff --git a/examples/smpi/NAS/EP/ep.c b/examples/smpi/NAS/EP/ep.c
new file mode 100644
index 0000000000..753fd431a3
--- /dev/null
+++ b/examples/smpi/NAS/EP/ep.c
@@ -0,0 +1,440 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#include "randlc.h"
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS            1                 
+#endif
+#define true 1
+#define false 0
+
+
+//---NOTE : all the timers function have been modified to
+//          avoid global timers (privatize these). 
+      // ----------------------- timers ---------------------
+      void timer_clear(double *onetimer) {
+            //elapsed[n] = 0.0;
+            *onetimer = 0.0;
+      }
+
+      void timer_start(double *onetimer) {
+            *onetimer = MPI_Wtime();
+      }
+
+      void timer_stop(int n,double *elapsed,double *start) {
+            double t, now;
+
+            now = MPI_Wtime();
+            t = now - start[n];
+            elapsed[n] += t;
+      }
+
+      double timer_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
+            return(elapsed[n]);
+      }
+      /********************************************************************
+       *****************            V R A N L C          ******************
+       *****************                                 *****************/           
+      double vranlc(int n, double x, double a, double *y)
+      {
+        int i;
+        long  i246m1=0x00003FFFFFFFFFFF;
+	  long  LLx, Lx, La;
+        double d2m46;
+
+// This doesn't work, because the compiler does the calculation in 32
+// bits and overflows. No standard way (without f90 stuff) to specify
+// that the rhs should be done in 64 bit arithmetic.
+//     parameter(i246m1=2**46-1)
+
+      d2m46=pow(0.5,46);
+
+// c Note that the v6 compiler on an R8000 does something stupid with
+// c the above. Using the following instead (or various other things)
+// c makes the calculation run almost 10 times as fast.
+//
+// c     save d2m46
+// c      data d2m46/0.0d0/
+// c      if (d2m46 .eq. 0.0d0) then
+// c         d2m46 = 0.5d0**46
+// c      endif
+
+      Lx = (long)x;
+      La = (long)a;
+      //fprintf(stdout,("================== Vranlc ================");
+      //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
+	LLx = Lx;
+	for (i=0; i< n; i++) {
+		  Lx   = Lx*La & i246m1 ;
+		  LLx = Lx;
+		  y[i] = d2m46 * (double)LLx;
+		  /*
+		     if(i == 0) {
+		     fprintf(stdout,("After loop 0:");
+		     fprintf(stdout,("Lx = " + Lx + ", La = " + La);
+		     fprintf(stdout,("d2m46 = " + d2m46);
+		     fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
+		     fprintf(stdout,("Y[0]" + y[0]);
+		     }
+		   */
+	}
+
+      x = (double)LLx;
+      /*
+      fprintf(stdout,("Change: Lx = " + Lx);
+      fprintf(stdout,("=============End   Vranlc ================");
+      */
+      return x;
+    }
+
+
+
+//-------------- the core (unique function) -----------
+      void doTest(int argc, char **argv) {
+		  double dum[3] = {1.,1.,1.};
+		  double x1, x2, sx, sy, tm, an, tt, gc;
+		  double Mops;
+		  double epsilon=1.0E-8, a = 1220703125., s=271828183.;
+		  double t1, t2, t3, t4; 
+		  double sx_verify_value, sy_verify_value, sx_err, sy_err;
+
+#include "npbparams.h"
+		  int    mk=16, 
+			   // --> set by make : in npbparams.h
+			   //m=28, // for CLASS=A
+			   //m=30, // for CLASS=B
+			   //npm=2, // NPROCS
+			   mm = m-mk, 
+			   nn = (int)(pow(2,mm)), 
+			   nk = (int)(pow(2,mk)), 
+			   nq=10, 
+			   np, 
+			   node, 
+			   no_nodes, 
+			   i, 
+			   ik, 
+			   kk, 
+			   l, 
+			   k, nit, no_large_nodes,
+			   np_add, k_offset, j;
+		  int    me, nprocs, root=0, dp_type;
+		  int verified, 
+			    timers_enabled=true;
+		  char  size[500]; // mind the size of the string to represent a big number
+
+		  //Use in randlc..
+		  int KS = 0;
+		  double R23, R46, T23, T46;
+
+		  double *qq = (double *) malloc (10000*sizeof(double));
+		  double *start = (double *) malloc (64*sizeof(double));
+		  double *elapsed = (double *) malloc (64*sizeof(double));
+
+		  double *x = (double *) malloc (2*nk*sizeof(double));
+		  double *q = (double *) malloc (nq*sizeof(double));
+
+		  MPI_Init( &argc, &argv );
+		  MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
+		  MPI_Comm_rank( MPI_COMM_WORLD, &node);
+
+#ifdef USE_MPE
+    MPE_Init_log();
+#endif
+		  root = 0;
+		  if (node == root ) {
+
+			    /*   Because the size of the problem is too large to store in a 32-bit
+			     *   integer for some classes, we put it into a string (for printing).
+			     *   Have to strip off the decimal point put in there by the floating
+			     *   point print statement (internal file)
+			     */
+			    fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
+			    sprintf(size,"%d",pow(2,m+1));
+			    //size = size.replace('.', ' ');
+			    fprintf(stdout," Number of random numbers generated: %s\n",size);
+			    fprintf(stdout," Number of active processes: %d\n",no_nodes);
+
+		  }
+		  verified = false;
+
+		  /* c   Compute the number of "batches" of random number pairs generated 
+		     c   per processor. Adjust if the number of processors does not evenly 
+		     c   divide the total number
+*/
+
+       np = nn / no_nodes;
+       no_large_nodes = nn % no_nodes;
+       if (node < no_large_nodes) np_add = 1;
+       else np_add = 0;
+       np = np + np_add;
+
+       if (np == 0) {
+             fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
+             MPI_Abort(MPI_COMM_WORLD,1);
+             exit(0); 
+       } 
+
+/* c   Call the random number generator functions and initialize
+   c   the x-array to reduce the effects of paging on the timings.
+   c   Also, call all mathematical functions that are used. Make
+   c   sure these initializations cannot be eliminated as dead code.
+*/
+
+	 //call vranlc(0, dum[1], dum[2], dum[3]);
+	 // Array indexes start at 1 in Fortran, 0 in Java
+	 vranlc(0, dum[0], dum[1], &(dum[2])); 
+
+	 dum[0] = randlc(&(dum[1]),&(dum[2]));
+	 /////////////////////////////////
+	 for (i=0;i<2*nk;i++) {
+		   x[i] = -1e99;
+	 }
+	 Mops = log(sqrt(abs(1))); 
+
+	 /*
+	    c---------------------------------------------------------------------
+	    c    Synchronize before placing time stamp
+	    c---------------------------------------------------------------------
+	  */
+        MPI_Barrier( MPI_COMM_WORLD );
+
+        timer_clear(&(elapsed[1]));
+        timer_clear(&(elapsed[2]));
+        timer_clear(&(elapsed[3]));
+        timer_start(&(start[1]));
+        
+        t1 = a;
+	//fprintf(stdout,("(ep.f:160) t1 = " + t1);
+        t1 = vranlc(0, t1, a, x);
+	//fprintf(stdout,("(ep.f:161) t1 = " + t1);
+	
+        
+/* c   Compute AN = A ^ (2 * NK) (mod 2^46). */
+        
+        t1 = a;
+	//fprintf(stdout,("(ep.f:165) t1 = " + t1);
+        for (i=1; i <= mk+1; i++) {
+               t2 = randlc(&t1, &t1);
+	       //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
+        } 
+        an = t1;
+	//fprintf(stdout,("(ep.f:172) s = " + s);
+        tt = s;
+        gc = 0.;
+        sx = 0.;
+        sy = 0.;
+        for (i=0; i < nq ; i++) {
+               q[i] = 0.;
+        }
+
+/*
+    Each instance of this loop may be performed independently. We compute
+    the k offsets separately to take into account the fact that some nodes
+    have more numbers to generate than others
+*/
+
+      if (np_add == 1)
+         k_offset = node * np -1;
+      else
+         k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
+     
+      int stop = false;
+      for(k = 1; k <= np; k++) {
+         stop = false;
+         kk = k_offset + k ;
+         t1 = s;
+         //fprintf(stdout,("(ep.f:193) t1 = " + t1);
+         t2 = an;
+
+//       Find starting seed t1 for this kk.
+
+         for (i=1;i<=100 && !stop;i++) {
+            ik = kk / 2;
+	    //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
+            if (2 * ik != kk)  {
+                t3 = randlc(&t1, &t2);
+                //fprintf(stdout,("(ep.f:200) t1= " +t1 );
+            }
+            if (ik==0)
+                stop = true;
+            else {
+               t3 = randlc(&t2, &t2);
+               kk = ik;
+           }
+         }
+//       Compute uniform pseudorandom numbers.
+
+         //if (timers_enabled)  timer_start(3);
+	 timer_start(&(start[3]));
+         //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
+
+	//fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
+	//fprintf(stdout,"2*nk = " + (2*nk));
+	//fprintf(stdout,"t1 = " + t1);
+	//fprintf(stdout,"a  = " + a);
+	//fprintf(stdout,"x[0] = " + x[0]);
+	//fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+        
+	t1 = vranlc(2 * nk, t1, a, x);
+
+	//fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
+	//fprintf(stdout,("2*nk = " + (2*nk));
+	//fprintf(stdout,("t1 = " + t1);
+	//fprintf(stdout,("a  = " + a);
+	//fprintf(stdout,("x[0] = " + x[0]);
+	//fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+        
+         //if (timers_enabled)  timer_stop(3);
+	 timer_stop(3,elapsed,start);
+
+/*       Compute Gaussian deviates by acceptance-rejection method and 
+ *       tally counts in concentric square annuli.  This loop is not 
+ *       vectorizable. 
+ */
+         //if (timers_enabled) timer_start(2);
+ 	 timer_start(&(start[2]));
+         for(i=1; i<=nk;i++) {
+            x1 = 2. * x[2*i-2] -1.0;
+            x2 = 2. * x[2*i-1] - 1.0;
+            t1 = x1*x1 + x2*x2;
+            if (t1 <= 1.) {
+               t2   = sqrt(-2. * log(t1) / t1);
+               t3   = (x1 * t2);
+               t4   = (x2 * t2);
+               l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
+               q[l] = q[l] + 1.;
+               sx   = sx + t3;
+               sy   = sy + t4;
+             }
+		/*
+	     if(i == 1) {
+                fprintf(stdout,"x1 = " + x1);
+                fprintf(stdout,"x2 = " + x2);
+                fprintf(stdout,"t1 = " + t1);
+                fprintf(stdout,"t2 = " + t2);
+                fprintf(stdout,"t3 = " + t3);
+                fprintf(stdout,"t4 = " + t4);
+                fprintf(stdout,"l = " + l);
+                fprintf(stdout,"q[l] = " + q[l]);
+                fprintf(stdout,"sx = " + sx);
+                fprintf(stdout,"sy = " + sy);
+	     }
+		*/
+           }
+         //if (timers_enabled)  timer_stop(2);
+ 	 timer_stop(2,elapsed,start);
+      }
+
+      //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)   
+	MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+	sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
+      MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+      sy = x[0];
+      MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+      for(i = 0; i < nq; i++) {
+		q[i] = x[i];
+	}
+	for(i = 0; i < nq; i++) {
+		gc += q[i];
+	}
+
+	timer_stop(1,elapsed,start);
+      tm = timer_read(1,elapsed);
+	MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+	tm = x[0];
+
+	if(node == root) {
+		nit = 0;
+		verified = true;
+
+		if(m == 24) {
+         		sx_verify_value = -3.247834652034740E3;
+	                sy_verify_value = -6.958407078382297E3;
+		} else if(m == 25) {
+	            	sx_verify_value = -2.863319731645753E3;
+			sy_verify_value = -6.320053679109499E3;
+		} else if(m == 28) {
+		        sx_verify_value = -4.295875165629892E3;
+			sy_verify_value = -1.580732573678431E4;
+		} else if(m == 30) {
+	        	sx_verify_value =  4.033815542441498E4;
+	                sy_verify_value = -2.660669192809235E4;
+		} else if(m == 32) {
+	                sx_verify_value =  4.764367927995374E4;
+                   	sy_verify_value = -8.084072988043731E4;
+		} else if(m == 36) {
+		        sx_verify_value =  1.982481200946593E5;
+		        sy_verify_value = -1.020596636361769E5;
+		} else {
+			verified = false;
+		}
+
+		/*
+		fprintf(stdout,("sx        = " + sx);
+		fprintf(stdout,("sx_verify = " + sx_verify_value);
+		fprintf(stdout,("sy        = " + sy);
+		fprintf(stdout,("sy_verify = " + sy_verify_value);
+		*/
+		if(verified) {
+			sx_err = abs((sx - sx_verify_value)/sx_verify_value);
+			sy_err = abs((sy - sy_verify_value)/sy_verify_value);
+			/*
+			fprintf(stdout,("sx_err = " + sx_err);
+			fprintf(stdout,("sy_err = " + sx_err);
+			fprintf(stdout,("epsilon= " + epsilon);
+			*/
+			verified = ((sx_err < epsilon) && (sy_err < epsilon));
+		}
+
+		Mops = (pow(2.0, m+1))/tm/1000;
+
+		fprintf(stdout,"EP Benchmark Results:\n");
+		fprintf(stdout,"CPU Time=%d\n",tm);
+		fprintf(stdout,"N = 2^%d\n",m);
+		fprintf(stdout,"No. Gaussain Pairs =%d\n",gc);
+		fprintf(stdout,"Sum = %lf %ld\n",sx,sy);
+		fprintf(stdout,"Count:");
+		for(i = 0; i < nq; i++) {
+			fprintf(stdout,"%d\t %ld\n",i,q[i]);
+		}
+
+		/*
+		print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
+				"Random numbers generated", verified, npbversion,
+				compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
+		fprintf(stdout,"\nEP Benchmark Completed\n");
+            fprintf(stdout,"Class           = %s\n", _class);
+		fprintf(stdout,"Size            = %s\n", size);
+		fprintf(stdout,"Iteration       = %d\n", nit);
+		fprintf(stdout,"Time in seconds = %lf\n",(tm/1000));
+		fprintf(stdout,"Total processes = %d\n",no_nodes);
+		fprintf(stdout,"Mops/s total    = %lf\n",Mops);
+		fprintf(stdout,"Mops/s/process  = %lf\n", Mops/no_nodes);
+		fprintf(stdout,"Operation type  = Random number generated\n");
+		if(verified) {
+			fprintf(stdout,"Verification    = SUCCESSFUL\n");
+		} else {
+			fprintf(stdout,"Verification    = UNSUCCESSFUL\n");
+		}
+           	fprintf(stdout,"Total time:     %lf\n",(timer_read(1,elapsed)/1000));
+           	fprintf(stdout,"Gaussian pairs: %lf\n",(timer_read(2,elapsed)/1000));
+           	fprintf(stdout,"Random numbers: %lf\n",(timer_read(3,elapsed)/1000));
+       	}
+#ifdef USE_MPE
+    MPE_Finish_log(argv[0]);
+#endif
+ 
+       MPI_Finalize();
+      }
+
+    int main(int argc, char **argv) {
+       doTest(argc,argv);
+    }
diff --git a/examples/smpi/NAS/EP/ep.f b/examples/smpi/NAS/EP/ep.f
new file mode 100644
index 0000000000..ca7cc24620
--- /dev/null
+++ b/examples/smpi/NAS/EP/ep.f
@@ -0,0 +1,316 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                                   E P                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+!    It is described in NAS Technical Reports 95-020 and 02-007           !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c
+c Authors: P. O. Frederickson 
+c          D. H. Bailey
+c          A. C. Woo
+c          R. F. Van der Wijngaart
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+      program EMBAR
+c---------------------------------------------------------------------
+C
+c   This is the MPI version of the APP Benchmark 1,
+c   the "embarassingly parallel" benchmark.
+c
+c
+c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
+c   numbers.  MK is the Log_2 of the size of each batch of uniform random
+c   numbers.  MK can be set for convenience on a given system, since it does
+c   not affect the results.
+
+      implicit none
+
+      include 'npbparams.h'
+      include 'mpinpb.h'
+
+      double precision Mops, epsilon, a, s, t1, t2, t3, t4, x, x1, 
+     >                 x2, q, sx, sy, tm, an, tt, gc, dum(3),
+     >                 timer_read
+      double precision sx_verify_value, sy_verify_value, sx_err, sy_err
+      integer          mk, mm, nn, nk, nq, np, ierr, node, no_nodes, 
+     >                 i, ik, kk, l, k, nit, ierrcode, no_large_nodes,
+     >                 np_add, k_offset, j
+      logical          verified, timers_enabled
+      parameter       (timers_enabled = .false.)
+      external         randlc, timer_read
+      double precision randlc, qq
+      character*15     size
+
+      parameter (mk = 16, mm = m - mk, nn = 2 ** mm,
+     >           nk = 2 ** mk, nq = 10, epsilon=1.d-8,
+     >           a = 1220703125.d0, s = 271828183.d0)
+
+      common/storage/ x(2*nk), q(0:nq-1), qq(10000)
+      data             dum /1.d0, 1.d0, 1.d0/
+
+      call mpi_init(ierr)
+      call mpi_comm_rank(MPI_COMM_WORLD,node,ierr)
+      call mpi_comm_size(MPI_COMM_WORLD,no_nodes,ierr)
+
+      root = 0
+
+      if (.not. convertdouble) then
+         dp_type = MPI_DOUBLE_PRECISION
+      else
+         dp_type = MPI_REAL
+      endif
+
+      if (node.eq.root)  then
+
+c   Because the size of the problem is too large to store in a 32-bit
+c   integer for some classes, we put it into a string (for printing).
+c   Have to strip off the decimal point put in there by the floating
+c   point print statement (internal file)
+
+          write(*, 1000)
+          write(size, '(f15.0)' ) 2.d0**(m+1)
+          j = 15
+          if (size(j:j) .eq. '.') j = j - 1
+          write (*,1001) size(1:j)
+          write(*, 1003) no_nodes
+
+ 1000 format(/,' NAS Parallel Benchmarks 3.3 -- EP Benchmark',/)
+ 1001     format(' Number of random numbers generated: ', a15)
+ 1003     format(' Number of active processes:         ', 2x, i13, /)
+
+      endif
+
+      verified = .false.
+
+c   Compute the number of "batches" of random number pairs generated 
+c   per processor. Adjust if the number of processors does not evenly 
+c   divide the total number
+
+      np = nn / no_nodes
+      no_large_nodes = mod(nn, no_nodes)
+      if (node .lt. no_large_nodes) then
+         np_add = 1
+      else
+         np_add = 0
+      endif
+      np = np + np_add
+
+      if (np .eq. 0) then
+         write (6, 1) no_nodes, nn
+ 1       format ('Too many nodes:',2i6)
+         call mpi_abort(MPI_COMM_WORLD,ierrcode,ierr)
+         stop
+      endif
+
+c   Call the random number generator functions and initialize
+c   the x-array to reduce the effects of paging on the timings.
+c   Also, call all mathematical functions that are used. Make
+c   sure these initializations cannot be eliminated as dead code.
+
+      call vranlc(0, dum(1), dum(2), dum(3))
+      dum(1) = randlc(dum(2), dum(3))
+      do 5    i = 1, 2*nk
+         x(i) = -1.d99
+ 5    continue
+      Mops = log(sqrt(abs(max(1.d0,1.d0))))
+
+c---------------------------------------------------------------------
+c      Synchronize before placing time stamp
+c---------------------------------------------------------------------
+      call mpi_barrier(MPI_COMM_WORLD, ierr)
+      
+      call timer_clear(1)
+      call timer_clear(2)
+      call timer_clear(3)
+      call timer_start(1)
+
+      t1 = a
+      call vranlc(0, t1, a, x)
+
+c   Compute AN = A ^ (2 * NK) (mod 2^46).
+
+      t1 = a
+
+      do 100 i = 1, mk + 1
+         t2 = randlc(t1, t1)
+ 100  continue
+
+      an = t1
+      tt = s
+      gc = 0.d0
+      sx = 0.d0
+      sy = 0.d0
+
+      do 110 i = 0, nq - 1
+         q(i) = 0.d0
+ 110  continue
+
+c   Each instance of this loop may be performed independently. We compute
+c   the k offsets separately to take into account the fact that some nodes
+c   have more numbers to generate than others
+
+      if (np_add .eq. 1) then
+         k_offset = node * np -1
+      else
+         k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1
+      endif
+
+      do 150 k = 1, np
+         kk = k_offset + k 
+         t1 = s
+         t2 = an
+
+c        Find starting seed t1 for this kk.
+
+         do 120 i = 1, 100
+            ik = kk / 2
+            if (2 * ik .ne. kk) t3 = randlc(t1, t2)
+            if (ik .eq. 0) goto 130
+            t3 = randlc(t2, t2)
+            kk = ik
+ 120     continue
+
+c        Compute uniform pseudorandom numbers.
+ 130     continue
+
+         if (timers_enabled) call timer_start(3)
+         call vranlc(2 * nk, t1, a, x)
+         if (timers_enabled) call timer_stop(3)
+
+c        Compute Gaussian deviates by acceptance-rejection method and 
+c        tally counts in concentric square annuli.  This loop is not 
+c        vectorizable. 
+
+         if (timers_enabled) call timer_start(2)
+
+         do 140 i = 1, nk
+            x1 = 2.d0 * x(2*i-1) - 1.d0
+            x2 = 2.d0 * x(2*i) - 1.d0
+            t1 = x1 ** 2 + x2 ** 2
+            if (t1 .le. 1.d0) then
+               t2   = sqrt(-2.d0 * log(t1) / t1)
+               t3   = (x1 * t2)
+               t4   = (x2 * t2)
+               l    = max(abs(t3), abs(t4))
+               q(l) = q(l) + 1.d0
+               sx   = sx + t3
+               sy   = sy + t4
+            endif
+ 140     continue
+
+         if (timers_enabled) call timer_stop(2)
+
+ 150  continue
+
+      call mpi_allreduce(sx, x, 1, dp_type,
+     >                   MPI_SUM, MPI_COMM_WORLD, ierr)
+      sx = x(1)
+      call mpi_allreduce(sy, x, 1, dp_type,
+     >                   MPI_SUM, MPI_COMM_WORLD, ierr)
+      sy = x(1)
+      call mpi_allreduce(q, x, nq, dp_type,
+     >                   MPI_SUM, MPI_COMM_WORLD, ierr)
+
+      do i = 1, nq
+         q(i-1) = x(i)
+      enddo
+
+      do 160 i = 0, nq - 1
+        gc = gc + q(i)
+ 160  continue
+
+      call timer_stop(1)
+      tm  = timer_read(1)
+
+      call mpi_allreduce(tm, x, 1, dp_type,
+     >                   MPI_MAX, MPI_COMM_WORLD, ierr)
+      tm = x(1)
+
+      if (node.eq.root) then
+         nit=0
+         verified = .true.
+         if (m.eq.24) then
+            sx_verify_value = -3.247834652034740D+3
+            sy_verify_value = -6.958407078382297D+3
+         elseif (m.eq.25) then
+            sx_verify_value = -2.863319731645753D+3
+            sy_verify_value = -6.320053679109499D+3
+         elseif (m.eq.28) then
+            sx_verify_value = -4.295875165629892D+3
+            sy_verify_value = -1.580732573678431D+4
+         elseif (m.eq.30) then
+            sx_verify_value =  4.033815542441498D+4
+            sy_verify_value = -2.660669192809235D+4
+         elseif (m.eq.32) then
+            sx_verify_value =  4.764367927995374D+4
+            sy_verify_value = -8.084072988043731D+4
+         elseif (m.eq.36) then
+            sx_verify_value =  1.982481200946593D+5
+            sy_verify_value = -1.020596636361769D+5
+         elseif (m.eq.40) then
+            sx_verify_value = -5.319717441530D+05
+            sy_verify_value = -3.688834557731D+05
+         else
+            verified = .false.
+         endif
+         if (verified) then
+            sx_err = abs((sx - sx_verify_value)/sx_verify_value)
+            sy_err = abs((sy - sy_verify_value)/sy_verify_value)
+            verified = ((sx_err.le.epsilon) .and. (sy_err.le.epsilon))
+         endif
+         Mops = 2.d0**(m+1)/tm/1000000.d0
+
+         write (6,11) tm, m, gc, sx, sy, (i, q(i), i = 0, nq - 1)
+ 11      format ('EP Benchmark Results:'//'CPU Time =',f10.4/'N = 2^',
+     >           i5/'No. Gaussian Pairs =',f15.0/'Sums = ',1p,2d25.15/
+     >           'Counts:'/(i3,0p,f15.0))
+
+         call print_results('EP', class, m+1, 0, 0, nit, npm, 
+     >                      no_nodes, tm, Mops, 
+     >                      'Random numbers generated', 
+     >                      verified, npbversion, compiletime, cs1,
+     >                      cs2, cs3, cs4, cs5, cs6, cs7)
+
+      endif
+
+      if (timers_enabled .and. (node .eq. root)) then
+          print *, 'Total time:     ', timer_read(1)
+          print *, 'Gaussian pairs: ', timer_read(2)
+          print *, 'Random numbers: ', timer_read(3)
+      endif
+
+      call mpi_finalize(ierr)
+
+      end
diff --git a/examples/smpi/NAS/EP/mpinpb.h b/examples/smpi/NAS/EP/mpinpb.h
new file mode 100644
index 0000000000..1f13637ae4
--- /dev/null
+++ b/examples/smpi/NAS/EP/mpinpb.h
@@ -0,0 +1,9 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'mpif.h'
+
+      integer           me, nprocs, root, dp_type
+      common /mpistuff/ me, nprocs, root, dp_type
+
diff --git a/examples/smpi/NAS/EP/randlc.c b/examples/smpi/NAS/EP/randlc.c
new file mode 100644
index 0000000000..624b800df8
--- /dev/null
+++ b/examples/smpi/NAS/EP/randlc.c
@@ -0,0 +1,107 @@
+
+/*
+ *    FUNCTION RANDLC (X, A)
+ *
+ *  This routine returns a uniform pseudorandom double precision number in the
+ *  range (0, 1) by using the linear congruential generator
+ *
+ *  x_{k+1} = a x_k  (mod 2^46)
+ *
+ *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+ *  before repeating.  The argument A is the same as 'a' in the above formula,
+ *  and X is the same as x_0.  A and X must be odd double precision integers
+ *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
+ *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+ *  the new seed x_1, so that subsequent calls to RANDLC using the same
+ *  arguments will generate a continuous sequence.
+ *
+ *  This routine should produce the same results on any computer with at least
+ *  48 mantissa bits in double precision floating point data.  On Cray systems,
+ *  double precision should be disabled.
+ *
+ *  David H. Bailey     October 26, 1990
+ *
+ *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ *     SAVE KS, R23, R46, T23, T46
+ *     DATA KS/0/
+ *
+ *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
+ *  by merely using the ** operator, in order to insure that the results are
+ *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/*************           R  A  N  D  L  C             ************/
+/*************                                        ************/
+/*************    portable random number generator    ************/
+/*****************************************************************/
+
+double	randlc( double *X, double *A )
+{
+      static int        KS=0;
+      static double	R23, R46, T23, T46;
+      double		T1, T2, T3, T4;
+      double		A1;
+      double		A2;
+      double		X1;
+      double		X2;
+      double		Z;
+      int     		i, j;
+
+      if (KS == 0) 
+      {
+        R23 = 1.0;
+        R46 = 1.0;
+        T23 = 1.0;
+        T46 = 1.0;
+    
+        for (i=1; i<=23; i++)
+        {
+          R23 = 0.50 * R23;
+          T23 = 2.0 * T23;
+        }
+        for (i=1; i<=46; i++)
+        {
+          R46 = 0.50 * R46;
+          T46 = 2.0 * T46;
+        }
+        KS = 1;
+      }
+
+/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
+
+      T1 = R23 * *A;
+      j  = T1;
+      A1 = j;
+      A2 = *A - T23 * A1;
+
+/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
+    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
+
+      T1 = R23 * *X;
+      j  = T1;
+      X1 = j;
+      X2 = *X - T23 * X1;
+      T1 = A1 * X2 + A2 * X1;
+      
+      j  = R23 * T1;
+      T2 = j;
+      Z = T1 - T23 * T2;
+      T3 = T23 * Z + A2 * X2;
+      j  = R46 * T3;
+      T4 = j;
+      *X = T3 - T46 * T4;
+      return(R46 * *X);
+} 
+
+
+
+/*****************************************************************/
+/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
+/************                                         ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
diff --git a/examples/smpi/NAS/EP/randlc.h b/examples/smpi/NAS/EP/randlc.h
new file mode 100644
index 0000000000..aff84d341a
--- /dev/null
+++ b/examples/smpi/NAS/EP/randlc.h
@@ -0,0 +1,3 @@
+
+double      randlc( double *X, double *A );
+
diff --git a/examples/smpi/NAS/FT/Makefile b/examples/smpi/NAS/FT/Makefile
new file mode 100644
index 0000000000..1cc6e1416b
--- /dev/null
+++ b/examples/smpi/NAS/FT/Makefile
@@ -0,0 +1,23 @@
+SHELL=/bin/sh
+BENCHMARK=ft
+BENCHMARKU=FT
+
+include ../config/make.def
+
+include ../sys/make.common
+
+OBJS = ft.o ${COMMON}/${RAND}.o ${COMMON}/print_results.o ${COMMON}/timers.o
+
+${PROGRAM}: config ${OBJS}
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+
+
+.f.o:
+	${FCOMPILE} $<
+
+ft.o:             ft.f  global.h mpinpb.h npbparams.h
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f ft npbparams.h core
diff --git a/examples/smpi/NAS/FT/README b/examples/smpi/NAS/FT/README
new file mode 100644
index 0000000000..ab08b363b2
--- /dev/null
+++ b/examples/smpi/NAS/FT/README
@@ -0,0 +1,5 @@
+This code implements the time integration of a three-dimensional
+partial differential equation using the Fast Fourier Transform.
+Some of the dimension statements are not F77 conforming and will
+not work using the g77 compiler. All dimension statements,
+however, are legal F90.
\ No newline at end of file
diff --git a/examples/smpi/NAS/FT/ft.f b/examples/smpi/NAS/FT/ft.f
new file mode 100644
index 0000000000..8ab25b9780
--- /dev/null
+++ b/examples/smpi/NAS/FT/ft.f
@@ -0,0 +1,1993 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                                   F T                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+!    It is described in NAS Technical Reports 95-020 and 02-007           !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+!TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
+!STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
+!JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
+!FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY 
+!USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
+!NPB3.0-SER)
+
+!TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
+!MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
+!NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
+!COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
+!((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
+!THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE 
+!PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A 
+!FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
+!THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.
+
+!UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
+!FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
+!"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM 
+!DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
+!OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
+!AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
+!OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
+!ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.
+
+c---------------------------------------------------------------------
+c
+c Authors: D. Bailey
+c          W. Saphir
+c          R. F. Van der Wijngaart
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c FT benchmark
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      program ft
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpif.h'
+      include 'global.h'
+      integer i, ierr
+      
+c---------------------------------------------------------------------
+c u0, u1, u2 are the main arrays in the problem. 
+c Depending on the decomposition, these arrays will have different 
+c dimensions. To accomodate all possibilities, we allocate them as 
+c one-dimensional arrays and pass them to subroutines for different 
+c views
+c  - u0 contains the initial (transformed) initial condition
+c  - u1 and u2 are working arrays
+c---------------------------------------------------------------------
+
+      double complex   u0(ntdivnp), 
+     >                 u1(ntdivnp), 
+     >                 u2(ntdivnp)
+      double precision twiddle(ntdivnp)
+c---------------------------------------------------------------------
+c Large arrays are in common so that they are allocated on the
+c heap rather than the stack. This common block is not
+c referenced directly anywhere else. Padding is to avoid accidental 
+c cache problems, since all array sizes are powers of two.
+c---------------------------------------------------------------------
+
+      double complex pad1(3), pad2(3), pad3(3)
+      common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
+
+      integer iter
+      double precision total_time, mflops
+      logical verified
+      character class
+
+      call MPI_Init(ierr)
+
+c---------------------------------------------------------------------
+c Run the entire problem once to make sure all data is touched. 
+c This reduces variable startup costs, which is important for such a 
+c short benchmark. The other NPB 2 implementations are similar. 
+c---------------------------------------------------------------------
+      do i = 1, t_max
+         call timer_clear(i)
+      end do
+
+      call setup()
+      call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
+      call compute_initial_conditions(u1, dims(1,1), dims(2,1), 
+     >                                dims(3,1))
+      call fft_init (dims(1,1))
+      call fft(1, u1, u0)
+
+c---------------------------------------------------------------------
+c Start over from the beginning. Note that all operations must
+c be timed, in contrast to other benchmarks. 
+c---------------------------------------------------------------------
+      do i = 1, t_max
+         call timer_clear(i)
+      end do
+      call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+      call timer_start(T_total)
+      if (timers_enabled) call timer_start(T_setup)
+
+      call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
+      call compute_initial_conditions(u1, dims(1,1), dims(2,1), 
+     >                                dims(3,1))
+      call fft_init (dims(1,1))
+
+      if (timers_enabled) call synchup()
+      if (timers_enabled) call timer_stop(T_setup)
+
+      if (timers_enabled) call timer_start(T_fft)
+      call fft(1, u1, u0)
+      if (timers_enabled) call timer_stop(T_fft)
+
+      do iter = 1, niter
+         if (timers_enabled) call timer_start(T_evolve)
+         call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
+         if (timers_enabled) call timer_stop(T_evolve)
+         if (timers_enabled) call timer_start(T_fft)
+         call fft(-1, u1, u2)
+         if (timers_enabled) call timer_stop(T_fft)
+         if (timers_enabled) call synchup()
+         if (timers_enabled) call timer_start(T_checksum)
+         call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
+         if (timers_enabled) call timer_stop(T_checksum)
+      end do
+
+      call verify(nx, ny, nz, niter, verified, class)
+      call timer_stop(t_total)
+      if (np .ne. np_min) verified = .false.
+      total_time = timer_read(t_total)
+
+      if( total_time .ne. 0. ) then
+         mflops = 1.0d-6*ntotal_f *
+     >             (14.8157+7.19641*log(ntotal_f)
+     >          +  (5.23518+7.21113*log(ntotal_f))*niter)
+     >                 /total_time
+      else
+         mflops = 0.0
+      endif
+      if (me .eq. 0) then
+         call print_results('FT', class, nx, ny, nz, niter, np_min, np,
+     >     total_time, mflops, '          floating point', verified, 
+     >     npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+      endif
+      if (timers_enabled) call print_timers()
+      call MPI_Finalize(ierr)
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine evolve(u0, u1, twiddle, d1, d2, d3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c evolve u0 -> u1 (t time steps) in fourier space
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer d1, d2, d3
+      double precision exi
+      double complex u0(d1,d2,d3)
+      double complex u1(d1,d2,d3)
+      double precision twiddle(d1,d2,d3)
+      integer i, j, k
+
+      do k = 1, d3
+         do j = 1, d2
+            do i = 1, d1
+               u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
+               u1(i,j,k) = u0(i,j,k)
+            end do
+         end do
+      end do
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine compute_initial_conditions(u0, d1, d2, d3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Fill in array u0 with initial conditions from 
+c random number generator 
+c---------------------------------------------------------------------
+      implicit none
+      include 'global.h'
+      integer d1, d2, d3
+      double complex u0(d1, d2, d3)
+      integer k
+      double precision x0, start, an, dummy
+      
+c---------------------------------------------------------------------
+c 0-D and 1-D layouts are easy because each processor gets a contiguous
+c chunk of the array, in the Fortran ordering sense. 
+c For a 2-D layout, it's a bit more complicated. We always
+c have entire x-lines (contiguous) in processor. 
+c We can do ny/np1 of them at a time since we have
+c ny/np1 contiguous in y-direction. But then we jump
+c by z-planes (nz/np2 of them, total). 
+c For the 0-D and 1-D layouts we could do larger chunks, but
+c this turns out to have no measurable impact on performance. 
+c---------------------------------------------------------------------
+
+
+      start = seed                                    
+c---------------------------------------------------------------------
+c Jump to the starting element for our first plane.
+c---------------------------------------------------------------------
+      call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
+      dummy = randlc(start, an)
+      call ipow46(a, 2*nx, ny, an)
+      
+c---------------------------------------------------------------------
+c Go through by z planes filling in one square at a time.
+c---------------------------------------------------------------------
+      do k = 1, dims(3, 1) ! nz/np2
+         x0 = start
+         call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
+         if (k .ne. dims(3, 1)) dummy = randlc(start, an)
+      end do
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine ipow46(a, exp_1, exp_2, result)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute a^exponent mod 2^46
+c---------------------------------------------------------------------
+
+      implicit none
+      double precision a, result, dummy, q, r
+      integer exp_1, exp_2, n, n2, ierr
+      external randlc
+      double precision randlc
+      logical  two_pow
+c---------------------------------------------------------------------
+c Use
+c   a^n = a^(n/2)*a^(n/2) if n even else
+c   a^n = a*a^(n-1)       if n odd
+c---------------------------------------------------------------------
+      result = 1
+      if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
+      q = a
+      r = 1
+      n = exp_1
+      two_pow = .true.
+
+      do while (two_pow)
+         n2 = n/2
+         if (n2 * 2 .eq. n) then
+            dummy = randlc(q, q)
+            n = n2
+         else
+            n = n * exp_2
+            two_pow = .false.
+         endif
+      end do
+
+      do while (n .gt. 1)
+         n2 = n/2
+         if (n2 * 2 .eq. n) then
+            dummy = randlc(q, q) 
+            n = n2
+         else
+            dummy = randlc(r, q)
+            n = n-1
+         endif
+      end do
+      dummy = randlc(r, q)
+      result = r
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'mpinpb.h'
+      include 'global.h'
+
+      integer ierr, i, j, fstatus
+      debug = .FALSE.
+      
+      call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
+      call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)
+
+      if (.not. convertdouble) then
+         dc_type = MPI_DOUBLE_COMPLEX
+      else
+         dc_type = MPI_COMPLEX
+      endif
+
+
+      if (me .eq. 0) then
+         write(*, 1000)
+         open (unit=2,file='inputft.data',status='old', iostat=fstatus)
+
+         if (fstatus .eq. 0) then
+            write(*,233) 
+ 233        format(' Reading from input file inputft.data')
+            read (2,*) niter
+            read (2,*) layout_type
+            read (2,*) np1, np2
+            close(2)
+
+c---------------------------------------------------------------------
+c check to make sure input data is consistent
+c---------------------------------------------------------------------
+
+    
+c---------------------------------------------------------------------
+c 1. product of processor grid dims must equal number of processors
+c---------------------------------------------------------------------
+
+            if (np1 * np2 .ne. np) then
+               write(*, 238)
+ 238           format(' np1 and np2 given in input file are not valid.')
+               write(*, 239) np1*np2, np
+ 239           format(' Product is ', i5, ' and should be ', i5)
+               call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+            endif
+
+c---------------------------------------------------------------------
+c 2. layout type must be valid
+c---------------------------------------------------------------------
+
+            if (layout_type .ne. layout_0D .and.
+     >          layout_type .ne. layout_1D .and.
+     >          layout_type .ne. layout_2D) then
+               write(*, 240)
+ 240           format(' Layout type specified in inputft.data is 
+     >                  invalid ')
+               call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+            endif
+
+c---------------------------------------------------------------------
+c 3. 0D layout must be 1x1 grid
+c---------------------------------------------------------------------
+
+            if (layout_type .eq. layout_0D .and.
+     >            (np1 .ne.1 .or. np2 .ne. 1)) then
+               write(*, 241)
+ 241           format(' For 0D layout, both np1 and np2 must be 1 ')
+               call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+            endif
+c---------------------------------------------------------------------
+c 4. 1D layout must be 1xN grid
+c---------------------------------------------------------------------
+
+            if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
+               write(*, 242)
+ 242           format(' For 1D layout, np1 must be 1 ')
+               call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+            endif
+
+         else
+            write(*,234) 
+            niter = niter_default
+            if (np .eq. 1) then
+               np1 = 1
+               np2 = 1
+               layout_type = layout_0D
+            else if (np .le. nz) then
+               np1 = 1
+               np2 = np
+               layout_type = layout_1D
+            else
+               np1 = nz
+               np2 = np/nz
+               layout_type = layout_2D
+            endif
+         endif
+
+         if (np .lt. np_min) then
+            write(*, 10) np_min
+ 10         format(' Error: Compiled for ', I5, ' processors. ')
+            write(*, 11) np
+ 11         format(' Only ',  i5, ' processors found ')
+            call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+         endif
+
+ 234     format(' No input file inputft.data. Using compiled defaults')
+         write(*, 1001) nx, ny, nz
+         write(*, 1002) niter
+         write(*, 1004) np
+         write(*, 1005) np1, np2
+         if (np .ne. np_min) write(*, 1006) np_min
+
+         if (layout_type .eq. layout_0D) then
+            write(*, 1010) '0D'
+         else if (layout_type .eq. layout_1D) then
+            write(*, 1010) '1D'
+         else
+            write(*, 1010) '2D'
+         endif
+
+ 1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
+ 1001    format(' Size                : ', i4, 'x', i4, 'x', i4)
+ 1002    format(' Iterations          : ', 7x, i7)
+ 1004    format(' Number of processes : ', 7x, i7)
+ 1005    format(' Processor array     : ', 5x, i4, 'x', i4)
+ 1006    format(' WARNING: compiled for ', i5, ' processes. ',
+     >          ' Will not verify. ')
+ 1010    format(' Layout type         : ', 9x, A5)
+      endif
+
+
+c---------------------------------------------------------------------
+c Since np1, np2 and layout_type are in a common block, 
+c this sends all three. 
+c---------------------------------------------------------------------
+      call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+
+      if (np1 .eq. 1 .and. np2 .eq. 1) then
+        layout_type = layout_0D
+      else if (np1 .eq. 1) then
+         layout_type = layout_1D
+      else
+         layout_type = layout_2D
+      endif
+
+      if (layout_type .eq. layout_0D) then
+         do i = 1, 3
+            dims(1, i) = nx
+            dims(2, i) = ny
+            dims(3, i) = nz
+         end do
+      else if (layout_type .eq. layout_1D) then
+         dims(1, 1) = nx
+         dims(2, 1) = ny
+         dims(3, 1) = nz
+
+         dims(1, 2) = nx
+         dims(2, 2) = ny
+         dims(3, 2) = nz
+
+         dims(1, 3) = nz
+         dims(2, 3) = nx
+         dims(3, 3) = ny
+      else if (layout_type .eq. layout_2D) then
+         dims(1, 1) = nx
+         dims(2, 1) = ny
+         dims(3, 1) = nz
+
+         dims(1, 2) = ny
+         dims(2, 2) = nx
+         dims(3, 2) = nz
+
+         dims(1, 3) = nz
+         dims(2, 3) = nx
+         dims(3, 3) = ny
+
+      endif
+      do i = 1, 3
+         dims(2, i) = dims(2, i) / np1
+         dims(3, i) = dims(3, i) / np2
+      end do
+
+
+c---------------------------------------------------------------------
+c Determine processor coordinates of this processor
+c Processor grid is np1xnp2. 
+c Arrays are always (n1, n2/np1, n3/np2)
+c Processor coords are zero-based. 
+c---------------------------------------------------------------------
+      me2 = mod(me, np2)  ! goes from 0...np2-1
+      me1 = me/np2        ! goes from 0...np1-1
+c---------------------------------------------------------------------
+c Communicators for rows/columns of processor grid. 
+c commslice1 is communicator of all procs with same me1, ranked as me2
+c commslice2 is communicator of all procs with same me2, ranked as me1
+c mpi_comm_split(comm, color, key, ...)
+c---------------------------------------------------------------------
+      call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
+      call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
+      if (timers_enabled) call synchup()
+
+      if (debug) print *, 'proc coords: ', me, me1, me2
+
+c---------------------------------------------------------------------
+c Determine which section of the grid is owned by this
+c processor. 
+c---------------------------------------------------------------------
+      if (layout_type .eq. layout_0d) then
+
+         do i = 1, 3
+            xstart(i) = 1
+            xend(i)   = nx
+            ystart(i) = 1
+            yend(i)   = ny
+            zstart(i) = 1
+            zend(i)   = nz
+         end do
+
+      else if (layout_type .eq. layout_1d) then
+
+         xstart(1) = 1
+         xend(1)   = nx
+         ystart(1) = 1
+         yend(1)   = ny
+         zstart(1) = 1 + me2 * nz/np2
+         zend(1)   = (me2+1) * nz/np2
+
+         xstart(2) = 1
+         xend(2)   = nx
+         ystart(2) = 1
+         yend(2)   = ny
+         zstart(2) = 1 + me2 * nz/np2
+         zend(2)   = (me2+1) * nz/np2
+
+         xstart(3) = 1
+         xend(3)   = nx
+         ystart(3) = 1 + me2 * ny/np2
+         yend(3)   = (me2+1) * ny/np2
+         zstart(3) = 1
+         zend(3)   = nz
+
+      else if (layout_type .eq. layout_2d) then
+
+         xstart(1) = 1
+         xend(1)   = nx
+         ystart(1) = 1 + me1 * ny/np1
+         yend(1)   = (me1+1) * ny/np1
+         zstart(1) = 1 + me2 * nz/np2
+         zend(1)   = (me2+1) * nz/np2
+
+         xstart(2) = 1 + me1 * nx/np1
+         xend(2)   = (me1+1)*nx/np1
+         ystart(2) = 1
+         yend(2)   = ny
+         zstart(2) = zstart(1)
+         zend(2)   = zend(1)
+
+         xstart(3) = xstart(2)
+         xend(3)   = xend(2)
+         ystart(3) = 1 + me2 *ny/np2
+         yend(3)   = (me2+1)*ny/np2
+         zstart(3) = 1
+         zend(3)   = nz
+      endif
+
+c---------------------------------------------------------------------
+c Set up info for blocking of ffts and transposes.  This improves
+c performance on cache-based systems. Blocking involves
+c working on a chunk of the problem at a time, taking chunks
+c along the first, second, or third dimension. 
+c
+c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+
+c Since 1st dim is always in processor, we'll assume it's long enough 
+c (default blocking factor is 16 so min size for 1st dim is 16)
+c The only case we have to worry about is cffts1 in a 2d decomposition. 
+c so the blocking factor should not be larger than the 2nd dimension. 
+c---------------------------------------------------------------------
+
+      fftblock = fftblock_default
+      fftblockpad = fftblockpad_default
+
+      if (layout_type .eq. layout_2d) then
+         if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
+         if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
+         if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
+      endif
+      
+      if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3
+
+      return
+      end
+
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine compute_indexmap(twiddle, d1, d2, d3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 
+c for time evolution exponent. 
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'mpinpb.h'
+      include 'global.h'
+      integer d1, d2, d3
+      integer i, j, k, ii, ii2, jj, ij2, kk
+      double precision ap, twiddle(d1, d2, d3)
+
+c---------------------------------------------------------------------
+c this function is very different depending on whether 
+c we are in the 0d, 1d or 2d layout. Compute separately. 
+c basically we want to convert the fortran indices 
+c   1 2 3 4 5 6 7 8 
+c to 
+c   0 1 2 3 -4 -3 -2 -1
+c The following magic formula does the trick:
+c mod(i-1+n/2, n) - n/2
+c---------------------------------------------------------------------
+
+      ap = - 4.d0 * alpha * pi *pi
+
+      if (layout_type .eq. layout_0d) then ! xyz layout
+         do i = 1, dims(1,3)
+            ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
+            ii2 = ii*ii
+            do j = 1, dims(2,3)
+               jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
+               ij2 = jj*jj+ii2
+               do k = 1, dims(3,3)
+                  kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
+                  twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
+               end do
+            end do
+         end do
+      else if (layout_type .eq. layout_1d) then ! zxy layout 
+         do i = 1,dims(2,3)
+            ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
+            ii2 = ii*ii
+            do j = 1,dims(3,3)
+               jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
+               ij2 = jj*jj+ii2
+               do k = 1,dims(1,3)
+                  kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
+                  twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
+               end do
+            end do
+         end do
+      else if (layout_type .eq. layout_2d) then ! zxy layout
+         do i = 1,dims(2,3)
+            ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
+            ii2 = ii*ii
+            do j = 1, dims(3,3)
+               jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
+               ij2 = jj*jj+ii2
+               do k =1,dims(1,3)
+                  kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
+                  twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
+               end do
+            end do
+         end do
+      else
+         print *, ' Unknown layout type ', layout_type
+         stop
+      endif
+
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine print_timers()
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      integer i
+      include 'global.h'
+      character*25 tstrings(T_max)
+      data tstrings / '          total ', 
+     >                '          setup ', 
+     >                '            fft ', 
+     >                '         evolve ', 
+     >                '       checksum ', 
+     >                '         fftlow ', 
+     >                '        fftcopy ', 
+     >                '      transpose ', 
+     >                ' transpose1_loc ', 
+     >                ' transpose1_glo ', 
+     >                ' transpose1_fin ', 
+     >                ' transpose2_loc ', 
+     >                ' transpose2_glo ', 
+     >                ' transpose2_fin ', 
+     >                '           sync ' /
+
+      if (me .ne. 0) return
+      do i = 1, t_max
+         if (timer_read(i) .ne. 0.0d0) then
+            write(*, 100) i, tstrings(i), timer_read(i)
+         endif
+      end do
+ 100  format(' timer ', i2, '(', A16,  ') :', F10.6)
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine fft(dir, x1, x2)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer dir
+      double complex x1(ntdivnp), x2(ntdivnp)
+
+      double complex scratch(fftblockpad_default*maxdim*2)
+
+c---------------------------------------------------------------------
+c note: args x1, x2 must be different arrays
+c note: args for cfftsx are (direction, layout, xin, xout, scratch)
+c       xin/xout may be the same and it can be somewhat faster
+c       if they are
+c note: args for transpose are (layout1, layout2, xin, xout)
+c       xin/xout must be different
+c---------------------------------------------------------------------
+
+      if (dir .eq. 1) then
+         if (layout_type .eq. layout_0d) then
+            call cffts1(1, dims(1,1), dims(2,1), dims(3,1), 
+     >                  x1, x1, scratch)
+            call cffts2(1, dims(1,2), dims(2,2), dims(3,2), 
+     >                  x1, x1, scratch)
+            call cffts3(1, dims(1,3), dims(2,3), dims(3,3), 
+     >                  x1, x2, scratch)
+         else if (layout_type .eq. layout_1d) then
+            call cffts1(1, dims(1,1), dims(2,1), dims(3,1), 
+     >                  x1, x1, scratch)
+            call cffts2(1, dims(1,2), dims(2,2), dims(3,2), 
+     >                  x1, x1, scratch)
+            if (timers_enabled) call timer_start(T_transpose)
+            call transpose_xy_z(2, 3, x1, x2)
+            if (timers_enabled) call timer_stop(T_transpose)
+            call cffts1(1, dims(1,3), dims(2,3), dims(3,3), 
+     >                  x2, x2, scratch)
+         else if (layout_type .eq. layout_2d) then
+            call cffts1(1, dims(1,1), dims(2,1), dims(3,1), 
+     >                  x1, x1, scratch)
+            if (timers_enabled) call timer_start(T_transpose)
+            call transpose_x_y(1, 2, x1, x2)
+            if (timers_enabled) call timer_stop(T_transpose)
+            call cffts1(1, dims(1,2), dims(2,2), dims(3,2), 
+     >                  x2, x2, scratch)
+            if (timers_enabled) call timer_start(T_transpose)
+            call transpose_x_z(2, 3, x2, x1)
+            if (timers_enabled) call timer_stop(T_transpose)
+            call cffts1(1, dims(1,3), dims(2,3), dims(3,3), 
+     >                  x1, x2, scratch)
+         endif
+      else
+         if (layout_type .eq. layout_0d) then
+            call cffts3(-1, dims(1,3), dims(2,3), dims(3,3), 
+     >                  x1, x1, scratch)
+            call cffts2(-1, dims(1,2), dims(2,2), dims(3,2), 
+     >                  x1, x1, scratch)
+            call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), 
+     >                  x1, x2, scratch)
+         else if (layout_type .eq. layout_1d) then
+            call cffts1(-1, dims(1,3), dims(2,3), dims(3,3), 
+     >                  x1, x1, scratch)
+            if (timers_enabled) call timer_start(T_transpose)
+            call transpose_x_yz(3, 2, x1, x2)
+            if (timers_enabled) call timer_stop(T_transpose)
+            call cffts2(-1, dims(1,2), dims(2,2), dims(3,2), 
+     >                  x2, x2, scratch)
+            call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), 
+     >                  x2, x2, scratch)
+         else if (layout_type .eq. layout_2d) then
+            call cffts1(-1, dims(1,3), dims(2,3), dims(3,3), 
+     >                  x1, x1, scratch)
+            if (timers_enabled) call timer_start(T_transpose)
+            call transpose_x_z(3, 2, x1, x2)
+            if (timers_enabled) call timer_stop(T_transpose)
+            call cffts1(-1, dims(1,2), dims(2,2), dims(3,2), 
+     >                  x2, x2, scratch)
+            if (timers_enabled) call timer_start(T_transpose)
+            call transpose_x_y(2, 1, x2, x1)
+            if (timers_enabled) call timer_stop(T_transpose)
+            call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), 
+     >                  x1, x2, scratch)
+         endif
+      endif
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine cffts1(is, d1, d2, d3, x, xout, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'global.h'
+      integer is, d1, d2, d3, logd1
+      double complex x(d1,d2,d3)
+      double complex xout(d1,d2,d3)
+      double complex y(fftblockpad, d1, 2) 
+      integer i, j, k, jj
+
+      logd1 = ilog2(d1)
+
+      do k = 1, d3
+         do jj = 0, d2 - fftblock, fftblock
+            if (timers_enabled) call timer_start(T_fftcopy)
+            do j = 1, fftblock
+               do i = 1, d1
+                  y(j,i,1) = x(i,j+jj,k)
+               enddo
+            enddo
+            if (timers_enabled) call timer_stop(T_fftcopy)
+            
+            if (timers_enabled) call timer_start(T_fftlow)
+            call cfftz (is, logd1, d1, y, y(1,1,2))
+            if (timers_enabled) call timer_stop(T_fftlow)
+
+            if (timers_enabled) call timer_start(T_fftcopy)
+            do j = 1, fftblock
+               do i = 1, d1
+                  xout(i,j+jj,k) = y(j,i,1)
+               enddo
+            enddo
+            if (timers_enabled) call timer_stop(T_fftcopy)
+         enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine cffts2(is, d1, d2, d3, x, xout, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'global.h'
+      integer is, d1, d2, d3, logd2
+      double complex x(d1,d2,d3)
+      double complex xout(d1,d2,d3)
+      double complex y(fftblockpad, d2, 2) 
+      integer i, j, k, ii
+
+      logd2 = ilog2(d2)
+
+      do k = 1, d3
+        do ii = 0, d1 - fftblock, fftblock
+           if (timers_enabled) call timer_start(T_fftcopy)
+           do j = 1, d2
+              do i = 1, fftblock
+                 y(i,j,1) = x(i+ii,j,k)
+              enddo
+           enddo
+           if (timers_enabled) call timer_stop(T_fftcopy)
+
+           if (timers_enabled) call timer_start(T_fftlow)
+           call cfftz (is, logd2, d2, y, y(1, 1, 2))
+           if (timers_enabled) call timer_stop(T_fftlow)
+
+           if (timers_enabled) call timer_start(T_fftcopy)
+           do j = 1, d2
+              do i = 1, fftblock
+                 xout(i+ii,j,k) = y(i,j,1)
+              enddo
+           enddo
+           if (timers_enabled) call timer_stop(T_fftcopy)
+        enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine cffts3(is, d1, d2, d3, x, xout, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'global.h'
+      integer is, d1, d2, d3, logd3
+      double complex x(d1,d2,d3)
+      double complex xout(d1,d2,d3)
+      double complex y(fftblockpad, d3, 2) 
+      integer i, j, k, ii
+
+      logd3 = ilog2(d3)
+
+      do j = 1, d2
+        do ii = 0, d1 - fftblock, fftblock
+           if (timers_enabled) call timer_start(T_fftcopy)
+           do k = 1, d3
+              do i = 1, fftblock
+                 y(i,k,1) = x(i+ii,j,k)
+              enddo
+           enddo
+           if (timers_enabled) call timer_stop(T_fftcopy)
+
+           if (timers_enabled) call timer_start(T_fftlow)
+           call cfftz (is, logd3, d3, y, y(1, 1, 2))
+           if (timers_enabled) call timer_stop(T_fftlow)
+
+           if (timers_enabled) call timer_start(T_fftcopy)
+           do k = 1, d3
+              do i = 1, fftblock
+                 xout(i+ii,j,k) = y(i,k,1)
+              enddo
+           enddo
+           if (timers_enabled) call timer_stop(T_fftcopy)
+        enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine fft_init (n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the roots-of-unity array that will be used for subsequent FFTs. 
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+
+      integer m,n,nu,ku,i,j,ln
+      double precision t, ti
+
+
+c---------------------------------------------------------------------
+c   Initialize the U array with sines and cosines in a manner that permits
+c   stride one access at each FFT iteration.
+c---------------------------------------------------------------------
+      nu = n
+      m = ilog2(n)
+      u(1) = m
+      ku = 2
+      ln = 1
+
+      do j = 1, m
+         t = pi / ln
+         
+         do i = 0, ln - 1
+            ti = i * t
+            u(i+ku) = dcmplx (cos (ti), sin(ti))
+         enddo
+         
+         ku = ku + ln
+         ln = 2 * ln
+      enddo
+      
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine cfftz (is, m, n, x, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+c   to Swarztrauber.  X is both the input and the output array, while Y is a 
+c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to 
+c   perform FFTs, the array U must be initialized by calling CFFTZ with IS 
+c   set to 0 and M set to MX, where MX is the maximum value of M for any 
+c   subsequent call.
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+
+      integer is,m,n,i,j,l,mx
+      double complex x, y
+
+      dimension x(fftblockpad,n), y(fftblockpad,n)
+
+c---------------------------------------------------------------------
+c   Check if input parameters are invalid.
+c---------------------------------------------------------------------
+      mx = u(1)
+      if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)    
+     >  then
+        write (*, 1)  is, m, mx
+ 1      format ('CFFTZ: Either U has not been initialized, or else'/    
+     >    'one of the input parameters is invalid', 3I5)
+        stop
+      endif
+
+c---------------------------------------------------------------------
+c   Perform one variant of the Stockham FFT.
+c---------------------------------------------------------------------
+      do l = 1, m, 2
+        call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
+        if (l .eq. m) goto 160
+        call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
+      enddo
+
+      goto 180
+
+c---------------------------------------------------------------------
+c   Copy Y to X.
+c---------------------------------------------------------------------
+ 160  do j = 1, n
+        do i = 1, fftblock
+          x(i,j) = y(i,j)
+        enddo
+      enddo
+
+ 180  continue
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   Performs the L-th iteration of the second variant of the Stockham FFT.
+c---------------------------------------------------------------------
+
+      implicit none
+
+      integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
+      double complex u,x,y,u1,x11,x21
+      dimension u(n), x(ny1,n), y(ny1,n)
+
+
+c---------------------------------------------------------------------
+c   Set initial parameters.
+c---------------------------------------------------------------------
+
+      n1 = n / 2
+      lk = 2 ** (l - 1)
+      li = 2 ** (m - l)
+      lj = 2 * lk
+      ku = li + 1
+
+      do i = 0, li - 1
+        i11 = i * lk + 1
+        i12 = i11 + n1
+        i21 = i * lj + 1
+        i22 = i21 + lk
+        if (is .ge. 1) then
+          u1 = u(ku+i)
+        else
+          u1 = dconjg (u(ku+i))
+        endif
+
+c---------------------------------------------------------------------
+c   This loop is vectorizable.
+c---------------------------------------------------------------------
+        do k = 0, lk - 1
+          do j = 1, ny
+            x11 = x(j,i11+k)
+            x21 = x(j,i12+k)
+            y(j,i21+k) = x11 + x21
+            y(j,i22+k) = u1 * (x11 - x21)
+          enddo
+        enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      integer function ilog2(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      integer n, nn, lg
+      if (n .eq. 1) then
+         ilog2=0
+         return
+      endif
+      lg = 1
+      nn = 2
+      do while (nn .lt. n)
+         nn = nn*2
+         lg = lg+1
+      end do
+      ilog2 = lg
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_yz(l1, l2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer l1, l2
+      double complex xin(ntdivnp), xout(ntdivnp)
+
+      call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
+     >                          xin, xout)
+
+      call transpose2_global(xout, xin)
+
+      call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
+     >                          xin, xout)
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_xy_z(l1, l2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer l1, l2
+      double complex xin(ntdivnp), xout(ntdivnp)
+
+      call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
+     >                          xin, xout)
+      call transpose2_global(xout, xin)
+      call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
+     >                          xin, xout)
+
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose2_local(n1, n2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'mpinpb.h'
+      include 'global.h'
+      integer n1, n2
+      double complex xin(n1, n2), xout(n2, n1)
+      
+      double complex z(transblockpad, transblock)
+
+      integer i, j, ii, jj
+
+      if (timers_enabled) call timer_start(T_transxzloc)
+
+c---------------------------------------------------------------------
+c If possible, block the transpose for cache memory systems. 
+c How much does this help? Example: R8000 Power Challenge (90 MHz)
+c Blocked version decreases time spend in this routine 
+c from 14 seconds to 5.2 seconds on 8 nodes class A.
+c---------------------------------------------------------------------
+
+      if (n1 .lt. transblock .or. n2 .lt. transblock) then
+         if (n1 .ge. n2) then 
+            do j = 1, n2
+               do i = 1, n1
+                  xout(j, i) = xin(i, j)
+               end do
+            end do
+         else
+            do i = 1, n1
+               do j = 1, n2
+                  xout(j, i) = xin(i, j)
+               end do
+            end do
+         endif
+      else
+         do j = 0, n2-1, transblock
+            do i = 0, n1-1, transblock
+               
+c---------------------------------------------------------------------
+c Note: compiler should be able to take j+jj out of inner loop
+c---------------------------------------------------------------------
+               do jj = 1, transblock
+                  do ii = 1, transblock
+                     z(jj,ii) = xin(i+ii, j+jj)
+                  end do
+               end do
+               
+               do ii = 1, transblock
+                  do jj = 1, transblock
+                     xout(j+jj, i+ii) = z(jj,ii)
+                  end do
+               end do
+               
+            end do
+         end do
+      endif
+      if (timers_enabled) call timer_stop(T_transxzloc)
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose2_global(xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      include 'mpinpb.h'
+      double complex xin(ntdivnp)
+      double complex xout(ntdivnp) 
+      integer ierr
+
+      if (timers_enabled) call synchup()
+
+      if (timers_enabled) call timer_start(T_transxzglo)
+      call mpi_alltoall(xin, ntdivnp/np, dc_type,
+     >                  xout, ntdivnp/np, dc_type,
+     >                  commslice1, ierr)
+      if (timers_enabled) call timer_stop(T_transxzglo)
+
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose2_finish(n1, n2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer n1, n2, ioff
+      double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)
+      
+      integer i, j, p
+
+      if (timers_enabled) call timer_start(T_transxzfin)
+      do p = 0, np2-1
+         ioff = p*n2
+         do j = 1, n1/np2
+            do i = 1, n2
+               xout(i+ioff, j) = xin(i, j, p)
+            end do
+         end do
+      end do
+      if (timers_enabled) call timer_stop(T_transxzfin)
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_z(l1, l2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer l1, l2
+      double complex xin(ntdivnp), xout(ntdivnp)
+
+      call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
+     >                         xin, xout)
+      call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1), 
+     >                          xout, xin)
+      call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2), 
+     >                          xin, xout)
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_z_local(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer d1, d2, d3
+      double complex xin(d1,d2,d3)
+      double complex xout(d3,d2,d1)
+      integer block1, block3
+      integer i, j, k, kk, ii, i1, k1
+
+      double complex buf(transblockpad, maxdim)
+      if (timers_enabled) call timer_start(T_transxzloc)
+      if (d1 .lt. 32) goto 100
+      block3 = d3
+      if (block3 .eq. 1)  goto 100
+      if (block3 .gt. transblock) block3 = transblock
+      block1 = d1
+      if (block1*block3 .gt. transblock*transblock) 
+     >          block1 = transblock*transblock/block3
+c---------------------------------------------------------------------
+c blocked transpose
+c---------------------------------------------------------------------
+      do j = 1, d2
+         do kk = 0, d3-block3, block3
+            do ii = 0, d1-block1, block1
+               
+               do k = 1, block3
+                  k1 = k + kk
+                  do i = 1, block1
+                     buf(k, i) = xin(i+ii, j, k1)
+                  end do
+               end do
+
+               do i = 1, block1
+                  i1 = i + ii
+                  do k = 1, block3
+                     xout(k+kk, j, i1) = buf(k, i)
+                  end do
+               end do
+
+            end do
+         end do
+      end do
+      goto 200
+      
+
+c---------------------------------------------------------------------
+c basic transpose
+c---------------------------------------------------------------------
+ 100  continue
+      
+      do j = 1, d2
+         do k = 1, d3
+            do i = 1, d1
+               xout(k, j, i) = xin(i, j, k)
+            end do
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c all done
+c---------------------------------------------------------------------
+ 200  continue
+
+      if (timers_enabled) call timer_stop(T_transxzloc)
+      return 
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_z_global(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      include 'mpinpb.h'
+      integer d1, d2, d3
+      double complex xin(d3,d2,d1)
+      double complex xout(d3,d2,d1) ! not real layout, but right size
+      integer ierr
+
+      if (timers_enabled) call synchup()
+
+c---------------------------------------------------------------------
+c do transpose among all  processes with same 1-coord (me1)
+c---------------------------------------------------------------------
+      if (timers_enabled)call timer_start(T_transxzglo)
+      call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
+     >                  xout, d1*d2*d3/np2, dc_type,
+     >                  commslice1, ierr)
+      if (timers_enabled) call timer_stop(T_transxzglo)
+      return
+      end
+      
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer d1, d2, d3
+      double complex xin(d1/np2, d2, d3, 0:np2-1)
+      double complex xout(d1,d2,d3)
+      integer i, j, k, p, ioff
+      if (timers_enabled) call timer_start(T_transxzfin)
+c---------------------------------------------------------------------
+c this is the most straightforward way of doing it. the
+c calculation in the inner loop doesn't help. 
+c      do i = 1, d1/np2
+c         do j = 1, d2
+c            do k = 1, d3
+c               do p = 0, np2-1
+c                  ii = i + p*d1/np2
+c                  xout(ii, j, k) = xin(i, j, k, p)
+c               end do
+c            end do
+c         end do
+c      end do
+c---------------------------------------------------------------------
+
+      do p = 0, np2-1
+         ioff = p*d1/np2
+         do k = 1, d3
+            do j = 1, d2
+               do i = 1, d1/np2
+                  xout(i+ioff, j, k) = xin(i, j, k, p)
+               end do
+            end do
+         end do
+      end do
+      if (timers_enabled) call timer_stop(T_transxzfin)
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_y(l1, l2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer l1, l2
+      double complex xin(ntdivnp), xout(ntdivnp)
+
+c---------------------------------------------------------------------
+c xy transpose is a little tricky, since we don't want
+c to touch 3rd axis. But alltoall must involve 3rd axis (most 
+c slowly varying) to be efficient. So we do
+c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
+c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
+c then local finish. 
+c---------------------------------------------------------------------
+
+
+      call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
+     >                         xin, xout)
+      call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1), 
+     >                          xout, xin)
+      call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2), 
+     >                          xin, xout)
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_y_local(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer d1, d2, d3
+      double complex xin(d1, d2, d3)
+      double complex xout(d2, d3, d1)
+      integer i, j, k
+      if (timers_enabled) call timer_start(T_transxyloc)
+
+      do k = 1, d3
+         do i = 1, d1
+            do j = 1, d2
+               xout(j,k,i)=xin(i,j,k)
+            end do
+         end do
+      end do
+      if (timers_enabled) call timer_stop(T_transxyloc)
+      return 
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_y_global(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      include 'mpinpb.h'
+      integer d1, d2, d3
+c---------------------------------------------------------------------
+c array is in form (ny/np1, nz/np2, nx)
+c---------------------------------------------------------------------
+      double complex xin(d2,d3,d1)
+      double complex xout(d2,d3,d1) ! not real layout but right size
+      integer ierr
+
+      if (timers_enabled) call synchup()
+
+c---------------------------------------------------------------------
+c do transpose among all processes with same 1-coord (me1)
+c---------------------------------------------------------------------
+      if (timers_enabled) call timer_start(T_transxyglo)
+      call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
+     >                  xout, d1*d2*d3/np1, dc_type,
+     >                  commslice2, ierr)
+      if (timers_enabled) call timer_stop(T_transxyglo)
+
+      return
+      end
+      
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      integer d1, d2, d3
+      double complex xin(d1/np1, d3, d2, 0:np1-1)
+      double complex xout(d1,d2,d3)
+      integer i, j, k, p, ioff
+      if (timers_enabled) call timer_start(T_transxyfin)
+c---------------------------------------------------------------------
+c this is the most straightforward way of doing it. the
+c calculation in the inner loop doesn't help. 
+c      do i = 1, d1/np1
+c         do j = 1, d2
+c            do k = 1, d3
+c               do p = 0, np1-1
+c                  ii = i + p*d1/np1
+c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
+c                  xout(ii, j, k) = xin(i, k, j, p)
+c               end do
+c            end do
+c         end do
+c      end do
+c---------------------------------------------------------------------
+
+      do p = 0, np1-1
+         ioff = p*d1/np1
+         do k = 1, d3
+            do j = 1, d2
+               do i = 1, d1/np1
+                  xout(i+ioff, j, k) = xin(i, k, j, p)
+               end do
+            end do
+         end do
+      end do
+      if (timers_enabled) call timer_stop(T_transxyfin)
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine checksum(i, u1, d1, d2, d3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      include 'mpinpb.h'
+      integer i, d1, d2, d3
+      double complex u1(d1, d2, d3)
+      integer j, q,r,s, ierr
+      double complex chk,allchk
+      chk = (0.0,0.0)
+
+      do j=1,1024
+         q = mod(j, nx)+1
+         if (q .ge. xstart(1) .and. q .le. xend(1)) then
+            r = mod(3*j,ny)+1
+            if (r .ge. ystart(1) .and. r .le. yend(1)) then
+               s = mod(5*j,nz)+1
+               if (s .ge. zstart(1) .and. s .le. zend(1)) then
+                  chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
+               end if
+            end if
+         end if
+      end do
+      chk = chk/ntotal_f
+      
+      call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM, 
+     >                0, MPI_COMM_WORLD, ierr)      
+      if (me .eq. 0) then
+            write (*, 30) i, allchk
+ 30         format (' T =',I5,5X,'Checksum =',1P2D22.12)
+      endif
+
+c      sums(i) = allchk
+c     If we compute the checksum for diagnostic purposes, we let i be
+c     negative, so the result will not be stored in an array
+      if (i .gt. 0) sums(i) = allchk
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine synchup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      include 'mpinpb.h'
+      integer ierr
+      call timer_start(T_synch)
+      call mpi_barrier(MPI_COMM_WORLD, ierr)
+      call timer_stop(T_synch)
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine verify (d1, d2, d3, nt, verified, class)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'global.h'
+      include 'mpinpb.h'
+      integer d1, d2, d3, nt
+      character class
+      logical verified
+      integer ierr, size, i
+      double precision err, epsilon
+
+c---------------------------------------------------------------------
+c   Reference checksums
+c---------------------------------------------------------------------
+      double complex csum_ref(25)
+
+
+      class = 'U'
+
+      if (me .ne. 0) return
+
+      epsilon = 1.0d-12
+      verified = .FALSE.
+
+      if (d1 .eq. 64 .and.
+     >    d2 .eq. 64 .and.
+     >    d3 .eq. 64 .and.
+     >    nt .eq. 6) then
+c---------------------------------------------------------------------
+c   Sample size reference checksums
+c---------------------------------------------------------------------
+         class = 'S'
+         csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
+         csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
+         csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
+         csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
+         csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
+         csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
+
+      else if (d1 .eq. 128 .and.
+     >    d2 .eq. 128 .and.
+     >    d3 .eq. 32 .and.
+     >    nt .eq. 6) then
+c---------------------------------------------------------------------
+c   Class W size reference checksums
+c---------------------------------------------------------------------
+         class = 'W'
+         csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
+         csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
+         csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
+         csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
+         csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
+         csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
+
+      else if (d1 .eq. 256 .and.
+     >    d2 .eq. 256 .and.
+     >    d3 .eq. 128 .and.
+     >    nt .eq. 6) then
+c---------------------------------------------------------------------
+c   Class A size reference checksums
+c---------------------------------------------------------------------
+         class = 'A'
+         csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
+         csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
+         csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
+         csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
+         csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
+         csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
+      
+      else if (d1 .eq. 512 .and.
+     >    d2 .eq. 256 .and.
+     >    d3 .eq. 256 .and.
+     >    nt .eq. 20) then
+c---------------------------------------------------------------------
+c   Class B size reference checksums
+c---------------------------------------------------------------------
+         class = 'B'
+         csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
+         csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
+         csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
+         csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
+         csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
+         csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
+         csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
+         csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
+         csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
+         csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
+         csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
+         csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
+         csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
+         csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
+         csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
+         csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
+         csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
+         csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
+         csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
+         csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
+
+      else if (d1 .eq. 512 .and.
+     >    d2 .eq. 512 .and.
+     >    d3 .eq. 512 .and.
+     >    nt .eq. 20) then
+c---------------------------------------------------------------------
+c   Class C size reference checksums
+c---------------------------------------------------------------------
+         class = 'C'
+         csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
+         csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
+         csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
+         csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
+         csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
+         csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
+         csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
+         csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
+         csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
+         csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
+         csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
+         csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
+         csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
+         csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
+         csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
+         csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
+         csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
+         csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
+         csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
+         csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
+
+      else if (d1 .eq. 2048 .and.
+     >    d2 .eq. 1024 .and.
+     >    d3 .eq. 1024 .and.
+     >    nt .eq. 25) then
+c---------------------------------------------------------------------
+c   Class D size reference checksums
+c---------------------------------------------------------------------
+         class = 'D'
+         csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
+         csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
+         csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
+         csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
+         csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
+         csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
+         csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
+         csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
+         csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
+         csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
+         csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
+         csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
+         csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
+         csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
+         csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
+         csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
+         csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
+         csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
+         csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
+         csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
+         csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
+         csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
+         csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
+         csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
+         csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
+
+      else if (d1 .eq. 4096 .and.
+     >    d2 .eq. 2048 .and.
+     >    d3 .eq. 2048 .and.
+     >    nt .eq. 25) then
+c---------------------------------------------------------------------
+c   Class E size reference checksums
+c---------------------------------------------------------------------
+         class = 'E'
+         csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
+         csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
+         csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
+         csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
+         csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
+         csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
+         csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
+         csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
+         csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
+         csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
+         csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
+         csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
+         csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
+         csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
+         csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
+         csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
+         csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
+         csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
+         csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
+         csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
+         csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
+         csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
+         csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
+         csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
+         csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
+
+      endif
+
+
+      if (class .ne. 'U') then
+
+         do i = 1, nt
+            err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
+            if (.not.(err .le. epsilon)) goto 100
+         end do
+         verified = .TRUE.
+ 100     continue
+
+      endif
+
+      call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
+      if (size .ne. np) then
+         write(*, 4010) np
+         write(*, 4011)
+         write(*, 4012)
+c---------------------------------------------------------------------
+c multiple statements because some Fortran compilers have
+c problems with long strings. 
+c---------------------------------------------------------------------
+ 4010    format( ' Warning: benchmark was compiled for ', i5, 
+     >           'processors')
+ 4011    format( ' Must be run on this many processors for official',
+     >           ' verification')
+ 4012    format( ' so memory access is repeatable')
+         verified = .false.
+      endif
+         
+      if (class .ne. 'U') then
+         if (verified) then
+            write(*,2000)
+ 2000       format(' Result verification successful')
+         else
+            write(*,2001)
+ 2001       format(' Result verification failed')
+         endif
+      endif
+      print *, 'class = ', class
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/FT/global.h b/examples/smpi/NAS/FT/global.h
new file mode 100644
index 0000000000..3e534bb48d
--- /dev/null
+++ b/examples/smpi/NAS/FT/global.h
@@ -0,0 +1,134 @@
+      include 'npbparams.h'
+
+c 2D processor array -> 2D grid decomposition (by pencils)
+c If processor array is 1xN or -> 1D grid decomposition (by planes)
+c If processor array is 1x1 -> 0D grid decomposition
+c For simplicity, do not treat Nx1 (np2 = 1) specially
+      integer np1, np2, np
+
+c basic decomposition strategy
+      integer layout_type
+      integer layout_0D, layout_1D, layout_2D
+      parameter (layout_0D = 0, layout_1D = 1, layout_2D = 2)
+
+      common /procgrid/ np1, np2, layout_type, np
+
+
+c Cache blocking params. These values are good for most
+c RISC processors.  
+c FFT parameters:
+c  fftblock controls how many ffts are done at a time. 
+c  The default is appropriate for most cache-based machines
+c  On vector machines, the FFT can be vectorized with vector
+c  length equal to the block size, so the block size should
+c  be as large as possible. This is the size of the smallest
+c  dimension of the problem: 128 for class A, 256 for class B and
+c  512 for class C.
+c Transpose parameters:
+c  transblock is the blocking factor for the transposes when there
+c  is a 1-D layout. On vector machines it should probably be
+c  large (largest dimension of the problem).
+
+
+      integer fftblock_default, fftblockpad_default
+      parameter (fftblock_default=16, fftblockpad_default=18)
+      integer transblock, transblockpad
+      parameter(transblock=32, transblockpad=34)
+      
+      integer fftblock, fftblockpad
+      common /blockinfo/ fftblock, fftblockpad
+
+c we need a bunch of logic to keep track of how
+c arrays are laid out. 
+c coords of this processor
+      integer me, me1, me2
+      common /coords/ me, me1, me2
+c need a communicator for row/col in processor grid
+      integer commslice1, commslice2
+      common /comms/ commslice1, commslice2
+
+
+
+c There are basically three stages
+c 1: x-y-z layout
+c 2: after x-transform (before y)
+c 3: after y-transform (before z)
+c The computation proceeds logically as
+
+c set up initial conditions
+c fftx(1)
+c transpose (1->2)
+c ffty(2)
+c transpose (2->3)
+c fftz(3)
+c time evolution
+c fftz(3)
+c transpose (3->2)
+c ffty(2)
+c transpose (2->1)
+c fftx(1)
+c compute residual(1)
+
+c for the 0D, 1D, 2D strategies, the layouts look like xxx
+c        
+c            0D        1D        2D
+c 1:        xyz       xyz       xyz
+c 2:        xyz       xyz       yxz
+c 3:        xyz       zyx       zxy
+
+c the array dimensions are stored in dims(coord, phase)
+      integer dims(3, 3)
+      integer xstart(3), ystart(3), zstart(3)
+      integer xend(3), yend(3), zend(3)
+      common /layout/ dims,
+     >                xstart, ystart, zstart, 
+     >                xend, yend, zend
+
+      integer T_total, T_setup, T_fft, T_evolve, T_checksum, 
+     >        T_fftlow, T_fftcopy, T_transpose, 
+     >        T_transxzloc, T_transxzglo, T_transxzfin, 
+     >        T_transxyloc, T_transxyglo, T_transxyfin, 
+     >        T_synch, T_max
+      parameter (T_total = 1, T_setup = 2, T_fft = 3, 
+     >           T_evolve = 4, T_checksum = 5, 
+     >           T_fftlow = 6, T_fftcopy = 7, T_transpose = 8,
+     >           T_transxzloc = 9, T_transxzglo = 10, T_transxzfin = 11, 
+     >           T_transxyloc = 12, T_transxyglo = 13, 
+     >           T_transxyfin = 14,  T_synch = 15, T_max = 15)
+
+
+
+      logical timers_enabled
+      parameter (timers_enabled = .false.)
+
+
+      external timer_read
+      double precision timer_read
+      external ilog2
+      integer ilog2
+
+      external randlc
+      double precision randlc
+
+
+c other stuff
+      logical debug, debugsynch
+      common /dbg/ debug, debugsynch
+
+      double precision seed, a, pi, alpha
+      parameter (seed = 314159265.d0, a = 1220703125.d0, 
+     >  pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+c roots of unity array
+c relies on x being largest dimension?
+      double complex u(nx)
+      common /ucomm/ u
+
+
+c for checksum data
+      double complex sums(0:niter_default)
+      common /sumcomm/ sums
+
+c number of iterations
+      integer niter
+      common /iter/ niter
diff --git a/examples/smpi/NAS/FT/inputft.data.sample b/examples/smpi/NAS/FT/inputft.data.sample
new file mode 100644
index 0000000000..448ac42bc0
--- /dev/null
+++ b/examples/smpi/NAS/FT/inputft.data.sample
@@ -0,0 +1,3 @@
+6   ! number of iterations
+2   ! layout type. 0 = 0d, 1 = 1d, 2 = 2d
+2 4 ! processor layout. 0d must be "1 1"; 1d must be "1 N"
diff --git a/examples/smpi/NAS/FT/mpinpb.h b/examples/smpi/NAS/FT/mpinpb.h
new file mode 100644
index 0000000000..e43e552a84
--- /dev/null
+++ b/examples/smpi/NAS/FT/mpinpb.h
@@ -0,0 +1,4 @@
+      include 'mpif.h'
+c mpi data types
+      integer dc_type
+      common /mpistuff/ dc_type
diff --git a/examples/smpi/NAS/IS/Makefile b/examples/smpi/NAS/IS/Makefile
new file mode 100644
index 0000000000..26d35e8343
--- /dev/null
+++ b/examples/smpi/NAS/IS/Makefile
@@ -0,0 +1,23 @@
+SHELL=/bin/sh
+BENCHMARK=is
+BENCHMARKU=IS
+
+include ../config/make.def
+
+include ../sys/make.common
+
+OBJS = is.o ${COMMON}/c_print_results.o
+
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+	${CCOMPILE} $<
+
+is.o:             is.c  npbparams.h
+
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f is npbparams.h core
diff --git a/examples/smpi/NAS/IS/is.c b/examples/smpi/NAS/IS/is.c
new file mode 100644
index 0000000000..01bd9ddceb
--- /dev/null
+++ b/examples/smpi/NAS/IS/is.c
@@ -0,0 +1,1150 @@
+/*************************************************************************
+ *                                                                       * 
+ *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
+ *                                                                       * 
+ *                                  I S                                  * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
+ *   It is described in NAS Technical Report 95-020.                     * 
+ *                                                                       * 
+ *   Permission to use, copy, distribute and modify this software        * 
+ *   for any purpose with or without fee is hereby granted.  We          * 
+ *   request, however, that all derived work reference the NAS           * 
+ *   Parallel Benchmarks 3.3. This software is provided "as is"          *
+ *   without express or implied warranty.                                * 
+ *                                                                       * 
+ *   Information on NPB 3.3, including the technical report, the         *
+ *   original specifications, source code, results and information       * 
+ *   on how to submit new results, is available at:                      * 
+ *                                                                       * 
+ *          http://www.nas.nasa.gov/Software/NPB                         * 
+ *                                                                       * 
+ *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
+ *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
+ *                                                                       * 
+ *         NAS Parallel Benchmarks Group                                 * 
+ *         NASA Ames Research Center                                     * 
+ *         Mail Stop: T27A-1                                             * 
+ *         Moffett Field, CA   94035-1000                                * 
+ *                                                                       * 
+ *         E-mail:  npb@nas.nasa.gov                                     * 
+ *         Fax:     (650) 604-3957                                       * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   Author: M. Yarrow                                                   * 
+ *           H. Jin                                                      * 
+ *                                                                       * 
+ *************************************************************************/
+
+#include "mpi.h"
+#include "npbparams.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+/******************/
+/* default values */
+/******************/
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS            1                 
+#endif
+#define MIN_PROCS            1
+
+
+/*************/
+/*  CLASS S  */
+/*************/
+#if CLASS == 'S'
+#define  TOTAL_KEYS_LOG_2    16
+#define  MAX_KEY_LOG_2       11
+#define  NUM_BUCKETS_LOG_2   9
+#endif
+
+
+/*************/
+/*  CLASS W  */
+/*************/
+#if CLASS == 'W'
+#define  TOTAL_KEYS_LOG_2    20
+#define  MAX_KEY_LOG_2       16
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+/*************/
+/*  CLASS A  */
+/*************/
+#if CLASS == 'A'
+#define  TOTAL_KEYS_LOG_2    23
+#define  MAX_KEY_LOG_2       19
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS B  */
+/*************/
+#if CLASS == 'B'
+#define  TOTAL_KEYS_LOG_2    25
+#define  MAX_KEY_LOG_2       21
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS C  */
+/*************/
+#if CLASS == 'C'
+#define  TOTAL_KEYS_LOG_2    27
+#define  MAX_KEY_LOG_2       23
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS D  */
+/*************/
+#if CLASS == 'D'
+#define  TOTAL_KEYS_LOG_2    29
+#define  MAX_KEY_LOG_2       27
+#define  NUM_BUCKETS_LOG_2   10
+#undef   MIN_PROCS
+#define  MIN_PROCS           4
+#endif
+
+
+#define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
+#define  MAX_KEY             (1 << MAX_KEY_LOG_2)
+#define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
+#define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
+
+/*****************************************************************/
+/* On larger number of processors, since the keys are (roughly)  */ 
+/* gaussian distributed, the first and last processor sort keys  */ 
+/* in a large interval, requiring array sizes to be larger. Note */
+/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
+/* The required array size also depends on the bucket size used. */
+/* The following values are validated for the 1024-bucket setup. */
+/*****************************************************************/
+#if   NUM_PROCS < 256
+#define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
+#elif NUM_PROCS < 512
+#define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
+#elif NUM_PROCS < 1024
+#define  SIZE_OF_BUFFERS     4*NUM_KEYS
+#else
+#define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
+#endif
+
+/*****************************************************************/
+/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
+/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
+/* MAX_PROCS AT YOUR PERIL                                       */
+/*****************************************************************/
+#if CLASS == 'S'
+#define  MAX_PROCS           128
+#else
+#define  MAX_PROCS           1024
+#endif
+
+#define  MAX_ITERATIONS      10
+#define  TEST_ARRAY_SIZE     5
+
+
+/***********************************/
+/* Enable separate communication,  */
+/* computation timing and printout */
+/***********************************/
+/* #define  TIMING_ENABLED         */
+
+
+/*************************************/
+/* Typedef: if necessary, change the */
+/* size of int here by changing the  */
+/* int type to, say, long            */
+/*************************************/
+typedef  int  INT_TYPE;
+typedef  long INT_TYPE2;
+#define MP_KEY_TYPE MPI_INT
+
+
+typedef struct {
+
+/********************/
+/* MPI properties:  */
+/********************/
+int      my_rank,
+         comm_size;
+
+
+/********************/
+/* Some global info */
+/********************/
+INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
+         total_local_keys,             /* copies of rank info        */
+         total_lesser_keys;
+
+
+int      passed_verification;
+                                 
+
+
+/************************************/
+/* These are the three main arrays. */
+/* See SIZE_OF_BUFFERS def above    */
+/************************************/
+INT_TYPE key_array[SIZE_OF_BUFFERS],    
+         key_buff1[SIZE_OF_BUFFERS],    
+         key_buff2[SIZE_OF_BUFFERS],
+         bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
+         bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
+         bucket_ptrs[NUM_BUCKETS],
+         process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],   
+         process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];   
+int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
+         send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
+
+
+/**********************/
+/* Partial verif info */
+/**********************/
+INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
+         test_rank_array[TEST_ARRAY_SIZE];
+
+/**********/
+/* Timers */
+/**********/
+double start[64], elapsed[64];
+
+} global_data;
+
+
+const INT_TYPE2
+         S_test_index_array[TEST_ARRAY_SIZE] = 
+                             {48427,17148,23627,62548,4431},
+         S_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {0,18,346,64917,65463},
+
+         W_test_index_array[TEST_ARRAY_SIZE] = 
+                             {357773,934767,875723,898999,404505},
+         W_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {1249,11698,1039987,1043896,1048018},
+
+         A_test_index_array[TEST_ARRAY_SIZE] = 
+                             {2112377,662041,5336171,3642833,4250760},
+         A_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {104,17523,123928,8288932,8388264},
+
+         B_test_index_array[TEST_ARRAY_SIZE] = 
+                             {41869,812306,5102857,18232239,26860214},
+         B_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {33422937,10244,59149,33135281,99}, 
+
+         C_test_index_array[TEST_ARRAY_SIZE] = 
+                             {44172927,72999161,74326391,129606274,21736814},
+         C_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {61147,882988,266290,133997595,133525895},
+
+         D_test_index_array[TEST_ARRAY_SIZE] = 
+                             {1317351170,995930646,1157283250,1503301535,1453734525},
+         D_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {1,36538729,1978098519,2145192618,2147425337};
+
+
+
+/***********************/
+/* function prototypes */
+/***********************/
+double	randlc( double *X, double *A );
+
+void full_verify( global_data* gd );
+
+void c_print_results( char   *name,
+                      char   class,
+                      int    n1, 
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      int    nprocs_compiled,
+                      int    nprocs_total,
+                      double t,
+                      double mops,
+		      char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *mpicc,
+                      char   *clink,
+                      char   *cmpi_lib,
+                      char   *cmpi_inc,
+                      char   *cflags,
+                      char   *clinkflags );
+
+void    timer_clear(global_data* gd, int n );
+void    timer_start(global_data* gd, int n );
+void    timer_stop(global_data* gd, int n );
+double  timer_read(global_data* gd, int n );
+
+void    timer_clear(global_data* gd, int n ) {
+   gd->elapsed[n] = 0.0;
+}
+
+void    timer_start(global_data* gd, int n ) {
+   gd->start[n] = MPI_Wtime();
+}
+
+void    timer_stop(global_data* gd, int n ) {
+   gd->elapsed[n] += MPI_Wtime() - gd->start[n];
+}
+
+double  timer_read(global_data* gd, int n ) {
+   return gd->elapsed[n];
+}
+
+
+/*
+ *    FUNCTION RANDLC (X, A)
+ *
+ *  This routine returns a uniform pseudorandom double precision number in the
+ *  range (0, 1) by using the linear congruential generator
+ *
+ *  x_{k+1} = a x_k  (mod 2^46)
+ *
+ *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+ *  before repeating.  The argument A is the same as 'a' in the above formula,
+ *  and X is the same as x_0.  A and X must be odd double precision integers
+ *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
+ *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+ *  the new seed x_1, so that subsequent calls to RANDLC using the same
+ *  arguments will generate a continuous sequence.
+ *
+ *  This routine should produce the same results on any computer with at least
+ *  48 mantissa bits in double precision floating point data.  On Cray systems,
+ *  double precision should be disabled.
+ *
+ *  David H. Bailey     October 26, 1990
+ *
+ *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ *     SAVE KS, R23, R46, T23, T46
+ *     DATA KS/0/
+ *
+ *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
+ *  by merely using the ** operator, in order to insure that the results are
+ *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/*************           R  A  N  D  L  C             ************/
+/*************                                        ************/
+/*************    portable random number generator    ************/
+/*****************************************************************/
+
+double	randlc( double *X, double *A )
+{
+      static int        KS=0;
+      static double	R23, R46, T23, T46;
+      double		T1, T2, T3, T4;
+      double		A1;
+      double		A2;
+      double		X1;
+      double		X2;
+      double		Z;
+      int     		i, j;
+
+      if (KS == 0) 
+      {
+        R23 = 1.0;
+        R46 = 1.0;
+        T23 = 1.0;
+        T46 = 1.0;
+    
+        for (i=1; i<=23; i++)
+        {
+          R23 = 0.50 * R23;
+          T23 = 2.0 * T23;
+        }
+        for (i=1; i<=46; i++)
+        {
+          R46 = 0.50 * R46;
+          T46 = 2.0 * T46;
+        }
+        KS = 1;
+      }
+
+/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
+
+      T1 = R23 * *A;
+      j  = T1;
+      A1 = j;
+      A2 = *A - T23 * A1;
+
+/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
+    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
+
+      T1 = R23 * *X;
+      j  = T1;
+      X1 = j;
+      X2 = *X - T23 * X1;
+      T1 = A1 * X2 + A2 * X1;
+      
+      j  = R23 * T1;
+      T2 = j;
+      Z = T1 - T23 * T2;
+      T3 = T23 * Z + A2 * X2;
+      j  = R46 * T3;
+      T4 = j;
+      *X = T3 - T46 * T4;
+      return(R46 * *X);
+} 
+
+
+
+/*****************************************************************/
+/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
+/************                                         ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
+/*
+ * Create a random number sequence of total length nn residing
+ * on np number of processors.  Each processor will therefore have a 
+ * subsequence of length nn/np.  This routine returns that random 
+ * number which is the first random number for the subsequence belonging
+ * to processor rank kn, and which is used as seed for proc kn ran # gen.
+ */
+
+double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
+                       int  np,       /* np = num procs                      */
+                       long nn,       /* total num of ran numbers, all procs */
+                       double s,      /* Ran num seed, for ex.: 314159265.00 */
+                       double a )     /* Ran num gen mult, try 1220703125.00 */
+{
+
+  long   i;
+
+  double t1,t2,t3,an;
+  long   mq,nq,kk,ik;
+
+
+
+      nq = nn / np;
+
+      for( mq=0; nq>1; mq++,nq/=2 )
+          ;
+
+      t1 = a;
+
+      for( i=1; i<=mq; i++ )
+        t2 = randlc( &t1, &t1 );
+
+      an = t1;
+
+      kk = kn;
+      t1 = s;
+      t2 = an;
+
+      for( i=1; i<=100; i++ )
+      {
+        ik = kk / 2;
+        if( 2 * ik !=  kk ) 
+            t3 = randlc( &t1, &t2 );
+        if( ik == 0 ) 
+            break;
+        t3 = randlc( &t2, &t2 );
+        kk = ik;
+      }
+
+      return( t1 );
+
+}
+
+
+
+
+/*****************************************************************/
+/*************      C  R  E  A  T  E  _  S  E  Q      ************/
+/*****************************************************************/
+
+void	create_seq( global_data* gd, double seed, double a )
+{
+	double x;
+	int    i, k;
+
+        k = MAX_KEY/4;
+
+	for (i=0; i<NUM_KEYS; i++)
+	{
+	    x = randlc(&seed, &a);
+	    x += randlc(&seed, &a);
+    	    x += randlc(&seed, &a);
+	    x += randlc(&seed, &a);  
+
+            gd->key_array[i] = k*x;
+	}
+}
+
+
+
+
+/*****************************************************************/
+/*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
+/*****************************************************************/
+
+
+void full_verify( global_data* gd )
+{
+    MPI_Status  status;
+    MPI_Request request;
+    
+    INT_TYPE    i, j;
+    INT_TYPE    k, last_local_key;
+
+    
+/*  Now, finally, sort the keys:  */
+    for( i=0; i<gd->total_local_keys; i++ )
+        gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
+                                 gd->total_lesser_keys] = gd->key_buff2[i];
+    last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
+
+/*  Send largest key value to next processor  */
+    if( gd->my_rank > 0 )
+        MPI_Irecv( &k,
+                   1,
+                   MP_KEY_TYPE,
+                   gd->my_rank-1,
+                   1000,
+                   MPI_COMM_WORLD,
+                   &request );                   
+    if( gd->my_rank < gd->comm_size-1 )
+        MPI_Send( &gd->key_array[last_local_key],
+                  1,
+                  MP_KEY_TYPE,
+                  gd->my_rank+1,
+                  1000,
+                  MPI_COMM_WORLD );
+    if( gd->my_rank > 0 )
+        MPI_Wait( &request, &status );
+
+/*  Confirm that neighbor's greatest key value 
+    is not greater than my least key value       */              
+    j = 0;
+    if( gd->my_rank > 0 && gd->total_local_keys > 0 )
+        if( k > gd->key_array[0] )
+            j++;
+
+
+/*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
+    for( i=1; i<gd->total_local_keys; i++ )
+        if( gd->key_array[i-1] > gd->key_array[i] )
+            j++;
+
+
+    if( j != 0 )
+    {
+        printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
+                gd->my_rank, j );
+    }
+    else
+        gd->passed_verification++;
+           
+
+}
+
+
+
+
+/*****************************************************************/
+/*************             R  A  N  K             ****************/
+/*****************************************************************/
+
+
+void rank( global_data* gd, int iteration )
+{
+
+    INT_TYPE    i, k;
+
+    INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
+    INT_TYPE    key;
+    INT_TYPE2   bucket_sum_accumulator, j, m;
+    INT_TYPE    local_bucket_sum_accumulator;
+    INT_TYPE    min_key_val, max_key_val;
+    INT_TYPE    *key_buff_ptr;
+
+
+
+
+/*  Iteration alteration of keys */  
+    if(gd->my_rank == 0 )                    
+    {
+      gd->key_array[iteration] = iteration;
+      gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
+    }
+
+
+/*  Initialize */
+    for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )  
+    {
+        gd->bucket_size[i] = 0;
+        gd->bucket_size_totals[i] = 0;
+        gd->process_bucket_distrib_ptr1[i] = 0;
+        gd->process_bucket_distrib_ptr2[i] = 0;
+    }
+
+
+/*  Determine where the partial verify test keys are, load into  */
+/*  top of array bucket_size                                     */
+    for( i=0; i<TEST_ARRAY_SIZE; i++ )
+        if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
+            gd->bucket_size[NUM_BUCKETS+i] = 
+                          gd->key_array[gd->test_index_array[i] % NUM_KEYS];
+
+
+/*  Determine the number of keys in each bucket */
+    for( i=0; i<NUM_KEYS; i++ )
+        gd->bucket_size[gd->key_array[i] >> shift]++;
+
+
+/*  Accumulative bucket sizes are the bucket pointers */
+    gd->bucket_ptrs[0] = 0;
+    for( i=1; i< NUM_BUCKETS; i++ )  
+        gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
+
+
+/*  Sort into appropriate bucket */
+    for( i=0; i<NUM_KEYS; i++ )  
+    {
+        key = gd->key_array[i];
+        gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
+    }
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 2 );
+    timer_start(gd, 3 );
+#endif
+
+/*  Get the bucket size totals for the entire problem. These 
+    will be used to determine the redistribution of keys      */
+    MPI_Allreduce( gd->bucket_size, 
+                   gd->bucket_size_totals, 
+                   NUM_BUCKETS+TEST_ARRAY_SIZE, 
+                   MP_KEY_TYPE,
+                   MPI_SUM,
+                   MPI_COMM_WORLD );
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 3 );
+    timer_start(gd, 2 );
+#endif
+
+/*  Determine Redistibution of keys: accumulate the bucket size totals 
+    till this number surpasses NUM_KEYS (which the average number of keys
+    per processor).  Then all keys in these buckets go to processor 0.
+    Continue accumulating again until supassing 2*NUM_KEYS. All keys
+    in these buckets go to processor 1, etc.  This algorithm guarantees
+    that all processors have work ranking; no processors are left idle.
+    The optimum number of buckets, however, does not result in as high
+    a degree of load balancing (as even a distribution of keys as is
+    possible) as is obtained from increasing the number of buckets, but
+    more buckets results in more computation per processor so that the
+    optimum number of buckets turns out to be 1024 for machines tested.
+    Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
+    number of first and last bucket which each processor will have after   
+    the redistribution is done.                                          */
+
+    bucket_sum_accumulator = 0;
+    local_bucket_sum_accumulator = 0;
+    gd->send_displ[0] = 0;
+    gd->process_bucket_distrib_ptr1[0] = 0;
+    for( i=0, j=0; i<NUM_BUCKETS; i++ )  
+    {
+        bucket_sum_accumulator       += gd->bucket_size_totals[i];
+        local_bucket_sum_accumulator += gd->bucket_size[i];
+        if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )  
+        {
+            gd->send_count[j] = local_bucket_sum_accumulator;
+            if( j != 0 )
+            {
+                gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
+                gd->process_bucket_distrib_ptr1[j] = 
+                                        gd->process_bucket_distrib_ptr2[j-1]+1;
+            }
+            gd->process_bucket_distrib_ptr2[j++] = i;
+            local_bucket_sum_accumulator = 0;
+        }
+    }
+
+/*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
+    that the last few processors don't get any buckets.  So, we
+    need to set counts properly in this case to avoid any fallouts.    */
+    while( j < gd->comm_size )
+    {
+        gd->send_count[j] = 0;
+        gd->process_bucket_distrib_ptr1[j] = 1;
+        j++;
+    }
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 2 );
+    timer_start(gd, 3 ); 
+#endif
+
+/*  This is the redistribution section:  first find out how many keys
+    each processor will send to every other processor:                 */
+    MPI_Alltoall( gd->send_count,
+                  1,
+                  MPI_INT,
+                  gd->recv_count,
+                  1,
+                  MPI_INT,
+                  MPI_COMM_WORLD );
+    MPI_Wtime();
+
+/*  Determine the receive array displacements for the buckets */    
+    gd->recv_displ[0] = 0;
+    for( i=1; i<gd->comm_size; i++ )
+        gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
+
+
+    MPI_Wtime();
+/*  Now send the keys to respective processors  */    
+    MPI_Alltoallv( gd->key_buff1,
+                   gd->send_count,
+                   gd->send_displ,
+                   MP_KEY_TYPE,
+                   gd->key_buff2,
+                   gd->recv_count,
+                   gd->recv_displ,
+                   MP_KEY_TYPE,
+                   MPI_COMM_WORLD );
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 3 ); 
+    timer_start(gd, 2 );
+#endif
+
+/*  The starting and ending bucket numbers on each processor are
+    multiplied by the interval size of the buckets to obtain the 
+    smallest possible min and greatest possible max value of any 
+    key on each processor                                          */
+    min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
+    max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
+
+/*  Clear the work array */
+    for( i=0; i<max_key_val-min_key_val+1; i++ )
+        gd->key_buff1[i] = 0;
+
+/*  Determine the total number of keys on all other 
+    processors holding keys of lesser value         */
+    m = 0;
+    for( k=0; k<gd->my_rank; k++ )
+        for( i= gd->process_bucket_distrib_ptr1[k];
+             i<=gd->process_bucket_distrib_ptr2[k];
+             i++ )  
+            m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
+
+/*  Determine total number of keys on this processor */
+    j = 0;                                 
+    for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
+         i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
+         i++ )  
+        j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
+
+
+/*  Ranking of all keys occurs in this section:                 */
+/*  shift it backwards so no subtractions are necessary in loop */
+    key_buff_ptr = gd->key_buff1 - min_key_val;
+
+/*  In this section, the keys themselves are used as their 
+    own indexes to determine how many of each there are: their
+    individual population                                       */
+    for( i=0; i<j; i++ )
+        key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
+                                       /* population                     */
+
+/*  To obtain ranks of each key, successively add the individual key
+    population, not forgetting the total of lesser keys, m.
+    NOTE: Since the total of lesser keys would be subtracted later 
+    in verification, it is no longer added to the first key population 
+    here, but still needed during the partial verify test.  This is to 
+    ensure that 32-bit key_buff can still be used for class D.           */
+/*    key_buff_ptr[min_key_val] += m;    */
+    for( i=min_key_val; i<max_key_val; i++ )   
+        key_buff_ptr[i+1] += key_buff_ptr[i];  
+
+
+/* This is the partial verify test section */
+/* Observe that test_rank_array vals are   */
+/* shifted differently for different cases */
+    for( i=0; i<TEST_ARRAY_SIZE; i++ )
+    {                                             
+        k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
+        if( min_key_val <= k  &&  k <= max_key_val )
+        {
+            /* Add the total of lesser keys, m, here */
+            INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
+            int failed = 0;
+
+            switch( CLASS )
+            {
+                case 'S':
+                    if( i <= 2 )
+                    {
+                        if( key_rank != gd->test_rank_array[i]+iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'W':
+                    if( i < 2 )
+                    {
+                        if( key_rank != gd->test_rank_array[i]+(iteration-2) )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'A':
+                    if( i <= 2 )
+        	    {
+                        if( key_rank != gd->test_rank_array[i]+(iteration-1) )
+                            failed = 1;
+                        else
+                          gd->passed_verification++;
+        	    }
+                    else
+                    {
+                        if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'B':
+                    if( i == 1 || i == 2 || i == 4 )
+        	    {
+                        if( key_rank != gd->test_rank_array[i]+iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+        	    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'C':
+                    if( i <= 2 )
+        	    {
+                        if( key_rank != gd->test_rank_array[i]+iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+        	    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'D':
+                    if( i < 2 )
+        	    {
+                        if( key_rank != gd->test_rank_array[i]+iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+        	    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+            }
+            if( failed == 1 )
+                printf( "Failed partial verification: "
+                        "iteration %d, processor %d, test key %d\n", 
+                         iteration, gd->my_rank, (int)i );
+        }
+    }
+
+
+
+
+/*  Make copies of rank info for use by full_verify: these variables
+    in rank are local; making them global slows down the code, probably
+    since they cannot be made register by compiler                        */
+
+    if( iteration == MAX_ITERATIONS ) 
+    {
+        gd->key_buff_ptr_global = key_buff_ptr;
+        gd->total_local_keys    = j;
+        gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
+    }
+
+}      
+
+
+/*****************************************************************/
+/*************             M  A  I  N             ****************/
+/*****************************************************************/
+
+int main( int argc, char **argv )
+{
+
+    int             i, iteration, itemp;
+
+    double          timecounter, maxtime;
+
+    global_data* gd = malloc(sizeof(global_data));
+/*  Initialize MPI */
+    MPI_Init( &argc, &argv );
+    MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
+    MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
+
+/*  Initialize the verification arrays if a valid class */
+    for( i=0; i<TEST_ARRAY_SIZE; i++ )
+        switch( CLASS )
+        {
+            case 'S':
+                gd->test_index_array[i] = S_test_index_array[i];
+                gd->test_rank_array[i]  = S_test_rank_array[i];
+                break;
+            case 'A':
+                gd->test_index_array[i] = A_test_index_array[i];
+                gd->test_rank_array[i]  = A_test_rank_array[i];
+                break;
+            case 'W':
+                gd->test_index_array[i] = W_test_index_array[i];
+                gd->test_rank_array[i]  = W_test_rank_array[i];
+                break;
+            case 'B':
+                gd->test_index_array[i] = B_test_index_array[i];
+                gd->test_rank_array[i]  = B_test_rank_array[i];
+                break;
+            case 'C':
+                gd->test_index_array[i] = C_test_index_array[i];
+                gd->test_rank_array[i]  = C_test_rank_array[i];
+                break;
+            case 'D':
+                gd->test_index_array[i] = D_test_index_array[i];
+                gd->test_rank_array[i]  = D_test_rank_array[i];
+                break;
+        };
+
+        
+
+/*  Printout initial NPB info */
+    if( gd->my_rank == 0 )
+    {
+        printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
+        printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
+        printf( " Iterations:   %d\n", MAX_ITERATIONS );
+        printf( " Number of processes:     %d\n",gd->comm_size );
+    }
+
+/*  Check that actual and compiled number of processors agree */
+    if( gd->comm_size != NUM_PROCS )
+    {
+        if( gd->my_rank == 0 )
+            printf( "\n ERROR: compiled for %d processes\n"
+                    " Number of active processes: %d\n"
+                    " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
+        MPI_Finalize();
+        exit( 1 );
+    }
+
+/*  Check to see whether total number of processes is within bounds.
+    This could in principle be checked in setparams.c, but it is more
+    convenient to do it here                                               */
+    if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
+    {
+       if( gd->my_rank == 0 )
+           printf( "\n ERROR: number of processes %d not within range %d-%d"
+                   "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
+       MPI_Finalize();
+       exit( 1 );
+    }
+
+
+/*  Generate random number sequence and subsequent keys on all procs */
+    create_seq(gd,  find_my_seed( gd->my_rank, 
+                              gd->comm_size, 
+                              4*(long)TOTAL_KEYS*MIN_PROCS,
+                              314159265.00,      /* Random number gen seed */
+                              1220703125.00 ),   /* Random number gen mult */
+                1220703125.00 );                 /* Random number gen mult */
+
+/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
+    all data and code pages and respective tables */
+    rank(gd, 1 );  
+
+/*  Start verification counter */
+    gd->passed_verification = 0;
+
+    if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
+
+/*  Initialize timer  */             
+    timer_clear(gd, 0 );
+
+/*  Initialize separate communication, computation timing */
+#ifdef  TIMING_ENABLED 
+    for( i=1; i<=3; i++ ) timer_clear(gd, i );
+#endif
+
+/*  Start timer  */             
+    timer_start(gd, 0 );
+
+#ifdef  TIMING_ENABLED
+    timer_start(gd, 1 );
+    timer_start(gd, 2 );
+#endif
+
+/*  This is the main iteration */
+    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
+    {
+        if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
+        rank(gd,  iteration );
+    }
+
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 2 );
+    timer_stop(gd, 1 );
+#endif
+
+/*  Stop timer, obtain time for processors */
+    timer_stop(gd, 0 );
+
+    timecounter = timer_read(gd, 0 );
+
+/*  End of timing, obtain maximum time of all processors */
+    MPI_Reduce( &timecounter,
+                &maxtime,
+                1,
+                MPI_DOUBLE,
+                MPI_MAX,
+                0,
+                MPI_COMM_WORLD );
+
+#ifdef  TIMING_ENABLED
+    {
+        double    tmin, tsum, tmax;
+    
+        if( my_rank == 0 )
+        {
+            printf( "\ntimer 1/2/3 = total/computation/communication time\n");
+            printf( "              min                avg                max\n" );
+        }
+        for( i=1; i<=3; i++ )
+        {
+            timecounter = timer_read(gd, i );
+            MPI_Reduce( &timecounter,
+                        &tmin,
+                        1,
+                        MPI_DOUBLE,
+                        MPI_MIN,
+                        0,
+                        MPI_COMM_WORLD );
+            MPI_Reduce( &timecounter,
+                        &tsum,
+                        1,
+                        MPI_DOUBLE,
+                        MPI_SUM,
+                        0,
+                        MPI_COMM_WORLD );
+            MPI_Reduce( &timecounter,
+                        &tmax,
+                        1,
+                        MPI_DOUBLE,
+                        MPI_MAX,
+                        0,
+                        MPI_COMM_WORLD );
+            if( my_rank == 0 )
+                printf( "timer %d:    %f           %f            %f\n",
+                        i, tmin, tsum/((double) comm_size), tmax );
+        }
+        if( my_rank == 0 )
+            printf( "\n" );
+    }
+#endif
+
+/*  This tests that keys are in sequence: sorting of last ranked key seq
+    occurs here, but is an untimed operation                             */
+    full_verify(gd);
+
+
+/*  Obtain verification counter sum */
+    itemp =gd->passed_verification;
+    MPI_Reduce( &itemp,
+                &gd->passed_verification,
+                1,
+                MPI_INT,
+                MPI_SUM,
+                0,
+                MPI_COMM_WORLD );
+
+
+
+/*  The final printout  */
+    if( gd->my_rank == 0 )
+    {
+        if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
+            gd->passed_verification = 0;
+        c_print_results( "IS",
+                         CLASS,
+                         (int)(TOTAL_KEYS),
+                         MIN_PROCS,
+                         0,
+                         MAX_ITERATIONS,
+                         NUM_PROCS,
+                         gd->comm_size,
+                         maxtime,
+                         ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
+                                                      /maxtime/1000000.,
+                         "keys ranked", 
+                         gd->passed_verification,
+                         NPBVERSION,
+                         COMPILETIME,
+                         MPICC,
+                         CLINK,
+                         CMPI_LIB,
+                         CMPI_INC,
+                         CFLAGS,
+                         CLINKFLAGS );
+    }
+                    
+    MPI_Finalize();
+    free(gd);
+
+    return 0;
+         /**************************/
+}        /*  E N D  P R O G R A M  */
+         /**************************/
diff --git a/examples/smpi/NAS/LU/Makefile b/examples/smpi/NAS/LU/Makefile
new file mode 100644
index 0000000000..a05c94dc4a
--- /dev/null
+++ b/examples/smpi/NAS/LU/Makefile
@@ -0,0 +1,74 @@
+SHELL=/bin/sh
+BENCHMARK=lu
+BENCHMARKU=LU
+VEC=
+
+include ../config/make.def
+
+OBJS = lu.o init_comm.o read_input.o bcast_inputs.o proc_grid.o neighbors.o \
+       nodedim.o subdomain.o setcoeff.o sethyper.o setbv.o exact.o setiv.o \
+       erhs.o ssor.o exchange_1.o exchange_3.o exchange_4.o exchange_5.o \
+       exchange_6.o rhs.o l2norm.o jacld.o blts$(VEC).o jacu.o buts$(VEC).o \
+       error.o pintgr.o verify.o ${COMMON}/print_results.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+
+# npbparams.h is included by applu.incl
+# The following rule should do the trick but many make programs (not gmake)
+# will do the wrong thing and rebuild the world every time (because the
+# mod time on header.h is not changed. One solution would be to 
+# touch header.h but this might cause confusion if someone has
+# accidentally deleted it. Instead, make the dependency on npbparams.h
+# explicit in all the lines below (even though dependence is indirect). 
+
+# applu.incl: npbparams.h
+
+${PROGRAM}: config
+	@if [ x$(VERSION) = xvec ] ; then	\
+		${MAKE} VEC=_vec exec;		\
+	elif [ x$(VERSION) = xVEC ] ; then	\
+		${MAKE} VEC=_vec exec;		\
+	else					\
+		${MAKE} exec;			\
+	fi
+
+exec: $(OBJS)
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+.f.o :
+	${FCOMPILE} $<
+
+lu.o:		lu.f applu.incl npbparams.h
+bcast_inputs.o:	bcast_inputs.f applu.incl npbparams.h mpinpb.h
+blts$(VEC).o:	blts$(VEC).f
+buts$(VEC).o:	buts$(VEC).f
+erhs.o:		erhs.f applu.incl npbparams.h
+error.o:	error.f applu.incl npbparams.h mpinpb.h
+exact.o:	exact.f applu.incl npbparams.h
+exchange_1.o:	exchange_1.f applu.incl npbparams.h mpinpb.h
+exchange_3.o:	exchange_3.f applu.incl npbparams.h mpinpb.h
+exchange_4.o:	exchange_4.f applu.incl npbparams.h mpinpb.h
+exchange_5.o:	exchange_5.f applu.incl npbparams.h mpinpb.h
+exchange_6.o:	exchange_6.f applu.incl npbparams.h mpinpb.h
+init_comm.o:	init_comm.f applu.incl npbparams.h mpinpb.h 
+jacld.o:	jacld.f applu.incl npbparams.h
+jacu.o:		jacu.f applu.incl npbparams.h
+l2norm.o:	l2norm.f mpinpb.h
+neighbors.o:	neighbors.f applu.incl npbparams.h
+nodedim.o:	nodedim.f
+pintgr.o:	pintgr.f applu.incl npbparams.h mpinpb.h
+proc_grid.o:	proc_grid.f applu.incl npbparams.h
+read_input.o:	read_input.f applu.incl npbparams.h mpinpb.h
+rhs.o:		rhs.f applu.incl npbparams.h
+setbv.o:	setbv.f applu.incl npbparams.h
+setiv.o:	setiv.f applu.incl npbparams.h
+setcoeff.o:	setcoeff.f applu.incl npbparams.h
+sethyper.o:	sethyper.f applu.incl npbparams.h
+ssor.o:		ssor.f applu.incl npbparams.h mpinpb.h
+subdomain.o:	subdomain.f applu.incl npbparams.h mpinpb.h
+verify.o:	verify.f applu.incl npbparams.h
+
+clean:
+	- /bin/rm -f npbparams.h
+	- /bin/rm -f *.o *~
diff --git a/examples/smpi/NAS/LU/applu.incl b/examples/smpi/NAS/LU/applu.incl
new file mode 100644
index 0000000000..413fc834e3
--- /dev/null
+++ b/examples/smpi/NAS/LU/applu.incl
@@ -0,0 +1,153 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c---  applu.incl   
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   npbparams.h defines parameters that depend on the class and 
+c   number of nodes
+c---------------------------------------------------------------------
+
+      include 'npbparams.h'
+
+c---------------------------------------------------------------------
+c   parameters which can be overridden in runtime config file
+c   (in addition to size of problem - isiz01,02,03 give the maximum size)
+c   ipr = 1 to print out verbose information
+c   omega = 2.0 is correct for all classes
+c   tolrsd is tolerance levels for steady state residuals
+c---------------------------------------------------------------------
+      integer ipr_default
+      parameter (ipr_default = 1)
+      double precision omega_default
+      parameter (omega_default = 1.2d0)
+      double precision tolrsd1_def, tolrsd2_def, tolrsd3_def, 
+     >                 tolrsd4_def, tolrsd5_def
+      parameter (tolrsd1_def=1.0e-08, 
+     >          tolrsd2_def=1.0e-08, tolrsd3_def=1.0e-08, 
+     >          tolrsd4_def=1.0e-08, tolrsd5_def=1.0e-08)
+
+      double precision c1, c2, c3, c4, c5
+      parameter( c1 = 1.40d+00, c2 = 0.40d+00,
+     >           c3 = 1.00d-01, c4 = 1.00d+00,
+     >           c5 = 1.40d+00 )
+
+c---------------------------------------------------------------------
+c   grid
+c---------------------------------------------------------------------
+      integer nx, ny, nz
+      integer nx0, ny0, nz0
+      integer ipt, ist, iend
+      integer jpt, jst, jend
+      integer ii1, ii2
+      integer ji1, ji2
+      integer ki1, ki2
+      double precision  dxi, deta, dzeta
+      double precision  tx1, tx2, tx3
+      double precision  ty1, ty2, ty3
+      double precision  tz1, tz2, tz3
+
+      common/cgcon/ dxi, deta, dzeta,
+     >              tx1, tx2, tx3,
+     >              ty1, ty2, ty3,
+     >              tz1, tz2, tz3,
+     >              nx, ny, nz, 
+     >              nx0, ny0, nz0,
+     >              ipt, ist, iend,
+     >              jpt, jst, jend,
+     >              ii1, ii2, 
+     >              ji1, ji2, 
+     >              ki1, ki2
+
+c---------------------------------------------------------------------
+c   dissipation
+c---------------------------------------------------------------------
+      double precision dx1, dx2, dx3, dx4, dx5
+      double precision dy1, dy2, dy3, dy4, dy5
+      double precision dz1, dz2, dz3, dz4, dz5
+      double precision dssp
+
+      common/disp/ dx1,dx2,dx3,dx4,dx5,
+     >             dy1,dy2,dy3,dy4,dy5,
+     >             dz1,dz2,dz3,dz4,dz5,
+     >             dssp
+
+c---------------------------------------------------------------------
+c   field variables and residuals
+c---------------------------------------------------------------------
+      double precision u(5,-1:isiz1+2,-1:isiz2+2,isiz3),
+     >       rsd(5,-1:isiz1+2,-1:isiz2+2,isiz3),
+     >       frct(5,-1:isiz1+2,-1:isiz2+2,isiz3),
+     >       flux(5,0:isiz1+1,0:isiz2+1,isiz3)
+
+      common/cvar/ u,
+     >             rsd,
+     >             frct,
+     >             flux
+
+
+c---------------------------------------------------------------------
+c   output control parameters
+c---------------------------------------------------------------------
+      integer ipr, inorm
+
+      common/cprcon/ ipr, inorm
+
+c---------------------------------------------------------------------
+c   newton-raphson iteration control parameters
+c---------------------------------------------------------------------
+      integer itmax, invert
+      double precision  dt, omega, tolrsd(5),
+     >        rsdnm(5), errnm(5), frc, ttotal
+
+      common/ctscon/ dt, omega, tolrsd,
+     >               rsdnm, errnm, frc, ttotal,
+     >               itmax, invert
+
+      double precision a(5,5,isiz1,isiz2),
+     >       b(5,5,isiz1,isiz2),
+     >       c(5,5,isiz1,isiz2),
+     >       d(5,5,isiz1,isiz2)
+
+      common/cjac/ a, b, c, d
+
+c---------------------------------------------------------------------
+c   coefficients of the exact solution
+c---------------------------------------------------------------------
+      double precision ce(5,13)
+
+      common/cexact/ ce
+
+c---------------------------------------------------------------------
+c   multi-processor common blocks
+c---------------------------------------------------------------------
+      integer id, ndim, num, xdim, ydim, row, col
+      common/dim/ id,ndim,num,xdim,ydim,row,col
+
+      integer north,south,east,west
+      common/neigh/ north,south,east, west
+
+      integer from_s,from_n,from_e,from_w
+      parameter (from_s=1,from_n=2,from_e=3,from_w=4)
+
+      integer npmax
+      parameter (npmax=isiz01+isiz02)
+
+      logical icommn(npmax+1),icomms(npmax+1),
+     >        icomme(npmax+1),icommw(npmax+1)
+      double precision  buf(5,2*isiz2*isiz3),
+     >                  buf1(5,2*isiz2*isiz3)
+
+      common/comm/ buf, buf1,
+     >             icommn,icomms,
+     >             icomme,icommw
+
+      double precision maxtime
+      common/timer/maxtime
+
+
+c---------------------------------------------------------------------
+c   end of include file
+c---------------------------------------------------------------------
diff --git a/examples/smpi/NAS/LU/bcast_inputs.f b/examples/smpi/NAS/LU/bcast_inputs.f
new file mode 100644
index 0000000000..c606724bec
--- /dev/null
+++ b/examples/smpi/NAS/LU/bcast_inputs.f
@@ -0,0 +1,41 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine bcast_inputs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer ierr
+
+c---------------------------------------------------------------------
+c   root broadcasts the data
+c   The data isn't contiguous or of the same type, so it's not
+c   clear how to send it in the "MPI" way. 
+c   We could pack the info into a buffer or we could create
+c   an obscene datatype to handle it all at once. Since we only
+c   broadcast the data once, just use a separate broadcast for
+c   each piece. 
+c---------------------------------------------------------------------
+      call MPI_BCAST(ipr, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(inorm, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(itmax, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(dt, 1, dp_type, root, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(omega, 1, dp_type, root, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(tolrsd, 5, dp_type, root, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(nx0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(ny0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+      call MPI_BCAST(nz0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/LU/blts.f b/examples/smpi/NAS/LU/blts.f
new file mode 100644
index 0000000000..9861261b03
--- /dev/null
+++ b/examples/smpi/NAS/LU/blts.f
@@ -0,0 +1,261 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine blts ( ldmx, ldmy, ldmz,
+     >                  nx, ny, nz, k,
+     >                  omega,
+     >                  v,
+     >                  ldz, ldy, ldx, d,
+     >                  ist, iend, jst, jend,
+     >                  nx0, ny0, ipt, jpt)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   compute the regular-sparse, block lower triangular solution:
+c
+c                     v <-- ( L-inv ) * v
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer ldmx, ldmy, ldmz
+      integer nx, ny, nz
+      integer k
+      double precision  omega
+      double precision  v( 5, -1:ldmx+2, -1:ldmy+2, *),
+     >        ldz( 5, 5, ldmx, ldmy),
+     >        ldy( 5, 5, ldmx, ldmy),
+     >        ldx( 5, 5, ldmx, ldmy),
+     >        d( 5, 5, ldmx, ldmy)
+      integer ist, iend
+      integer jst, jend
+      integer nx0, ny0
+      integer ipt, jpt
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, m
+      integer iex
+      double precision  tmp, tmp1
+      double precision  tmat(5,5)
+
+
+c---------------------------------------------------------------------
+c   receive data from north and west
+c---------------------------------------------------------------------
+      iex = 0
+      call exchange_1( v,k,iex )
+
+
+      do j = jst, jend
+         do i = ist, iend
+            do m = 1, 5
+
+                  v( m, i, j, k ) =  v( m, i, j, k )
+     >    - omega * (  ldz( m, 1, i, j ) * v( 1, i, j, k-1 )
+     >               + ldz( m, 2, i, j ) * v( 2, i, j, k-1 )
+     >               + ldz( m, 3, i, j ) * v( 3, i, j, k-1 )
+     >               + ldz( m, 4, i, j ) * v( 4, i, j, k-1 )
+     >               + ldz( m, 5, i, j ) * v( 5, i, j, k-1 )  )
+
+            end do
+         end do
+      end do
+
+
+      do j=jst,jend
+        do i = ist, iend
+
+            do m = 1, 5
+
+                  v( m, i, j, k ) =  v( m, i, j, k )
+     > - omega * ( ldy( m, 1, i, j ) * v( 1, i, j-1, k )
+     >           + ldx( m, 1, i, j ) * v( 1, i-1, j, k )
+     >           + ldy( m, 2, i, j ) * v( 2, i, j-1, k )
+     >           + ldx( m, 2, i, j ) * v( 2, i-1, j, k )
+     >           + ldy( m, 3, i, j ) * v( 3, i, j-1, k )
+     >           + ldx( m, 3, i, j ) * v( 3, i-1, j, k )
+     >           + ldy( m, 4, i, j ) * v( 4, i, j-1, k )
+     >           + ldx( m, 4, i, j ) * v( 4, i-1, j, k )
+     >           + ldy( m, 5, i, j ) * v( 5, i, j-1, k )
+     >           + ldx( m, 5, i, j ) * v( 5, i-1, j, k ) )
+
+            end do
+       
+c---------------------------------------------------------------------
+c   diagonal block inversion
+c
+c   forward elimination
+c---------------------------------------------------------------------
+            do m = 1, 5
+               tmat( m, 1 ) = d( m, 1, i, j )
+               tmat( m, 2 ) = d( m, 2, i, j )
+               tmat( m, 3 ) = d( m, 3, i, j )
+               tmat( m, 4 ) = d( m, 4, i, j )
+               tmat( m, 5 ) = d( m, 5, i, j )
+            end do
+
+            tmp1 = 1.0d+00 / tmat( 1, 1 )
+            tmp = tmp1 * tmat( 2, 1 )
+            tmat( 2, 2 ) =  tmat( 2, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 2, 3 ) =  tmat( 2, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 2, 4 ) =  tmat( 2, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 2, 5 ) =  tmat( 2, 5 )
+     >           - tmp * tmat( 1, 5 )
+            v( 2, i, j, k ) = v( 2, i, j, k )
+     >        - v( 1, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 3, 1 )
+            tmat( 3, 2 ) =  tmat( 3, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 3, 3 ) =  tmat( 3, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 3, 4 ) =  tmat( 3, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 3, 5 ) =  tmat( 3, 5 )
+     >           - tmp * tmat( 1, 5 )
+            v( 3, i, j, k ) = v( 3, i, j, k )
+     >        - v( 1, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 4, 1 )
+            tmat( 4, 2 ) =  tmat( 4, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 4, 3 ) =  tmat( 4, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 1, 5 )
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >        - v( 1, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 5, 1 )
+            tmat( 5, 2 ) =  tmat( 5, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 5, 3 ) =  tmat( 5, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 1, 5 )
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >        - v( 1, i, j, k ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 2, 2 )
+            tmp = tmp1 * tmat( 3, 2 )
+            tmat( 3, 3 ) =  tmat( 3, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 3, 4 ) =  tmat( 3, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 3, 5 ) =  tmat( 3, 5 )
+     >           - tmp * tmat( 2, 5 )
+            v( 3, i, j, k ) = v( 3, i, j, k )
+     >        - v( 2, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 4, 2 )
+            tmat( 4, 3 ) =  tmat( 4, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 2, 5 )
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >        - v( 2, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 5, 2 )
+            tmat( 5, 3 ) =  tmat( 5, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 2, 5 )
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >        - v( 2, i, j, k ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 3, 3 )
+            tmp = tmp1 * tmat( 4, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 3, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 3, 5 )
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >        - v( 3, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 5, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 3, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 3, 5 )
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >        - v( 3, i, j, k ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 4, 4 )
+            tmp = tmp1 * tmat( 5, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 4, 5 )
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >        - v( 4, i, j, k ) * tmp
+
+c---------------------------------------------------------------------
+c   back substitution
+c---------------------------------------------------------------------
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >                      / tmat( 5, 5 )
+
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >           - tmat( 4, 5 ) * v( 5, i, j, k )
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >                      / tmat( 4, 4 )
+
+            v( 3, i, j, k ) = v( 3, i, j, k )
+     >           - tmat( 3, 4 ) * v( 4, i, j, k )
+     >           - tmat( 3, 5 ) * v( 5, i, j, k )
+            v( 3, i, j, k ) = v( 3, i, j, k )
+     >                      / tmat( 3, 3 )
+
+            v( 2, i, j, k ) = v( 2, i, j, k )
+     >           - tmat( 2, 3 ) * v( 3, i, j, k )
+     >           - tmat( 2, 4 ) * v( 4, i, j, k )
+     >           - tmat( 2, 5 ) * v( 5, i, j, k )
+            v( 2, i, j, k ) = v( 2, i, j, k )
+     >                      / tmat( 2, 2 )
+
+            v( 1, i, j, k ) = v( 1, i, j, k )
+     >           - tmat( 1, 2 ) * v( 2, i, j, k )
+     >           - tmat( 1, 3 ) * v( 3, i, j, k )
+     >           - tmat( 1, 4 ) * v( 4, i, j, k )
+     >           - tmat( 1, 5 ) * v( 5, i, j, k )
+            v( 1, i, j, k ) = v( 1, i, j, k )
+     >                      / tmat( 1, 1 )
+
+
+        enddo
+      enddo
+
+c---------------------------------------------------------------------
+c   send data to east and south
+c---------------------------------------------------------------------
+      iex = 2
+      call exchange_1( v,k,iex )
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/LU/blts_vec.f b/examples/smpi/NAS/LU/blts_vec.f
new file mode 100644
index 0000000000..f90ea84560
--- /dev/null
+++ b/examples/smpi/NAS/LU/blts_vec.f
@@ -0,0 +1,334 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine blts ( ldmx, ldmy, ldmz,
+     >                  nx, ny, nz, k,
+     >                  omega,
+     >                  v,
+     >                  ldz, ldy, ldx, d,
+     >                  ist, iend, jst, jend,
+     >                  nx0, ny0, ipt, jpt)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   compute the regular-sparse, block lower triangular solution:
+c
+c                     v <-- ( L-inv ) * v
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer ldmx, ldmy, ldmz
+      integer nx, ny, nz
+      integer k
+      double precision  omega
+      double precision  v( 5, -1:ldmx+2, -1:ldmy+2, *),
+     >        ldz( 5, 5, ldmx, ldmy),
+     >        ldy( 5, 5, ldmx, ldmy),
+     >        ldx( 5, 5, ldmx, ldmy),
+     >        d( 5, 5, ldmx, ldmy)
+      integer ist, iend
+      integer jst, jend
+      integer nx0, ny0
+      integer ipt, jpt
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, m, l, istp, iendp
+      integer iex
+      double precision  tmp, tmp1
+      double precision  tmat(5,5)
+
+
+c---------------------------------------------------------------------
+c   receive data from north and west
+c---------------------------------------------------------------------
+      iex = 0
+      call exchange_1( v,k,iex )
+
+
+      do j = jst, jend
+         do i = ist, iend
+            do m = 1, 5
+
+                  v( m, i, j, k ) =  v( m, i, j, k )
+     >    - omega * (  ldz( m, 1, i, j ) * v( 1, i, j, k-1 )
+     >               + ldz( m, 2, i, j ) * v( 2, i, j, k-1 )
+     >               + ldz( m, 3, i, j ) * v( 3, i, j, k-1 )
+     >               + ldz( m, 4, i, j ) * v( 4, i, j, k-1 )
+     >               + ldz( m, 5, i, j ) * v( 5, i, j, k-1 )  )
+
+            end do
+         end do
+      end do
+
+
+      do l = ist+jst, iend+jend
+         istp  = max(l - jend, ist)
+         iendp = min(l - jst, iend)
+
+!dir$ ivdep
+         do i = istp, iendp
+            j = l - i
+
+!!dir$ unroll 5
+!   manually unroll the loop
+!            do m = 1, 5
+
+                  v( 1, i, j, k ) =  v( 1, i, j, k )
+     > - omega * ( ldy( 1, 1, i, j ) * v( 1, i, j-1, k )
+     >           + ldx( 1, 1, i, j ) * v( 1, i-1, j, k )
+     >           + ldy( 1, 2, i, j ) * v( 2, i, j-1, k )
+     >           + ldx( 1, 2, i, j ) * v( 2, i-1, j, k )
+     >           + ldy( 1, 3, i, j ) * v( 3, i, j-1, k )
+     >           + ldx( 1, 3, i, j ) * v( 3, i-1, j, k )
+     >           + ldy( 1, 4, i, j ) * v( 4, i, j-1, k )
+     >           + ldx( 1, 4, i, j ) * v( 4, i-1, j, k )
+     >           + ldy( 1, 5, i, j ) * v( 5, i, j-1, k )
+     >           + ldx( 1, 5, i, j ) * v( 5, i-1, j, k ) )
+                  v( 2, i, j, k ) =  v( 2, i, j, k )
+     > - omega * ( ldy( 2, 1, i, j ) * v( 1, i, j-1, k )
+     >           + ldx( 2, 1, i, j ) * v( 1, i-1, j, k )
+     >           + ldy( 2, 2, i, j ) * v( 2, i, j-1, k )
+     >           + ldx( 2, 2, i, j ) * v( 2, i-1, j, k )
+     >           + ldy( 2, 3, i, j ) * v( 3, i, j-1, k )
+     >           + ldx( 2, 3, i, j ) * v( 3, i-1, j, k )
+     >           + ldy( 2, 4, i, j ) * v( 4, i, j-1, k )
+     >           + ldx( 2, 4, i, j ) * v( 4, i-1, j, k )
+     >           + ldy( 2, 5, i, j ) * v( 5, i, j-1, k )
+     >           + ldx( 2, 5, i, j ) * v( 5, i-1, j, k ) )
+                  v( 3, i, j, k ) =  v( 3, i, j, k )
+     > - omega * ( ldy( 3, 1, i, j ) * v( 1, i, j-1, k )
+     >           + ldx( 3, 1, i, j ) * v( 1, i-1, j, k )
+     >           + ldy( 3, 2, i, j ) * v( 2, i, j-1, k )
+     >           + ldx( 3, 2, i, j ) * v( 2, i-1, j, k )
+     >           + ldy( 3, 3, i, j ) * v( 3, i, j-1, k )
+     >           + ldx( 3, 3, i, j ) * v( 3, i-1, j, k )
+     >           + ldy( 3, 4, i, j ) * v( 4, i, j-1, k )
+     >           + ldx( 3, 4, i, j ) * v( 4, i-1, j, k )
+     >           + ldy( 3, 5, i, j ) * v( 5, i, j-1, k )
+     >           + ldx( 3, 5, i, j ) * v( 5, i-1, j, k ) )
+                  v( 4, i, j, k ) =  v( 4, i, j, k )
+     > - omega * ( ldy( 4, 1, i, j ) * v( 1, i, j-1, k )
+     >           + ldx( 4, 1, i, j ) * v( 1, i-1, j, k )
+     >           + ldy( 4, 2, i, j ) * v( 2, i, j-1, k )
+     >           + ldx( 4, 2, i, j ) * v( 2, i-1, j, k )
+     >           + ldy( 4, 3, i, j ) * v( 3, i, j-1, k )
+     >           + ldx( 4, 3, i, j ) * v( 3, i-1, j, k )
+     >           + ldy( 4, 4, i, j ) * v( 4, i, j-1, k )
+     >           + ldx( 4, 4, i, j ) * v( 4, i-1, j, k )
+     >           + ldy( 4, 5, i, j ) * v( 5, i, j-1, k )
+     >           + ldx( 4, 5, i, j ) * v( 5, i-1, j, k ) )
+                  v( 5, i, j, k ) =  v( 5, i, j, k )
+     > - omega * ( ldy( 5, 1, i, j ) * v( 1, i, j-1, k )
+     >           + ldx( 5, 1, i, j ) * v( 1, i-1, j, k )
+     >           + ldy( 5, 2, i, j ) * v( 2, i, j-1, k )
+     >           + ldx( 5, 2, i, j ) * v( 2, i-1, j, k )
+     >           + ldy( 5, 3, i, j ) * v( 3, i, j-1, k )
+     >           + ldx( 5, 3, i, j ) * v( 3, i-1, j, k )
+     >           + ldy( 5, 4, i, j ) * v( 4, i, j-1, k )
+     >           + ldx( 5, 4, i, j ) * v( 4, i-1, j, k )
+     >           + ldy( 5, 5, i, j ) * v( 5, i, j-1, k )
+     >           + ldx( 5, 5, i, j ) * v( 5, i-1, j, k ) )
+
+!            end do
+       
+c---------------------------------------------------------------------
+c   diagonal block inversion
+c
+c   forward elimination
+c---------------------------------------------------------------------
+!!dir$ unroll 5
+!   manually unroll the loop
+!            do m = 1, 5
+               tmat( 1, 1 ) = d( 1, 1, i, j )
+               tmat( 1, 2 ) = d( 1, 2, i, j )
+               tmat( 1, 3 ) = d( 1, 3, i, j )
+               tmat( 1, 4 ) = d( 1, 4, i, j )
+               tmat( 1, 5 ) = d( 1, 5, i, j )
+               tmat( 2, 1 ) = d( 2, 1, i, j )
+               tmat( 2, 2 ) = d( 2, 2, i, j )
+               tmat( 2, 3 ) = d( 2, 3, i, j )
+               tmat( 2, 4 ) = d( 2, 4, i, j )
+               tmat( 2, 5 ) = d( 2, 5, i, j )
+               tmat( 3, 1 ) = d( 3, 1, i, j )
+               tmat( 3, 2 ) = d( 3, 2, i, j )
+               tmat( 3, 3 ) = d( 3, 3, i, j )
+               tmat( 3, 4 ) = d( 3, 4, i, j )
+               tmat( 3, 5 ) = d( 3, 5, i, j )
+               tmat( 4, 1 ) = d( 4, 1, i, j )
+               tmat( 4, 2 ) = d( 4, 2, i, j )
+               tmat( 4, 3 ) = d( 4, 3, i, j )
+               tmat( 4, 4 ) = d( 4, 4, i, j )
+               tmat( 4, 5 ) = d( 4, 5, i, j )
+               tmat( 5, 1 ) = d( 5, 1, i, j )
+               tmat( 5, 2 ) = d( 5, 2, i, j )
+               tmat( 5, 3 ) = d( 5, 3, i, j )
+               tmat( 5, 4 ) = d( 5, 4, i, j )
+               tmat( 5, 5 ) = d( 5, 5, i, j )
+!            end do
+
+            tmp1 = 1.0d+00 / tmat( 1, 1 )
+            tmp = tmp1 * tmat( 2, 1 )
+            tmat( 2, 2 ) =  tmat( 2, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 2, 3 ) =  tmat( 2, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 2, 4 ) =  tmat( 2, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 2, 5 ) =  tmat( 2, 5 )
+     >           - tmp * tmat( 1, 5 )
+            v( 2, i, j, k ) = v( 2, i, j, k )
+     >        - v( 1, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 3, 1 )
+            tmat( 3, 2 ) =  tmat( 3, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 3, 3 ) =  tmat( 3, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 3, 4 ) =  tmat( 3, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 3, 5 ) =  tmat( 3, 5 )
+     >           - tmp * tmat( 1, 5 )
+            v( 3, i, j, k ) = v( 3, i, j, k )
+     >        - v( 1, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 4, 1 )
+            tmat( 4, 2 ) =  tmat( 4, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 4, 3 ) =  tmat( 4, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 1, 5 )
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >        - v( 1, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 5, 1 )
+            tmat( 5, 2 ) =  tmat( 5, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 5, 3 ) =  tmat( 5, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 1, 5 )
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >        - v( 1, i, j, k ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 2, 2 )
+            tmp = tmp1 * tmat( 3, 2 )
+            tmat( 3, 3 ) =  tmat( 3, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 3, 4 ) =  tmat( 3, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 3, 5 ) =  tmat( 3, 5 )
+     >           - tmp * tmat( 2, 5 )
+            v( 3, i, j, k ) = v( 3, i, j, k )
+     >        - v( 2, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 4, 2 )
+            tmat( 4, 3 ) =  tmat( 4, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 2, 5 )
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >        - v( 2, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 5, 2 )
+            tmat( 5, 3 ) =  tmat( 5, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 2, 5 )
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >        - v( 2, i, j, k ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 3, 3 )
+            tmp = tmp1 * tmat( 4, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 3, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 3, 5 )
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >        - v( 3, i, j, k ) * tmp
+
+            tmp = tmp1 * tmat( 5, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 3, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 3, 5 )
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >        - v( 3, i, j, k ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 4, 4 )
+            tmp = tmp1 * tmat( 5, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 4, 5 )
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >        - v( 4, i, j, k ) * tmp
+
+c---------------------------------------------------------------------
+c   back substitution
+c---------------------------------------------------------------------
+            v( 5, i, j, k ) = v( 5, i, j, k )
+     >                      / tmat( 5, 5 )
+
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >           - tmat( 4, 5 ) * v( 5, i, j, k )
+            v( 4, i, j, k ) = v( 4, i, j, k )
+     >                      / tmat( 4, 4 )
+
+            v( 3, i, j, k ) = v( 3, i, j, k )
+     >           - tmat( 3, 4 ) * v( 4, i, j, k )
+     >           - tmat( 3, 5 ) * v( 5, i, j, k )
+            v( 3, i, j, k ) = v( 3, i, j, k )
+     >                      / tmat( 3, 3 )
+
+            v( 2, i, j, k ) = v( 2, i, j, k )
+     >           - tmat( 2, 3 ) * v( 3, i, j, k )
+     >           - tmat( 2, 4 ) * v( 4, i, j, k )
+     >           - tmat( 2, 5 ) * v( 5, i, j, k )
+            v( 2, i, j, k ) = v( 2, i, j, k )
+     >                      / tmat( 2, 2 )
+
+            v( 1, i, j, k ) = v( 1, i, j, k )
+     >           - tmat( 1, 2 ) * v( 2, i, j, k )
+     >           - tmat( 1, 3 ) * v( 3, i, j, k )
+     >           - tmat( 1, 4 ) * v( 4, i, j, k )
+     >           - tmat( 1, 5 ) * v( 5, i, j, k )
+            v( 1, i, j, k ) = v( 1, i, j, k )
+     >                      / tmat( 1, 1 )
+
+
+        enddo
+      enddo
+
+c---------------------------------------------------------------------
+c   send data to east and south
+c---------------------------------------------------------------------
+      iex = 2
+      call exchange_1( v,k,iex )
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/LU/buts.f b/examples/smpi/NAS/LU/buts.f
new file mode 100644
index 0000000000..a6fc3d6217
--- /dev/null
+++ b/examples/smpi/NAS/LU/buts.f
@@ -0,0 +1,259 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine buts( ldmx, ldmy, ldmz,
+     >                 nx, ny, nz, k,
+     >                 omega,
+     >                 v, tv,
+     >                 d, udx, udy, udz,
+     >                 ist, iend, jst, jend,
+     >                 nx0, ny0, ipt, jpt )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   compute the regular-sparse, block upper triangular solution:
+c
+c                     v <-- ( U-inv ) * v
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer ldmx, ldmy, ldmz
+      integer nx, ny, nz
+      integer k
+      double precision  omega
+      double precision  v( 5, -1:ldmx+2, -1:ldmy+2, *), 
+     >        tv(5, ldmx, ldmy),
+     >        d( 5, 5, ldmx, ldmy),
+     >        udx( 5, 5, ldmx, ldmy),
+     >        udy( 5, 5, ldmx, ldmy),
+     >        udz( 5, 5, ldmx, ldmy )
+      integer ist, iend
+      integer jst, jend
+      integer nx0, ny0
+      integer ipt, jpt
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, m
+      integer iex
+      double precision  tmp, tmp1
+      double precision  tmat(5,5)
+
+
+c---------------------------------------------------------------------
+c   receive data from south and east
+c---------------------------------------------------------------------
+      iex = 1
+      call exchange_1( v,k,iex )
+
+      do j = jend, jst, -1
+         do i = iend, ist, -1
+            do m = 1, 5
+                  tv( m, i, j ) = 
+     >      omega * (  udz( m, 1, i, j ) * v( 1, i, j, k+1 )
+     >               + udz( m, 2, i, j ) * v( 2, i, j, k+1 )
+     >               + udz( m, 3, i, j ) * v( 3, i, j, k+1 )
+     >               + udz( m, 4, i, j ) * v( 4, i, j, k+1 )
+     >               + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) )
+            end do
+         end do
+      end do
+
+
+      do j = jend,jst,-1
+        do i = iend,ist,-1
+
+            do m = 1, 5
+                  tv( m, i, j ) = tv( m, i, j )
+     > + omega * ( udy( m, 1, i, j ) * v( 1, i, j+1, k )
+     >           + udx( m, 1, i, j ) * v( 1, i+1, j, k )
+     >           + udy( m, 2, i, j ) * v( 2, i, j+1, k )
+     >           + udx( m, 2, i, j ) * v( 2, i+1, j, k )
+     >           + udy( m, 3, i, j ) * v( 3, i, j+1, k )
+     >           + udx( m, 3, i, j ) * v( 3, i+1, j, k )
+     >           + udy( m, 4, i, j ) * v( 4, i, j+1, k )
+     >           + udx( m, 4, i, j ) * v( 4, i+1, j, k )
+     >           + udy( m, 5, i, j ) * v( 5, i, j+1, k )
+     >           + udx( m, 5, i, j ) * v( 5, i+1, j, k ) )
+            end do
+
+c---------------------------------------------------------------------
+c   diagonal block inversion
+c---------------------------------------------------------------------
+            do m = 1, 5
+               tmat( m, 1 ) = d( m, 1, i, j )
+               tmat( m, 2 ) = d( m, 2, i, j )
+               tmat( m, 3 ) = d( m, 3, i, j )
+               tmat( m, 4 ) = d( m, 4, i, j )
+               tmat( m, 5 ) = d( m, 5, i, j )
+            end do
+
+            tmp1 = 1.0d+00 / tmat( 1, 1 )
+            tmp = tmp1 * tmat( 2, 1 )
+            tmat( 2, 2 ) =  tmat( 2, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 2, 3 ) =  tmat( 2, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 2, 4 ) =  tmat( 2, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 2, 5 ) =  tmat( 2, 5 )
+     >           - tmp * tmat( 1, 5 )
+            tv( 2, i, j ) = tv( 2, i, j )
+     >        - tv( 1, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 3, 1 )
+            tmat( 3, 2 ) =  tmat( 3, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 3, 3 ) =  tmat( 3, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 3, 4 ) =  tmat( 3, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 3, 5 ) =  tmat( 3, 5 )
+     >           - tmp * tmat( 1, 5 )
+            tv( 3, i, j ) = tv( 3, i, j )
+     >        - tv( 1, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 4, 1 )
+            tmat( 4, 2 ) =  tmat( 4, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 4, 3 ) =  tmat( 4, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 1, 5 )
+            tv( 4, i, j ) = tv( 4, i, j )
+     >        - tv( 1, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 5, 1 )
+            tmat( 5, 2 ) =  tmat( 5, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 5, 3 ) =  tmat( 5, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 1, 5 )
+            tv( 5, i, j ) = tv( 5, i, j )
+     >        - tv( 1, i, j ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 2, 2 )
+            tmp = tmp1 * tmat( 3, 2 )
+            tmat( 3, 3 ) =  tmat( 3, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 3, 4 ) =  tmat( 3, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 3, 5 ) =  tmat( 3, 5 )
+     >           - tmp * tmat( 2, 5 )
+            tv( 3, i, j ) = tv( 3, i, j )
+     >        - tv( 2, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 4, 2 )
+            tmat( 4, 3 ) =  tmat( 4, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 2, 5 )
+            tv( 4, i, j ) = tv( 4, i, j )
+     >        - tv( 2, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 5, 2 )
+            tmat( 5, 3 ) =  tmat( 5, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 2, 5 )
+            tv( 5, i, j ) = tv( 5, i, j )
+     >        - tv( 2, i, j ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 3, 3 )
+            tmp = tmp1 * tmat( 4, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 3, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 3, 5 )
+            tv( 4, i, j ) = tv( 4, i, j )
+     >        - tv( 3, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 5, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 3, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 3, 5 )
+            tv( 5, i, j ) = tv( 5, i, j )
+     >        - tv( 3, i, j ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 4, 4 )
+            tmp = tmp1 * tmat( 5, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 4, 5 )
+            tv( 5, i, j ) = tv( 5, i, j )
+     >        - tv( 4, i, j ) * tmp
+
+c---------------------------------------------------------------------
+c   back substitution
+c---------------------------------------------------------------------
+            tv( 5, i, j ) = tv( 5, i, j )
+     >                      / tmat( 5, 5 )
+
+            tv( 4, i, j ) = tv( 4, i, j )
+     >           - tmat( 4, 5 ) * tv( 5, i, j )
+            tv( 4, i, j ) = tv( 4, i, j )
+     >                      / tmat( 4, 4 )
+
+            tv( 3, i, j ) = tv( 3, i, j )
+     >           - tmat( 3, 4 ) * tv( 4, i, j )
+     >           - tmat( 3, 5 ) * tv( 5, i, j )
+            tv( 3, i, j ) = tv( 3, i, j )
+     >                      / tmat( 3, 3 )
+
+            tv( 2, i, j ) = tv( 2, i, j )
+     >           - tmat( 2, 3 ) * tv( 3, i, j )
+     >           - tmat( 2, 4 ) * tv( 4, i, j )
+     >           - tmat( 2, 5 ) * tv( 5, i, j )
+            tv( 2, i, j ) = tv( 2, i, j )
+     >                      / tmat( 2, 2 )
+
+            tv( 1, i, j ) = tv( 1, i, j )
+     >           - tmat( 1, 2 ) * tv( 2, i, j )
+     >           - tmat( 1, 3 ) * tv( 3, i, j )
+     >           - tmat( 1, 4 ) * tv( 4, i, j )
+     >           - tmat( 1, 5 ) * tv( 5, i, j )
+            tv( 1, i, j ) = tv( 1, i, j )
+     >                      / tmat( 1, 1 )
+
+            v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j )
+            v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j )
+            v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j )
+            v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j )
+            v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j )
+
+
+        enddo
+      end do
+
+c---------------------------------------------------------------------
+c   send data to north and west
+c---------------------------------------------------------------------
+      iex = 3
+      call exchange_1( v,k,iex )
+ 
+      return
+      end
diff --git a/examples/smpi/NAS/LU/buts_vec.f b/examples/smpi/NAS/LU/buts_vec.f
new file mode 100644
index 0000000000..813105d270
--- /dev/null
+++ b/examples/smpi/NAS/LU/buts_vec.f
@@ -0,0 +1,332 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine buts( ldmx, ldmy, ldmz,
+     >                 nx, ny, nz, k,
+     >                 omega,
+     >                 v, tv,
+     >                 d, udx, udy, udz,
+     >                 ist, iend, jst, jend,
+     >                 nx0, ny0, ipt, jpt )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   compute the regular-sparse, block upper triangular solution:
+c
+c                     v <-- ( U-inv ) * v
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer ldmx, ldmy, ldmz
+      integer nx, ny, nz
+      integer k
+      double precision  omega
+      double precision  v( 5, -1:ldmx+2, -1:ldmy+2, *), 
+     >        tv(5, ldmx, ldmy),
+     >        d( 5, 5, ldmx, ldmy),
+     >        udx( 5, 5, ldmx, ldmy),
+     >        udy( 5, 5, ldmx, ldmy),
+     >        udz( 5, 5, ldmx, ldmy )
+      integer ist, iend
+      integer jst, jend
+      integer nx0, ny0
+      integer ipt, jpt
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, m, l, istp, iendp
+      integer iex
+      double precision  tmp, tmp1
+      double precision  tmat(5,5)
+
+
+c---------------------------------------------------------------------
+c   receive data from south and east
+c---------------------------------------------------------------------
+      iex = 1
+      call exchange_1( v,k,iex )
+
+      do j = jend, jst, -1
+         do i = iend, ist, -1
+            do m = 1, 5
+                  tv( m, i, j ) = 
+     >      omega * (  udz( m, 1, i, j ) * v( 1, i, j, k+1 )
+     >               + udz( m, 2, i, j ) * v( 2, i, j, k+1 )
+     >               + udz( m, 3, i, j ) * v( 3, i, j, k+1 )
+     >               + udz( m, 4, i, j ) * v( 4, i, j, k+1 )
+     >               + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) )
+            end do
+         end do
+      end do
+
+
+      do l = iend+jend, ist+jst, -1
+         istp  = max(l - jend, ist)
+         iendp = min(l - jst, iend)
+
+!dir$ ivdep
+         do i = istp, iendp
+            j = l - i
+
+!!dir$ unroll 5
+!   manually unroll the loop
+!            do m = 1, 5
+                  tv( 1, i, j ) = tv( 1, i, j )
+     > + omega * ( udy( 1, 1, i, j ) * v( 1, i, j+1, k )
+     >           + udx( 1, 1, i, j ) * v( 1, i+1, j, k )
+     >           + udy( 1, 2, i, j ) * v( 2, i, j+1, k )
+     >           + udx( 1, 2, i, j ) * v( 2, i+1, j, k )
+     >           + udy( 1, 3, i, j ) * v( 3, i, j+1, k )
+     >           + udx( 1, 3, i, j ) * v( 3, i+1, j, k )
+     >           + udy( 1, 4, i, j ) * v( 4, i, j+1, k )
+     >           + udx( 1, 4, i, j ) * v( 4, i+1, j, k )
+     >           + udy( 1, 5, i, j ) * v( 5, i, j+1, k )
+     >           + udx( 1, 5, i, j ) * v( 5, i+1, j, k ) )
+                  tv( 2, i, j ) = tv( 2, i, j )
+     > + omega * ( udy( 2, 1, i, j ) * v( 1, i, j+1, k )
+     >           + udx( 2, 1, i, j ) * v( 1, i+1, j, k )
+     >           + udy( 2, 2, i, j ) * v( 2, i, j+1, k )
+     >           + udx( 2, 2, i, j ) * v( 2, i+1, j, k )
+     >           + udy( 2, 3, i, j ) * v( 3, i, j+1, k )
+     >           + udx( 2, 3, i, j ) * v( 3, i+1, j, k )
+     >           + udy( 2, 4, i, j ) * v( 4, i, j+1, k )
+     >           + udx( 2, 4, i, j ) * v( 4, i+1, j, k )
+     >           + udy( 2, 5, i, j ) * v( 5, i, j+1, k )
+     >           + udx( 2, 5, i, j ) * v( 5, i+1, j, k ) )
+                  tv( 3, i, j ) = tv( 3, i, j )
+     > + omega * ( udy( 3, 1, i, j ) * v( 1, i, j+1, k )
+     >           + udx( 3, 1, i, j ) * v( 1, i+1, j, k )
+     >           + udy( 3, 2, i, j ) * v( 2, i, j+1, k )
+     >           + udx( 3, 2, i, j ) * v( 2, i+1, j, k )
+     >           + udy( 3, 3, i, j ) * v( 3, i, j+1, k )
+     >           + udx( 3, 3, i, j ) * v( 3, i+1, j, k )
+     >           + udy( 3, 4, i, j ) * v( 4, i, j+1, k )
+     >           + udx( 3, 4, i, j ) * v( 4, i+1, j, k )
+     >           + udy( 3, 5, i, j ) * v( 5, i, j+1, k )
+     >           + udx( 3, 5, i, j ) * v( 5, i+1, j, k ) )
+                  tv( 4, i, j ) = tv( 4, i, j )
+     > + omega * ( udy( 4, 1, i, j ) * v( 1, i, j+1, k )
+     >           + udx( 4, 1, i, j ) * v( 1, i+1, j, k )
+     >           + udy( 4, 2, i, j ) * v( 2, i, j+1, k )
+     >           + udx( 4, 2, i, j ) * v( 2, i+1, j, k )
+     >           + udy( 4, 3, i, j ) * v( 3, i, j+1, k )
+     >           + udx( 4, 3, i, j ) * v( 3, i+1, j, k )
+     >           + udy( 4, 4, i, j ) * v( 4, i, j+1, k )
+     >           + udx( 4, 4, i, j ) * v( 4, i+1, j, k )
+     >           + udy( 4, 5, i, j ) * v( 5, i, j+1, k )
+     >           + udx( 4, 5, i, j ) * v( 5, i+1, j, k ) )
+                  tv( 5, i, j ) = tv( 5, i, j )
+     > + omega * ( udy( 5, 1, i, j ) * v( 1, i, j+1, k )
+     >           + udx( 5, 1, i, j ) * v( 1, i+1, j, k )
+     >           + udy( 5, 2, i, j ) * v( 2, i, j+1, k )
+     >           + udx( 5, 2, i, j ) * v( 2, i+1, j, k )
+     >           + udy( 5, 3, i, j ) * v( 3, i, j+1, k )
+     >           + udx( 5, 3, i, j ) * v( 3, i+1, j, k )
+     >           + udy( 5, 4, i, j ) * v( 4, i, j+1, k )
+     >           + udx( 5, 4, i, j ) * v( 4, i+1, j, k )
+     >           + udy( 5, 5, i, j ) * v( 5, i, j+1, k )
+     >           + udx( 5, 5, i, j ) * v( 5, i+1, j, k ) )
+!            end do
+
+c---------------------------------------------------------------------
+c   diagonal block inversion
+c---------------------------------------------------------------------
+!!dir$ unroll 5
+!   manually unroll the loop
+!            do m = 1, 5
+               tmat( 1, 1 ) = d( 1, 1, i, j )
+               tmat( 1, 2 ) = d( 1, 2, i, j )
+               tmat( 1, 3 ) = d( 1, 3, i, j )
+               tmat( 1, 4 ) = d( 1, 4, i, j )
+               tmat( 1, 5 ) = d( 1, 5, i, j )
+               tmat( 2, 1 ) = d( 2, 1, i, j )
+               tmat( 2, 2 ) = d( 2, 2, i, j )
+               tmat( 2, 3 ) = d( 2, 3, i, j )
+               tmat( 2, 4 ) = d( 2, 4, i, j )
+               tmat( 2, 5 ) = d( 2, 5, i, j )
+               tmat( 3, 1 ) = d( 3, 1, i, j )
+               tmat( 3, 2 ) = d( 3, 2, i, j )
+               tmat( 3, 3 ) = d( 3, 3, i, j )
+               tmat( 3, 4 ) = d( 3, 4, i, j )
+               tmat( 3, 5 ) = d( 3, 5, i, j )
+               tmat( 4, 1 ) = d( 4, 1, i, j )
+               tmat( 4, 2 ) = d( 4, 2, i, j )
+               tmat( 4, 3 ) = d( 4, 3, i, j )
+               tmat( 4, 4 ) = d( 4, 4, i, j )
+               tmat( 4, 5 ) = d( 4, 5, i, j )
+               tmat( 5, 1 ) = d( 5, 1, i, j )
+               tmat( 5, 2 ) = d( 5, 2, i, j )
+               tmat( 5, 3 ) = d( 5, 3, i, j )
+               tmat( 5, 4 ) = d( 5, 4, i, j )
+               tmat( 5, 5 ) = d( 5, 5, i, j )
+!            end do
+
+            tmp1 = 1.0d+00 / tmat( 1, 1 )
+            tmp = tmp1 * tmat( 2, 1 )
+            tmat( 2, 2 ) =  tmat( 2, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 2, 3 ) =  tmat( 2, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 2, 4 ) =  tmat( 2, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 2, 5 ) =  tmat( 2, 5 )
+     >           - tmp * tmat( 1, 5 )
+            tv( 2, i, j ) = tv( 2, i, j )
+     >        - tv( 1, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 3, 1 )
+            tmat( 3, 2 ) =  tmat( 3, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 3, 3 ) =  tmat( 3, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 3, 4 ) =  tmat( 3, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 3, 5 ) =  tmat( 3, 5 )
+     >           - tmp * tmat( 1, 5 )
+            tv( 3, i, j ) = tv( 3, i, j )
+     >        - tv( 1, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 4, 1 )
+            tmat( 4, 2 ) =  tmat( 4, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 4, 3 ) =  tmat( 4, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 1, 5 )
+            tv( 4, i, j ) = tv( 4, i, j )
+     >        - tv( 1, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 5, 1 )
+            tmat( 5, 2 ) =  tmat( 5, 2 )
+     >           - tmp * tmat( 1, 2 )
+            tmat( 5, 3 ) =  tmat( 5, 3 )
+     >           - tmp * tmat( 1, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 1, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 1, 5 )
+            tv( 5, i, j ) = tv( 5, i, j )
+     >        - tv( 1, i, j ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 2, 2 )
+            tmp = tmp1 * tmat( 3, 2 )
+            tmat( 3, 3 ) =  tmat( 3, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 3, 4 ) =  tmat( 3, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 3, 5 ) =  tmat( 3, 5 )
+     >           - tmp * tmat( 2, 5 )
+            tv( 3, i, j ) = tv( 3, i, j )
+     >        - tv( 2, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 4, 2 )
+            tmat( 4, 3 ) =  tmat( 4, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 2, 5 )
+            tv( 4, i, j ) = tv( 4, i, j )
+     >        - tv( 2, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 5, 2 )
+            tmat( 5, 3 ) =  tmat( 5, 3 )
+     >           - tmp * tmat( 2, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 2, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 2, 5 )
+            tv( 5, i, j ) = tv( 5, i, j )
+     >        - tv( 2, i, j ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 3, 3 )
+            tmp = tmp1 * tmat( 4, 3 )
+            tmat( 4, 4 ) =  tmat( 4, 4 )
+     >           - tmp * tmat( 3, 4 )
+            tmat( 4, 5 ) =  tmat( 4, 5 )
+     >           - tmp * tmat( 3, 5 )
+            tv( 4, i, j ) = tv( 4, i, j )
+     >        - tv( 3, i, j ) * tmp
+
+            tmp = tmp1 * tmat( 5, 3 )
+            tmat( 5, 4 ) =  tmat( 5, 4 )
+     >           - tmp * tmat( 3, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 3, 5 )
+            tv( 5, i, j ) = tv( 5, i, j )
+     >        - tv( 3, i, j ) * tmp
+
+
+
+            tmp1 = 1.0d+00 / tmat( 4, 4 )
+            tmp = tmp1 * tmat( 5, 4 )
+            tmat( 5, 5 ) =  tmat( 5, 5 )
+     >           - tmp * tmat( 4, 5 )
+            tv( 5, i, j ) = tv( 5, i, j )
+     >        - tv( 4, i, j ) * tmp
+
+c---------------------------------------------------------------------
+c   back substitution
+c---------------------------------------------------------------------
+            tv( 5, i, j ) = tv( 5, i, j )
+     >                      / tmat( 5, 5 )
+
+            tv( 4, i, j ) = tv( 4, i, j )
+     >           - tmat( 4, 5 ) * tv( 5, i, j )
+            tv( 4, i, j ) = tv( 4, i, j )
+     >                      / tmat( 4, 4 )
+
+            tv( 3, i, j ) = tv( 3, i, j )
+     >           - tmat( 3, 4 ) * tv( 4, i, j )
+     >           - tmat( 3, 5 ) * tv( 5, i, j )
+            tv( 3, i, j ) = tv( 3, i, j )
+     >                      / tmat( 3, 3 )
+
+            tv( 2, i, j ) = tv( 2, i, j )
+     >           - tmat( 2, 3 ) * tv( 3, i, j )
+     >           - tmat( 2, 4 ) * tv( 4, i, j )
+     >           - tmat( 2, 5 ) * tv( 5, i, j )
+            tv( 2, i, j ) = tv( 2, i, j )
+     >                      / tmat( 2, 2 )
+
+            tv( 1, i, j ) = tv( 1, i, j )
+     >           - tmat( 1, 2 ) * tv( 2, i, j )
+     >           - tmat( 1, 3 ) * tv( 3, i, j )
+     >           - tmat( 1, 4 ) * tv( 4, i, j )
+     >           - tmat( 1, 5 ) * tv( 5, i, j )
+            tv( 1, i, j ) = tv( 1, i, j )
+     >                      / tmat( 1, 1 )
+
+            v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j )
+            v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j )
+            v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j )
+            v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j )
+            v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j )
+
+
+        enddo
+      end do
+
+c---------------------------------------------------------------------
+c   send data to north and west
+c---------------------------------------------------------------------
+      iex = 3
+      call exchange_1( v,k,iex )
+ 
+      return
+      end
diff --git a/examples/smpi/NAS/LU/erhs.f b/examples/smpi/NAS/LU/erhs.f
new file mode 100644
index 0000000000..928e2a9f50
--- /dev/null
+++ b/examples/smpi/NAS/LU/erhs.f
@@ -0,0 +1,536 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine erhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   compute the right hand side based on exact solution
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, k, m
+      integer iglob, jglob
+      integer iex
+      integer L1, L2
+      integer ist1, iend1
+      integer jst1, jend1
+      double precision  dsspm
+      double precision  xi, eta, zeta
+      double precision  q
+      double precision  u21, u31, u41
+      double precision  tmp
+      double precision  u21i, u31i, u41i, u51i
+      double precision  u21j, u31j, u41j, u51j
+      double precision  u21k, u31k, u41k, u51k
+      double precision  u21im1, u31im1, u41im1, u51im1
+      double precision  u21jm1, u31jm1, u41jm1, u51jm1
+      double precision  u21km1, u31km1, u41km1, u51km1
+
+      dsspm = dssp
+
+
+      do k = 1, nz
+         do j = 1, ny
+            do i = 1, nx
+               do m = 1, 5
+                  frct( m, i, j, k ) = 0.0d+00
+               end do
+            end do
+         end do
+      end do
+
+      do k = 1, nz
+         zeta = ( dble(k-1) ) / ( nz - 1 )
+         do j = 1, ny
+            jglob = jpt + j
+            eta = ( dble(jglob-1) ) / ( ny0 - 1 )
+            do i = 1, nx
+               iglob = ipt + i
+               xi = ( dble(iglob-1) ) / ( nx0 - 1 )
+               do m = 1, 5
+                  rsd(m,i,j,k) =  ce(m,1)
+     >                 + ce(m,2) * xi
+     >                 + ce(m,3) * eta
+     >                 + ce(m,4) * zeta
+     >                 + ce(m,5) * xi * xi
+     >                 + ce(m,6) * eta * eta
+     >                 + ce(m,7) * zeta * zeta
+     >                 + ce(m,8) * xi * xi * xi
+     >                 + ce(m,9) * eta * eta * eta
+     >                 + ce(m,10) * zeta * zeta * zeta
+     >                 + ce(m,11) * xi * xi * xi * xi
+     >                 + ce(m,12) * eta * eta * eta * eta
+     >                 + ce(m,13) * zeta * zeta * zeta * zeta
+               end do
+            end do
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c   xi-direction flux differences
+c---------------------------------------------------------------------
+c
+c   iex = flag : iex = 0  north/south communication
+c              : iex = 1  east/west communication
+c
+c---------------------------------------------------------------------
+      iex   = 0
+
+c---------------------------------------------------------------------
+c   communicate and receive/send two rows of data
+c---------------------------------------------------------------------
+      call exchange_3 (rsd,iex)
+
+      L1 = 0
+      if (north.eq.-1) L1 = 1
+      L2 = nx + 1
+      if (south.eq.-1) L2 = nx
+
+      ist1 = 1
+      iend1 = nx
+      if (north.eq.-1) ist1 = 4
+      if (south.eq.-1) iend1 = nx - 3
+
+      do k = 2, nz - 1
+         do j = jst, jend
+            do i = L1, L2
+               flux(1,i,j,k) = rsd(2,i,j,k)
+               u21 = rsd(2,i,j,k) / rsd(1,i,j,k)
+               q = 0.50d+00 * (  rsd(2,i,j,k) * rsd(2,i,j,k)
+     >                         + rsd(3,i,j,k) * rsd(3,i,j,k)
+     >                         + rsd(4,i,j,k) * rsd(4,i,j,k) )
+     >                      / rsd(1,i,j,k)
+               flux(2,i,j,k) = rsd(2,i,j,k) * u21 + c2 * 
+     >                         ( rsd(5,i,j,k) - q )
+               flux(3,i,j,k) = rsd(3,i,j,k) * u21
+               flux(4,i,j,k) = rsd(4,i,j,k) * u21
+               flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u21
+            end do
+         end do
+      end do 
+
+      do k = 2, nz - 1
+         do j = jst, jend
+            do i = ist, iend
+               do m = 1, 5
+                  frct(m,i,j,k) =  frct(m,i,j,k)
+     >                   - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) )
+               end do
+            end do
+            do i = ist, L2
+               tmp = 1.0d+00 / rsd(1,i,j,k)
+
+               u21i = tmp * rsd(2,i,j,k)
+               u31i = tmp * rsd(3,i,j,k)
+               u41i = tmp * rsd(4,i,j,k)
+               u51i = tmp * rsd(5,i,j,k)
+
+               tmp = 1.0d+00 / rsd(1,i-1,j,k)
+
+               u21im1 = tmp * rsd(2,i-1,j,k)
+               u31im1 = tmp * rsd(3,i-1,j,k)
+               u41im1 = tmp * rsd(4,i-1,j,k)
+               u51im1 = tmp * rsd(5,i-1,j,k)
+
+               flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * 
+     >                        ( u21i - u21im1 )
+               flux(3,i,j,k) = tx3 * ( u31i - u31im1 )
+               flux(4,i,j,k) = tx3 * ( u41i - u41im1 )
+               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+     >              * tx3 * ( ( u21i  **2 + u31i  **2 + u41i  **2 )
+     >                      - ( u21im1**2 + u31im1**2 + u41im1**2 ) )
+     >              + (1.0d+00/6.0d+00)
+     >              * tx3 * ( u21i**2 - u21im1**2 )
+     >              + c1 * c5 * tx3 * ( u51i - u51im1 )
+            end do
+
+            do i = ist, iend
+               frct(1,i,j,k) = frct(1,i,j,k)
+     >              + dx1 * tx1 * (            rsd(1,i-1,j,k)
+     >                             - 2.0d+00 * rsd(1,i,j,k)
+     >                             +           rsd(1,i+1,j,k) )
+               frct(2,i,j,k) = frct(2,i,j,k)
+     >           + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) )
+     >              + dx2 * tx1 * (            rsd(2,i-1,j,k)
+     >                             - 2.0d+00 * rsd(2,i,j,k)
+     >                             +           rsd(2,i+1,j,k) )
+               frct(3,i,j,k) = frct(3,i,j,k)
+     >           + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) )
+     >              + dx3 * tx1 * (            rsd(3,i-1,j,k)
+     >                             - 2.0d+00 * rsd(3,i,j,k)
+     >                             +           rsd(3,i+1,j,k) )
+               frct(4,i,j,k) = frct(4,i,j,k)
+     >            + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) )
+     >              + dx4 * tx1 * (            rsd(4,i-1,j,k)
+     >                             - 2.0d+00 * rsd(4,i,j,k)
+     >                             +           rsd(4,i+1,j,k) )
+               frct(5,i,j,k) = frct(5,i,j,k)
+     >           + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) )
+     >              + dx5 * tx1 * (            rsd(5,i-1,j,k)
+     >                             - 2.0d+00 * rsd(5,i,j,k)
+     >                             +           rsd(5,i+1,j,k) )
+            end do
+
+c---------------------------------------------------------------------
+c   Fourth-order dissipation
+c---------------------------------------------------------------------
+            IF (north.eq.-1) then
+             do m = 1, 5
+               frct(m,2,j,k) = frct(m,2,j,k)
+     >           - dsspm * ( + 5.0d+00 * rsd(m,2,j,k)
+     >                       - 4.0d+00 * rsd(m,3,j,k)
+     >                       +           rsd(m,4,j,k) )
+               frct(m,3,j,k) = frct(m,3,j,k)
+     >           - dsspm * ( - 4.0d+00 * rsd(m,2,j,k)
+     >                       + 6.0d+00 * rsd(m,3,j,k)
+     >                       - 4.0d+00 * rsd(m,4,j,k)
+     >                       +           rsd(m,5,j,k) )
+             end do
+            END IF
+
+            do i = ist1,iend1
+               do m = 1, 5
+                  frct(m,i,j,k) = frct(m,i,j,k)
+     >              - dsspm * (            rsd(m,i-2,j,k)
+     >                         - 4.0d+00 * rsd(m,i-1,j,k)
+     >                         + 6.0d+00 * rsd(m,i,j,k)
+     >                         - 4.0d+00 * rsd(m,i+1,j,k)
+     >                         +           rsd(m,i+2,j,k) )
+               end do
+            end do
+
+            IF (south.eq.-1) then
+             do m = 1, 5
+               frct(m,nx-2,j,k) = frct(m,nx-2,j,k)
+     >           - dsspm * (             rsd(m,nx-4,j,k)
+     >                       - 4.0d+00 * rsd(m,nx-3,j,k)
+     >                       + 6.0d+00 * rsd(m,nx-2,j,k)
+     >                       - 4.0d+00 * rsd(m,nx-1,j,k)  )
+               frct(m,nx-1,j,k) = frct(m,nx-1,j,k)
+     >           - dsspm * (             rsd(m,nx-3,j,k)
+     >                       - 4.0d+00 * rsd(m,nx-2,j,k)
+     >                       + 5.0d+00 * rsd(m,nx-1,j,k) )
+             end do
+            END IF
+
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c   eta-direction flux differences
+c---------------------------------------------------------------------
+c
+c   iex = flag : iex = 0  north/south communication
+c              : iex = 1  east/west communication
+c
+c---------------------------------------------------------------------
+      iex   = 1
+
+c---------------------------------------------------------------------
+c   communicate and receive/send two rows of data
+c---------------------------------------------------------------------
+      call exchange_3 (rsd,iex)
+
+      L1 = 0
+      if (west.eq.-1) L1 = 1
+      L2 = ny + 1
+      if (east.eq.-1) L2 = ny
+
+      jst1 = 1
+      jend1 = ny
+      if (west.eq.-1) jst1 = 4
+      if (east.eq.-1) jend1 = ny - 3
+
+      do k = 2, nz - 1
+         do j = L1, L2
+            do i = ist, iend
+               flux(1,i,j,k) = rsd(3,i,j,k)
+               u31 = rsd(3,i,j,k) / rsd(1,i,j,k)
+               q = 0.50d+00 * (  rsd(2,i,j,k) * rsd(2,i,j,k)
+     >                         + rsd(3,i,j,k) * rsd(3,i,j,k)
+     >                         + rsd(4,i,j,k) * rsd(4,i,j,k) )
+     >                      / rsd(1,i,j,k)
+               flux(2,i,j,k) = rsd(2,i,j,k) * u31 
+               flux(3,i,j,k) = rsd(3,i,j,k) * u31 + c2 * 
+     >                       ( rsd(5,i,j,k) - q )
+               flux(4,i,j,k) = rsd(4,i,j,k) * u31
+               flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u31
+            end do
+         end do
+      end do
+
+      do k = 2, nz - 1
+         do i = ist, iend
+            do j = jst, jend
+               do m = 1, 5
+                  frct(m,i,j,k) =  frct(m,i,j,k)
+     >                 - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) )
+               end do
+            end do
+         end do
+
+         do j = jst, L2
+            do i = ist, iend
+               tmp = 1.0d+00 / rsd(1,i,j,k)
+
+               u21j = tmp * rsd(2,i,j,k)
+               u31j = tmp * rsd(3,i,j,k)
+               u41j = tmp * rsd(4,i,j,k)
+               u51j = tmp * rsd(5,i,j,k)
+
+               tmp = 1.0d+00 / rsd(1,i,j-1,k)
+
+               u21jm1 = tmp * rsd(2,i,j-1,k)
+               u31jm1 = tmp * rsd(3,i,j-1,k)
+               u41jm1 = tmp * rsd(4,i,j-1,k)
+               u51jm1 = tmp * rsd(5,i,j-1,k)
+
+               flux(2,i,j,k) = ty3 * ( u21j - u21jm1 )
+               flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * 
+     >                       ( u31j - u31jm1 )
+               flux(4,i,j,k) = ty3 * ( u41j - u41jm1 )
+               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+     >              * ty3 * ( ( u21j  **2 + u31j  **2 + u41j  **2 )
+     >                      - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) )
+     >              + (1.0d+00/6.0d+00)
+     >              * ty3 * ( u31j**2 - u31jm1**2 )
+     >              + c1 * c5 * ty3 * ( u51j - u51jm1 )
+            end do
+         end do
+
+         do j = jst, jend
+            do i = ist, iend
+               frct(1,i,j,k) = frct(1,i,j,k)
+     >              + dy1 * ty1 * (            rsd(1,i,j-1,k)
+     >                             - 2.0d+00 * rsd(1,i,j,k)
+     >                             +           rsd(1,i,j+1,k) )
+               frct(2,i,j,k) = frct(2,i,j,k)
+     >          + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) )
+     >              + dy2 * ty1 * (            rsd(2,i,j-1,k)
+     >                             - 2.0d+00 * rsd(2,i,j,k)
+     >                             +           rsd(2,i,j+1,k) )
+               frct(3,i,j,k) = frct(3,i,j,k)
+     >          + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) )
+     >              + dy3 * ty1 * (            rsd(3,i,j-1,k)
+     >                             - 2.0d+00 * rsd(3,i,j,k)
+     >                             +           rsd(3,i,j+1,k) )
+               frct(4,i,j,k) = frct(4,i,j,k)
+     >          + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) )
+     >              + dy4 * ty1 * (            rsd(4,i,j-1,k)
+     >                             - 2.0d+00 * rsd(4,i,j,k)
+     >                             +           rsd(4,i,j+1,k) )
+               frct(5,i,j,k) = frct(5,i,j,k)
+     >          + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) )
+     >              + dy5 * ty1 * (            rsd(5,i,j-1,k)
+     >                             - 2.0d+00 * rsd(5,i,j,k)
+     >                             +           rsd(5,i,j+1,k) )
+            end do
+         end do
+
+c---------------------------------------------------------------------
+c   fourth-order dissipation
+c---------------------------------------------------------------------
+         IF (west.eq.-1) then
+            do i = ist, iend
+             do m = 1, 5
+               frct(m,i,2,k) = frct(m,i,2,k)
+     >           - dsspm * ( + 5.0d+00 * rsd(m,i,2,k)
+     >                       - 4.0d+00 * rsd(m,i,3,k)
+     >                       +           rsd(m,i,4,k) )
+               frct(m,i,3,k) = frct(m,i,3,k)
+     >           - dsspm * ( - 4.0d+00 * rsd(m,i,2,k)
+     >                       + 6.0d+00 * rsd(m,i,3,k)
+     >                       - 4.0d+00 * rsd(m,i,4,k)
+     >                       +           rsd(m,i,5,k) )
+             end do
+            end do
+         END IF
+
+         do j = jst1, jend1
+            do i = ist, iend
+               do m = 1, 5
+                  frct(m,i,j,k) = frct(m,i,j,k)
+     >              - dsspm * (            rsd(m,i,j-2,k)
+     >                        - 4.0d+00 * rsd(m,i,j-1,k)
+     >                        + 6.0d+00 * rsd(m,i,j,k)
+     >                        - 4.0d+00 * rsd(m,i,j+1,k)
+     >                        +           rsd(m,i,j+2,k) )
+               end do
+            end do
+         end do
+
+         IF (east.eq.-1) then
+            do i = ist, iend
+             do m = 1, 5
+               frct(m,i,ny-2,k) = frct(m,i,ny-2,k)
+     >           - dsspm * (             rsd(m,i,ny-4,k)
+     >                       - 4.0d+00 * rsd(m,i,ny-3,k)
+     >                       + 6.0d+00 * rsd(m,i,ny-2,k)
+     >                       - 4.0d+00 * rsd(m,i,ny-1,k)  )
+               frct(m,i,ny-1,k) = frct(m,i,ny-1,k)
+     >           - dsspm * (             rsd(m,i,ny-3,k)
+     >                       - 4.0d+00 * rsd(m,i,ny-2,k)
+     >                       + 5.0d+00 * rsd(m,i,ny-1,k)  )
+             end do
+            end do
+         END IF
+
+      end do
+
+c---------------------------------------------------------------------
+c   zeta-direction flux differences
+c---------------------------------------------------------------------
+      do k = 1, nz
+         do j = jst, jend
+            do i = ist, iend
+               flux(1,i,j,k) = rsd(4,i,j,k)
+               u41 = rsd(4,i,j,k) / rsd(1,i,j,k)
+               q = 0.50d+00 * (  rsd(2,i,j,k) * rsd(2,i,j,k)
+     >                         + rsd(3,i,j,k) * rsd(3,i,j,k)
+     >                         + rsd(4,i,j,k) * rsd(4,i,j,k) )
+     >                      / rsd(1,i,j,k)
+               flux(2,i,j,k) = rsd(2,i,j,k) * u41 
+               flux(3,i,j,k) = rsd(3,i,j,k) * u41 
+               flux(4,i,j,k) = rsd(4,i,j,k) * u41 + c2 * 
+     >                         ( rsd(5,i,j,k) - q )
+               flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u41
+            end do
+         end do
+      end do
+
+      do k = 2, nz - 1
+         do j = jst, jend
+            do i = ist, iend
+               do m = 1, 5
+                  frct(m,i,j,k) =  frct(m,i,j,k)
+     >                  - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) )
+               end do
+            end do
+         end do
+      end do
+
+      do k = 2, nz
+         do j = jst, jend
+            do i = ist, iend
+               tmp = 1.0d+00 / rsd(1,i,j,k)
+
+               u21k = tmp * rsd(2,i,j,k)
+               u31k = tmp * rsd(3,i,j,k)
+               u41k = tmp * rsd(4,i,j,k)
+               u51k = tmp * rsd(5,i,j,k)
+
+               tmp = 1.0d+00 / rsd(1,i,j,k-1)
+
+               u21km1 = tmp * rsd(2,i,j,k-1)
+               u31km1 = tmp * rsd(3,i,j,k-1)
+               u41km1 = tmp * rsd(4,i,j,k-1)
+               u51km1 = tmp * rsd(5,i,j,k-1)
+
+               flux(2,i,j,k) = tz3 * ( u21k - u21km1 )
+               flux(3,i,j,k) = tz3 * ( u31k - u31km1 )
+               flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * ( u41k 
+     >                       - u41km1 )
+               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+     >              * tz3 * ( ( u21k  **2 + u31k  **2 + u41k  **2 )
+     >                      - ( u21km1**2 + u31km1**2 + u41km1**2 ) )
+     >              + (1.0d+00/6.0d+00)
+     >              * tz3 * ( u41k**2 - u41km1**2 )
+     >              + c1 * c5 * tz3 * ( u51k - u51km1 )
+            end do
+         end do
+      end do
+
+      do k = 2, nz - 1
+         do j = jst, jend
+            do i = ist, iend
+               frct(1,i,j,k) = frct(1,i,j,k)
+     >              + dz1 * tz1 * (            rsd(1,i,j,k+1)
+     >                             - 2.0d+00 * rsd(1,i,j,k)
+     >                             +           rsd(1,i,j,k-1) )
+               frct(2,i,j,k) = frct(2,i,j,k)
+     >          + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) )
+     >              + dz2 * tz1 * (            rsd(2,i,j,k+1)
+     >                             - 2.0d+00 * rsd(2,i,j,k)
+     >                             +           rsd(2,i,j,k-1) )
+               frct(3,i,j,k) = frct(3,i,j,k)
+     >          + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) )
+     >              + dz3 * tz1 * (            rsd(3,i,j,k+1)
+     >                             - 2.0d+00 * rsd(3,i,j,k)
+     >                             +           rsd(3,i,j,k-1) )
+               frct(4,i,j,k) = frct(4,i,j,k)
+     >          + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) )
+     >              + dz4 * tz1 * (            rsd(4,i,j,k+1)
+     >                             - 2.0d+00 * rsd(4,i,j,k)
+     >                             +           rsd(4,i,j,k-1) )
+               frct(5,i,j,k) = frct(5,i,j,k)
+     >          + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) )
+     >              + dz5 * tz1 * (            rsd(5,i,j,k+1)
+     >                             - 2.0d+00 * rsd(5,i,j,k)
+     >                             +           rsd(5,i,j,k-1) )
+            end do
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c   fourth-order dissipation
+c---------------------------------------------------------------------
+      do j = jst, jend
+         do i = ist, iend
+            do m = 1, 5
+               frct(m,i,j,2) = frct(m,i,j,2)
+     >           - dsspm * ( + 5.0d+00 * rsd(m,i,j,2)
+     >                       - 4.0d+00 * rsd(m,i,j,3)
+     >                       +           rsd(m,i,j,4) )
+               frct(m,i,j,3) = frct(m,i,j,3)
+     >           - dsspm * (- 4.0d+00 * rsd(m,i,j,2)
+     >                      + 6.0d+00 * rsd(m,i,j,3)
+     >                      - 4.0d+00 * rsd(m,i,j,4)
+     >                      +           rsd(m,i,j,5) )
+            end do
+         end do
+      end do
+
+      do k = 4, nz - 3
+         do j = jst, jend
+            do i = ist, iend
+               do m = 1, 5
+                  frct(m,i,j,k) = frct(m,i,j,k)
+     >              - dsspm * (           rsd(m,i,j,k-2)
+     >                        - 4.0d+00 * rsd(m,i,j,k-1)
+     >                        + 6.0d+00 * rsd(m,i,j,k)
+     >                        - 4.0d+00 * rsd(m,i,j,k+1)
+     >                        +           rsd(m,i,j,k+2) )
+               end do
+            end do
+         end do
+      end do
+
+      do j = jst, jend
+         do i = ist, iend
+            do m = 1, 5
+               frct(m,i,j,nz-2) = frct(m,i,j,nz-2)
+     >           - dsspm * (            rsd(m,i,j,nz-4)
+     >                      - 4.0d+00 * rsd(m,i,j,nz-3)
+     >                      + 6.0d+00 * rsd(m,i,j,nz-2)
+     >                      - 4.0d+00 * rsd(m,i,j,nz-1)  )
+               frct(m,i,j,nz-1) = frct(m,i,j,nz-1)
+     >           - dsspm * (             rsd(m,i,j,nz-3)
+     >                       - 4.0d+00 * rsd(m,i,j,nz-2)
+     >                       + 5.0d+00 * rsd(m,i,j,nz-1)  )
+            end do
+         end do
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/error.f b/examples/smpi/NAS/LU/error.f
new file mode 100644
index 0000000000..e83f74912f
--- /dev/null
+++ b/examples/smpi/NAS/LU/error.f
@@ -0,0 +1,81 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine error
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   compute the solution error
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, k, m
+      integer iglob, jglob
+      double precision  tmp
+      double precision  u000ijk(5), dummy(5)
+
+      integer IERROR
+
+
+      do m = 1, 5
+         errnm(m) = 0.0d+00
+         dummy(m) = 0.0d+00
+      end do
+
+      do k = 2, nz-1
+         do j = jst, jend
+            jglob = jpt + j
+            do i = ist, iend
+               iglob = ipt + i
+               call exact( iglob, jglob, k, u000ijk )
+               do m = 1, 5
+                  tmp = ( u000ijk(m) - u(m,i,j,k) )
+                  dummy(m) = dummy(m) + tmp ** 2
+               end do
+            end do
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c   compute the global sum of individual contributions to dot product.
+c---------------------------------------------------------------------
+      call MPI_ALLREDUCE( dummy,
+     >                    errnm,
+     >                    5,
+     >                    dp_type,
+     >                    MPI_SUM,
+     >                    MPI_COMM_WORLD,
+     >                    IERROR )
+
+      do m = 1, 5
+         errnm(m) = sqrt ( errnm(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) )
+      end do
+
+c      if (id.eq.0) then
+c        write (*,1002) ( errnm(m), m = 1, 5 )
+c      end if
+
+ 1002 format (1x/1x,'RMS-norm of error in soln. to ',
+     > 'first pde  = ',1pe12.5/,
+     > 1x,'RMS-norm of error in soln. to ',
+     > 'second pde = ',1pe12.5/,
+     > 1x,'RMS-norm of error in soln. to ',
+     > 'third pde  = ',1pe12.5/,
+     > 1x,'RMS-norm of error in soln. to ',
+     > 'fourth pde = ',1pe12.5/,
+     > 1x,'RMS-norm of error in soln. to ',
+     > 'fifth pde  = ',1pe12.5)
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/exact.f b/examples/smpi/NAS/LU/exact.f
new file mode 100644
index 0000000000..19e14c3d5f
--- /dev/null
+++ b/examples/smpi/NAS/LU/exact.f
@@ -0,0 +1,53 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exact( i, j, k, u000ijk )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   compute the exact solution at (i,j,k)
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer i, j, k
+      double precision u000ijk(*)
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer m
+      double precision xi, eta, zeta
+
+      xi  = ( dble ( i - 1 ) ) / ( nx0 - 1 )
+      eta  = ( dble ( j - 1 ) ) / ( ny0 - 1 )
+      zeta = ( dble ( k - 1 ) ) / ( nz - 1 )
+
+
+      do m = 1, 5
+         u000ijk(m) =  ce(m,1)
+     >        + ce(m,2) * xi
+     >        + ce(m,3) * eta
+     >        + ce(m,4) * zeta
+     >        + ce(m,5) * xi * xi
+     >        + ce(m,6) * eta * eta
+     >        + ce(m,7) * zeta * zeta
+     >        + ce(m,8) * xi * xi * xi
+     >        + ce(m,9) * eta * eta * eta
+     >        + ce(m,10) * zeta * zeta * zeta
+     >        + ce(m,11) * xi * xi * xi * xi
+     >        + ce(m,12) * eta * eta * eta * eta
+     >        + ce(m,13) * zeta * zeta * zeta * zeta
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/exchange_1.f b/examples/smpi/NAS/LU/exchange_1.f
new file mode 100644
index 0000000000..2bf7d28b94
--- /dev/null
+++ b/examples/smpi/NAS/LU/exchange_1.f
@@ -0,0 +1,180 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exchange_1( g,k,iex )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+      double precision  g(5,-1:isiz1+2,-1:isiz2+2,isiz3)
+      integer k
+      integer iex
+      integer i, j
+      double precision dum(5,isiz1+isiz2), dum1(5,isiz1+isiz2)
+
+      integer STATUS(MPI_STATUS_SIZE)
+      integer IERROR
+
+
+
+      if( iex .eq. 0 ) then
+
+          if( north .ne. -1 ) then
+              call MPI_RECV( dum1(1,jst),
+     >                       5*(jend-jst+1),
+     >                       dp_type,
+     >                       north,
+     >                       from_n,
+     >                       MPI_COMM_WORLD,
+     >                       status,
+     >                       IERROR )
+              do j=jst,jend
+                  g(1,0,j,k) = dum1(1,j)
+                  g(2,0,j,k) = dum1(2,j)
+                  g(3,0,j,k) = dum1(3,j)
+                  g(4,0,j,k) = dum1(4,j)
+                  g(5,0,j,k) = dum1(5,j)
+              enddo
+          endif
+
+          if( west .ne. -1 ) then
+              call MPI_RECV( dum1(1,ist),
+     >                       5*(iend-ist+1),
+     >                       dp_type,
+     >                       west,
+     >                       from_w,
+     >                       MPI_COMM_WORLD,
+     >                       status,
+     >                       IERROR )
+              do i=ist,iend
+                  g(1,i,0,k) = dum1(1,i)
+                  g(2,i,0,k) = dum1(2,i)
+                  g(3,i,0,k) = dum1(3,i)
+                  g(4,i,0,k) = dum1(4,i)
+                  g(5,i,0,k) = dum1(5,i)
+              enddo
+          endif
+
+      else if( iex .eq. 1 ) then
+
+          if( south .ne. -1 ) then
+              call MPI_RECV( dum1(1,jst),
+     >                       5*(jend-jst+1),
+     >                       dp_type,
+     >                       south,
+     >                       from_s,
+     >                       MPI_COMM_WORLD,
+     >                       status,
+     >                       IERROR )
+              do j=jst,jend
+                  g(1,nx+1,j,k) = dum1(1,j)
+                  g(2,nx+1,j,k) = dum1(2,j)
+                  g(3,nx+1,j,k) = dum1(3,j)
+                  g(4,nx+1,j,k) = dum1(4,j)
+                  g(5,nx+1,j,k) = dum1(5,j)
+              enddo
+          endif
+
+          if( east .ne. -1 ) then
+              call MPI_RECV( dum1(1,ist),
+     >                       5*(iend-ist+1),
+     >                       dp_type,
+     >                       east,
+     >                       from_e,
+     >                       MPI_COMM_WORLD,
+     >                       status,
+     >                       IERROR )
+              do i=ist,iend
+                  g(1,i,ny+1,k) = dum1(1,i)
+                  g(2,i,ny+1,k) = dum1(2,i)
+                  g(3,i,ny+1,k) = dum1(3,i)
+                  g(4,i,ny+1,k) = dum1(4,i)
+                  g(5,i,ny+1,k) = dum1(5,i)
+              enddo
+          endif
+
+      else if( iex .eq. 2 ) then
+
+          if( south .ne. -1 ) then
+              do j=jst,jend
+                  dum(1,j) = g(1,nx,j,k) 
+                  dum(2,j) = g(2,nx,j,k) 
+                  dum(3,j) = g(3,nx,j,k) 
+                  dum(4,j) = g(4,nx,j,k) 
+                  dum(5,j) = g(5,nx,j,k) 
+              enddo
+              call MPI_SEND( dum(1,jst), 
+     >                       5*(jend-jst+1), 
+     >                       dp_type, 
+     >                       south, 
+     >                       from_n, 
+     >                       MPI_COMM_WORLD, 
+     >                       IERROR )
+          endif
+
+          if( east .ne. -1 ) then
+              do i=ist,iend
+                  dum(1,i) = g(1,i,ny,k)
+                  dum(2,i) = g(2,i,ny,k)
+                  dum(3,i) = g(3,i,ny,k)
+                  dum(4,i) = g(4,i,ny,k)
+                  dum(5,i) = g(5,i,ny,k)
+              enddo
+              call MPI_SEND( dum(1,ist), 
+     >                       5*(iend-ist+1), 
+     >                       dp_type, 
+     >                       east, 
+     >                       from_w, 
+     >                       MPI_COMM_WORLD, 
+     >                       IERROR )
+          endif
+
+      else
+
+          if( north .ne. -1 ) then
+              do j=jst,jend
+                  dum(1,j) = g(1,1,j,k)
+                  dum(2,j) = g(2,1,j,k)
+                  dum(3,j) = g(3,1,j,k)
+                  dum(4,j) = g(4,1,j,k)
+                  dum(5,j) = g(5,1,j,k)
+              enddo
+              call MPI_SEND( dum(1,jst), 
+     >                       5*(jend-jst+1), 
+     >                       dp_type, 
+     >                       north, 
+     >                       from_s, 
+     >                       MPI_COMM_WORLD, 
+     >                       IERROR )
+          endif
+
+          if( west .ne. -1 ) then
+              do i=ist,iend
+                  dum(1,i) = g(1,i,1,k)
+                  dum(2,i) = g(2,i,1,k)
+                  dum(3,i) = g(3,i,1,k)
+                  dum(4,i) = g(4,i,1,k)
+                  dum(5,i) = g(5,i,1,k)
+              enddo
+              call MPI_SEND( dum(1,ist), 
+     >                       5*(iend-ist+1), 
+     >                       dp_type, 
+     >                       west, 
+     >                       from_e, 
+     >                       MPI_COMM_WORLD, 
+     >                       IERROR )
+          endif
+
+      endif
+
+      end
+
+
+
diff --git a/examples/smpi/NAS/LU/exchange_3.f b/examples/smpi/NAS/LU/exchange_3.f
new file mode 100644
index 0000000000..d52ae7ef4d
--- /dev/null
+++ b/examples/smpi/NAS/LU/exchange_3.f
@@ -0,0 +1,312 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exchange_3(g,iex)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      double precision  g(5,-1:isiz1+2,-1:isiz2+2,isiz3)
+      integer iex
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, k
+      integer ipos1, ipos2
+
+      integer mid
+      integer STATUS(MPI_STATUS_SIZE)
+      integer IERROR
+
+
+
+      if (iex.eq.0) then
+c---------------------------------------------------------------------
+c   communicate in the south and north directions
+c---------------------------------------------------------------------
+      if (north.ne.-1) then
+          call MPI_IRECV( buf1,
+     >                    10*ny*nz,
+     >                    dp_type,
+     >                    MPI_ANY_SOURCE,
+     >                    from_n,
+     >                    MPI_COMM_WORLD,
+     >                    mid,
+     >                    IERROR )
+      end if
+
+c---------------------------------------------------------------------
+c   send south
+c---------------------------------------------------------------------
+      if (south.ne.-1) then
+          do k = 1,nz
+            do j = 1,ny
+              ipos1 = (k-1)*ny + j
+              ipos2 = ipos1 + ny*nz
+              buf(1,ipos1) = g(1,nx-1,j,k) 
+              buf(2,ipos1) = g(2,nx-1,j,k) 
+              buf(3,ipos1) = g(3,nx-1,j,k) 
+              buf(4,ipos1) = g(4,nx-1,j,k) 
+              buf(5,ipos1) = g(5,nx-1,j,k) 
+              buf(1,ipos2) = g(1,nx,j,k)
+              buf(2,ipos2) = g(2,nx,j,k)
+              buf(3,ipos2) = g(3,nx,j,k)
+              buf(4,ipos2) = g(4,nx,j,k)
+              buf(5,ipos2) = g(5,nx,j,k)
+            end do
+          end do
+
+          call MPI_SEND( buf,
+     >                   10*ny*nz,
+     >                   dp_type,
+     >                   south,
+     >                   from_n,
+     >                   MPI_COMM_WORLD,
+     >                   IERROR )
+        end if
+
+c---------------------------------------------------------------------
+c   receive from north
+c---------------------------------------------------------------------
+        if (north.ne.-1) then
+          call MPI_WAIT( mid, STATUS, IERROR )
+
+          do k = 1,nz
+            do j = 1,ny
+              ipos1 = (k-1)*ny + j
+              ipos2 = ipos1 + ny*nz
+              g(1,-1,j,k) = buf1(1,ipos1)
+              g(2,-1,j,k) = buf1(2,ipos1)
+              g(3,-1,j,k) = buf1(3,ipos1)
+              g(4,-1,j,k) = buf1(4,ipos1)
+              g(5,-1,j,k) = buf1(5,ipos1)
+              g(1,0,j,k) = buf1(1,ipos2)
+              g(2,0,j,k) = buf1(2,ipos2)
+              g(3,0,j,k) = buf1(3,ipos2)
+              g(4,0,j,k) = buf1(4,ipos2)
+              g(5,0,j,k) = buf1(5,ipos2)
+            end do
+          end do
+
+        end if
+
+      if (south.ne.-1) then
+          call MPI_IRECV( buf1,
+     >                    10*ny*nz,
+     >                    dp_type,
+     >                    MPI_ANY_SOURCE,
+     >                    from_s,
+     >                    MPI_COMM_WORLD,
+     >                    mid,
+     >                    IERROR )
+      end if
+
+c---------------------------------------------------------------------
+c   send north
+c---------------------------------------------------------------------
+        if (north.ne.-1) then
+          do k = 1,nz
+            do j = 1,ny
+              ipos1 = (k-1)*ny + j
+              ipos2 = ipos1 + ny*nz
+              buf(1,ipos1) = g(1,2,j,k)
+              buf(2,ipos1) = g(2,2,j,k)
+              buf(3,ipos1) = g(3,2,j,k)
+              buf(4,ipos1) = g(4,2,j,k)
+              buf(5,ipos1) = g(5,2,j,k)
+              buf(1,ipos2) = g(1,1,j,k)
+              buf(2,ipos2) = g(2,1,j,k)
+              buf(3,ipos2) = g(3,1,j,k)
+              buf(4,ipos2) = g(4,1,j,k)
+              buf(5,ipos2) = g(5,1,j,k)
+            end do
+          end do
+
+          call MPI_SEND( buf,
+     >                   10*ny*nz,
+     >                   dp_type,
+     >                   north,
+     >                   from_s,
+     >                   MPI_COMM_WORLD,
+     >                   IERROR )
+        end if
+
+c---------------------------------------------------------------------
+c   receive from south
+c---------------------------------------------------------------------
+        if (south.ne.-1) then
+          call MPI_WAIT( mid, STATUS, IERROR )
+
+          do k = 1,nz
+            do j = 1,ny
+              ipos1 = (k-1)*ny + j
+              ipos2 = ipos1 + ny*nz
+              g(1,nx+2,j,k)  = buf1(1,ipos1)
+              g(2,nx+2,j,k)  = buf1(2,ipos1)
+              g(3,nx+2,j,k)  = buf1(3,ipos1)
+              g(4,nx+2,j,k)  = buf1(4,ipos1)
+              g(5,nx+2,j,k)  = buf1(5,ipos1)
+              g(1,nx+1,j,k) = buf1(1,ipos2)
+              g(2,nx+1,j,k) = buf1(2,ipos2)
+              g(3,nx+1,j,k) = buf1(3,ipos2)
+              g(4,nx+1,j,k) = buf1(4,ipos2)
+              g(5,nx+1,j,k) = buf1(5,ipos2)
+            end do
+          end do
+        end if
+
+      else
+
+c---------------------------------------------------------------------
+c   communicate in the east and west directions
+c---------------------------------------------------------------------
+      if (west.ne.-1) then
+          call MPI_IRECV( buf1,
+     >                    10*nx*nz,
+     >                    dp_type,
+     >                    MPI_ANY_SOURCE,
+     >                    from_w,
+     >                    MPI_COMM_WORLD,
+     >                    mid,
+     >                    IERROR )
+      end if
+
+c---------------------------------------------------------------------
+c   send east
+c---------------------------------------------------------------------
+        if (east.ne.-1) then
+          do k = 1,nz
+            do i = 1,nx
+              ipos1 = (k-1)*nx + i
+              ipos2 = ipos1 + nx*nz
+              buf(1,ipos1) = g(1,i,ny-1,k)
+              buf(2,ipos1) = g(2,i,ny-1,k)
+              buf(3,ipos1) = g(3,i,ny-1,k)
+              buf(4,ipos1) = g(4,i,ny-1,k)
+              buf(5,ipos1) = g(5,i,ny-1,k)
+              buf(1,ipos2) = g(1,i,ny,k)
+              buf(2,ipos2) = g(2,i,ny,k)
+              buf(3,ipos2) = g(3,i,ny,k)
+              buf(4,ipos2) = g(4,i,ny,k)
+              buf(5,ipos2) = g(5,i,ny,k)
+            end do
+          end do
+
+          call MPI_SEND( buf,
+     >                   10*nx*nz,
+     >                   dp_type,
+     >                   east,
+     >                   from_w,
+     >                   MPI_COMM_WORLD,
+     >                   IERROR )
+        end if
+
+c---------------------------------------------------------------------
+c   receive from west
+c---------------------------------------------------------------------
+        if (west.ne.-1) then
+          call MPI_WAIT( mid, STATUS, IERROR )
+
+          do k = 1,nz
+            do i = 1,nx
+              ipos1 = (k-1)*nx + i
+              ipos2 = ipos1 + nx*nz
+              g(1,i,-1,k) = buf1(1,ipos1)
+              g(2,i,-1,k) = buf1(2,ipos1)
+              g(3,i,-1,k) = buf1(3,ipos1)
+              g(4,i,-1,k) = buf1(4,ipos1)
+              g(5,i,-1,k) = buf1(5,ipos1)
+              g(1,i,0,k) = buf1(1,ipos2)
+              g(2,i,0,k) = buf1(2,ipos2)
+              g(3,i,0,k) = buf1(3,ipos2)
+              g(4,i,0,k) = buf1(4,ipos2)
+              g(5,i,0,k) = buf1(5,ipos2)
+            end do
+          end do
+
+        end if
+
+      if (east.ne.-1) then
+          call MPI_IRECV( buf1,
+     >                    10*nx*nz,
+     >                    dp_type,
+     >                    MPI_ANY_SOURCE,
+     >                    from_e,
+     >                    MPI_COMM_WORLD,
+     >                    mid,
+     >                    IERROR )
+      end if
+
+c---------------------------------------------------------------------
+c   send west
+c---------------------------------------------------------------------
+      if (west.ne.-1) then
+          do k = 1,nz
+            do i = 1,nx
+              ipos1 = (k-1)*nx + i
+              ipos2 = ipos1 + nx*nz
+              buf(1,ipos1) = g(1,i,2,k)
+              buf(2,ipos1) = g(2,i,2,k)
+              buf(3,ipos1) = g(3,i,2,k)
+              buf(4,ipos1) = g(4,i,2,k)
+              buf(5,ipos1) = g(5,i,2,k)
+              buf(1,ipos2) = g(1,i,1,k)
+              buf(2,ipos2) = g(2,i,1,k)
+              buf(3,ipos2) = g(3,i,1,k)
+              buf(4,ipos2) = g(4,i,1,k)
+              buf(5,ipos2) = g(5,i,1,k)
+            end do
+          end do
+
+          call MPI_SEND( buf,
+     >                   10*nx*nz,
+     >                   dp_type,
+     >                   west,
+     >                   from_e,
+     >                   MPI_COMM_WORLD,
+     >                   IERROR )
+        end if
+
+c---------------------------------------------------------------------
+c   receive from east
+c---------------------------------------------------------------------
+        if (east.ne.-1) then
+          call MPI_WAIT( mid, STATUS, IERROR )
+
+          do k = 1,nz
+            do i = 1,nx
+              ipos1 = (k-1)*nx + i
+              ipos2 = ipos1 + nx*nz
+              g(1,i,ny+2,k)  = buf1(1,ipos1)
+              g(2,i,ny+2,k)  = buf1(2,ipos1)
+              g(3,i,ny+2,k)  = buf1(3,ipos1)
+              g(4,i,ny+2,k)  = buf1(4,ipos1)
+              g(5,i,ny+2,k)  = buf1(5,ipos1)
+              g(1,i,ny+1,k) = buf1(1,ipos2)
+              g(2,i,ny+1,k) = buf1(2,ipos2)
+              g(3,i,ny+1,k) = buf1(3,ipos2)
+              g(4,i,ny+1,k) = buf1(4,ipos2)
+              g(5,i,ny+1,k) = buf1(5,ipos2)
+            end do
+          end do
+
+        end if
+
+      end if
+
+      return
+      end     
diff --git a/examples/smpi/NAS/LU/exchange_4.f b/examples/smpi/NAS/LU/exchange_4.f
new file mode 100644
index 0000000000..1c4c38e218
--- /dev/null
+++ b/examples/smpi/NAS/LU/exchange_4.f
@@ -0,0 +1,133 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exchange_4(g,h,ibeg,ifin1,jbeg,jfin1)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      double precision  g(0:isiz2+1,0:isiz3+1), 
+     >        h(0:isiz2+1,0:isiz3+1)
+      integer ibeg, ifin1
+      integer jbeg, jfin1
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j
+      integer ny2
+      double precision  dum(1024)
+
+      integer msgid1, msgid3
+      integer STATUS(MPI_STATUS_SIZE)
+      integer IERROR
+
+
+
+      ny2 = ny + 2
+
+c---------------------------------------------------------------------
+c   communicate in the east and west directions
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   receive from east
+c---------------------------------------------------------------------
+      if (jfin1.eq.ny) then
+        call MPI_IRECV( dum,
+     >                  2*nx,
+     >                  dp_type,
+     >                  MPI_ANY_SOURCE,
+     >                  from_e,
+     >                  MPI_COMM_WORLD,
+     >                  msgid3,
+     >                  IERROR )
+
+        call MPI_WAIT( msgid3, STATUS, IERROR )
+
+        do i = 1,nx
+          g(i,ny+1) = dum(i)
+          h(i,ny+1) = dum(i+nx)
+        end do
+
+      end if
+
+c---------------------------------------------------------------------
+c   send west
+c---------------------------------------------------------------------
+      if (jbeg.eq.1) then
+        do i = 1,nx
+          dum(i) = g(i,1)
+          dum(i+nx) = h(i,1)
+        end do
+
+        call MPI_SEND( dum,
+     >                 2*nx,
+     >                 dp_type,
+     >                 west,
+     >                 from_e,
+     >                 MPI_COMM_WORLD,
+     >                 IERROR )
+
+      end if
+
+c---------------------------------------------------------------------
+c   communicate in the south and north directions
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   receive from south
+c---------------------------------------------------------------------
+      if (ifin1.eq.nx) then
+        call MPI_IRECV( dum,
+     >                  2*ny2,
+     >                  dp_type,
+     >                  MPI_ANY_SOURCE,
+     >                  from_s,
+     >                  MPI_COMM_WORLD,
+     >                  msgid1,
+     >                  IERROR )
+
+        call MPI_WAIT( msgid1, STATUS, IERROR )
+
+        do j = 0,ny+1
+          g(nx+1,j) = dum(j+1)
+          h(nx+1,j) = dum(j+ny2+1)
+        end do
+
+      end if
+
+c---------------------------------------------------------------------
+c   send north
+c---------------------------------------------------------------------
+      if (ibeg.eq.1) then
+        do j = 0,ny+1
+          dum(j+1) = g(1,j)
+          dum(j+ny2+1) = h(1,j)
+        end do
+
+        call MPI_SEND( dum,
+     >                 2*ny2,
+     >                 dp_type,
+     >                 north,
+     >                 from_s,
+     >                 MPI_COMM_WORLD,
+     >                 IERROR )
+
+      end if
+
+      return
+      end     
diff --git a/examples/smpi/NAS/LU/exchange_5.f b/examples/smpi/NAS/LU/exchange_5.f
new file mode 100644
index 0000000000..e4cc66f5bf
--- /dev/null
+++ b/examples/smpi/NAS/LU/exchange_5.f
@@ -0,0 +1,81 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exchange_5(g,ibeg,ifin1)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      double precision  g(0:isiz2+1,0:isiz3+1)
+      integer ibeg, ifin1
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer k
+      double precision  dum(1024)
+
+      integer msgid1
+      integer STATUS(MPI_STATUS_SIZE)
+      integer IERROR
+
+
+
+c---------------------------------------------------------------------
+c   communicate in the south and north directions
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   receive from south
+c---------------------------------------------------------------------
+      if (ifin1.eq.nx) then
+        call MPI_IRECV( dum,
+     >                  nz,
+     >                  dp_type,
+     >                  MPI_ANY_SOURCE,
+     >                  from_s,
+     >                  MPI_COMM_WORLD,
+     >                  msgid1,
+     >                  IERROR )
+
+        call MPI_WAIT( msgid1, STATUS, IERROR )
+
+        do k = 1,nz
+          g(nx+1,k) = dum(k)
+        end do
+
+      end if
+
+c---------------------------------------------------------------------
+c   send north
+c---------------------------------------------------------------------
+      if (ibeg.eq.1) then
+        do k = 1,nz
+          dum(k) = g(1,k)
+        end do
+
+        call MPI_SEND( dum,
+     >                 nz,
+     >                 dp_type,
+     >                 north,
+     >                 from_s,
+     >                 MPI_COMM_WORLD,
+     >                 IERROR )
+
+      end if
+
+      return
+      end     
diff --git a/examples/smpi/NAS/LU/exchange_6.f b/examples/smpi/NAS/LU/exchange_6.f
new file mode 100644
index 0000000000..0626609547
--- /dev/null
+++ b/examples/smpi/NAS/LU/exchange_6.f
@@ -0,0 +1,81 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exchange_6(g,jbeg,jfin1)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      double precision  g(0:isiz2+1,0:isiz3+1)
+      integer jbeg, jfin1
+
+c---------------------------------------------------------------------
+c  local parameters
+c---------------------------------------------------------------------
+      integer k
+      double precision  dum(1024)
+
+      integer msgid3
+      integer STATUS(MPI_STATUS_SIZE)
+      integer IERROR
+
+
+
+c---------------------------------------------------------------------
+c   communicate in the east and west directions
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   receive from east
+c---------------------------------------------------------------------
+      if (jfin1.eq.ny) then
+        call MPI_IRECV( dum,
+     >                  nz,
+     >                  dp_type,
+     >                  MPI_ANY_SOURCE,
+     >                  from_e,
+     >                  MPI_COMM_WORLD,
+     >                  msgid3,
+     >                  IERROR )
+
+        call MPI_WAIT( msgid3, STATUS, IERROR )
+
+        do k = 1,nz
+          g(ny+1,k) = dum(k)
+        end do
+
+      end if
+
+c---------------------------------------------------------------------
+c   send west
+c---------------------------------------------------------------------
+      if (jbeg.eq.1) then
+        do k = 1,nz
+          dum(k) = g(1,k)
+        end do
+
+        call MPI_SEND( dum,
+     >                 nz,
+     >                 dp_type,
+     >                 west,
+     >                 from_e,
+     >                 MPI_COMM_WORLD,
+     >                 IERROR )
+
+      end if
+
+      return
+      end     
diff --git a/examples/smpi/NAS/LU/init_comm.f b/examples/smpi/NAS/LU/init_comm.f
new file mode 100644
index 0000000000..72ece00ac4
--- /dev/null
+++ b/examples/smpi/NAS/LU/init_comm.f
@@ -0,0 +1,57 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine init_comm 
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   initialize MPI and establish rank and size
+c
+c This is a module in the MPI implementation of LUSSOR
+c pseudo application from the NAS Parallel Benchmarks. 
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+      integer nodedim
+      integer IERROR
+
+
+c---------------------------------------------------------------------
+c    initialize MPI communication
+c---------------------------------------------------------------------
+      call MPI_INIT( IERROR )
+
+c---------------------------------------------------------------------
+c   establish the global rank of this process
+c---------------------------------------------------------------------
+      call MPI_COMM_RANK( MPI_COMM_WORLD,
+     >                     id,
+     >                     IERROR )
+
+c---------------------------------------------------------------------
+c   establish the size of the global group
+c---------------------------------------------------------------------
+      call MPI_COMM_SIZE( MPI_COMM_WORLD,
+     >                     num,
+     >                     IERROR )
+
+      ndim   = nodedim(num)
+
+      if (.not. convertdouble) then
+         dp_type = MPI_DOUBLE_PRECISION
+      else
+         dp_type = MPI_REAL
+      endif
+
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/inputlu.data.sample b/examples/smpi/NAS/LU/inputlu.data.sample
new file mode 100644
index 0000000000..9ef5a7be00
--- /dev/null
+++ b/examples/smpi/NAS/LU/inputlu.data.sample
@@ -0,0 +1,24 @@
+c
+c***controls printing of the progress of iterations: ipr    inorm
+                                                      1      250
+c
+c***the maximum no. of pseudo-time steps to be performed: nitmax
+                                                             250
+c
+c***magnitude of the time step: dt 
+                               2.0e+00
+c
+c***relaxation factor for SSOR iterations: omega
+                                            1.2
+c
+c***tolerance levels for steady-state residuals: tolnwt(m),m=1,5
+                             1.0e-08   1.0e-08   1.0e-08  1.0e-08  1.0e-08 
+c
+c***number of grid points in xi and eta and zeta directions: nx   ny   nz
+                                                            64  64  64
+c
+
+
+
+
+
diff --git a/examples/smpi/NAS/LU/jacld.f b/examples/smpi/NAS/LU/jacld.f
new file mode 100644
index 0000000000..9580d080ab
--- /dev/null
+++ b/examples/smpi/NAS/LU/jacld.f
@@ -0,0 +1,384 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine jacld(k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c   compute the lower triangular part of the jacobian matrix
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer k
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j
+      double precision  r43
+      double precision  c1345
+      double precision  c34
+      double precision  tmp1, tmp2, tmp3
+
+
+
+      r43 = ( 4.0d+00 / 3.0d+00 )
+      c1345 = c1 * c3 * c4 * c5
+      c34 = c3 * c4
+
+         do j = jst, jend
+            do i = ist, iend
+
+c---------------------------------------------------------------------
+c   form the block daigonal
+c---------------------------------------------------------------------
+               tmp1 = 1.0d+00 / u(1,i,j,k)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               d(1,1,i,j) =  1.0d+00
+     >                       + dt * 2.0d+00 * (   tx1 * dx1
+     >                                          + ty1 * dy1
+     >                                          + tz1 * dz1 )
+               d(1,2,i,j) =  0.0d+00
+               d(1,3,i,j) =  0.0d+00
+               d(1,4,i,j) =  0.0d+00
+               d(1,5,i,j) =  0.0d+00
+
+               d(2,1,i,j) =  dt * 2.0d+00
+     >          * (  tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) )
+     >             + ty1 * ( -       c34 * tmp2 * u(2,i,j,k) )
+     >             + tz1 * ( -       c34 * tmp2 * u(2,i,j,k) ) )
+               d(2,2,i,j) =  1.0d+00
+     >          + dt * 2.0d+00 
+     >          * (  tx1 * r43 * c34 * tmp1
+     >             + ty1 *       c34 * tmp1
+     >             + tz1 *       c34 * tmp1 )
+     >          + dt * 2.0d+00 * (   tx1 * dx2
+     >                             + ty1 * dy2
+     >                             + tz1 * dz2  )
+               d(2,3,i,j) = 0.0d+00
+               d(2,4,i,j) = 0.0d+00
+               d(2,5,i,j) = 0.0d+00
+
+               d(3,1,i,j) = dt * 2.0d+00
+     >      * (  tx1 * ( -       c34 * tmp2 * u(3,i,j,k) )
+     >         + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) )
+     >         + tz1 * ( -       c34 * tmp2 * u(3,i,j,k) ) )
+               d(3,2,i,j) = 0.0d+00
+               d(3,3,i,j) = 1.0d+00
+     >         + dt * 2.0d+00
+     >              * (  tx1 *       c34 * tmp1
+     >                 + ty1 * r43 * c34 * tmp1
+     >                 + tz1 *       c34 * tmp1 )
+     >         + dt * 2.0d+00 * (  tx1 * dx3
+     >                           + ty1 * dy3
+     >                           + tz1 * dz3 )
+               d(3,4,i,j) = 0.0d+00
+               d(3,5,i,j) = 0.0d+00
+
+               d(4,1,i,j) = dt * 2.0d+00
+     >      * (  tx1 * ( -       c34 * tmp2 * u(4,i,j,k) )
+     >         + ty1 * ( -       c34 * tmp2 * u(4,i,j,k) )
+     >         + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) )
+               d(4,2,i,j) = 0.0d+00
+               d(4,3,i,j) = 0.0d+00
+               d(4,4,i,j) = 1.0d+00
+     >         + dt * 2.0d+00
+     >              * (  tx1 *       c34 * tmp1
+     >                 + ty1 *       c34 * tmp1
+     >                 + tz1 * r43 * c34 * tmp1 )
+     >         + dt * 2.0d+00 * (  tx1 * dx4
+     >                           + ty1 * dy4
+     >                           + tz1 * dz4 )
+               d(4,5,i,j) = 0.0d+00
+
+               d(5,1,i,j) = dt * 2.0d+00
+     > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+     >             - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+     >             - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+     >             - ( c1345 ) * tmp2 * u(5,i,j,k) )
+     >   + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+     >             - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+     >             - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+     >             - ( c1345 ) * tmp2 * u(5,i,j,k) )
+     >   + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+     >             - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+     >             - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+     >             - ( c1345 ) * tmp2 * u(5,i,j,k) ) )
+               d(5,2,i,j) = dt * 2.0d+00
+     > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k)
+     >   + ty1 * (     c34 - c1345 ) * tmp2 * u(2,i,j,k)
+     >   + tz1 * (     c34 - c1345 ) * tmp2 * u(2,i,j,k) )
+               d(5,3,i,j) = dt * 2.0d+00
+     > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k)
+     >   + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k)
+     >   + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) )
+               d(5,4,i,j) = dt * 2.0d+00
+     > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
+     >   + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
+     >   + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) )
+               d(5,5,i,j) = 1.0d+00
+     >   + dt * 2.0d+00 * ( tx1 * c1345 * tmp1
+     >                    + ty1 * c1345 * tmp1
+     >                    + tz1 * c1345 * tmp1 )
+     >   + dt * 2.0d+00 * (  tx1 * dx5
+     >                    +  ty1 * dy5
+     >                    +  tz1 * dz5 )
+
+c---------------------------------------------------------------------
+c   form the first block sub-diagonal
+c---------------------------------------------------------------------
+               tmp1 = 1.0d+00 / u(1,i,j,k-1)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               a(1,1,i,j) = - dt * tz1 * dz1
+               a(1,2,i,j) =   0.0d+00
+               a(1,3,i,j) =   0.0d+00
+               a(1,4,i,j) = - dt * tz2
+               a(1,5,i,j) =   0.0d+00
+
+               a(2,1,i,j) = - dt * tz2
+     >           * ( - ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
+     >           - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k-1) )
+               a(2,2,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 )
+     >           - dt * tz1 * c34 * tmp1
+     >           - dt * tz1 * dz2 
+               a(2,3,i,j) = 0.0d+00
+               a(2,4,i,j) = - dt * tz2 * ( u(2,i,j,k-1) * tmp1 )
+               a(2,5,i,j) = 0.0d+00
+
+               a(3,1,i,j) = - dt * tz2
+     >           * ( - ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
+     >           - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k-1) )
+               a(3,2,i,j) = 0.0d+00
+               a(3,3,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 )
+     >           - dt * tz1 * ( c34 * tmp1 )
+     >           - dt * tz1 * dz3
+               a(3,4,i,j) = - dt * tz2 * ( u(3,i,j,k-1) * tmp1 )
+               a(3,5,i,j) = 0.0d+00
+
+               a(4,1,i,j) = - dt * tz2
+     >        * ( - ( u(4,i,j,k-1) * tmp1 ) ** 2
+     >            + 0.50d+00 * c2
+     >            * ( ( u(2,i,j,k-1) * u(2,i,j,k-1)
+     >                + u(3,i,j,k-1) * u(3,i,j,k-1)
+     >                + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2 ) )
+     >        - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k-1) )
+               a(4,2,i,j) = - dt * tz2
+     >             * ( - c2 * ( u(2,i,j,k-1) * tmp1 ) )
+               a(4,3,i,j) = - dt * tz2
+     >             * ( - c2 * ( u(3,i,j,k-1) * tmp1 ) )
+               a(4,4,i,j) = - dt * tz2 * ( 2.0d+00 - c2 )
+     >             * ( u(4,i,j,k-1) * tmp1 )
+     >             - dt * tz1 * ( r43 * c34 * tmp1 )
+     >             - dt * tz1 * dz4
+               a(4,5,i,j) = - dt * tz2 * c2
+
+               a(5,1,i,j) = - dt * tz2
+     >     * ( ( c2 * (  u(2,i,j,k-1) * u(2,i,j,k-1)
+     >                 + u(3,i,j,k-1) * u(3,i,j,k-1)
+     >                 + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2
+     >       - c1 * ( u(5,i,j,k-1) * tmp1 ) )
+     >            * ( u(4,i,j,k-1) * tmp1 ) )
+     >       - dt * tz1
+     >       * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k-1)**2)
+     >           - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k-1)**2)
+     >           - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k-1)**2)
+     >          - c1345 * tmp2 * u(5,i,j,k-1) )
+               a(5,2,i,j) = - dt * tz2
+     >       * ( - c2 * ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
+     >       - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k-1)
+               a(5,3,i,j) = - dt * tz2
+     >       * ( - c2 * ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
+     >       - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k-1)
+               a(5,4,i,j) = - dt * tz2
+     >       * ( c1 * ( u(5,i,j,k-1) * tmp1 )
+     >       - 0.50d+00 * c2
+     >       * ( (  u(2,i,j,k-1)*u(2,i,j,k-1)
+     >            + u(3,i,j,k-1)*u(3,i,j,k-1)
+     >            + 3.0d+00*u(4,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) )
+     >       - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k-1)
+               a(5,5,i,j) = - dt * tz2
+     >       * ( c1 * ( u(4,i,j,k-1) * tmp1 ) )
+     >       - dt * tz1 * c1345 * tmp1
+     >       - dt * tz1 * dz5
+
+c---------------------------------------------------------------------
+c   form the second block sub-diagonal
+c---------------------------------------------------------------------
+               tmp1 = 1.0d+00 / u(1,i,j-1,k)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               b(1,1,i,j) = - dt * ty1 * dy1
+               b(1,2,i,j) =   0.0d+00
+               b(1,3,i,j) = - dt * ty2
+               b(1,4,i,j) =   0.0d+00
+               b(1,5,i,j) =   0.0d+00
+
+               b(2,1,i,j) = - dt * ty2
+     >           * ( - ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 )
+     >           - dt * ty1 * ( - c34 * tmp2 * u(2,i,j-1,k) )
+               b(2,2,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 )
+     >          - dt * ty1 * ( c34 * tmp1 )
+     >          - dt * ty1 * dy2
+               b(2,3,i,j) = - dt * ty2 * ( u(2,i,j-1,k) * tmp1 )
+               b(2,4,i,j) = 0.0d+00
+               b(2,5,i,j) = 0.0d+00
+
+               b(3,1,i,j) = - dt * ty2
+     >           * ( - ( u(3,i,j-1,k) * tmp1 ) ** 2
+     >      + 0.50d+00 * c2 * ( (  u(2,i,j-1,k) * u(2,i,j-1,k)
+     >                           + u(3,i,j-1,k) * u(3,i,j-1,k)
+     >                           + u(4,i,j-1,k) * u(4,i,j-1,k) )
+     >                          * tmp2 ) )
+     >       - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j-1,k) )
+               b(3,2,i,j) = - dt * ty2
+     >                   * ( - c2 * ( u(2,i,j-1,k) * tmp1 ) )
+               b(3,3,i,j) = - dt * ty2 * ( ( 2.0d+00 - c2 )
+     >                   * ( u(3,i,j-1,k) * tmp1 ) )
+     >       - dt * ty1 * ( r43 * c34 * tmp1 )
+     >       - dt * ty1 * dy3
+               b(3,4,i,j) = - dt * ty2
+     >                   * ( - c2 * ( u(4,i,j-1,k) * tmp1 ) )
+               b(3,5,i,j) = - dt * ty2 * c2
+
+               b(4,1,i,j) = - dt * ty2
+     >              * ( - ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 )
+     >       - dt * ty1 * ( - c34 * tmp2 * u(4,i,j-1,k) )
+               b(4,2,i,j) = 0.0d+00
+               b(4,3,i,j) = - dt * ty2 * ( u(4,i,j-1,k) * tmp1 )
+               b(4,4,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 )
+     >                        - dt * ty1 * ( c34 * tmp1 )
+     >                        - dt * ty1 * dy4
+               b(4,5,i,j) = 0.0d+00
+
+               b(5,1,i,j) = - dt * ty2
+     >          * ( ( c2 * (  u(2,i,j-1,k) * u(2,i,j-1,k)
+     >                      + u(3,i,j-1,k) * u(3,i,j-1,k)
+     >                      + u(4,i,j-1,k) * u(4,i,j-1,k) ) * tmp2
+     >               - c1 * ( u(5,i,j-1,k) * tmp1 ) )
+     >          * ( u(3,i,j-1,k) * tmp1 ) )
+     >          - dt * ty1
+     >          * ( - (     c34 - c1345 )*tmp3*(u(2,i,j-1,k)**2)
+     >              - ( r43*c34 - c1345 )*tmp3*(u(3,i,j-1,k)**2)
+     >              - (     c34 - c1345 )*tmp3*(u(4,i,j-1,k)**2)
+     >              - c1345*tmp2*u(5,i,j-1,k) )
+               b(5,2,i,j) = - dt * ty2
+     >          * ( - c2 * ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 )
+     >          - dt * ty1
+     >          * ( c34 - c1345 ) * tmp2 * u(2,i,j-1,k)
+               b(5,3,i,j) = - dt * ty2
+     >          * ( c1 * ( u(5,i,j-1,k) * tmp1 )
+     >          - 0.50d+00 * c2 
+     >          * ( (  u(2,i,j-1,k)*u(2,i,j-1,k)
+     >               + 3.0d+00 * u(3,i,j-1,k)*u(3,i,j-1,k)
+     >               + u(4,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) )
+     >          - dt * ty1
+     >          * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j-1,k)
+               b(5,4,i,j) = - dt * ty2
+     >          * ( - c2 * ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 )
+     >          - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j-1,k)
+               b(5,5,i,j) = - dt * ty2
+     >          * ( c1 * ( u(3,i,j-1,k) * tmp1 ) )
+     >          - dt * ty1 * c1345 * tmp1
+     >          - dt * ty1 * dy5
+
+c---------------------------------------------------------------------
+c   form the third block sub-diagonal
+c---------------------------------------------------------------------
+               tmp1 = 1.0d+00 / u(1,i-1,j,k)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               c(1,1,i,j) = - dt * tx1 * dx1
+               c(1,2,i,j) = - dt * tx2
+               c(1,3,i,j) =   0.0d+00
+               c(1,4,i,j) =   0.0d+00
+               c(1,5,i,j) =   0.0d+00
+
+               c(2,1,i,j) = - dt * tx2
+     >          * ( - ( u(2,i-1,j,k) * tmp1 ) ** 2
+     >     + c2 * 0.50d+00 * (  u(2,i-1,j,k) * u(2,i-1,j,k)
+     >                        + u(3,i-1,j,k) * u(3,i-1,j,k)
+     >                        + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2 )
+     >          - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i-1,j,k) )
+               c(2,2,i,j) = - dt * tx2
+     >          * ( ( 2.0d+00 - c2 ) * ( u(2,i-1,j,k) * tmp1 ) )
+     >          - dt * tx1 * ( r43 * c34 * tmp1 )
+     >          - dt * tx1 * dx2
+               c(2,3,i,j) = - dt * tx2
+     >              * ( - c2 * ( u(3,i-1,j,k) * tmp1 ) )
+               c(2,4,i,j) = - dt * tx2
+     >              * ( - c2 * ( u(4,i-1,j,k) * tmp1 ) )
+               c(2,5,i,j) = - dt * tx2 * c2 
+
+               c(3,1,i,j) = - dt * tx2
+     >              * ( - ( u(2,i-1,j,k) * u(3,i-1,j,k) ) * tmp2 )
+     >         - dt * tx1 * ( - c34 * tmp2 * u(3,i-1,j,k) )
+               c(3,2,i,j) = - dt * tx2 * ( u(3,i-1,j,k) * tmp1 )
+               c(3,3,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 )
+     >          - dt * tx1 * ( c34 * tmp1 )
+     >          - dt * tx1 * dx3
+               c(3,4,i,j) = 0.0d+00
+               c(3,5,i,j) = 0.0d+00
+
+               c(4,1,i,j) = - dt * tx2
+     >          * ( - ( u(2,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 )
+     >          - dt * tx1 * ( - c34 * tmp2 * u(4,i-1,j,k) )
+               c(4,2,i,j) = - dt * tx2 * ( u(4,i-1,j,k) * tmp1 )
+               c(4,3,i,j) = 0.0d+00
+               c(4,4,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 )
+     >          - dt * tx1 * ( c34 * tmp1 )
+     >          - dt * tx1 * dx4
+               c(4,5,i,j) = 0.0d+00
+
+               c(5,1,i,j) = - dt * tx2
+     >          * ( ( c2 * (  u(2,i-1,j,k) * u(2,i-1,j,k)
+     >                      + u(3,i-1,j,k) * u(3,i-1,j,k)
+     >                      + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2
+     >              - c1 * ( u(5,i-1,j,k) * tmp1 ) )
+     >          * ( u(2,i-1,j,k) * tmp1 ) )
+     >          - dt * tx1
+     >          * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i-1,j,k)**2 )
+     >              - (     c34 - c1345 ) * tmp3 * ( u(3,i-1,j,k)**2 )
+     >              - (     c34 - c1345 ) * tmp3 * ( u(4,i-1,j,k)**2 )
+     >              - c1345 * tmp2 * u(5,i-1,j,k) )
+               c(5,2,i,j) = - dt * tx2
+     >          * ( c1 * ( u(5,i-1,j,k) * tmp1 )
+     >             - 0.50d+00 * c2
+     >             * ( (  3.0d+00*u(2,i-1,j,k)*u(2,i-1,j,k)
+     >                  + u(3,i-1,j,k)*u(3,i-1,j,k)
+     >                  + u(4,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 ) )
+     >           - dt * tx1
+     >           * ( r43*c34 - c1345 ) * tmp2 * u(2,i-1,j,k)
+               c(5,3,i,j) = - dt * tx2
+     >           * ( - c2 * ( u(3,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 )
+     >           - dt * tx1
+     >           * (  c34 - c1345 ) * tmp2 * u(3,i-1,j,k)
+               c(5,4,i,j) = - dt * tx2
+     >           * ( - c2 * ( u(4,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 )
+     >           - dt * tx1
+     >           * (  c34 - c1345 ) * tmp2 * u(4,i-1,j,k)
+               c(5,5,i,j) = - dt * tx2
+     >           * ( c1 * ( u(2,i-1,j,k) * tmp1 ) )
+     >           - dt * tx1 * c1345 * tmp1
+     >           - dt * tx1 * dx5
+
+            end do
+         end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/jacu.f b/examples/smpi/NAS/LU/jacu.f
new file mode 100644
index 0000000000..6a3c5b8ddb
--- /dev/null
+++ b/examples/smpi/NAS/LU/jacu.f
@@ -0,0 +1,384 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine jacu(k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   compute the upper triangular part of the jacobian matrix
+c---------------------------------------------------------------------
+
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer k
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j
+      double precision  r43
+      double precision  c1345
+      double precision  c34
+      double precision  tmp1, tmp2, tmp3
+
+
+
+      r43 = ( 4.0d+00 / 3.0d+00 )
+      c1345 = c1 * c3 * c4 * c5
+      c34 = c3 * c4
+
+         do j = jst, jend
+            do i = ist, iend
+
+c---------------------------------------------------------------------
+c   form the block daigonal
+c---------------------------------------------------------------------
+               tmp1 = 1.0d+00 / u(1,i,j,k)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               d(1,1,i,j) =  1.0d+00
+     >                       + dt * 2.0d+00 * (   tx1 * dx1
+     >                                          + ty1 * dy1
+     >                                          + tz1 * dz1 )
+               d(1,2,i,j) =  0.0d+00
+               d(1,3,i,j) =  0.0d+00
+               d(1,4,i,j) =  0.0d+00
+               d(1,5,i,j) =  0.0d+00
+
+               d(2,1,i,j) =  dt * 2.0d+00
+     >          * (  tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) )
+     >             + ty1 * ( -       c34 * tmp2 * u(2,i,j,k) )
+     >             + tz1 * ( -       c34 * tmp2 * u(2,i,j,k) ) )
+               d(2,2,i,j) =  1.0d+00
+     >          + dt * 2.0d+00 
+     >          * (  tx1 * r43 * c34 * tmp1
+     >             + ty1 *       c34 * tmp1
+     >             + tz1 *       c34 * tmp1 )
+     >          + dt * 2.0d+00 * (   tx1 * dx2
+     >                             + ty1 * dy2
+     >                             + tz1 * dz2  )
+               d(2,3,i,j) = 0.0d+00
+               d(2,4,i,j) = 0.0d+00
+               d(2,5,i,j) = 0.0d+00
+
+               d(3,1,i,j) = dt * 2.0d+00
+     >      * (  tx1 * ( -       c34 * tmp2 * u(3,i,j,k) )
+     >         + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) )
+     >         + tz1 * ( -       c34 * tmp2 * u(3,i,j,k) ) )
+               d(3,2,i,j) = 0.0d+00
+               d(3,3,i,j) = 1.0d+00
+     >         + dt * 2.0d+00
+     >              * (  tx1 *       c34 * tmp1
+     >                 + ty1 * r43 * c34 * tmp1
+     >                 + tz1 *       c34 * tmp1 )
+     >         + dt * 2.0d+00 * (  tx1 * dx3
+     >                           + ty1 * dy3
+     >                           + tz1 * dz3 )
+               d(3,4,i,j) = 0.0d+00
+               d(3,5,i,j) = 0.0d+00
+
+               d(4,1,i,j) = dt * 2.0d+00
+     >      * (  tx1 * ( -       c34 * tmp2 * u(4,i,j,k) )
+     >         + ty1 * ( -       c34 * tmp2 * u(4,i,j,k) )
+     >         + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) )
+               d(4,2,i,j) = 0.0d+00
+               d(4,3,i,j) = 0.0d+00
+               d(4,4,i,j) = 1.0d+00
+     >         + dt * 2.0d+00
+     >              * (  tx1 *       c34 * tmp1
+     >                 + ty1 *       c34 * tmp1
+     >                 + tz1 * r43 * c34 * tmp1 )
+     >         + dt * 2.0d+00 * (  tx1 * dx4
+     >                           + ty1 * dy4
+     >                           + tz1 * dz4 )
+               d(4,5,i,j) = 0.0d+00
+
+               d(5,1,i,j) = dt * 2.0d+00
+     > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+     >             - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+     >             - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+     >             - ( c1345 ) * tmp2 * u(5,i,j,k) )
+     >   + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+     >             - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+     >             - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+     >             - ( c1345 ) * tmp2 * u(5,i,j,k) )
+     >   + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+     >             - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+     >             - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+     >             - ( c1345 ) * tmp2 * u(5,i,j,k) ) )
+               d(5,2,i,j) = dt * 2.0d+00
+     > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k)
+     >   + ty1 * (     c34 - c1345 ) * tmp2 * u(2,i,j,k)
+     >   + tz1 * (     c34 - c1345 ) * tmp2 * u(2,i,j,k) )
+               d(5,3,i,j) = dt * 2.0d+00
+     > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k)
+     >   + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k)
+     >   + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) )
+               d(5,4,i,j) = dt * 2.0d+00
+     > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
+     >   + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
+     >   + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) )
+               d(5,5,i,j) = 1.0d+00
+     >   + dt * 2.0d+00 * ( tx1 * c1345 * tmp1
+     >                    + ty1 * c1345 * tmp1
+     >                    + tz1 * c1345 * tmp1 )
+     >   + dt * 2.0d+00 * (  tx1 * dx5
+     >                    +  ty1 * dy5
+     >                    +  tz1 * dz5 )
+
+c---------------------------------------------------------------------
+c   form the first block sub-diagonal
+c---------------------------------------------------------------------
+               tmp1 = 1.0d+00 / u(1,i+1,j,k)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               a(1,1,i,j) = - dt * tx1 * dx1
+               a(1,2,i,j) =   dt * tx2
+               a(1,3,i,j) =   0.0d+00
+               a(1,4,i,j) =   0.0d+00
+               a(1,5,i,j) =   0.0d+00
+
+               a(2,1,i,j) =  dt * tx2
+     >          * ( - ( u(2,i+1,j,k) * tmp1 ) ** 2
+     >     + c2 * 0.50d+00 * (  u(2,i+1,j,k) * u(2,i+1,j,k)
+     >                        + u(3,i+1,j,k) * u(3,i+1,j,k)
+     >                        + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2 )
+     >          - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i+1,j,k) )
+               a(2,2,i,j) =  dt * tx2
+     >          * ( ( 2.0d+00 - c2 ) * ( u(2,i+1,j,k) * tmp1 ) )
+     >          - dt * tx1 * ( r43 * c34 * tmp1 )
+     >          - dt * tx1 * dx2
+               a(2,3,i,j) =  dt * tx2
+     >              * ( - c2 * ( u(3,i+1,j,k) * tmp1 ) )
+               a(2,4,i,j) =  dt * tx2
+     >              * ( - c2 * ( u(4,i+1,j,k) * tmp1 ) )
+               a(2,5,i,j) =  dt * tx2 * c2 
+
+               a(3,1,i,j) =  dt * tx2
+     >              * ( - ( u(2,i+1,j,k) * u(3,i+1,j,k) ) * tmp2 )
+     >         - dt * tx1 * ( - c34 * tmp2 * u(3,i+1,j,k) )
+               a(3,2,i,j) =  dt * tx2 * ( u(3,i+1,j,k) * tmp1 )
+               a(3,3,i,j) =  dt * tx2 * ( u(2,i+1,j,k) * tmp1 )
+     >          - dt * tx1 * ( c34 * tmp1 )
+     >          - dt * tx1 * dx3
+               a(3,4,i,j) = 0.0d+00
+               a(3,5,i,j) = 0.0d+00
+
+               a(4,1,i,j) = dt * tx2
+     >          * ( - ( u(2,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 )
+     >          - dt * tx1 * ( - c34 * tmp2 * u(4,i+1,j,k) )
+               a(4,2,i,j) = dt * tx2 * ( u(4,i+1,j,k) * tmp1 )
+               a(4,3,i,j) = 0.0d+00
+               a(4,4,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 )
+     >          - dt * tx1 * ( c34 * tmp1 )
+     >          - dt * tx1 * dx4
+               a(4,5,i,j) = 0.0d+00
+
+               a(5,1,i,j) = dt * tx2
+     >          * ( ( c2 * (  u(2,i+1,j,k) * u(2,i+1,j,k)
+     >                      + u(3,i+1,j,k) * u(3,i+1,j,k)
+     >                      + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2
+     >              - c1 * ( u(5,i+1,j,k) * tmp1 ) )
+     >          * ( u(2,i+1,j,k) * tmp1 ) )
+     >          - dt * tx1
+     >          * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i+1,j,k)**2 )
+     >              - (     c34 - c1345 ) * tmp3 * ( u(3,i+1,j,k)**2 )
+     >              - (     c34 - c1345 ) * tmp3 * ( u(4,i+1,j,k)**2 )
+     >              - c1345 * tmp2 * u(5,i+1,j,k) )
+               a(5,2,i,j) = dt * tx2
+     >          * ( c1 * ( u(5,i+1,j,k) * tmp1 )
+     >             - 0.50d+00 * c2
+     >             * ( (  3.0d+00*u(2,i+1,j,k)*u(2,i+1,j,k)
+     >                  + u(3,i+1,j,k)*u(3,i+1,j,k)
+     >                  + u(4,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 ) )
+     >           - dt * tx1
+     >           * ( r43*c34 - c1345 ) * tmp2 * u(2,i+1,j,k)
+               a(5,3,i,j) = dt * tx2
+     >           * ( - c2 * ( u(3,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 )
+     >           - dt * tx1
+     >           * (  c34 - c1345 ) * tmp2 * u(3,i+1,j,k)
+               a(5,4,i,j) = dt * tx2
+     >           * ( - c2 * ( u(4,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 )
+     >           - dt * tx1
+     >           * (  c34 - c1345 ) * tmp2 * u(4,i+1,j,k)
+               a(5,5,i,j) = dt * tx2
+     >           * ( c1 * ( u(2,i+1,j,k) * tmp1 ) )
+     >           - dt * tx1 * c1345 * tmp1
+     >           - dt * tx1 * dx5
+
+c---------------------------------------------------------------------
+c   form the second block sub-diagonal
+c---------------------------------------------------------------------
+               tmp1 = 1.0d+00 / u(1,i,j+1,k)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               b(1,1,i,j) = - dt * ty1 * dy1
+               b(1,2,i,j) =   0.0d+00
+               b(1,3,i,j) =  dt * ty2
+               b(1,4,i,j) =   0.0d+00
+               b(1,5,i,j) =   0.0d+00
+
+               b(2,1,i,j) =  dt * ty2
+     >           * ( - ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 )
+     >           - dt * ty1 * ( - c34 * tmp2 * u(2,i,j+1,k) )
+               b(2,2,i,j) =  dt * ty2 * ( u(3,i,j+1,k) * tmp1 )
+     >          - dt * ty1 * ( c34 * tmp1 )
+     >          - dt * ty1 * dy2
+               b(2,3,i,j) =  dt * ty2 * ( u(2,i,j+1,k) * tmp1 )
+               b(2,4,i,j) = 0.0d+00
+               b(2,5,i,j) = 0.0d+00
+
+               b(3,1,i,j) =  dt * ty2
+     >           * ( - ( u(3,i,j+1,k) * tmp1 ) ** 2
+     >      + 0.50d+00 * c2 * ( (  u(2,i,j+1,k) * u(2,i,j+1,k)
+     >                           + u(3,i,j+1,k) * u(3,i,j+1,k)
+     >                           + u(4,i,j+1,k) * u(4,i,j+1,k) )
+     >                          * tmp2 ) )
+     >       - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j+1,k) )
+               b(3,2,i,j) =  dt * ty2
+     >                   * ( - c2 * ( u(2,i,j+1,k) * tmp1 ) )
+               b(3,3,i,j) =  dt * ty2 * ( ( 2.0d+00 - c2 )
+     >                   * ( u(3,i,j+1,k) * tmp1 ) )
+     >       - dt * ty1 * ( r43 * c34 * tmp1 )
+     >       - dt * ty1 * dy3
+               b(3,4,i,j) =  dt * ty2
+     >                   * ( - c2 * ( u(4,i,j+1,k) * tmp1 ) )
+               b(3,5,i,j) =  dt * ty2 * c2
+
+               b(4,1,i,j) =  dt * ty2
+     >              * ( - ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 )
+     >       - dt * ty1 * ( - c34 * tmp2 * u(4,i,j+1,k) )
+               b(4,2,i,j) = 0.0d+00
+               b(4,3,i,j) =  dt * ty2 * ( u(4,i,j+1,k) * tmp1 )
+               b(4,4,i,j) =  dt * ty2 * ( u(3,i,j+1,k) * tmp1 )
+     >                        - dt * ty1 * ( c34 * tmp1 )
+     >                        - dt * ty1 * dy4
+               b(4,5,i,j) = 0.0d+00
+
+               b(5,1,i,j) =  dt * ty2
+     >          * ( ( c2 * (  u(2,i,j+1,k) * u(2,i,j+1,k)
+     >                      + u(3,i,j+1,k) * u(3,i,j+1,k)
+     >                      + u(4,i,j+1,k) * u(4,i,j+1,k) ) * tmp2
+     >               - c1 * ( u(5,i,j+1,k) * tmp1 ) )
+     >          * ( u(3,i,j+1,k) * tmp1 ) )
+     >          - dt * ty1
+     >          * ( - (     c34 - c1345 )*tmp3*(u(2,i,j+1,k)**2)
+     >              - ( r43*c34 - c1345 )*tmp3*(u(3,i,j+1,k)**2)
+     >              - (     c34 - c1345 )*tmp3*(u(4,i,j+1,k)**2)
+     >              - c1345*tmp2*u(5,i,j+1,k) )
+               b(5,2,i,j) =  dt * ty2
+     >          * ( - c2 * ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 )
+     >          - dt * ty1
+     >          * ( c34 - c1345 ) * tmp2 * u(2,i,j+1,k)
+               b(5,3,i,j) =  dt * ty2
+     >          * ( c1 * ( u(5,i,j+1,k) * tmp1 )
+     >          - 0.50d+00 * c2 
+     >          * ( (  u(2,i,j+1,k)*u(2,i,j+1,k)
+     >               + 3.0d+00 * u(3,i,j+1,k)*u(3,i,j+1,k)
+     >               + u(4,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) )
+     >          - dt * ty1
+     >          * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j+1,k)
+               b(5,4,i,j) =  dt * ty2
+     >          * ( - c2 * ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 )
+     >          - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j+1,k)
+               b(5,5,i,j) =  dt * ty2
+     >          * ( c1 * ( u(3,i,j+1,k) * tmp1 ) )
+     >          - dt * ty1 * c1345 * tmp1
+     >          - dt * ty1 * dy5
+
+c---------------------------------------------------------------------
+c   form the third block sub-diagonal
+c---------------------------------------------------------------------
+               tmp1 = 1.0d+00 / u(1,i,j,k+1)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               c(1,1,i,j) = - dt * tz1 * dz1
+               c(1,2,i,j) =   0.0d+00
+               c(1,3,i,j) =   0.0d+00
+               c(1,4,i,j) = dt * tz2
+               c(1,5,i,j) =   0.0d+00
+
+               c(2,1,i,j) = dt * tz2
+     >           * ( - ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
+     >           - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k+1) )
+               c(2,2,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 )
+     >           - dt * tz1 * c34 * tmp1
+     >           - dt * tz1 * dz2 
+               c(2,3,i,j) = 0.0d+00
+               c(2,4,i,j) = dt * tz2 * ( u(2,i,j,k+1) * tmp1 )
+               c(2,5,i,j) = 0.0d+00
+
+               c(3,1,i,j) = dt * tz2
+     >           * ( - ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
+     >           - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k+1) )
+               c(3,2,i,j) = 0.0d+00
+               c(3,3,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 )
+     >           - dt * tz1 * ( c34 * tmp1 )
+     >           - dt * tz1 * dz3
+               c(3,4,i,j) = dt * tz2 * ( u(3,i,j,k+1) * tmp1 )
+               c(3,5,i,j) = 0.0d+00
+
+               c(4,1,i,j) = dt * tz2
+     >        * ( - ( u(4,i,j,k+1) * tmp1 ) ** 2
+     >            + 0.50d+00 * c2
+     >            * ( ( u(2,i,j,k+1) * u(2,i,j,k+1)
+     >                + u(3,i,j,k+1) * u(3,i,j,k+1)
+     >                + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2 ) )
+     >        - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k+1) )
+               c(4,2,i,j) = dt * tz2
+     >             * ( - c2 * ( u(2,i,j,k+1) * tmp1 ) )
+               c(4,3,i,j) = dt * tz2
+     >             * ( - c2 * ( u(3,i,j,k+1) * tmp1 ) )
+               c(4,4,i,j) = dt * tz2 * ( 2.0d+00 - c2 )
+     >             * ( u(4,i,j,k+1) * tmp1 )
+     >             - dt * tz1 * ( r43 * c34 * tmp1 )
+     >             - dt * tz1 * dz4
+               c(4,5,i,j) = dt * tz2 * c2
+
+               c(5,1,i,j) = dt * tz2
+     >     * ( ( c2 * (  u(2,i,j,k+1) * u(2,i,j,k+1)
+     >                 + u(3,i,j,k+1) * u(3,i,j,k+1)
+     >                 + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2
+     >       - c1 * ( u(5,i,j,k+1) * tmp1 ) )
+     >            * ( u(4,i,j,k+1) * tmp1 ) )
+     >       - dt * tz1
+     >       * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k+1)**2)
+     >           - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k+1)**2)
+     >           - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k+1)**2)
+     >          - c1345 * tmp2 * u(5,i,j,k+1) )
+               c(5,2,i,j) = dt * tz2
+     >       * ( - c2 * ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
+     >       - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k+1)
+               c(5,3,i,j) = dt * tz2
+     >       * ( - c2 * ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
+     >       - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k+1)
+               c(5,4,i,j) = dt * tz2
+     >       * ( c1 * ( u(5,i,j,k+1) * tmp1 )
+     >       - 0.50d+00 * c2
+     >       * ( (  u(2,i,j,k+1)*u(2,i,j,k+1)
+     >            + u(3,i,j,k+1)*u(3,i,j,k+1)
+     >            + 3.0d+00*u(4,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) )
+     >       - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k+1)
+               c(5,5,i,j) = dt * tz2
+     >       * ( c1 * ( u(4,i,j,k+1) * tmp1 ) )
+     >       - dt * tz1 * c1345 * tmp1
+     >       - dt * tz1 * dz5
+
+            end do
+         end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/l2norm.f b/examples/smpi/NAS/LU/l2norm.f
new file mode 100644
index 0000000000..147b21deb4
--- /dev/null
+++ b/examples/smpi/NAS/LU/l2norm.f
@@ -0,0 +1,68 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine l2norm ( ldx, ldy, ldz, 
+     >                    nx0, ny0, nz0,
+     >                    ist, iend, 
+     >                    jst, jend,
+     >                    v, sum )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   to compute the l2-norm of vector v.
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer ldx, ldy, ldz
+      integer nx0, ny0, nz0
+      integer ist, iend
+      integer jst, jend
+      double precision  v(5,-1:ldx+2,-1:ldy+2,*), sum(5)
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, k, m
+      double precision  dummy(5)
+
+      integer IERROR
+
+
+      do m = 1, 5
+         dummy(m) = 0.0d+00
+      end do
+
+      do k = 2, nz0-1
+         do j = jst, jend
+            do i = ist, iend
+               do m = 1, 5
+                  dummy(m) = dummy(m) + v(m,i,j,k) * v(m,i,j,k)
+               end do
+            end do
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c   compute the global sum of individual contributions to dot product.
+c---------------------------------------------------------------------
+      call MPI_ALLREDUCE( dummy,
+     >                    sum,
+     >                    5,
+     >                    dp_type,
+     >                    MPI_SUM,
+     >                    MPI_COMM_WORLD,
+     >                    IERROR )
+
+      do m = 1, 5
+         sum(m) = sqrt ( sum(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) )
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/lu.f b/examples/smpi/NAS/LU/lu.f
new file mode 100644
index 0000000000..543463a07b
--- /dev/null
+++ b/examples/smpi/NAS/LU/lu.f
@@ -0,0 +1,164 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                                   L U                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+!    It is described in NAS Technical Reports 95-020 and 02-007           !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+c---------------------------------------------------------------------
+c
+c Authors: S. Weeratunga
+c          V. Venkatakrishnan
+c          E. Barszcz
+c          M. Yarrow
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+      program applu
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   driver for the performance evaluation of the solver for
+c   five coupled parabolic/elliptic partial differential equations.
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+      character class
+      logical verified
+      double precision mflops
+      integer ierr
+
+c---------------------------------------------------------------------
+c   initialize communications
+c---------------------------------------------------------------------
+      call init_comm()
+
+c---------------------------------------------------------------------
+c   read input data
+c---------------------------------------------------------------------
+      call read_input()
+
+c---------------------------------------------------------------------
+c   set up processor grid
+c---------------------------------------------------------------------
+      call proc_grid()
+
+c---------------------------------------------------------------------
+c   determine the neighbors
+c---------------------------------------------------------------------
+      call neighbors()
+
+c---------------------------------------------------------------------
+c   set up sub-domain sizes
+c---------------------------------------------------------------------
+      call subdomain()
+
+c---------------------------------------------------------------------
+c   set up coefficients
+c---------------------------------------------------------------------
+      call setcoeff()
+
+c---------------------------------------------------------------------
+c   set the masks required for comm
+c---------------------------------------------------------------------
+      call sethyper()
+
+c---------------------------------------------------------------------
+c   set the boundary values for dependent variables
+c---------------------------------------------------------------------
+      call setbv()
+
+c---------------------------------------------------------------------
+c   set the initial values for dependent variables
+c---------------------------------------------------------------------
+      call setiv()
+
+c---------------------------------------------------------------------
+c   compute the forcing term based on prescribed exact solution
+c---------------------------------------------------------------------
+      call erhs()
+
+c---------------------------------------------------------------------
+c   perform one SSOR iteration to touch all data and program pages 
+c---------------------------------------------------------------------
+      call ssor(1)
+
+c---------------------------------------------------------------------
+c   reset the boundary and initial values
+c---------------------------------------------------------------------
+      call setbv()
+      call setiv()
+
+c---------------------------------------------------------------------
+c   perform the SSOR iterations
+c---------------------------------------------------------------------
+      call ssor(itmax)
+
+c---------------------------------------------------------------------
+c   compute the solution error
+c---------------------------------------------------------------------
+      call error()
+
+c---------------------------------------------------------------------
+c   compute the surface integral
+c---------------------------------------------------------------------
+      call pintgr()
+
+c---------------------------------------------------------------------
+c   verification test
+c---------------------------------------------------------------------
+      IF (id.eq.0) THEN
+         call verify ( rsdnm, errnm, frc, class, verified )
+         mflops = float(itmax)*(1984.77*float( nx0 )
+     >        *float( ny0 )
+     >        *float( nz0 )
+     >        -10923.3*(float( nx0+ny0+nz0 )/3.)**2 
+     >        +27770.9* float( nx0+ny0+nz0 )/3.
+     >        -144010.)
+     >        / (maxtime*1000000.)
+
+         call print_results('LU', class, nx0,
+     >     ny0, nz0, itmax, nnodes_compiled,
+     >     num, maxtime, mflops, '          floating point', verified, 
+     >     npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, 
+     >     '(none)')
+
+      END IF
+
+      call mpi_finalize(ierr)
+      end
+
+
diff --git a/examples/smpi/NAS/LU/mpinpb.h b/examples/smpi/NAS/LU/mpinpb.h
new file mode 100644
index 0000000000..ddbf1515f0
--- /dev/null
+++ b/examples/smpi/NAS/LU/mpinpb.h
@@ -0,0 +1,11 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'mpif.h'
+
+      integer           node, no_nodes, root, comm_setup, 
+     >                  comm_solve, comm_rhs, dp_type
+      common /mpistuff/ node, no_nodes, root, comm_setup, 
+     >                  comm_solve, comm_rhs, dp_type
+
diff --git a/examples/smpi/NAS/LU/neighbors.f b/examples/smpi/NAS/LU/neighbors.f
new file mode 100644
index 0000000000..ed8a3126af
--- /dev/null
+++ b/examples/smpi/NAS/LU/neighbors.f
@@ -0,0 +1,48 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine neighbors ()
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c     figure out the neighbors and their wrap numbers for each processor
+c---------------------------------------------------------------------
+
+        south = -1
+        east  = -1
+        north = -1
+        west  = -1
+
+      if (row.gt.1) then
+              north = id -1
+      else
+              north = -1
+      end if
+
+      if (row.lt.xdim) then
+              south = id + 1
+      else
+              south = -1
+      end if
+
+      if (col.gt.1) then
+              west = id- xdim
+      else
+              west = -1
+      end if
+
+      if (col.lt.ydim) then
+              east = id + xdim
+      else 
+              east = -1
+      end if
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/nodedim.f b/examples/smpi/NAS/LU/nodedim.f
new file mode 100644
index 0000000000..f4def3a01e
--- /dev/null
+++ b/examples/smpi/NAS/LU/nodedim.f
@@ -0,0 +1,36 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      integer function nodedim(num)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c  compute the exponent where num = 2**nodedim
+c  NOTE: assumes a power-of-two number of nodes
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+c---------------------------------------------------------------------
+c  input parameters
+c---------------------------------------------------------------------
+      integer num
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      double precision fnum
+
+
+      fnum = dble(num)
+      nodedim = log(fnum)/log(2.0d+0) + 0.00001
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/LU/pintgr.f b/examples/smpi/NAS/LU/pintgr.f
new file mode 100644
index 0000000000..de514cccd2
--- /dev/null
+++ b/examples/smpi/NAS/LU/pintgr.f
@@ -0,0 +1,288 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine pintgr
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, k
+      integer ibeg, ifin, ifin1
+      integer jbeg, jfin, jfin1
+      integer iglob, iglob1, iglob2
+      integer jglob, jglob1, jglob2
+      integer ind1, ind2
+      double precision  phi1(0:isiz2+1,0:isiz3+1),
+     >                  phi2(0:isiz2+1,0:isiz3+1)
+      double precision  frc1, frc2, frc3
+      double precision  dummy
+
+      integer IERROR
+
+
+c---------------------------------------------------------------------
+c   set up the sub-domains for integeration in each processor
+c---------------------------------------------------------------------
+      ibeg = nx + 1
+      ifin = 0
+      iglob1 = ipt + 1
+      iglob2 = ipt + nx
+      if (iglob1.ge.ii1.and.iglob2.lt.ii2+nx) ibeg = 1
+      if (iglob1.gt.ii1-nx.and.iglob2.le.ii2) ifin = nx
+      if (ii1.ge.iglob1.and.ii1.le.iglob2) ibeg = ii1 - ipt
+      if (ii2.ge.iglob1.and.ii2.le.iglob2) ifin = ii2 - ipt
+      jbeg = ny + 1
+      jfin = 0
+      jglob1 = jpt + 1
+      jglob2 = jpt + ny
+      if (jglob1.ge.ji1.and.jglob2.lt.ji2+ny) jbeg = 1
+      if (jglob1.gt.ji1-ny.and.jglob2.le.ji2) jfin = ny
+      if (ji1.ge.jglob1.and.ji1.le.jglob2) jbeg = ji1 - jpt
+      if (ji2.ge.jglob1.and.ji2.le.jglob2) jfin = ji2 - jpt
+      ifin1 = ifin
+      jfin1 = jfin
+      if (ipt + ifin1.eq.ii2) ifin1 = ifin -1
+      if (jpt + jfin1.eq.ji2) jfin1 = jfin -1
+
+c---------------------------------------------------------------------
+c   initialize
+c---------------------------------------------------------------------
+      do i = 0,isiz2+1
+        do k = 0,isiz3+1
+          phi1(i,k) = 0.
+          phi2(i,k) = 0.
+        end do
+      end do
+
+      do j = jbeg,jfin
+         jglob = jpt + j
+         do i = ibeg,ifin
+            iglob = ipt + i
+
+            k = ki1
+
+            phi1(i,j) = c2*(  u(5,i,j,k)
+     >           - 0.50d+00 * (  u(2,i,j,k) ** 2
+     >                         + u(3,i,j,k) ** 2
+     >                         + u(4,i,j,k) ** 2 )
+     >                        / u(1,i,j,k) )
+
+            k = ki2
+
+            phi2(i,j) = c2*(  u(5,i,j,k)
+     >           - 0.50d+00 * (  u(2,i,j,k) ** 2
+     >                         + u(3,i,j,k) ** 2
+     >                         + u(4,i,j,k) ** 2 )
+     >                        / u(1,i,j,k) )
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c  communicate in i and j directions
+c---------------------------------------------------------------------
+      call exchange_4(phi1,phi2,ibeg,ifin1,jbeg,jfin1)
+
+      frc1 = 0.0d+00
+
+      do j = jbeg,jfin1
+         do i = ibeg, ifin1
+            frc1 = frc1 + (  phi1(i,j)
+     >                     + phi1(i+1,j)
+     >                     + phi1(i,j+1)
+     >                     + phi1(i+1,j+1)
+     >                     + phi2(i,j)
+     >                     + phi2(i+1,j)
+     >                     + phi2(i,j+1)
+     >                     + phi2(i+1,j+1) )
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c  compute the global sum of individual contributions to frc1
+c---------------------------------------------------------------------
+      dummy = frc1
+      call MPI_ALLREDUCE( dummy,
+     >                    frc1,
+     >                    1,
+     >                    dp_type,
+     >                    MPI_SUM,
+     >                    MPI_COMM_WORLD,
+     >                    IERROR )
+
+      frc1 = dxi * deta * frc1
+
+c---------------------------------------------------------------------
+c   initialize
+c---------------------------------------------------------------------
+      do i = 0,isiz2+1
+        do k = 0,isiz3+1
+          phi1(i,k) = 0.
+          phi2(i,k) = 0.
+        end do
+      end do
+      jglob = jpt + jbeg
+      ind1 = 0
+      if (jglob.eq.ji1) then
+        ind1 = 1
+        do k = ki1, ki2
+           do i = ibeg, ifin
+              iglob = ipt + i
+              phi1(i,k) = c2*(  u(5,i,jbeg,k)
+     >             - 0.50d+00 * (  u(2,i,jbeg,k) ** 2
+     >                           + u(3,i,jbeg,k) ** 2
+     >                           + u(4,i,jbeg,k) ** 2 )
+     >                          / u(1,i,jbeg,k) )
+           end do
+        end do
+      end if
+
+      jglob = jpt + jfin
+      ind2 = 0
+      if (jglob.eq.ji2) then
+        ind2 = 1
+        do k = ki1, ki2
+           do i = ibeg, ifin
+              iglob = ipt + i
+              phi2(i,k) = c2*(  u(5,i,jfin,k)
+     >             - 0.50d+00 * (  u(2,i,jfin,k) ** 2
+     >                           + u(3,i,jfin,k) ** 2
+     >                           + u(4,i,jfin,k) ** 2 )
+     >                          / u(1,i,jfin,k) )
+           end do
+        end do
+      end if
+
+c---------------------------------------------------------------------
+c  communicate in i direction
+c---------------------------------------------------------------------
+      if (ind1.eq.1) then
+        call exchange_5(phi1,ibeg,ifin1)
+      end if
+      if (ind2.eq.1) then
+        call exchange_5 (phi2,ibeg,ifin1)
+      end if
+
+      frc2 = 0.0d+00
+      do k = ki1, ki2-1
+         do i = ibeg, ifin1
+            frc2 = frc2 + (  phi1(i,k)
+     >                     + phi1(i+1,k)
+     >                     + phi1(i,k+1)
+     >                     + phi1(i+1,k+1)
+     >                     + phi2(i,k)
+     >                     + phi2(i+1,k)
+     >                     + phi2(i,k+1)
+     >                     + phi2(i+1,k+1) )
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c  compute the global sum of individual contributions to frc2
+c---------------------------------------------------------------------
+      dummy = frc2
+      call MPI_ALLREDUCE( dummy,
+     >                    frc2,
+     >                    1,
+     >                    dp_type,
+     >                    MPI_SUM,
+     >                    MPI_COMM_WORLD,
+     >                    IERROR )
+
+      frc2 = dxi * dzeta * frc2
+
+c---------------------------------------------------------------------
+c   initialize
+c---------------------------------------------------------------------
+      do i = 0,isiz2+1
+        do k = 0,isiz3+1
+          phi1(i,k) = 0.
+          phi2(i,k) = 0.
+        end do
+      end do
+      iglob = ipt + ibeg
+      ind1 = 0
+      if (iglob.eq.ii1) then
+        ind1 = 1
+        do k = ki1, ki2
+           do j = jbeg, jfin
+              jglob = jpt + j
+              phi1(j,k) = c2*(  u(5,ibeg,j,k)
+     >             - 0.50d+00 * (  u(2,ibeg,j,k) ** 2
+     >                           + u(3,ibeg,j,k) ** 2
+     >                           + u(4,ibeg,j,k) ** 2 )
+     >                          / u(1,ibeg,j,k) )
+           end do
+        end do
+      end if
+
+      iglob = ipt + ifin
+      ind2 = 0
+      if (iglob.eq.ii2) then
+        ind2 = 1
+        do k = ki1, ki2
+           do j = jbeg, jfin
+              jglob = jpt + j
+              phi2(j,k) = c2*(  u(5,ifin,j,k)
+     >             - 0.50d+00 * (  u(2,ifin,j,k) ** 2
+     >                           + u(3,ifin,j,k) ** 2
+     >                           + u(4,ifin,j,k) ** 2 )
+     >                          / u(1,ifin,j,k) )
+           end do
+        end do
+      end if
+
+c---------------------------------------------------------------------
+c  communicate in j direction
+c---------------------------------------------------------------------
+      if (ind1.eq.1) then
+        call exchange_6(phi1,jbeg,jfin1)
+      end if
+      if (ind2.eq.1) then
+        call exchange_6(phi2,jbeg,jfin1)
+      end if
+
+      frc3 = 0.0d+00
+
+      do k = ki1, ki2-1
+         do j = jbeg, jfin1
+            frc3 = frc3 + (  phi1(j,k)
+     >                     + phi1(j+1,k)
+     >                     + phi1(j,k+1)
+     >                     + phi1(j+1,k+1)
+     >                     + phi2(j,k)
+     >                     + phi2(j+1,k)
+     >                     + phi2(j,k+1)
+     >                     + phi2(j+1,k+1) )
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c  compute the global sum of individual contributions to frc3
+c---------------------------------------------------------------------
+      dummy = frc3
+      call MPI_ALLREDUCE( dummy,
+     >                    frc3,
+     >                    1,
+     >                    dp_type,
+     >                    MPI_SUM,
+     >                    MPI_COMM_WORLD,
+     >                    IERROR )
+
+      frc3 = deta * dzeta * frc3
+      frc = 0.25d+00 * ( frc1 + frc2 + frc3 )
+c      if (id.eq.0) write (*,1001) frc
+
+      return
+
+ 1001 format (//5x,'surface integral = ',1pe12.5//)
+
+      end
diff --git a/examples/smpi/NAS/LU/proc_grid.f b/examples/smpi/NAS/LU/proc_grid.f
new file mode 100644
index 0000000000..40271c135e
--- /dev/null
+++ b/examples/smpi/NAS/LU/proc_grid.f
@@ -0,0 +1,36 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine proc_grid
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   set up a two-d grid for processors: column-major ordering of unknowns
+c   NOTE: assumes a power-of-two number of processors
+c
+c---------------------------------------------------------------------
+
+      xdim   = 2**(ndim/2)
+      if (mod(ndim,2).eq.1) xdim = xdim + xdim
+      ydim   = num/xdim
+
+      row    = mod(id,xdim) + 1
+      col    = id/xdim + 1
+
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/LU/read_input.f b/examples/smpi/NAS/LU/read_input.f
new file mode 100644
index 0000000000..b2e5ff1e79
--- /dev/null
+++ b/examples/smpi/NAS/LU/read_input.f
@@ -0,0 +1,127 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine read_input
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+      integer IERROR, fstatus, nnodes
+
+
+c---------------------------------------------------------------------
+c    only root reads the input file
+c    if input file does not exist, it uses defaults
+c       ipr = 1 for detailed progress output
+c       inorm = how often the norm is printed (once every inorm iterations)
+c       itmax = number of pseudo time steps
+c       dt = time step
+c       omega 1 over-relaxation factor for SSOR
+c       tolrsd = steady state residual tolerance levels
+c       nx, ny, nz = number of grid points in x, y, z directions
+c---------------------------------------------------------------------
+      ROOT = 0
+      if (id .eq. ROOT) then
+
+         write(*, 1000)
+
+         open (unit=3,file='inputlu.data',status='old',
+     >         access='sequential',form='formatted', iostat=fstatus)
+         if (fstatus .eq. 0) then
+
+            write(*, *) 'Reading from input file inputlu.data'
+
+            read (3,*)
+            read (3,*)
+            read (3,*) ipr, inorm
+            read (3,*)
+            read (3,*)
+            read (3,*) itmax
+            read (3,*)
+            read (3,*)
+            read (3,*) dt
+            read (3,*)
+            read (3,*)
+            read (3,*) omega
+            read (3,*)
+            read (3,*)
+            read (3,*) tolrsd(1),tolrsd(2),tolrsd(3),tolrsd(4),tolrsd(5)
+            read (3,*)
+            read (3,*)
+            read (3,*) nx0, ny0, nz0
+            close(3)
+         else
+            ipr = ipr_default
+            inorm = inorm_default
+            itmax = itmax_default
+            dt = dt_default
+            omega = omega_default
+            tolrsd(1) = tolrsd1_def
+            tolrsd(2) = tolrsd2_def
+            tolrsd(3) = tolrsd3_def
+            tolrsd(4) = tolrsd4_def
+            tolrsd(5) = tolrsd5_def
+            nx0 = isiz01
+            ny0 = isiz02
+            nz0 = isiz03
+         endif
+
+c---------------------------------------------------------------------
+c   check problem size
+c---------------------------------------------------------------------
+         call MPI_COMM_SIZE(MPI_COMM_WORLD, nnodes, ierror)
+         if (nnodes .ne. nnodes_compiled) then
+            write (*, 2000) nnodes, nnodes_compiled
+ 2000       format (5x,'Warning: program is running on',i3,' processors'
+     >             /5x,'but was compiled for ', i3)
+         endif
+
+         if ( ( nx0 .lt. 4 ) .or.
+     >        ( ny0 .lt. 4 ) .or.
+     >        ( nz0 .lt. 4 ) ) then
+
+            write (*,2001)
+ 2001       format (5x,'PROBLEM SIZE IS TOO SMALL - ',
+     >           /5x,'SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5')
+            CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR )
+
+         end if
+
+         if ( ( nx0 .gt. isiz01 ) .or.
+     >        ( ny0 .gt. isiz02 ) .or.
+     >        ( nz0 .gt. isiz03 ) ) then
+
+            write (*,2002)
+ 2002       format (5x,'PROBLEM SIZE IS TOO LARGE - ',
+     >           /5x,'NX, NY AND NZ SHOULD BE LESS THAN OR EQUAL TO ',
+     >           /5x,'ISIZ01, ISIZ02 AND ISIZ03 RESPECTIVELY')
+            CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR )
+
+         end if
+
+
+         write(*, 1001) nx0, ny0, nz0
+         write(*, 1002) itmax
+         write(*, 1003) nnodes
+
+ 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- LU Benchmark',/)
+ 1001    format(' Size: ', i4, 'x', i4, 'x', i4)
+ 1002    format(' Iterations: ', i4)
+ 1003    format(' Number of processes: ', i5, /)
+         
+
+
+      end if
+
+      call bcast_inputs
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/LU/rhs.f b/examples/smpi/NAS/LU/rhs.f
new file mode 100644
index 0000000000..3da39117a4
--- /dev/null
+++ b/examples/smpi/NAS/LU/rhs.f
@@ -0,0 +1,504 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   compute the right hand sides
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, k, m
+      integer iex
+      integer L1, L2
+      integer ist1, iend1
+      integer jst1, jend1
+      double precision  q
+      double precision  u21, u31, u41
+      double precision  tmp
+      double precision  u21i, u31i, u41i, u51i
+      double precision  u21j, u31j, u41j, u51j
+      double precision  u21k, u31k, u41k, u51k
+      double precision  u21im1, u31im1, u41im1, u51im1
+      double precision  u21jm1, u31jm1, u41jm1, u51jm1
+      double precision  u21km1, u31km1, u41km1, u51km1
+
+
+
+      do k = 1, nz
+         do j = 1, ny
+            do i = 1, nx
+               do m = 1, 5
+                  rsd(m,i,j,k) = - frct(m,i,j,k)
+               end do
+            end do
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c   xi-direction flux differences
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   iex = flag : iex = 0  north/south communication
+c              : iex = 1  east/west communication
+c---------------------------------------------------------------------
+      iex   = 0
+
+c---------------------------------------------------------------------
+c   communicate and receive/send two rows of data
+c---------------------------------------------------------------------
+      call exchange_3(u,iex)
+
+      L1 = 0
+      if (north.eq.-1) L1 = 1
+      L2 = nx + 1
+      if (south.eq.-1) L2 = nx
+
+      ist1 = 1
+      iend1 = nx
+      if (north.eq.-1) ist1 = 4
+      if (south.eq.-1) iend1 = nx - 3
+
+      do k = 2, nz - 1
+         do j = jst, jend
+            do i = L1, L2
+               flux(1,i,j,k) = u(2,i,j,k)
+               u21 = u(2,i,j,k) / u(1,i,j,k)
+
+               q = 0.50d+00 * (  u(2,i,j,k) * u(2,i,j,k)
+     >                         + u(3,i,j,k) * u(3,i,j,k)
+     >                         + u(4,i,j,k) * u(4,i,j,k) )
+     >                      / u(1,i,j,k)
+
+               flux(2,i,j,k) = u(2,i,j,k) * u21 + c2 * 
+     >                        ( u(5,i,j,k) - q )
+               flux(3,i,j,k) = u(3,i,j,k) * u21
+               flux(4,i,j,k) = u(4,i,j,k) * u21
+               flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u21
+            end do
+
+            do i = ist, iend
+               do m = 1, 5
+                  rsd(m,i,j,k) =  rsd(m,i,j,k)
+     >                 - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) )
+               end do
+            end do
+
+            do i = ist, L2
+               tmp = 1.0d+00 / u(1,i,j,k)
+
+               u21i = tmp * u(2,i,j,k)
+               u31i = tmp * u(3,i,j,k)
+               u41i = tmp * u(4,i,j,k)
+               u51i = tmp * u(5,i,j,k)
+
+               tmp = 1.0d+00 / u(1,i-1,j,k)
+
+               u21im1 = tmp * u(2,i-1,j,k)
+               u31im1 = tmp * u(3,i-1,j,k)
+               u41im1 = tmp * u(4,i-1,j,k)
+               u51im1 = tmp * u(5,i-1,j,k)
+
+               flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * (u21i-u21im1)
+               flux(3,i,j,k) = tx3 * ( u31i - u31im1 )
+               flux(4,i,j,k) = tx3 * ( u41i - u41im1 )
+               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+     >              * tx3 * ( ( u21i  **2 + u31i  **2 + u41i  **2 )
+     >                      - ( u21im1**2 + u31im1**2 + u41im1**2 ) )
+     >              + (1.0d+00/6.0d+00)
+     >              * tx3 * ( u21i**2 - u21im1**2 )
+     >              + c1 * c5 * tx3 * ( u51i - u51im1 )
+            end do
+
+            do i = ist, iend
+               rsd(1,i,j,k) = rsd(1,i,j,k)
+     >              + dx1 * tx1 * (            u(1,i-1,j,k)
+     >                             - 2.0d+00 * u(1,i,j,k)
+     >                             +           u(1,i+1,j,k) )
+               rsd(2,i,j,k) = rsd(2,i,j,k)
+     >          + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) )
+     >              + dx2 * tx1 * (            u(2,i-1,j,k)
+     >                             - 2.0d+00 * u(2,i,j,k)
+     >                             +           u(2,i+1,j,k) )
+               rsd(3,i,j,k) = rsd(3,i,j,k)
+     >          + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) )
+     >              + dx3 * tx1 * (            u(3,i-1,j,k)
+     >                             - 2.0d+00 * u(3,i,j,k)
+     >                             +           u(3,i+1,j,k) )
+               rsd(4,i,j,k) = rsd(4,i,j,k)
+     >          + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) )
+     >              + dx4 * tx1 * (            u(4,i-1,j,k)
+     >                             - 2.0d+00 * u(4,i,j,k)
+     >                             +           u(4,i+1,j,k) )
+               rsd(5,i,j,k) = rsd(5,i,j,k)
+     >          + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) )
+     >              + dx5 * tx1 * (            u(5,i-1,j,k)
+     >                             - 2.0d+00 * u(5,i,j,k)
+     >                             +           u(5,i+1,j,k) )
+            end do
+
+c---------------------------------------------------------------------
+c   Fourth-order dissipation
+c---------------------------------------------------------------------
+            IF (north.eq.-1) then
+             do m = 1, 5
+               rsd(m,2,j,k) = rsd(m,2,j,k)
+     >           - dssp * ( + 5.0d+00 * u(m,2,j,k)
+     >                      - 4.0d+00 * u(m,3,j,k)
+     >                      +           u(m,4,j,k) )
+               rsd(m,3,j,k) = rsd(m,3,j,k)
+     >           - dssp * ( - 4.0d+00 * u(m,2,j,k)
+     >                      + 6.0d+00 * u(m,3,j,k)
+     >                      - 4.0d+00 * u(m,4,j,k)
+     >                      +           u(m,5,j,k) )
+             end do
+            END IF
+
+            do i = ist1,iend1
+               do m = 1, 5
+                  rsd(m,i,j,k) = rsd(m,i,j,k)
+     >              - dssp * (            u(m,i-2,j,k)
+     >                        - 4.0d+00 * u(m,i-1,j,k)
+     >                        + 6.0d+00 * u(m,i,j,k)
+     >                        - 4.0d+00 * u(m,i+1,j,k)
+     >                        +           u(m,i+2,j,k) )
+               end do
+            end do
+
+            IF (south.eq.-1) then
+             do m = 1, 5
+               rsd(m,nx-2,j,k) = rsd(m,nx-2,j,k)
+     >           - dssp * (             u(m,nx-4,j,k)
+     >                      - 4.0d+00 * u(m,nx-3,j,k)
+     >                      + 6.0d+00 * u(m,nx-2,j,k)
+     >                      - 4.0d+00 * u(m,nx-1,j,k)  )
+               rsd(m,nx-1,j,k) = rsd(m,nx-1,j,k)
+     >           - dssp * (             u(m,nx-3,j,k)
+     >                      - 4.0d+00 * u(m,nx-2,j,k)
+     >                      + 5.0d+00 * u(m,nx-1,j,k) )
+             end do
+            END IF
+
+         end do
+      end do 
+
+c---------------------------------------------------------------------
+c   eta-direction flux differences
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   iex = flag : iex = 0  north/south communication
+c---------------------------------------------------------------------
+      iex   = 1
+
+c---------------------------------------------------------------------
+c   communicate and receive/send two rows of data
+c---------------------------------------------------------------------
+      call exchange_3(u,iex)
+
+      L1 = 0
+      if (west.eq.-1) L1 = 1
+      L2 = ny + 1
+      if (east.eq.-1) L2 = ny
+
+      jst1 = 1
+      jend1 = ny
+      if (west.eq.-1) jst1 = 4
+      if (east.eq.-1) jend1 = ny - 3
+
+      do k = 2, nz - 1
+         do j = L1, L2
+            do i = ist, iend
+               flux(1,i,j,k) = u(3,i,j,k)
+               u31 = u(3,i,j,k) / u(1,i,j,k)
+
+               q = 0.50d+00 * (  u(2,i,j,k) * u(2,i,j,k)
+     >                         + u(3,i,j,k) * u(3,i,j,k)
+     >                         + u(4,i,j,k) * u(4,i,j,k) )
+     >                      / u(1,i,j,k)
+
+               flux(2,i,j,k) = u(2,i,j,k) * u31 
+               flux(3,i,j,k) = u(3,i,j,k) * u31 + c2 * (u(5,i,j,k)-q)
+               flux(4,i,j,k) = u(4,i,j,k) * u31
+               flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u31
+            end do
+         end do
+
+         do j = jst, jend
+            do i = ist, iend
+               do m = 1, 5
+                  rsd(m,i,j,k) =  rsd(m,i,j,k)
+     >                   - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) )
+               end do
+            end do
+         end do
+
+         do j = jst, L2
+            do i = ist, iend
+               tmp = 1.0d+00 / u(1,i,j,k)
+
+               u21j = tmp * u(2,i,j,k)
+               u31j = tmp * u(3,i,j,k)
+               u41j = tmp * u(4,i,j,k)
+               u51j = tmp * u(5,i,j,k)
+
+               tmp = 1.0d+00 / u(1,i,j-1,k)
+               u21jm1 = tmp * u(2,i,j-1,k)
+               u31jm1 = tmp * u(3,i,j-1,k)
+               u41jm1 = tmp * u(4,i,j-1,k)
+               u51jm1 = tmp * u(5,i,j-1,k)
+
+               flux(2,i,j,k) = ty3 * ( u21j - u21jm1 )
+               flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * (u31j-u31jm1)
+               flux(4,i,j,k) = ty3 * ( u41j - u41jm1 )
+               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+     >              * ty3 * ( ( u21j  **2 + u31j  **2 + u41j  **2 )
+     >                      - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) )
+     >              + (1.0d+00/6.0d+00)
+     >              * ty3 * ( u31j**2 - u31jm1**2 )
+     >              + c1 * c5 * ty3 * ( u51j - u51jm1 )
+            end do
+         end do
+
+         do j = jst, jend
+            do i = ist, iend
+
+               rsd(1,i,j,k) = rsd(1,i,j,k)
+     >              + dy1 * ty1 * (            u(1,i,j-1,k)
+     >                             - 2.0d+00 * u(1,i,j,k)
+     >                             +           u(1,i,j+1,k) )
+
+               rsd(2,i,j,k) = rsd(2,i,j,k)
+     >          + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) )
+     >              + dy2 * ty1 * (            u(2,i,j-1,k)
+     >                             - 2.0d+00 * u(2,i,j,k)
+     >                             +           u(2,i,j+1,k) )
+
+               rsd(3,i,j,k) = rsd(3,i,j,k)
+     >          + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) )
+     >              + dy3 * ty1 * (            u(3,i,j-1,k)
+     >                             - 2.0d+00 * u(3,i,j,k)
+     >                             +           u(3,i,j+1,k) )
+
+               rsd(4,i,j,k) = rsd(4,i,j,k)
+     >          + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) )
+     >              + dy4 * ty1 * (            u(4,i,j-1,k)
+     >                             - 2.0d+00 * u(4,i,j,k)
+     >                             +           u(4,i,j+1,k) )
+
+               rsd(5,i,j,k) = rsd(5,i,j,k)
+     >          + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) )
+     >              + dy5 * ty1 * (            u(5,i,j-1,k)
+     >                             - 2.0d+00 * u(5,i,j,k)
+     >                             +           u(5,i,j+1,k) )
+
+            end do
+         end do
+
+c---------------------------------------------------------------------
+c   fourth-order dissipation
+c---------------------------------------------------------------------
+         IF (west.eq.-1) then
+            do i = ist, iend
+             do m = 1, 5
+               rsd(m,i,2,k) = rsd(m,i,2,k)
+     >           - dssp * ( + 5.0d+00 * u(m,i,2,k)
+     >                      - 4.0d+00 * u(m,i,3,k)
+     >                      +           u(m,i,4,k) )
+               rsd(m,i,3,k) = rsd(m,i,3,k)
+     >           - dssp * ( - 4.0d+00 * u(m,i,2,k)
+     >                      + 6.0d+00 * u(m,i,3,k)
+     >                      - 4.0d+00 * u(m,i,4,k)
+     >                      +           u(m,i,5,k) )
+             end do
+            end do
+         END IF
+
+         do j = jst1, jend1
+            do i = ist, iend
+               do m = 1, 5
+                  rsd(m,i,j,k) = rsd(m,i,j,k)
+     >              - dssp * (            u(m,i,j-2,k)
+     >                        - 4.0d+00 * u(m,i,j-1,k)
+     >                        + 6.0d+00 * u(m,i,j,k)
+     >                        - 4.0d+00 * u(m,i,j+1,k)
+     >                        +           u(m,i,j+2,k) )
+               end do
+            end do
+         end do
+
+         IF (east.eq.-1) then
+            do i = ist, iend
+             do m = 1, 5
+               rsd(m,i,ny-2,k) = rsd(m,i,ny-2,k)
+     >           - dssp * (             u(m,i,ny-4,k)
+     >                      - 4.0d+00 * u(m,i,ny-3,k)
+     >                      + 6.0d+00 * u(m,i,ny-2,k)
+     >                      - 4.0d+00 * u(m,i,ny-1,k)  )
+               rsd(m,i,ny-1,k) = rsd(m,i,ny-1,k)
+     >           - dssp * (             u(m,i,ny-3,k)
+     >                      - 4.0d+00 * u(m,i,ny-2,k)
+     >                      + 5.0d+00 * u(m,i,ny-1,k) )
+             end do
+            end do
+         END IF
+
+      end do
+
+c---------------------------------------------------------------------
+c   zeta-direction flux differences
+c---------------------------------------------------------------------
+      do k = 1, nz
+         do j = jst, jend
+            do i = ist, iend
+               flux(1,i,j,k) = u(4,i,j,k)
+               u41 = u(4,i,j,k) / u(1,i,j,k)
+
+               q = 0.50d+00 * (  u(2,i,j,k) * u(2,i,j,k)
+     >                         + u(3,i,j,k) * u(3,i,j,k)
+     >                         + u(4,i,j,k) * u(4,i,j,k) )
+     >                      / u(1,i,j,k)
+
+               flux(2,i,j,k) = u(2,i,j,k) * u41 
+               flux(3,i,j,k) = u(3,i,j,k) * u41 
+               flux(4,i,j,k) = u(4,i,j,k) * u41 + c2 * (u(5,i,j,k)-q)
+               flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u41
+            end do
+         end do
+      end do
+
+      do k = 2, nz - 1
+         do j = jst, jend
+            do i = ist, iend
+               do m = 1, 5
+                  rsd(m,i,j,k) =  rsd(m,i,j,k)
+     >                - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) )
+               end do
+            end do
+         end do
+      end do
+
+      do k = 2, nz
+         do j = jst, jend
+            do i = ist, iend
+               tmp = 1.0d+00 / u(1,i,j,k)
+
+               u21k = tmp * u(2,i,j,k)
+               u31k = tmp * u(3,i,j,k)
+               u41k = tmp * u(4,i,j,k)
+               u51k = tmp * u(5,i,j,k)
+
+               tmp = 1.0d+00 / u(1,i,j,k-1)
+
+               u21km1 = tmp * u(2,i,j,k-1)
+               u31km1 = tmp * u(3,i,j,k-1)
+               u41km1 = tmp * u(4,i,j,k-1)
+               u51km1 = tmp * u(5,i,j,k-1)
+
+               flux(2,i,j,k) = tz3 * ( u21k - u21km1 )
+               flux(3,i,j,k) = tz3 * ( u31k - u31km1 )
+               flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * (u41k-u41km1)
+               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+     >              * tz3 * ( ( u21k  **2 + u31k  **2 + u41k  **2 )
+     >                      - ( u21km1**2 + u31km1**2 + u41km1**2 ) )
+     >              + (1.0d+00/6.0d+00)
+     >              * tz3 * ( u41k**2 - u41km1**2 )
+     >              + c1 * c5 * tz3 * ( u51k - u51km1 )
+            end do
+         end do
+      end do
+
+      do k = 2, nz - 1
+         do j = jst, jend
+            do i = ist, iend
+               rsd(1,i,j,k) = rsd(1,i,j,k)
+     >              + dz1 * tz1 * (            u(1,i,j,k-1)
+     >                             - 2.0d+00 * u(1,i,j,k)
+     >                             +           u(1,i,j,k+1) )
+               rsd(2,i,j,k) = rsd(2,i,j,k)
+     >          + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) )
+     >              + dz2 * tz1 * (            u(2,i,j,k-1)
+     >                             - 2.0d+00 * u(2,i,j,k)
+     >                             +           u(2,i,j,k+1) )
+               rsd(3,i,j,k) = rsd(3,i,j,k)
+     >          + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) )
+     >              + dz3 * tz1 * (            u(3,i,j,k-1)
+     >                             - 2.0d+00 * u(3,i,j,k)
+     >                             +           u(3,i,j,k+1) )
+               rsd(4,i,j,k) = rsd(4,i,j,k)
+     >          + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) )
+     >              + dz4 * tz1 * (            u(4,i,j,k-1)
+     >                             - 2.0d+00 * u(4,i,j,k)
+     >                             +           u(4,i,j,k+1) )
+               rsd(5,i,j,k) = rsd(5,i,j,k)
+     >          + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) )
+     >              + dz5 * tz1 * (            u(5,i,j,k-1)
+     >                             - 2.0d+00 * u(5,i,j,k)
+     >                             +           u(5,i,j,k+1) )
+            end do
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c   fourth-order dissipation
+c---------------------------------------------------------------------
+      do j = jst, jend
+         do i = ist, iend
+            do m = 1, 5
+               rsd(m,i,j,2) = rsd(m,i,j,2)
+     >           - dssp * ( + 5.0d+00 * u(m,i,j,2)
+     >                      - 4.0d+00 * u(m,i,j,3)
+     >                      +           u(m,i,j,4) )
+               rsd(m,i,j,3) = rsd(m,i,j,3)
+     >           - dssp * ( - 4.0d+00 * u(m,i,j,2)
+     >                      + 6.0d+00 * u(m,i,j,3)
+     >                      - 4.0d+00 * u(m,i,j,4)
+     >                      +           u(m,i,j,5) )
+            end do
+         end do
+      end do
+
+      do k = 4, nz - 3
+         do j = jst, jend
+            do i = ist, iend
+               do m = 1, 5
+                  rsd(m,i,j,k) = rsd(m,i,j,k)
+     >              - dssp * (            u(m,i,j,k-2)
+     >                        - 4.0d+00 * u(m,i,j,k-1)
+     >                        + 6.0d+00 * u(m,i,j,k)
+     >                        - 4.0d+00 * u(m,i,j,k+1)
+     >                        +           u(m,i,j,k+2) )
+               end do
+            end do
+         end do
+      end do
+
+      do j = jst, jend
+         do i = ist, iend
+            do m = 1, 5
+               rsd(m,i,j,nz-2) = rsd(m,i,j,nz-2)
+     >           - dssp * (             u(m,i,j,nz-4)
+     >                      - 4.0d+00 * u(m,i,j,nz-3)
+     >                      + 6.0d+00 * u(m,i,j,nz-2)
+     >                      - 4.0d+00 * u(m,i,j,nz-1)  )
+               rsd(m,i,j,nz-1) = rsd(m,i,j,nz-1)
+     >           - dssp * (             u(m,i,j,nz-3)
+     >                      - 4.0d+00 * u(m,i,j,nz-2)
+     >                      + 5.0d+00 * u(m,i,j,nz-1) )
+            end do
+         end do
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/setbv.f b/examples/smpi/NAS/LU/setbv.f
new file mode 100644
index 0000000000..56b0edf967
--- /dev/null
+++ b/examples/smpi/NAS/LU/setbv.f
@@ -0,0 +1,79 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setbv
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   set the boundary values of dependent variables
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c   local variables
+c---------------------------------------------------------------------
+      integer i, j, k
+      integer iglob, jglob
+
+c---------------------------------------------------------------------
+c   set the dependent variable values along the top and bottom faces
+c---------------------------------------------------------------------
+      do j = 1, ny
+         jglob = jpt + j
+         do i = 1, nx
+           iglob = ipt + i
+            call exact( iglob, jglob, 1, u( 1, i, j, 1 ) )
+            call exact( iglob, jglob, nz, u( 1, i, j, nz ) )
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c   set the dependent variable values along north and south faces
+c---------------------------------------------------------------------
+      IF (west.eq.-1) then
+         do k = 1, nz
+            do i = 1, nx
+               iglob = ipt + i
+               call exact( iglob, 1, k, u( 1, i, 1, k ) )
+            end do
+         end do
+      END IF
+
+      IF (east.eq.-1) then
+          do k = 1, nz
+             do i = 1, nx
+                iglob = ipt + i
+                call exact( iglob, ny0, k, u( 1, i, ny, k ) )
+             end do
+          end do
+      END IF
+
+c---------------------------------------------------------------------
+c   set the dependent variable values along east and west faces
+c---------------------------------------------------------------------
+      IF (north.eq.-1) then
+         do k = 1, nz
+            do j = 1, ny
+               jglob = jpt + j
+               call exact( 1, jglob, k, u( 1, 1, j, k ) )
+            end do
+         end do
+      END IF
+
+      IF (south.eq.-1) then
+         do k = 1, nz
+            do j = 1, ny
+                  jglob = jpt + j
+            call exact( nx0, jglob, k, u( 1, nx, j, k ) )
+            end do
+         end do
+      END IF
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/setcoeff.f b/examples/smpi/NAS/LU/setcoeff.f
new file mode 100644
index 0000000000..8fc5c1840c
--- /dev/null
+++ b/examples/smpi/NAS/LU/setcoeff.f
@@ -0,0 +1,159 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setcoeff
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c   set up coefficients
+c---------------------------------------------------------------------
+      dxi = 1.0d+00 / ( nx0 - 1 )
+      deta = 1.0d+00 / ( ny0 - 1 )
+      dzeta = 1.0d+00 / ( nz0 - 1 )
+
+      tx1 = 1.0d+00 / ( dxi * dxi )
+      tx2 = 1.0d+00 / ( 2.0d+00 * dxi )
+      tx3 = 1.0d+00 / dxi
+
+      ty1 = 1.0d+00 / ( deta * deta )
+      ty2 = 1.0d+00 / ( 2.0d+00 * deta )
+      ty3 = 1.0d+00 / deta
+
+      tz1 = 1.0d+00 / ( dzeta * dzeta )
+      tz2 = 1.0d+00 / ( 2.0d+00 * dzeta )
+      tz3 = 1.0d+00 / dzeta
+
+      ii1 = 2
+      ii2 = nx0 - 1
+      ji1 = 2
+      ji2 = ny0 - 2
+      ki1 = 3
+      ki2 = nz0 - 1
+
+c---------------------------------------------------------------------
+c   diffusion coefficients
+c---------------------------------------------------------------------
+      dx1 = 0.75d+00
+      dx2 = dx1
+      dx3 = dx1
+      dx4 = dx1
+      dx5 = dx1
+
+      dy1 = 0.75d+00
+      dy2 = dy1
+      dy3 = dy1
+      dy4 = dy1
+      dy5 = dy1
+
+      dz1 = 1.00d+00
+      dz2 = dz1
+      dz3 = dz1
+      dz4 = dz1
+      dz5 = dz1
+
+c---------------------------------------------------------------------
+c   fourth difference dissipation
+c---------------------------------------------------------------------
+      dssp = ( max (dx1, dy1, dz1 ) ) / 4.0d+00
+
+c---------------------------------------------------------------------
+c   coefficients of the exact solution to the first pde
+c---------------------------------------------------------------------
+      ce(1,1) = 2.0d+00
+      ce(1,2) = 0.0d+00
+      ce(1,3) = 0.0d+00
+      ce(1,4) = 4.0d+00
+      ce(1,5) = 5.0d+00
+      ce(1,6) = 3.0d+00
+      ce(1,7) = 5.0d-01
+      ce(1,8) = 2.0d-02
+      ce(1,9) = 1.0d-02
+      ce(1,10) = 3.0d-02
+      ce(1,11) = 5.0d-01
+      ce(1,12) = 4.0d-01
+      ce(1,13) = 3.0d-01
+
+c---------------------------------------------------------------------
+c   coefficients of the exact solution to the second pde
+c---------------------------------------------------------------------
+      ce(2,1) = 1.0d+00
+      ce(2,2) = 0.0d+00
+      ce(2,3) = 0.0d+00
+      ce(2,4) = 0.0d+00
+      ce(2,5) = 1.0d+00
+      ce(2,6) = 2.0d+00
+      ce(2,7) = 3.0d+00
+      ce(2,8) = 1.0d-02
+      ce(2,9) = 3.0d-02
+      ce(2,10) = 2.0d-02
+      ce(2,11) = 4.0d-01
+      ce(2,12) = 3.0d-01
+      ce(2,13) = 5.0d-01
+
+c---------------------------------------------------------------------
+c   coefficients of the exact solution to the third pde
+c---------------------------------------------------------------------
+      ce(3,1) = 2.0d+00
+      ce(3,2) = 2.0d+00
+      ce(3,3) = 0.0d+00
+      ce(3,4) = 0.0d+00
+      ce(3,5) = 0.0d+00
+      ce(3,6) = 2.0d+00
+      ce(3,7) = 3.0d+00
+      ce(3,8) = 4.0d-02
+      ce(3,9) = 3.0d-02
+      ce(3,10) = 5.0d-02
+      ce(3,11) = 3.0d-01
+      ce(3,12) = 5.0d-01
+      ce(3,13) = 4.0d-01
+
+c---------------------------------------------------------------------
+c   coefficients of the exact solution to the fourth pde
+c---------------------------------------------------------------------
+      ce(4,1) = 2.0d+00
+      ce(4,2) = 2.0d+00
+      ce(4,3) = 0.0d+00
+      ce(4,4) = 0.0d+00
+      ce(4,5) = 0.0d+00
+      ce(4,6) = 2.0d+00
+      ce(4,7) = 3.0d+00
+      ce(4,8) = 3.0d-02
+      ce(4,9) = 5.0d-02
+      ce(4,10) = 4.0d-02
+      ce(4,11) = 2.0d-01
+      ce(4,12) = 1.0d-01
+      ce(4,13) = 3.0d-01
+
+c---------------------------------------------------------------------
+c   coefficients of the exact solution to the fifth pde
+c---------------------------------------------------------------------
+      ce(5,1) = 5.0d+00
+      ce(5,2) = 4.0d+00
+      ce(5,3) = 3.0d+00
+      ce(5,4) = 2.0d+00
+      ce(5,5) = 1.0d-01
+      ce(5,6) = 4.0d-01
+      ce(5,7) = 3.0d-01
+      ce(5,8) = 5.0d-02
+      ce(5,9) = 4.0d-02
+      ce(5,10) = 3.0d-02
+      ce(5,11) = 1.0d-01
+      ce(5,12) = 3.0d-01
+      ce(5,13) = 2.0d-01
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/LU/sethyper.f b/examples/smpi/NAS/LU/sethyper.f
new file mode 100644
index 0000000000..15245d4c21
--- /dev/null
+++ b/examples/smpi/NAS/LU/sethyper.f
@@ -0,0 +1,94 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine sethyper
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c    for each column in a hyperplane, istart = first row,
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j
+      integer iglob, jglob
+      integer kp
+
+c---------------------------------------------------------------------
+c compute the pointers for hyperplanes
+c---------------------------------------------------------------------
+        do kp = 2,nx0+ny0
+          icomms(kp) = .false.
+          icommn(kp) = .false.
+          icomme(kp) = .false.
+          icommw(kp) = .false.
+
+c---------------------------------------------------------------------
+c  check to see if comm. to south is required
+c---------------------------------------------------------------------
+          if (south.ne.-1) then
+            i     = iend
+            iglob = ipt + i
+            jglob = kp - iglob
+            j     = jglob - jpt
+            if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and.
+     >         j.le.jend) icomms(kp) = .true.
+          end if
+
+c---------------------------------------------------------------------
+c  check to see if comm. to north is required
+c---------------------------------------------------------------------
+          if (north.ne.-1) then
+            i     = ist
+            iglob = ipt + i
+            jglob = kp - iglob
+            j     = jglob - jpt
+            if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and.
+     >         j.le.jend) icommn(kp) = .true.
+          end if
+
+c---------------------------------------------------------------------
+c  check to see if comm. to east is required
+c---------------------------------------------------------------------
+          if (east.ne.-1) then
+            j     = jend
+            jglob = jpt + j
+            iglob = kp - jglob
+            i     = iglob - ipt
+            if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and.
+     >         i.le.iend) icomme(kp) = .true.
+          end if
+
+c---------------------------------------------------------------------
+c  check to see if comm. to west is required
+c---------------------------------------------------------------------
+          if (west.ne.-1) then
+            j = jst
+            jglob = jpt + j
+            iglob = kp - jglob
+            i     = iglob - ipt
+            if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and.
+     >         i.le.iend) icommw(kp) = .true.
+          end if
+
+        end do
+
+        icomms(1) = .false.
+        icommn(1) = .false.
+        icomme(1) = .false.
+        icommw(1) = .false.
+        icomms(nx0+ny0+1) = .false.
+        icommn(nx0+ny0+1) = .false.
+        icomme(nx0+ny0+1) = .false.
+        icommw(nx0+ny0+1) = .false.
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/setiv.f b/examples/smpi/NAS/LU/setiv.f
new file mode 100644
index 0000000000..73725cbe5c
--- /dev/null
+++ b/examples/smpi/NAS/LU/setiv.f
@@ -0,0 +1,67 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine setiv
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   set the initial values of independent variables based on tri-linear
+c   interpolation of boundary values in the computational space.
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, k, m
+      integer iglob, jglob
+      double precision  xi, eta, zeta
+      double precision  pxi, peta, pzeta
+      double precision  ue_1jk(5),ue_nx0jk(5),ue_i1k(5),
+     >        ue_iny0k(5),ue_ij1(5),ue_ijnz(5)
+
+
+      do k = 2, nz - 1
+         zeta = ( dble (k-1) ) / (nz-1)
+         do j = 1, ny
+          jglob = jpt + j
+          IF (jglob.ne.1.and.jglob.ne.ny0) then
+            eta = ( dble (jglob-1) ) / (ny0-1)
+            do i = 1, nx
+              iglob = ipt + i
+              IF (iglob.ne.1.and.iglob.ne.nx0) then
+               xi = ( dble (iglob-1) ) / (nx0-1)
+               call exact (1,jglob,k,ue_1jk)
+               call exact (nx0,jglob,k,ue_nx0jk)
+               call exact (iglob,1,k,ue_i1k)
+               call exact (iglob,ny0,k,ue_iny0k)
+               call exact (iglob,jglob,1,ue_ij1)
+               call exact (iglob,jglob,nz,ue_ijnz)
+               do m = 1, 5
+                  pxi =   ( 1.0d+00 - xi ) * ue_1jk(m)
+     >                              + xi   * ue_nx0jk(m)
+                  peta =  ( 1.0d+00 - eta ) * ue_i1k(m)
+     >                              + eta   * ue_iny0k(m)
+                  pzeta = ( 1.0d+00 - zeta ) * ue_ij1(m)
+     >                              + zeta   * ue_ijnz(m)
+
+                  u( m, i, j, k ) = pxi + peta + pzeta
+     >                 - pxi * peta - peta * pzeta - pzeta * pxi
+     >                 + pxi * peta * pzeta
+
+               end do
+              END IF
+            end do
+          END IF
+         end do
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/LU/ssor.f b/examples/smpi/NAS/LU/ssor.f
new file mode 100644
index 0000000000..cf4eed0eb7
--- /dev/null
+++ b/examples/smpi/NAS/LU/ssor.f
@@ -0,0 +1,241 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine ssor(niter)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   to perform pseudo-time stepping SSOR iterations
+c   for five nonlinear pde's.
+c---------------------------------------------------------------------
+
+      implicit none
+      integer  niter
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer i, j, k, m
+      integer istep
+      double precision  tmp
+      double precision  delunm(5), tv(5,isiz1,isiz2)
+
+      external timer_read
+      double precision wtime, timer_read
+
+      integer IERROR
+
+ 
+      ROOT = 0
+ 
+c---------------------------------------------------------------------
+c   begin pseudo-time stepping iterations
+c---------------------------------------------------------------------
+      tmp = 1.0d+00 / ( omega * ( 2.0d+00 - omega ) ) 
+
+c---------------------------------------------------------------------
+c   initialize a,b,c,d to zero (guarantees that page tables have been
+c   formed, if applicable on given architecture, before timestepping).
+c---------------------------------------------------------------------
+      do m=1,isiz2
+         do k=1,isiz1
+            do j=1,5
+               do i=1,5
+                  a(i,j,k,m) = 0.d0
+                  b(i,j,k,m) = 0.d0
+                  c(i,j,k,m) = 0.d0
+                  d(i,j,k,m) = 0.d0
+               enddo
+            enddo
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c   compute the steady-state residuals
+c---------------------------------------------------------------------
+      call rhs
+ 
+c---------------------------------------------------------------------
+c   compute the L2 norms of newton iteration residuals
+c---------------------------------------------------------------------
+      call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
+     >             ist, iend, jst, jend,
+     >             rsd, rsdnm )
+  
+      call MPI_BARRIER( MPI_COMM_WORLD, IERROR )
+ 
+      call timer_clear(1)
+      call timer_start(1)
+
+c---------------------------------------------------------------------
+c   the timestep loop
+c---------------------------------------------------------------------
+      do istep = 1, niter
+
+         if (id .eq. 0) then
+            if (mod ( istep, 20) .eq. 0 .or.
+     >            istep .eq. itmax .or.
+     >            istep .eq. 1) then
+               if (niter .gt. 1) write( *, 200) istep
+ 200           format(' Time step ', i4)
+            endif
+         endif
+ 
+c---------------------------------------------------------------------
+c   perform SSOR iteration
+c---------------------------------------------------------------------
+         do k = 2, nz - 1
+            do j = jst, jend
+               do i = ist, iend
+                  do m = 1, 5
+                     rsd(m,i,j,k) = dt * rsd(m,i,j,k)
+                  end do
+               end do
+            end do
+         end do
+ 
+         DO k = 2, nz -1 
+c---------------------------------------------------------------------
+c   form the lower triangular part of the jacobian matrix
+c---------------------------------------------------------------------
+            call jacld(k)
+ 
+c---------------------------------------------------------------------
+c   perform the lower triangular solution
+c---------------------------------------------------------------------
+            call blts( isiz1, isiz2, isiz3,
+     >                 nx, ny, nz, k,
+     >                 omega,
+     >                 rsd,
+     >                 a, b, c, d,
+     >                 ist, iend, jst, jend, 
+     >                 nx0, ny0, ipt, jpt)
+          END DO
+ 
+          DO k = nz - 1, 2, -1
+c---------------------------------------------------------------------
+c   form the strictly upper triangular part of the jacobian matrix
+c---------------------------------------------------------------------
+            call jacu(k)
+
+c---------------------------------------------------------------------
+c   perform the upper triangular solution
+c---------------------------------------------------------------------
+            call buts( isiz1, isiz2, isiz3,
+     >                 nx, ny, nz, k,
+     >                 omega,
+     >                 rsd, tv,
+     >                 d, a, b, c,
+     >                 ist, iend, jst, jend,
+     >                 nx0, ny0, ipt, jpt)
+          END DO
+ 
+c---------------------------------------------------------------------
+c   update the variables
+c---------------------------------------------------------------------
+ 
+         do k = 2, nz-1
+            do j = jst, jend
+               do i = ist, iend
+                  do m = 1, 5
+                     u( m, i, j, k ) = u( m, i, j, k )
+     >                    + tmp * rsd( m, i, j, k )
+                  end do
+               end do
+            end do
+         end do
+ 
+c---------------------------------------------------------------------
+c   compute the max-norms of newton iteration corrections
+c---------------------------------------------------------------------
+         if ( mod ( istep, inorm ) .eq. 0 ) then
+            call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
+     >                   ist, iend, jst, jend,
+     >                   rsd, delunm )
+c            if ( ipr .eq. 1 .and. id .eq. 0 ) then
+c                write (*,1006) ( delunm(m), m = 1, 5 )
+c            else if ( ipr .eq. 2 .and. id .eq. 0 ) then
+c                write (*,'(i5,f15.6)') istep,delunm(5)
+c            end if
+         end if
+ 
+c---------------------------------------------------------------------
+c   compute the steady-state residuals
+c---------------------------------------------------------------------
+         call rhs
+ 
+c---------------------------------------------------------------------
+c   compute the max-norms of newton iteration residuals
+c---------------------------------------------------------------------
+         if ( ( mod ( istep, inorm ) .eq. 0 ) .or.
+     >        ( istep .eq. itmax ) ) then
+            call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
+     >                   ist, iend, jst, jend,
+     >                   rsd, rsdnm )
+c            if ( ipr .eq. 1.and.id.eq.0 ) then
+c                write (*,1007) ( rsdnm(m), m = 1, 5 )
+c            end if
+         end if
+
+c---------------------------------------------------------------------
+c   check the newton-iteration residuals against the tolerance levels
+c---------------------------------------------------------------------
+         if ( ( rsdnm(1) .lt. tolrsd(1) ) .and.
+     >        ( rsdnm(2) .lt. tolrsd(2) ) .and.
+     >        ( rsdnm(3) .lt. tolrsd(3) ) .and.
+     >        ( rsdnm(4) .lt. tolrsd(4) ) .and.
+     >        ( rsdnm(5) .lt. tolrsd(5) ) ) then
+c            if (ipr .eq. 1 .and. id.eq.0) then
+c               write (*,1004) istep
+c            end if
+            return
+         end if
+ 
+      end do
+ 
+      call timer_stop(1)
+      wtime = timer_read(1)
+ 
+
+      call MPI_ALLREDUCE( wtime, 
+     >                    maxtime, 
+     >                    1, 
+     >                    MPI_DOUBLE_PRECISION, 
+     >                    MPI_MAX, 
+     >                    MPI_COMM_WORLD,
+     >                    IERROR )
+ 
+
+
+      return
+      
+ 1001 format (1x/5x,'pseudo-time SSOR iteration no.=',i4/)
+ 1004 format (1x/1x,'convergence was achieved after ',i4,
+     >   ' pseudo-time steps' )
+ 1006 format (1x/1x,'RMS-norm of SSOR-iteration correction ',
+     > 'for first pde  = ',1pe12.5/,
+     > 1x,'RMS-norm of SSOR-iteration correction ',
+     > 'for second pde = ',1pe12.5/,
+     > 1x,'RMS-norm of SSOR-iteration correction ',
+     > 'for third pde  = ',1pe12.5/,
+     > 1x,'RMS-norm of SSOR-iteration correction ',
+     > 'for fourth pde = ',1pe12.5/,
+     > 1x,'RMS-norm of SSOR-iteration correction ',
+     > 'for fifth pde  = ',1pe12.5)
+ 1007 format (1x/1x,'RMS-norm of steady-state residual for ',
+     > 'first pde  = ',1pe12.5/,
+     > 1x,'RMS-norm of steady-state residual for ',
+     > 'second pde = ',1pe12.5/,
+     > 1x,'RMS-norm of steady-state residual for ',
+     > 'third pde  = ',1pe12.5/,
+     > 1x,'RMS-norm of steady-state residual for ',
+     > 'fourth pde = ',1pe12.5/,
+     > 1x,'RMS-norm of steady-state residual for ',
+     > 'fifth pde  = ',1pe12.5)
+ 
+      end
diff --git a/examples/smpi/NAS/LU/subdomain.f b/examples/smpi/NAS/LU/subdomain.f
new file mode 100644
index 0000000000..388bbf4494
--- /dev/null
+++ b/examples/smpi/NAS/LU/subdomain.f
@@ -0,0 +1,103 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine subdomain
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'applu.incl'
+
+c---------------------------------------------------------------------
+c  local variables
+c---------------------------------------------------------------------
+      integer mm, ierror, errorcode
+
+
+c---------------------------------------------------------------------
+c
+c   set up the sub-domain sizes
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   x dimension
+c---------------------------------------------------------------------
+      mm   = mod(nx0,xdim)
+      if (row.le.mm) then
+        nx = nx0/xdim + 1
+        ipt = (row-1)*nx
+      else
+        nx = nx0/xdim
+        ipt = (row-1)*nx + mm
+      end if
+
+c---------------------------------------------------------------------
+c   y dimension
+c---------------------------------------------------------------------
+      mm   = mod(ny0,ydim)
+      if (col.le.mm) then
+        ny = ny0/ydim + 1
+        jpt = (col-1)*ny
+      else
+        ny = ny0/ydim
+        jpt = (col-1)*ny + mm
+      end if
+
+c---------------------------------------------------------------------
+c   z dimension
+c---------------------------------------------------------------------
+      nz = nz0
+
+c---------------------------------------------------------------------
+c   check the sub-domain size
+c---------------------------------------------------------------------
+      if ( ( nx .lt. 4 ) .or.
+     >     ( ny .lt. 4 ) .or.
+     >     ( nz .lt. 4 ) ) then
+         write (*,2001) nx, ny, nz
+ 2001    format (5x,'SUBDOMAIN SIZE IS TOO SMALL - ',
+     >        /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS',
+     >        /5x,'SO THAT NX, NY AND NZ ARE GREATER THAN OR EQUAL',
+     >        /5x,'TO 4 THEY ARE CURRENTLY', 3I3)
+          CALL MPI_ABORT( MPI_COMM_WORLD,
+     >                    ERRORCODE,
+     >                    IERROR )
+      end if
+
+      if ( ( nx .gt. isiz1 ) .or.
+     >     ( ny .gt. isiz2 ) .or.
+     >     ( nz .gt. isiz3 ) ) then
+         write (*,2002) nx, ny, nz
+ 2002    format (5x,'SUBDOMAIN SIZE IS TOO LARGE - ',
+     >        /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS',
+     >        /5x,'SO THAT NX, NY AND NZ ARE LESS THAN OR EQUAL TO ',
+     >        /5x,'ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY.  THEY ARE',
+     >        /5x,'CURRENTLY', 3I4)
+          CALL MPI_ABORT( MPI_COMM_WORLD,
+     >                    ERRORCODE,
+     >                    IERROR )
+      end if
+
+
+c---------------------------------------------------------------------
+c   set up the start and end in i and j extents for all processors
+c---------------------------------------------------------------------
+      ist = 1
+      iend = nx
+      if (north.eq.-1) ist = 2
+      if (south.eq.-1) iend = nx - 1
+
+      jst = 1
+      jend = ny
+      if (west.eq.-1) jst = 2
+      if (east.eq.-1) jend = ny - 1
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/LU/verify.f b/examples/smpi/NAS/LU/verify.f
new file mode 100644
index 0000000000..2572441a44
--- /dev/null
+++ b/examples/smpi/NAS/LU/verify.f
@@ -0,0 +1,403 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+        subroutine verify(xcr, xce, xci, class, verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c  verification routine                         
+c---------------------------------------------------------------------
+
+        implicit none
+        include 'mpinpb.h'
+        include 'applu.incl'
+
+        double precision xcr(5), xce(5), xci
+        double precision xcrref(5),xceref(5),xciref, 
+     >                   xcrdif(5),xcedif(5),xcidif,
+     >                   epsilon, dtref
+        integer m
+        character class
+        logical verified
+
+c---------------------------------------------------------------------
+c   tolerance level
+c---------------------------------------------------------------------
+        epsilon = 1.0d-08
+
+        class = 'U'
+        verified = .true.
+
+        do m = 1,5
+           xcrref(m) = 1.0
+           xceref(m) = 1.0
+        end do
+        xciref = 1.0
+
+        if ( (nx0  .eq. 12     ) .and. 
+     >       (ny0  .eq. 12     ) .and.
+     >       (nz0  .eq. 12     ) .and.
+     >       (itmax   .eq. 50    ))  then
+
+           class = 'S'
+           dtref = 5.0d-1
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of residual, for the (12X12X12) grid,
+c   after 50 time steps, with  DT = 5.0d-01
+c---------------------------------------------------------------------
+         xcrref(1) = 1.6196343210976702d-02
+         xcrref(2) = 2.1976745164821318d-03
+         xcrref(3) = 1.5179927653399185d-03
+         xcrref(4) = 1.5029584435994323d-03
+         xcrref(5) = 3.4264073155896461d-02
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of solution error, for the (12X12X12) grid,
+c   after 50 time steps, with  DT = 5.0d-01
+c---------------------------------------------------------------------
+         xceref(1) = 6.4223319957960924d-04
+         xceref(2) = 8.4144342047347926d-05
+         xceref(3) = 5.8588269616485186d-05
+         xceref(4) = 5.8474222595157350d-05
+         xceref(5) = 1.3103347914111294d-03
+
+c---------------------------------------------------------------------
+c   Reference value of surface integral, for the (12X12X12) grid,
+c   after 50 time steps, with DT = 5.0d-01
+c---------------------------------------------------------------------
+         xciref = 7.8418928865937083d+00
+
+
+        elseif ( (nx0 .eq. 33) .and. 
+     >           (ny0 .eq. 33) .and.
+     >           (nz0 .eq. 33) .and.
+     >           (itmax . eq. 300) ) then
+
+           class = 'W'   !SPEC95fp size
+           dtref = 1.5d-3
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of residual, for the (33x33x33) grid,
+c   after 300 time steps, with  DT = 1.5d-3
+c---------------------------------------------------------------------
+           xcrref(1) =   0.1236511638192d+02
+           xcrref(2) =   0.1317228477799d+01
+           xcrref(3) =   0.2550120713095d+01
+           xcrref(4) =   0.2326187750252d+01
+           xcrref(5) =   0.2826799444189d+02
+
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of solution error, for the (33X33X33) grid,
+c---------------------------------------------------------------------
+           xceref(1) =   0.4867877144216d+00
+           xceref(2) =   0.5064652880982d-01
+           xceref(3) =   0.9281818101960d-01
+           xceref(4) =   0.8570126542733d-01
+           xceref(5) =   0.1084277417792d+01
+
+
+c---------------------------------------------------------------------
+c   Reference value of surface integral, for the (33X33X33) grid,
+c   after 300 time steps, with  DT = 1.5d-3
+c---------------------------------------------------------------------
+           xciref    =   0.1161399311023d+02
+
+        elseif ( (nx0 .eq. 64) .and. 
+     >           (ny0 .eq. 64) .and.
+     >           (nz0 .eq. 64) .and.
+     >           (itmax . eq. 250) ) then
+
+           class = 'A'
+           dtref = 2.0d+0
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of residual, for the (64X64X64) grid,
+c   after 250 time steps, with  DT = 2.0d+00
+c---------------------------------------------------------------------
+         xcrref(1) = 7.7902107606689367d+02
+         xcrref(2) = 6.3402765259692870d+01
+         xcrref(3) = 1.9499249727292479d+02
+         xcrref(4) = 1.7845301160418537d+02
+         xcrref(5) = 1.8384760349464247d+03
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of solution error, for the (64X64X64) grid,
+c   after 250 time steps, with  DT = 2.0d+00
+c---------------------------------------------------------------------
+         xceref(1) = 2.9964085685471943d+01
+         xceref(2) = 2.8194576365003349d+00
+         xceref(3) = 7.3473412698774742d+00
+         xceref(4) = 6.7139225687777051d+00
+         xceref(5) = 7.0715315688392578d+01
+
+c---------------------------------------------------------------------
+c   Reference value of surface integral, for the (64X64X64) grid,
+c   after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+         xciref = 2.6030925604886277d+01
+
+
+        elseif ( (nx0 .eq. 102) .and. 
+     >           (ny0 .eq. 102) .and.
+     >           (nz0 .eq. 102) .and.
+     >           (itmax . eq. 250) ) then
+
+           class = 'B'
+           dtref = 2.0d+0
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of residual, for the (102X102X102) grid,
+c   after 250 time steps, with  DT = 2.0d+00
+c---------------------------------------------------------------------
+         xcrref(1) = 3.5532672969982736d+03
+         xcrref(2) = 2.6214750795310692d+02
+         xcrref(3) = 8.8333721850952190d+02
+         xcrref(4) = 7.7812774739425265d+02
+         xcrref(5) = 7.3087969592545314d+03
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of solution error, for the (102X102X102) 
+c   grid, after 250 time steps, with  DT = 2.0d+00
+c---------------------------------------------------------------------
+         xceref(1) = 1.1401176380212709d+02
+         xceref(2) = 8.1098963655421574d+00
+         xceref(3) = 2.8480597317698308d+01
+         xceref(4) = 2.5905394567832939d+01
+         xceref(5) = 2.6054907504857413d+02
+
+c---------------------------------------------------------------------
+c   Reference value of surface integral, for the (102X102X102) grid,
+c   after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+         xciref = 4.7887162703308227d+01
+
+        elseif ( (nx0 .eq. 162) .and. 
+     >           (ny0 .eq. 162) .and.
+     >           (nz0 .eq. 162) .and.
+     >           (itmax . eq. 250) ) then
+
+           class = 'C'
+           dtref = 2.0d+0
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of residual, for the (162X162X162) grid,
+c   after 250 time steps, with  DT = 2.0d+00
+c---------------------------------------------------------------------
+         xcrref(1) = 1.03766980323537846d+04
+         xcrref(2) = 8.92212458801008552d+02
+         xcrref(3) = 2.56238814582660871d+03
+         xcrref(4) = 2.19194343857831427d+03
+         xcrref(5) = 1.78078057261061185d+04
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of solution error, for the (162X162X162) 
+c   grid, after 250 time steps, with  DT = 2.0d+00
+c---------------------------------------------------------------------
+         xceref(1) = 2.15986399716949279d+02
+         xceref(2) = 1.55789559239863600d+01
+         xceref(3) = 5.41318863077207766d+01
+         xceref(4) = 4.82262643154045421d+01
+         xceref(5) = 4.55902910043250358d+02
+
+c---------------------------------------------------------------------
+c   Reference value of surface integral, for the (162X162X162) grid,
+c   after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+         xciref = 6.66404553572181300d+01
+
+        elseif ( (nx0 .eq. 408) .and. 
+     >           (ny0 .eq. 408) .and.
+     >           (nz0 .eq. 408) .and.
+     >           (itmax . eq. 300) ) then
+
+           class = 'D'
+           dtref = 1.0d+0
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of residual, for the (408X408X408) grid,
+c   after 300 time steps, with  DT = 1.0d+00
+c---------------------------------------------------------------------
+         xcrref(1) = 0.4868417937025d+05
+         xcrref(2) = 0.4696371050071d+04
+         xcrref(3) = 0.1218114549776d+05 
+         xcrref(4) = 0.1033801493461d+05
+         xcrref(5) = 0.7142398413817d+05
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of solution error, for the (408X408X408) 
+c   grid, after 300 time steps, with  DT = 1.0d+00
+c---------------------------------------------------------------------
+         xceref(1) = 0.3752393004482d+03
+         xceref(2) = 0.3084128893659d+02
+         xceref(3) = 0.9434276905469d+02
+         xceref(4) = 0.8230686681928d+02
+         xceref(5) = 0.7002620636210d+03
+
+c---------------------------------------------------------------------
+c   Reference value of surface integral, for the (408X408X408) grid,
+c   after 300 time steps, with DT = 1.0d+00
+c---------------------------------------------------------------------
+         xciref =    0.8334101392503d+02
+
+        elseif ( (nx0 .eq. 1020) .and. 
+     >           (ny0 .eq. 1020) .and.
+     >           (nz0 .eq. 1020) .and.
+     >           (itmax . eq. 300) ) then
+
+           class = 'E'
+           dtref = 0.5d+0
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of residual, for the (1020X1020X1020) grid,
+c   after 300 time steps, with  DT = 0.5d+00
+c---------------------------------------------------------------------
+         xcrref(1) = 0.2099641687874d+06
+         xcrref(2) = 0.2130403143165d+05
+         xcrref(3) = 0.5319228789371d+05 
+         xcrref(4) = 0.4509761639833d+05
+         xcrref(5) = 0.2932360006590d+06
+
+c---------------------------------------------------------------------
+c   Reference values of RMS-norms of solution error, for the (1020X1020X1020) 
+c   grid, after 300 time steps, with  DT = 0.5d+00
+c---------------------------------------------------------------------
+         xceref(1) = 0.4800572578333d+03
+         xceref(2) = 0.4221993400184d+02
+         xceref(3) = 0.1210851906824d+03
+         xceref(4) = 0.1047888986770d+03
+         xceref(5) = 0.8363028257389d+03
+
+c---------------------------------------------------------------------
+c   Reference value of surface integral, for the (1020X1020X1020) grid,
+c   after 300 time steps, with DT = 0.5d+00
+c---------------------------------------------------------------------
+         xciref =    0.9512163272273d+02
+
+        else
+           verified = .FALSE.
+        endif
+
+c---------------------------------------------------------------------
+c    verification test for residuals if gridsize is one of 
+c    the defined grid sizes above (class .ne. 'U')
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c    Compute the difference of solution values and the known reference values.
+c---------------------------------------------------------------------
+        do m = 1, 5
+           
+           xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) 
+           xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
+           
+        enddo
+        xcidif = dabs((xci - xciref)/xciref)
+
+
+c---------------------------------------------------------------------
+c    Output the comparison of computed results to known cases.
+c---------------------------------------------------------------------
+
+        if (class .ne. 'U') then
+           write(*, 1990) class
+ 1990      format(/, ' Verification being performed for class ', a)
+           write (*,2000) epsilon
+ 2000      format(' Accuracy setting for epsilon = ', E20.13)
+           verified = (dabs(dt-dtref) .le. epsilon)
+           if (.not.verified) then  
+              class = 'U'
+              write (*,1000) dtref
+ 1000         format(' DT does not match the reference value of ', 
+     >                 E15.8)
+           endif
+        else 
+           write(*, 1995)
+ 1995      format(' Unknown class')
+        endif
+
+
+        if (class .ne. 'U') then
+           write (*,2001) 
+        else
+           write (*, 2005)
+        endif
+
+ 2001   format(' Comparison of RMS-norms of residual')
+ 2005   format(' RMS-norms of residual')
+        do m = 1, 5
+           if (class .eq. 'U') then
+              write(*, 2015) m, xcr(m)
+           else if (xcrdif(m) .le. epsilon) then
+              write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
+           else 
+              verified = .false.
+              write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
+           endif
+        enddo
+
+        if (class .ne. 'U') then
+           write (*,2002)
+        else
+           write (*,2006)
+        endif
+ 2002   format(' Comparison of RMS-norms of solution error')
+ 2006   format(' RMS-norms of solution error')
+        
+        do m = 1, 5
+           if (class .eq. 'U') then
+              write(*, 2015) m, xce(m)
+           else if (xcedif(m) .le. epsilon) then
+              write (*,2011) m,xce(m),xceref(m),xcedif(m)
+           else
+              verified = .false.
+              write (*,2010) m,xce(m),xceref(m),xcedif(m)
+           endif
+        enddo
+        
+ 2010   format(' FAILURE: ', i2, 2x, E20.13, E20.13, E20.13)
+ 2011   format('          ', i2, 2x, E20.13, E20.13, E20.13)
+ 2015   format('          ', i2, 2x, E20.13)
+        
+        if (class .ne. 'U') then
+           write (*,2025)
+        else
+           write (*,2026)
+        endif
+ 2025   format(' Comparison of surface integral')
+ 2026   format(' Surface integral')
+
+
+        if (class .eq. 'U') then
+           write(*, 2030) xci
+        else if (xcidif .le. epsilon) then
+           write(*, 2032) xci, xciref, xcidif
+        else
+           verified = .false.
+           write(*, 2031) xci, xciref, xcidif
+        endif
+
+ 2030   format('          ', 4x, E20.13)
+ 2031   format(' FAILURE: ', 4x, E20.13, E20.13, E20.13)
+ 2032   format('          ', 4x, E20.13, E20.13, E20.13)
+
+
+
+        if (class .eq. 'U') then
+           write(*, 2022)
+           write(*, 2023)
+ 2022      format(' No reference values provided')
+ 2023      format(' No verification performed')
+        else if (verified) then
+           write(*, 2020)
+ 2020      format(' Verification Successful')
+        else
+           write(*, 2021)
+ 2021      format(' Verification failed')
+        endif
+
+        return
+
+
+        end
diff --git a/examples/smpi/NAS/MG/Makefile b/examples/smpi/NAS/MG/Makefile
new file mode 100644
index 0000000000..1554bedeea
--- /dev/null
+++ b/examples/smpi/NAS/MG/Makefile
@@ -0,0 +1,23 @@
+SHELL=/bin/sh
+BENCHMARK=mg
+BENCHMARKU=MG
+
+include ../config/make.def
+
+OBJS = mg.o ${COMMON}/print_results.o  \
+       ${COMMON}/${RAND}.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+mg.o:		mg.f  globals.h mpinpb.h npbparams.h
+	${FCOMPILE} mg.f
+
+clean:
+	- rm -f *.o *~ 
+	- rm -f npbparams.h core
+
+
+
diff --git a/examples/smpi/NAS/MG/README b/examples/smpi/NAS/MG/README
new file mode 100644
index 0000000000..6c03f78527
--- /dev/null
+++ b/examples/smpi/NAS/MG/README
@@ -0,0 +1,138 @@
+Some info about the MG benchmark
+================================
+    
+'mg_demo' demonstrates the capabilities of a very simple multigrid
+solver in computing a three dimensional potential field.  This is
+a simplified multigrid solver in two important respects:
+
+  (1) it solves only a constant coefficient equation,
+  and that only on a uniform cubical grid,
+    
+  (2) it solves only a single equation, representing
+  a scalar field rather than a vector field.
+
+We chose it for its portability and simplicity, and expect that a
+supercomputer which can run it effectively will also be able to
+run more complex multigrid programs at least as well.
+     
+     Eric Barszcz                         Paul Frederickson
+     RIACS
+     NASA Ames Research Center            NASA Ames Research Center
+
+========================================================================
+Running the program:  (Note: also see parameter lm information in the
+                       two sections immediately below this section)
+
+The program may be run with or without an input deck (called "mg.input"). 
+The following describes a few things about the input deck if you want to 
+use one. 
+
+The four lines below are the "mg.input" file required to run a
+problem of total size 256x256x256, for 4 iterations (Class "A"),
+and presumes the use of 8 processors:
+
+   8 = top level
+   256 256 256 = nx ny nz
+   4 = nit
+   0 0 0 0 0 0 0 0 = debug_vec
+
+The first line of input indicates how many levels of multi-grid
+cycle will be applied to a particular subpartition.  Presuming that
+8 processors are solving this problem (recall that the number of 
+processors is specified to MPI as a run parameter, and MPI subsequently
+determines this for the code via an MPI subroutine call), a 2x2x2 
+processor grid is  formed, and thus each partition on a processor is 
+of size 128x128x128.  Therefore, a maximum of 8 multi-grid levels may 
+be used.  These are of size 128,64,32,16,8,4,2,1, with the coarsest 
+level being a single point on a given processor.
+
+
+Next, consider the same size problem but running on 1 processor.  The
+following "mg.input" file is appropriate:
+
+    9 = top level
+    256 256 256 = nx ny nz
+    4 = nit
+    0 0 0 0 0 0 0 0 = debug_vec
+
+Since this processor must solve the full 256x256x256 problem, this
+permits 9 multi-grid levels (256,128,64,32,16,8,4,2,1), resulting in 
+a coarsest multi-grid level of a single point on the processor
+
+
+Next, consider the same size problem but running on 2 processors.  The
+following "mg.input" file is required:
+
+    8 = top level
+    256 256 256 = nx ny nz
+    4 = nit
+    0 0 0 0 0 0 0 0 = debug_vec
+
+The algorithm for partitioning the full grid onto some power of 2 number 
+of processors is to start by splitting the last dimension of the grid
+(z dimension) in 2: the problem is now partitioned onto 2 processors.
+Next the middle dimension (y dimension) is split in 2: the problem is now
+partitioned onto 4 processors.  Next, first dimension (x dimension) is
+split in 2: the problem is now partitioned onto 8 processors.  Next, the
+last dimension (z dimension) is split again in 2: the problem is now
+partitioned onto 16 processors.  This partitioning is repeated until all 
+of the power of 2 processors have been allocated.
+
+Thus to run the above problem on 2 processors, the grid partitioning 
+algorithm will allocate the two processors across the last dimension, 
+creating two partitions each of size 256x256x128. The coarsest level of 
+multi-grid must be a single point surrounded by a cubic number of grid 
+points.  Therefore, each of the two processor partitions will contain 4 
+coarsest multi-grid level points, each surrounded by a cube of grid points 
+of size 128x128x128, indicated by a top level of 8.
+
+
+Next, consider the same size problem but running on 4 processors.  The
+following "mg.input" file is required:
+
+    8 = top level
+    256 256 256 = nx ny nz
+    4 = nit
+    0 0 0 0 0 0 0 0 = debug_vec
+
+The partitioning algorithm will create 4 partitions, each of size
+256x128x128.  Each partition will contain 2 coarsest multi-grid level
+points each surrounded by a cube of grid points of size 128x128x128, 
+indicated by a top level of 8.
+
+
+Next, consider the same size problem but running on 16 processors.  The
+following "mg.input" file is required:
+
+    7 = top level
+    256 256 256 = nx ny nz
+    4 = nit
+    0 0 0 0 0 0 0 0 = debug_vec
+
+On each node a partition of size 128x128x64 will be created.  A maximum
+of 7 multi-grid levels (64,32,16,8,4,2,1) may be used, resulting in each 
+partions containing 4 coarsest multi-grid level points, each surrounded 
+by a cube of grid points of size 64x64x64, indicated by a top level of 7.
+
+
+
+
+Note that non-cubic problem sizes may also be considered:
+
+The four lines below are the "mg.input" file appropriate for running a
+problem of total size 256x512x512, for 20 iterations and presumes the 
+use of 32 processors (note: this is NOT a class C problem):
+
+    8 = top level
+    256 512 512 = nx ny nz
+    20 = nit
+    0 0 0 0 0 0 0 0 = debug_vec
+
+The first line of input indicates how many levels of multi-grid
+cycle will be applied to a particular subpartition.  Presuming that
+32 processors are solving this problem, a 2x4x4 processor grid is
+formed, and thus each partition on a processor is of size 128x128x128.
+Therefore, a maximum of 8 multi-grid levels may be used.  These are of
+size 128,64,32,16,8,4,2,1, with the coarsest level being a single 
+point on a given processor.
+
diff --git a/examples/smpi/NAS/MG/globals.h b/examples/smpi/NAS/MG/globals.h
new file mode 100644
index 0000000000..99573e3c20
--- /dev/null
+++ b/examples/smpi/NAS/MG/globals.h
@@ -0,0 +1,55 @@
+c---------------------------------------------------------------------
+c  Parameter lm (declared and set in "npbparams.h") is the log-base2 of 
+c  the edge size max for the partition on a given node, so must be changed 
+c  either to save space (if running a small case) or made bigger for larger 
+c  cases, for example, 512^3. Thus lm=7 means that the largest dimension 
+c  of a partition that can be solved on a node is 2^7 = 128. lm is set 
+c  automatically in npbparams.h
+c  Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 
+c---------------------------------------------------------------------
+
+      include 'npbparams.h'
+
+      integer nm      ! actual dimension including ghost cells for communications
+     >      , nv      ! size of rhs array
+     >      , nr      ! size of residual array
+     >      , nm2     ! size of communication buffer
+     >      , maxlevel! maximum number of levels
+
+      parameter( nm=2+2**lm, nv=(2+2**ndim1)*(2+2**ndim2)*(2+2**ndim3) )
+      parameter( nm2=2*nm*nm, maxlevel=(lt_default+1) )
+      parameter( nr = (8*(nv+nm**2+5*nm+14*lt_default-7*lm))/7 )
+      integer maxprocs
+      parameter( maxprocs = 131072 )  ! this is the upper proc limit that 
+                                      ! the current "nr" parameter can handle
+c---------------------------------------------------------------------
+      integer nbr(3,-1:1,maxlevel), msg_type(3,-1:1)
+      integer  msg_id(3,-1:1,2),nx(maxlevel),ny(maxlevel),nz(maxlevel)
+      common /mg3/ nbr,msg_type,msg_id,nx,ny,nz
+
+      character class
+      common /ClassType/class
+
+      integer debug_vec(0:7)
+      common /my_debug/ debug_vec
+
+      integer ir(maxlevel), m1(maxlevel), m2(maxlevel), m3(maxlevel)
+      integer lt, lb
+      common /fap/ ir,m1,m2,m3,lt,lb
+
+      logical dead(maxlevel), give_ex(3,maxlevel), take_ex(3,maxlevel)
+      common /comm_ex/ dead, give_ex, take_ex
+
+c---------------------------------------------------------------------
+c  Set at m=1024, can handle cases up to 1024^3 case
+c---------------------------------------------------------------------
+      integer m
+c      parameter( m=1037 )
+      parameter( m=nm+1 )
+
+      double precision buff(nm2,4)
+      common /buffer/ buff
+
+
+
+
diff --git a/examples/smpi/NAS/MG/mg.f b/examples/smpi/NAS/MG/mg.f
new file mode 100644
index 0000000000..b0352ae2df
--- /dev/null
+++ b/examples/smpi/NAS/MG/mg.f
@@ -0,0 +1,2479 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                                   M G                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+!    It is described in NAS Technical Reports 95-020 and 02-007           !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c
+c Authors: E. Barszcz
+c          P. Frederickson
+c          A. Woo
+c          M. Yarrow
+c          R. F. Van der Wijngaart
+c
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+      program mg_mpi
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+c---------------------------------------------------------------------------c
+c k is the current level. It is passed down through subroutine args
+c and is NOT global. it is the current iteration
+c---------------------------------------------------------------------------c
+
+      integer k, it
+      
+      external timer_read
+      double precision t, t0, tinit, mflops, timer_read
+
+c---------------------------------------------------------------------------c
+c These arrays are in common because they are quite large
+c and probably shouldn't be allocated on the stack. They
+c are always passed as subroutine args. 
+c---------------------------------------------------------------------------c
+
+      double precision u(nr),v(nv),r(nr),a(0:3),c(0:3)
+      common /noautom/ u,v,r   
+
+      double precision rnm2, rnmu, old2, oldu, epsilon
+      integer n1, n2, n3, nit
+      double precision nn, verify_value, err
+      logical verified
+
+      integer ierr,i, fstatus
+      integer T_bench, T_init
+      parameter (T_bench=1, T_init=2)
+
+      call mpi_init(ierr)
+      call mpi_comm_rank(mpi_comm_world, me, ierr)
+      call mpi_comm_size(mpi_comm_world, nprocs, ierr)
+
+      root = 0
+      if (nprocs_compiled .gt. maxprocs) then
+         if (me .eq. root) write(*,20) nprocs_compiled, maxprocs
+ 20      format(' ERROR: compiled for ',i8,' processes'//
+     &          ' The maximum size allowed for this benchmark is ',i6)
+         call mpi_abort(MPI_COMM_WORLD, ierr)
+         stop
+      endif
+
+      if (.not. convertdouble) then
+         dp_type = MPI_DOUBLE_PRECISION
+      else
+         dp_type = MPI_REAL
+      endif
+
+
+      call timer_clear(T_bench)
+      call timer_clear(T_init)
+
+      call mpi_barrier(MPI_COMM_WORLD, ierr)
+
+      call timer_start(T_init)
+      
+
+c---------------------------------------------------------------------
+c Read in and broadcast input data
+c---------------------------------------------------------------------
+
+      if( me .eq. root )then
+         write (*, 1000) 
+
+         open(unit=7,file="mg.input", status="old", iostat=fstatus)
+         if (fstatus .eq. 0) then
+            write(*,50) 
+ 50         format(' Reading from input file mg.input')
+            read(7,*) lt
+            read(7,*) nx(lt), ny(lt), nz(lt)
+            read(7,*) nit
+            read(7,*) (debug_vec(i),i=0,7)
+         else
+            write(*,51) 
+ 51         format(' No input file. Using compiled defaults ')
+            lt = lt_default
+            nit = nit_default
+            nx(lt) = nx_default
+            ny(lt) = ny_default
+            nz(lt) = nz_default
+            do i = 0,7
+               debug_vec(i) = debug_default
+            end do
+         endif
+      endif
+
+      call mpi_bcast(lt, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+      call mpi_bcast(nit, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+      call mpi_bcast(nx(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+      call mpi_bcast(ny(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+      call mpi_bcast(nz(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+      call mpi_bcast(debug_vec(0), 8, MPI_INTEGER, 0, 
+     >               mpi_comm_world, ierr)
+
+      if ( (nx(lt) .ne. ny(lt)) .or. (nx(lt) .ne. nz(lt)) ) then
+         Class = 'U' 
+      else if( nx(lt) .eq. 32 .and. nit .eq. 4 ) then
+         Class = 'S'
+      else if( nx(lt) .eq. 128 .and. nit .eq. 4 ) then
+         Class = 'W'
+      else if( nx(lt) .eq. 256 .and. nit .eq. 4 ) then  
+         Class = 'A'
+      else if( nx(lt) .eq. 256 .and. nit .eq. 20 ) then
+         Class = 'B'
+      else if( nx(lt) .eq. 512 .and. nit .eq. 20 ) then  
+         Class = 'C'
+      else if( nx(lt) .eq. 1024 .and. nit .eq. 50 ) then  
+         Class = 'D'
+      else if( nx(lt) .eq. 2048 .and. nit .eq. 50 ) then  
+         Class = 'E'
+      else
+         Class = 'U'
+      endif
+
+c---------------------------------------------------------------------
+c  Use these for debug info:
+c---------------------------------------------------------------------
+c     debug_vec(0) = 1 !=> report all norms
+c     debug_vec(1) = 1 !=> some setup information
+c     debug_vec(1) = 2 !=> more setup information
+c     debug_vec(2) = k => at level k or below, show result of resid
+c     debug_vec(3) = k => at level k or below, show result of psinv
+c     debug_vec(4) = k => at level k or below, show result of rprj
+c     debug_vec(5) = k => at level k or below, show result of interp
+c     debug_vec(6) = 1 => (unused)
+c     debug_vec(7) = 1 => (unused)
+c---------------------------------------------------------------------
+      a(0) = -8.0D0/3.0D0 
+      a(1) =  0.0D0 
+      a(2) =  1.0D0/6.0D0 
+      a(3) =  1.0D0/12.0D0
+      
+      if(Class .eq. 'A' .or. Class .eq. 'S'.or. Class .eq.'W') then
+c---------------------------------------------------------------------
+c     Coefficients for the S(a) smoother
+c---------------------------------------------------------------------
+         c(0) =  -3.0D0/8.0D0
+         c(1) =  +1.0D0/32.0D0
+         c(2) =  -1.0D0/64.0D0
+         c(3) =   0.0D0
+      else
+c---------------------------------------------------------------------
+c     Coefficients for the S(b) smoother
+c---------------------------------------------------------------------
+         c(0) =  -3.0D0/17.0D0
+         c(1) =  +1.0D0/33.0D0
+         c(2) =  -1.0D0/61.0D0
+         c(3) =   0.0D0
+      endif
+      lb = 1
+      k  = lt
+
+      call setup(n1,n2,n3,k)
+      call zero3(u,n1,n2,n3)
+      call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
+
+      call norm2u3(v,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
+
+      if( me .eq. root )then
+         write (*, 1001) nx(lt),ny(lt),nz(lt), Class
+         write (*, 1002) nit
+
+ 1000 format(//,' NAS Parallel Benchmarks 3.3 -- MG Benchmark', /)
+ 1001    format(' Size: ', i4, 'x', i4, 'x', i4, '  (class ', A, ')' )
+ 1002    format(' Iterations: ', i4)
+ 1003    format(' Number of processes: ', i6)
+         if (nprocs .ne. nprocs_compiled) then
+           write (*, 1004) nprocs_compiled
+           write (*, 1005) nprocs
+ 1004      format(' WARNING: compiled for ', i6, ' processes ')
+ 1005      format(' Number of active processes: ', i6, /)
+         else
+           write (*, 1003) nprocs
+         endif
+      endif
+
+      call resid(u,v,r,n1,n2,n3,a,k)
+      call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
+      old2 = rnm2
+      oldu = rnmu
+
+c---------------------------------------------------------------------
+c     One iteration for startup
+c---------------------------------------------------------------------
+      call mg3P(u,v,r,a,c,n1,n2,n3,k)
+      call resid(u,v,r,n1,n2,n3,a,k)
+      call setup(n1,n2,n3,k)
+      call zero3(u,n1,n2,n3)
+      call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
+
+      call timer_stop(T_init)
+      if( me .eq. root )then
+         tinit = timer_read(T_init)
+         write( *,'(/A,F15.3,A/)' ) 
+     >        ' Initialization time: ',tinit, ' seconds'
+      endif
+
+      call mpi_barrier(mpi_comm_world,ierr)
+
+      call timer_start(T_bench)
+
+      call resid(u,v,r,n1,n2,n3,a,k)
+      call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
+      old2 = rnm2
+      oldu = rnmu
+
+      do  it=1,nit
+         if (it.eq.1 .or. it.eq.nit .or. mod(it,5).eq.0) then
+            if (me .eq. root) write(*,80) it
+   80       format('  iter ',i4)
+         endif
+         call mg3P(u,v,r,a,c,n1,n2,n3,k)
+         call resid(u,v,r,n1,n2,n3,a,k)
+      enddo
+
+
+      call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
+
+      call timer_stop(T_bench)
+
+      t0 = timer_read(T_bench)
+
+      call mpi_reduce(t0,t,1,dp_type,
+     >     mpi_max,root,mpi_comm_world,ierr)
+      verified = .FALSE.
+      verify_value = 0.0
+      if( me .eq. root )then
+         write(*,100)
+ 100     format(/' Benchmark completed ')
+
+         epsilon = 1.d-8
+         if (Class .ne. 'U') then
+            if(Class.eq.'S') then
+               verify_value = 0.5307707005734d-04
+            elseif(Class.eq.'W') then
+               verify_value = 0.6467329375339d-05
+            elseif(Class.eq.'A') then
+               verify_value = 0.2433365309069d-05
+            elseif(Class.eq.'B') then
+               verify_value = 0.1800564401355d-05
+            elseif(Class.eq.'C') then
+               verify_value = 0.5706732285740d-06
+            elseif(Class.eq.'D') then
+               verify_value = 0.1583275060440d-09
+            elseif(Class.eq.'E') then
+               verify_value = 0.5630442584711d-10
+            endif
+
+            err = abs( rnm2 - verify_value ) / verify_value
+            if( err .le. epsilon ) then
+               verified = .TRUE.
+               write(*, 200)
+               write(*, 201) rnm2
+               write(*, 202) err
+ 200           format(' VERIFICATION SUCCESSFUL ')
+ 201           format(' L2 Norm is ', E20.13)
+ 202           format(' Error is   ', E20.13)
+            else
+               verified = .FALSE.
+               write(*, 300) 
+               write(*, 301) rnm2
+               write(*, 302) verify_value
+ 300           format(' VERIFICATION FAILED')
+ 301           format(' L2 Norm is             ', E20.13)
+ 302           format(' The correct L2 Norm is ', E20.13)
+            endif
+         else
+            verified = .FALSE.
+            write (*, 400)
+            write (*, 401)
+            write (*, 201) rnm2
+ 400        format(' Problem size unknown')
+ 401        format(' NO VERIFICATION PERFORMED')
+         endif
+
+         nn = 1.0d0*nx(lt)*ny(lt)*nz(lt)
+
+         if( t .ne. 0. ) then
+            mflops = 58.*1.0D-6*nit*nn / t
+         else
+            mflops = 0.0
+         endif
+
+         call print_results('MG', class, nx(lt), ny(lt), nz(lt), 
+     >                      nit, nprocs_compiled, nprocs, t,
+     >                      mflops, '          floating point', 
+     >                      verified, npbversion, compiletime,
+     >                      cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+
+
+      endif
+
+
+      call mpi_finalize(ierr)
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup(n1,n2,n3,k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer  is1, is2, is3, ie1, ie2, ie3
+      common /grid/ is1,is2,is3,ie1,ie2,ie3
+
+      integer n1,n2,n3,k
+      integer dx, dy, log_p, d, i, j
+
+      integer ax, next(3),mi(3,maxlevel),mip(3,maxlevel)
+      integer ng(3,maxlevel)
+      integer idi(3), pi(3), idin(3,-1:1)
+      integer s, dir,ierr
+
+      do  j=-1,1,1
+         do  d=1,3
+            msg_type(d,j) = 100*(j+2+10*d)
+         enddo
+      enddo
+
+      ng(1,lt) = nx(lt)
+      ng(2,lt) = ny(lt)
+      ng(3,lt) = nz(lt)
+      do  ax=1,3
+         next(ax) = 1
+         do  k=lt-1,1,-1
+            ng(ax,k) = ng(ax,k+1)/2
+         enddo
+      enddo
+ 61   format(10i4)
+      do  k=lt,1,-1
+         nx(k) = ng(1,k)
+         ny(k) = ng(2,k)
+         nz(k) = ng(3,k)
+      enddo
+
+      log_p  = log(float(nprocs)+0.0001)/log(2.0)
+      dx     = log_p/3
+      pi(1)  = 2**dx
+      idi(1) = mod(me,pi(1))
+
+      dy     = (log_p-dx)/2
+      pi(2)  = 2**dy
+      idi(2) = mod((me/pi(1)),pi(2))
+
+      pi(3)  = nprocs/(pi(1)*pi(2))
+      idi(3) = me/(pi(1)*pi(2))
+
+      do  k = lt,1,-1
+         dead(k) = .false.
+         do  ax = 1,3
+            take_ex(ax,k) = .false.
+            give_ex(ax,k) = .false.
+
+            mi(ax,k) = 2 + 
+     >           ((idi(ax)+1)*ng(ax,k))/pi(ax) -
+     >           ((idi(ax)+0)*ng(ax,k))/pi(ax)
+            mip(ax,k) = 2 + 
+     >           ((next(ax)+idi(ax)+1)*ng(ax,k))/pi(ax) -
+     >           ((next(ax)+idi(ax)+0)*ng(ax,k))/pi(ax) 
+
+            if(mip(ax,k).eq.2.or.mi(ax,k).eq.2)then
+               next(ax) = 2*next(ax)
+            endif
+
+            if( k+1 .le. lt )then
+               if((mip(ax,k).eq.2).and.(mi(ax,k).eq.3))then
+                  give_ex(ax,k+1) = .true.
+               endif
+               if((mip(ax,k).eq.3).and.(mi(ax,k).eq.2))then
+                  take_ex(ax,k+1) = .true.
+               endif
+            endif
+         enddo
+
+         if( mi(1,k).eq.2 .or. 
+     >        mi(2,k).eq.2 .or. 
+     >        mi(3,k).eq.2      )then
+            dead(k) = .true.
+         endif
+         m1(k) = mi(1,k)
+         m2(k) = mi(2,k)
+         m3(k) = mi(3,k)
+
+         do  ax=1,3
+            idin(ax,+1) = mod( idi(ax) + next(ax) + pi(ax) , pi(ax) )
+            idin(ax,-1) = mod( idi(ax) - next(ax) + pi(ax) , pi(ax) )
+         enddo
+         do  dir = 1,-1,-2
+            nbr(1,dir,k) = idin(1,dir) + pi(1)
+     >           *(idi(2)      + pi(2)
+     >           * idi(3))
+            nbr(2,dir,k) = idi(1)      + pi(1)
+     >           *(idin(2,dir) + pi(2)
+     >           * idi(3))
+            nbr(3,dir,k) = idi(1)      + pi(1)
+     >           *(idi(2)      + pi(2)
+     >           * idin(3,dir))
+         enddo
+      enddo
+
+      k = lt
+      is1 = 2 + ng(1,k) - ((pi(1)  -idi(1))*ng(1,lt))/pi(1)
+      ie1 = 1 + ng(1,k) - ((pi(1)-1-idi(1))*ng(1,lt))/pi(1)
+      n1 = 3 + ie1 - is1
+      is2 = 2 + ng(2,k) - ((pi(2)  -idi(2))*ng(2,lt))/pi(2)
+      ie2 = 1 + ng(2,k) - ((pi(2)-1-idi(2))*ng(2,lt))/pi(2)
+      n2 = 3 + ie2 - is2
+      is3 = 2 + ng(3,k) - ((pi(3)  -idi(3))*ng(3,lt))/pi(3)
+      ie3 = 1 + ng(3,k) - ((pi(3)-1-idi(3))*ng(3,lt))/pi(3)
+      n3 = 3 + ie3 - is3
+
+
+      ir(lt)=1
+      do  j = lt-1, 1, -1
+         ir(j)=ir(j+1)+m1(j+1)*m2(j+1)*m3(j+1)
+      enddo
+
+
+      if( debug_vec(1) .ge. 1 )then
+         if( me .eq. root )write(*,*)' in setup, '
+         if( me .eq. root )write(*,*)' me   k  lt  nx  ny  nz ',
+     >        ' n1  n2  n3 is1 is2 is3 ie1 ie2 ie3'
+         do  i=0,nprocs-1
+            if( me .eq. i )then
+               write(*,9) me,k,lt,ng(1,k),ng(2,k),ng(3,k),
+     >              n1,n2,n3,is1,is2,is3,ie1,ie2,ie3
+ 9             format(15i4)
+            endif
+            call mpi_barrier(mpi_comm_world,ierr)
+         enddo
+      endif
+      if( debug_vec(1) .ge. 2 )then
+         do  i=0,nprocs-1
+            if( me .eq. i )then
+               write(*,*)' '
+               write(*,*)' processor =',me
+               do  k=lt,1,-1
+                  write(*,7)k,idi(1),idi(2),idi(3),
+     >                 ((nbr(d,j,k),j=-1,1,2),d=1,3),
+     >                 (mi(d,k),d=1,3)
+               enddo
+ 7             format(i4,'idi=',3i4,'nbr=',3(2i4,'  '),'mi=',3i4,' ')
+               write(*,*)'idi(s) = ',(idi(s),s=1,3)
+               write(*,*)'dead(2), dead(1) = ',dead(2),dead(1)
+               do  ax=1,3
+                  write(*,*)'give_ex(ax,2)= ',give_ex(ax,2)
+                  write(*,*)'take_ex(ax,2)= ',take_ex(ax,2)
+               enddo
+            endif
+            call mpi_barrier(mpi_comm_world,ierr)
+         enddo
+      endif
+
+      k = lt
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine mg3P(u,v,r,a,c,n1,n2,n3,k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     multigrid V-cycle routine
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer n1, n2, n3, k
+      double precision u(nr),v(nv),r(nr)
+      double precision a(0:3),c(0:3)
+
+      integer j
+
+c---------------------------------------------------------------------
+c     down cycle.
+c     restrict the residual from the find grid to the coarse
+c---------------------------------------------------------------------
+
+      do  k= lt, lb+1 , -1
+         j = k-1
+         call rprj3(r(ir(k)),m1(k),m2(k),m3(k),
+     >        r(ir(j)),m1(j),m2(j),m3(j),k)
+      enddo
+
+      k = lb
+c---------------------------------------------------------------------
+c     compute an approximate solution on the coarsest grid
+c---------------------------------------------------------------------
+      call zero3(u(ir(k)),m1(k),m2(k),m3(k))
+      call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
+
+      do  k = lb+1, lt-1     
+          j = k-1
+c---------------------------------------------------------------------
+c        prolongate from level k-1  to k
+c---------------------------------------------------------------------
+         call zero3(u(ir(k)),m1(k),m2(k),m3(k))
+         call interp(u(ir(j)),m1(j),m2(j),m3(j),
+     >               u(ir(k)),m1(k),m2(k),m3(k),k)
+c---------------------------------------------------------------------
+c        compute residual for level k
+c---------------------------------------------------------------------
+         call resid(u(ir(k)),r(ir(k)),r(ir(k)),m1(k),m2(k),m3(k),a,k)
+c---------------------------------------------------------------------
+c        apply smoother
+c---------------------------------------------------------------------
+         call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
+      enddo
+ 200  continue
+      j = lt - 1
+      k = lt
+      call interp(u(ir(j)),m1(j),m2(j),m3(j),u,n1,n2,n3,k)
+      call resid(u,v,r,n1,n2,n3,a,k)
+      call psinv(r,u,n1,n2,n3,c,k)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine psinv( r,u,n1,n2,n3,c,k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     psinv applies an approximate inverse as smoother:  u = u + Cr
+c
+c     This  implementation costs  15A + 4M per result, where
+c     A and M denote the costs of Addition and Multiplication.  
+c     Presuming coefficient c(3) is zero (the NPB assumes this,
+c     but it is thus not a general case), 2A + 1M may be eliminated,
+c     resulting in 13A + 3M.
+c     Note that this vectorizes, and is also fine for cache 
+c     based machines.  
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'globals.h'
+
+      integer n1,n2,n3,k
+      double precision u(n1,n2,n3),r(n1,n2,n3),c(0:3)
+      integer i3, i2, i1
+
+      double precision r1(m), r2(m)
+      
+      do i3=2,n3-1
+         do i2=2,n2-1
+            do i1=1,n1
+               r1(i1) = r(i1,i2-1,i3) + r(i1,i2+1,i3)
+     >                + r(i1,i2,i3-1) + r(i1,i2,i3+1)
+               r2(i1) = r(i1,i2-1,i3-1) + r(i1,i2+1,i3-1)
+     >                + r(i1,i2-1,i3+1) + r(i1,i2+1,i3+1)
+            enddo
+            do i1=2,n1-1
+               u(i1,i2,i3) = u(i1,i2,i3)
+     >                     + c(0) * r(i1,i2,i3)
+     >                     + c(1) * ( r(i1-1,i2,i3) + r(i1+1,i2,i3)
+     >                              + r1(i1) )
+     >                     + c(2) * ( r2(i1) + r1(i1-1) + r1(i1+1) )
+c---------------------------------------------------------------------
+c  Assume c(3) = 0    (Enable line below if c(3) not= 0)
+c---------------------------------------------------------------------
+c    >                     + c(3) * ( r2(i1-1) + r2(i1+1) )
+c---------------------------------------------------------------------
+            enddo
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     exchange boundary points
+c---------------------------------------------------------------------
+      call comm3(u,n1,n2,n3,k)
+
+      if( debug_vec(0) .ge. 1 )then
+         call rep_nrm(u,n1,n2,n3,'   psinv',k)
+      endif
+
+      if( debug_vec(3) .ge. k )then
+         call showall(u,n1,n2,n3)
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine resid( u,v,r,n1,n2,n3,a,k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     resid computes the residual:  r = v - Au
+c
+c     This  implementation costs  15A + 4M per result, where
+c     A and M denote the costs of Addition (or Subtraction) and 
+c     Multiplication, respectively. 
+c     Presuming coefficient a(1) is zero (the NPB assumes this,
+c     but it is thus not a general case), 3A + 1M may be eliminated,
+c     resulting in 12A + 3M.
+c     Note that this vectorizes, and is also fine for cache 
+c     based machines.  
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'globals.h'
+
+      integer n1,n2,n3,k
+      double precision u(n1,n2,n3),v(n1,n2,n3),r(n1,n2,n3),a(0:3)
+      integer i3, i2, i1
+      double precision u1(m), u2(m)
+
+      do i3=2,n3-1
+         do i2=2,n2-1
+            do i1=1,n1
+               u1(i1) = u(i1,i2-1,i3) + u(i1,i2+1,i3)
+     >                + u(i1,i2,i3-1) + u(i1,i2,i3+1)
+               u2(i1) = u(i1,i2-1,i3-1) + u(i1,i2+1,i3-1)
+     >                + u(i1,i2-1,i3+1) + u(i1,i2+1,i3+1)
+            enddo
+            do i1=2,n1-1
+               r(i1,i2,i3) = v(i1,i2,i3)
+     >                     - a(0) * u(i1,i2,i3)
+c---------------------------------------------------------------------
+c  Assume a(1) = 0      (Enable 2 lines below if a(1) not= 0)
+c---------------------------------------------------------------------
+c    >                     - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
+c    >                              + u1(i1) )
+c---------------------------------------------------------------------
+     >                     - a(2) * ( u2(i1) + u1(i1-1) + u1(i1+1) )
+     >                     - a(3) * ( u2(i1-1) + u2(i1+1) )
+            enddo
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     exchange boundary data
+c---------------------------------------------------------------------
+      call comm3(r,n1,n2,n3,k)
+
+      if( debug_vec(0) .ge. 1 )then
+         call rep_nrm(r,n1,n2,n3,'   resid',k)
+      endif
+
+      if( debug_vec(2) .ge. k )then
+         call showall(r,n1,n2,n3)
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine rprj3( r,m1k,m2k,m3k,s,m1j,m2j,m3j,k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     rprj3 projects onto the next coarser grid, 
+c     using a trilinear Finite Element projection:  s = r' = P r
+c     
+c     This  implementation costs  20A + 4M per result, where
+c     A and M denote the costs of Addition and Multiplication.  
+c     Note that this vectorizes, and is also fine for cache 
+c     based machines.  
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer m1k, m2k, m3k, m1j, m2j, m3j,k
+      double precision r(m1k,m2k,m3k), s(m1j,m2j,m3j)
+      integer j3, j2, j1, i3, i2, i1, d1, d2, d3, j
+
+      double precision x1(m), y1(m), x2,y2
+
+
+      if(m1k.eq.3)then
+        d1 = 2
+      else
+        d1 = 1
+      endif
+
+      if(m2k.eq.3)then
+        d2 = 2
+      else
+        d2 = 1
+      endif
+
+      if(m3k.eq.3)then
+        d3 = 2
+      else
+        d3 = 1
+      endif
+
+      do  j3=2,m3j-1
+         i3 = 2*j3-d3
+C        i3 = 2*j3-1
+         do  j2=2,m2j-1
+            i2 = 2*j2-d2
+C           i2 = 2*j2-1
+
+            do j1=2,m1j
+              i1 = 2*j1-d1
+C             i1 = 2*j1-1
+              x1(i1-1) = r(i1-1,i2-1,i3  ) + r(i1-1,i2+1,i3  )
+     >                 + r(i1-1,i2,  i3-1) + r(i1-1,i2,  i3+1)
+              y1(i1-1) = r(i1-1,i2-1,i3-1) + r(i1-1,i2-1,i3+1)
+     >                 + r(i1-1,i2+1,i3-1) + r(i1-1,i2+1,i3+1)
+            enddo
+
+            do  j1=2,m1j-1
+              i1 = 2*j1-d1
+C             i1 = 2*j1-1
+              y2 = r(i1,  i2-1,i3-1) + r(i1,  i2-1,i3+1)
+     >           + r(i1,  i2+1,i3-1) + r(i1,  i2+1,i3+1)
+              x2 = r(i1,  i2-1,i3  ) + r(i1,  i2+1,i3  )
+     >           + r(i1,  i2,  i3-1) + r(i1,  i2,  i3+1)
+              s(j1,j2,j3) =
+     >               0.5D0 * r(i1,i2,i3)
+     >             + 0.25D0 * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + x2)
+     >             + 0.125D0 * ( x1(i1-1) + x1(i1+1) + y2)
+     >             + 0.0625D0 * ( y1(i1-1) + y1(i1+1) )
+            enddo
+
+         enddo
+      enddo
+
+
+      j = k-1
+      call comm3(s,m1j,m2j,m3j,j)
+
+      if( debug_vec(0) .ge. 1 )then
+         call rep_nrm(s,m1j,m2j,m3j,'   rprj3',k-1)
+      endif
+
+      if( debug_vec(4) .ge. k )then
+         call showall(s,m1j,m2j,m3j)
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine interp( z,mm1,mm2,mm3,u,n1,n2,n3,k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     interp adds the trilinear interpolation of the correction
+c     from the coarser grid to the current approximation:  u = u + Qu'
+c     
+c     Observe that this  implementation costs  16A + 4M, where
+c     A and M denote the costs of Addition and Multiplication.  
+c     Note that this vectorizes, and is also fine for cache 
+c     based machines.  Vector machines may get slightly better 
+c     performance however, with 8 separate "do i1" loops, rather than 4.
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer mm1, mm2, mm3, n1, n2, n3,k
+      double precision z(mm1,mm2,mm3),u(n1,n2,n3)
+      integer i3, i2, i1, d1, d2, d3, t1, t2, t3
+
+c note that m = 1037 in globals.h but for this only need to be
+c 535 to handle up to 1024^3
+c      integer m
+c      parameter( m=535 )
+      double precision z1(m),z2(m),z3(m)
+
+
+      if( n1 .ne. 3 .and. n2 .ne. 3 .and. n3 .ne. 3 ) then
+
+         do  i3=1,mm3-1
+            do  i2=1,mm2-1
+
+               do i1=1,mm1
+                  z1(i1) = z(i1,i2+1,i3) + z(i1,i2,i3)
+                  z2(i1) = z(i1,i2,i3+1) + z(i1,i2,i3)
+                  z3(i1) = z(i1,i2+1,i3+1) + z(i1,i2,i3+1) + z1(i1)
+               enddo
+
+               do  i1=1,mm1-1
+                  u(2*i1-1,2*i2-1,2*i3-1)=u(2*i1-1,2*i2-1,2*i3-1)
+     >                 +z(i1,i2,i3)
+                  u(2*i1,2*i2-1,2*i3-1)=u(2*i1,2*i2-1,2*i3-1)
+     >                 +0.5d0*(z(i1+1,i2,i3)+z(i1,i2,i3))
+               enddo
+               do i1=1,mm1-1
+                  u(2*i1-1,2*i2,2*i3-1)=u(2*i1-1,2*i2,2*i3-1)
+     >                 +0.5d0 * z1(i1)
+                  u(2*i1,2*i2,2*i3-1)=u(2*i1,2*i2,2*i3-1)
+     >                 +0.25d0*( z1(i1) + z1(i1+1) )
+               enddo
+               do i1=1,mm1-1
+                  u(2*i1-1,2*i2-1,2*i3)=u(2*i1-1,2*i2-1,2*i3)
+     >                 +0.5d0 * z2(i1)
+                  u(2*i1,2*i2-1,2*i3)=u(2*i1,2*i2-1,2*i3)
+     >                 +0.25d0*( z2(i1) + z2(i1+1) )
+               enddo
+               do i1=1,mm1-1
+                  u(2*i1-1,2*i2,2*i3)=u(2*i1-1,2*i2,2*i3)
+     >                 +0.25d0* z3(i1)
+                  u(2*i1,2*i2,2*i3)=u(2*i1,2*i2,2*i3)
+     >                 +0.125d0*( z3(i1) + z3(i1+1) )
+               enddo
+            enddo
+         enddo
+
+      else
+
+         if(n1.eq.3)then
+            d1 = 2
+            t1 = 1
+         else
+            d1 = 1
+            t1 = 0
+         endif
+         
+         if(n2.eq.3)then
+            d2 = 2
+            t2 = 1
+         else
+            d2 = 1
+            t2 = 0
+         endif
+         
+         if(n3.eq.3)then
+            d3 = 2
+            t3 = 1
+         else
+            d3 = 1
+            t3 = 0
+         endif
+         
+         do  i3=d3,mm3-1
+            do  i2=d2,mm2-1
+               do  i1=d1,mm1-1
+                  u(2*i1-d1,2*i2-d2,2*i3-d3)=u(2*i1-d1,2*i2-d2,2*i3-d3)
+     >                 +z(i1,i2,i3)
+               enddo
+               do  i1=1,mm1-1
+                  u(2*i1-t1,2*i2-d2,2*i3-d3)=u(2*i1-t1,2*i2-d2,2*i3-d3)
+     >                 +0.5D0*(z(i1+1,i2,i3)+z(i1,i2,i3))
+               enddo
+            enddo
+            do  i2=1,mm2-1
+               do  i1=d1,mm1-1
+                  u(2*i1-d1,2*i2-t2,2*i3-d3)=u(2*i1-d1,2*i2-t2,2*i3-d3)
+     >                 +0.5D0*(z(i1,i2+1,i3)+z(i1,i2,i3))
+               enddo
+               do  i1=1,mm1-1
+                  u(2*i1-t1,2*i2-t2,2*i3-d3)=u(2*i1-t1,2*i2-t2,2*i3-d3)
+     >                 +0.25D0*(z(i1+1,i2+1,i3)+z(i1+1,i2,i3)
+     >                 +z(i1,  i2+1,i3)+z(i1,  i2,i3))
+               enddo
+            enddo
+         enddo
+
+         do  i3=1,mm3-1
+            do  i2=d2,mm2-1
+               do  i1=d1,mm1-1
+                  u(2*i1-d1,2*i2-d2,2*i3-t3)=u(2*i1-d1,2*i2-d2,2*i3-t3)
+     >                 +0.5D0*(z(i1,i2,i3+1)+z(i1,i2,i3))
+               enddo
+               do  i1=1,mm1-1
+                  u(2*i1-t1,2*i2-d2,2*i3-t3)=u(2*i1-t1,2*i2-d2,2*i3-t3)
+     >                 +0.25D0*(z(i1+1,i2,i3+1)+z(i1,i2,i3+1)
+     >                 +z(i1+1,i2,i3  )+z(i1,i2,i3  ))
+               enddo
+            enddo
+            do  i2=1,mm2-1
+               do  i1=d1,mm1-1
+                  u(2*i1-d1,2*i2-t2,2*i3-t3)=u(2*i1-d1,2*i2-t2,2*i3-t3)
+     >                 +0.25D0*(z(i1,i2+1,i3+1)+z(i1,i2,i3+1)
+     >                 +z(i1,i2+1,i3  )+z(i1,i2,i3  ))
+               enddo
+               do  i1=1,mm1-1
+                  u(2*i1-t1,2*i2-t2,2*i3-t3)=u(2*i1-t1,2*i2-t2,2*i3-t3)
+     >                 +0.125D0*(z(i1+1,i2+1,i3+1)+z(i1+1,i2,i3+1)
+     >                 +z(i1  ,i2+1,i3+1)+z(i1  ,i2,i3+1)
+     >                 +z(i1+1,i2+1,i3  )+z(i1+1,i2,i3  )
+     >                 +z(i1  ,i2+1,i3  )+z(i1  ,i2,i3  ))
+               enddo
+            enddo
+         enddo
+
+      endif
+
+      call comm3_ex(u,n1,n2,n3,k)
+
+      if( debug_vec(0) .ge. 1 )then
+         call rep_nrm(z,mm1,mm2,mm3,'z: inter',k-1)
+         call rep_nrm(u,n1,n2,n3,'u: inter',k)
+      endif
+
+      if( debug_vec(5) .ge. k )then
+         call showall(z,mm1,mm2,mm3)
+         call showall(u,n1,n2,n3)
+      endif
+
+      return 
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine norm2u3(r,n1,n2,n3,rnm2,rnmu,nx,ny,nz)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     norm2u3 evaluates approximations to the L2 norm and the
+c     uniform (or L-infinity or Chebyshev) norm, under the
+c     assumption that the boundaries are periodic or zero.  Add the
+c     boundaries in with half weight (quarter weight on the edges
+c     and eighth weight at the corners) for inhomogeneous boundaries.
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer n1, n2, n3, nx, ny, nz
+      double precision rnm2, rnmu, r(n1,n2,n3)
+      double precision s, a, ss
+      integer i3, i2, i1, ierr
+
+      double precision dn
+
+      dn = 1.0d0*nx*ny*nz
+
+      s=0.0D0
+      rnmu = 0.0D0
+      do  i3=2,n3-1
+         do  i2=2,n2-1
+            do  i1=2,n1-1
+               s=s+r(i1,i2,i3)**2
+               a=abs(r(i1,i2,i3))
+               if(a.gt.rnmu)rnmu=a
+            enddo
+         enddo
+      enddo
+
+      call mpi_allreduce(rnmu,ss,1,dp_type,
+     >     mpi_max,mpi_comm_world,ierr)
+      rnmu = ss
+      call mpi_allreduce(s, ss, 1, dp_type,
+     >     mpi_sum,mpi_comm_world,ierr)
+      s = ss
+      rnm2=sqrt( s / dn )
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine rep_nrm(u,n1,n2,n3,title,kk)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     report on norm
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer n1, n2, n3, kk
+      double precision u(n1,n2,n3)
+      character*8 title
+
+      double precision rnm2, rnmu
+
+
+      call norm2u3(u,n1,n2,n3,rnm2,rnmu,nx(kk),ny(kk),nz(kk))
+      if( me .eq. root )then
+         write(*,7)kk,title,rnm2,rnmu
+ 7       format(' Level',i2,' in ',a8,': norms =',D21.14,D21.14)
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine comm3(u,n1,n2,n3,kk)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     comm3 organizes the communication on all borders 
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer n1, n2, n3, kk
+      double precision u(n1,n2,n3)
+      integer axis
+
+      if( .not. dead(kk) )then
+         do  axis = 1, 3
+            if( nprocs .ne. 1) then
+   
+               call ready( axis, -1, kk )
+               call ready( axis, +1, kk )
+   
+               call give3( axis, +1, u, n1, n2, n3, kk )
+               call give3( axis, -1, u, n1, n2, n3, kk )
+   
+               call take3( axis, -1, u, n1, n2, n3 )
+               call take3( axis, +1, u, n1, n2, n3 )
+   
+            else
+               call comm1p( axis, u, n1, n2, n3, kk )
+            endif
+         enddo
+      else
+         call zero3(u,n1,n2,n3)
+      endif
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine comm3_ex(u,n1,n2,n3,kk)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     comm3_ex  communicates to expand the number of processors
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer n1, n2, n3, kk
+      double precision u(n1,n2,n3)
+      integer axis
+
+      do  axis = 1, 3
+         if( nprocs .ne. 1 ) then
+            if( take_ex( axis, kk ) )then
+               call ready( axis, -1, kk )
+               call ready( axis, +1, kk )
+               call take3_ex( axis, -1, u, n1, n2, n3 )
+               call take3_ex( axis, +1, u, n1, n2, n3 )
+            endif
+   
+            if( give_ex( axis, kk ) )then
+               call give3_ex( axis, +1, u, n1, n2, n3, kk )
+               call give3_ex( axis, -1, u, n1, n2, n3, kk )
+            endif
+         else
+            call comm1p_ex( axis, u, n1, n2, n3, kk )
+         endif
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine ready( axis, dir, k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     ready allocates a buffer to take in a message
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer axis, dir, k
+      integer buff_id,buff_len,i,ierr
+
+      buff_id = 3 + dir
+      buff_len = nm2
+
+      do  i=1,nm2
+         buff(i,buff_id) = 0.0D0
+      enddo
+
+
+c---------------------------------------------------------------------
+c     fake message request type
+c---------------------------------------------------------------------
+      msg_id(axis,dir,1) = msg_type(axis,dir) +1000*me
+
+      call mpi_irecv( buff(1,buff_id), buff_len,
+     >     dp_type, nbr(axis,-dir,k), msg_type(axis,dir), 
+     >     mpi_comm_world, msg_id(axis,dir,1), ierr)
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine give3( axis, dir, u, n1, n2, n3, k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     give3 sends border data out in the requested direction
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer axis, dir, n1, n2, n3, k, ierr
+      double precision u( n1, n2, n3 )
+
+      integer i3, i2, i1, buff_len,buff_id
+
+      buff_id = 2 + dir 
+      buff_len = 0
+
+      if( axis .eq.  1 )then
+         if( dir .eq. -1 )then
+
+            do  i3=2,n3-1
+               do  i2=2,n2-1
+                  buff_len = buff_len + 1
+                  buff(buff_len,buff_id ) = u( 2,  i2,i3)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=2,n3-1
+               do  i2=2,n2-1
+                  buff_len = buff_len + 1
+                  buff(buff_len, buff_id ) = u( n1-1, i2,i3)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         endif
+      endif
+
+      if( axis .eq.  2 )then
+         if( dir .eq. -1 )then
+
+            do  i3=2,n3-1
+               do  i1=1,n1
+                  buff_len = buff_len + 1
+                  buff(buff_len, buff_id ) = u( i1,  2,i3)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=2,n3-1
+               do  i1=1,n1
+                  buff_len = buff_len + 1
+                  buff(buff_len,  buff_id )= u( i1,n2-1,i3)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         endif
+      endif
+
+      if( axis .eq.  3 )then
+         if( dir .eq. -1 )then
+
+            do  i2=1,n2
+               do  i1=1,n1
+                  buff_len = buff_len + 1
+                  buff(buff_len, buff_id ) = u( i1,i2,2)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         else if( dir .eq. +1 ) then
+
+            do  i2=1,n2
+               do  i1=1,n1
+                  buff_len = buff_len + 1
+                  buff(buff_len, buff_id ) = u( i1,i2,n3-1)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine take3( axis, dir, u, n1, n2, n3 )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     take3 copies in border data from the requested direction
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer axis, dir, n1, n2, n3
+      double precision u( n1, n2, n3 )
+
+      integer buff_id, indx
+
+      integer status(mpi_status_size), ierr
+
+      integer i3, i2, i1
+
+      call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
+      buff_id = 3 + dir
+      indx = 0
+
+      if( axis .eq.  1 )then
+         if( dir .eq. -1 )then
+
+            do  i3=2,n3-1
+               do  i2=2,n2-1
+                  indx = indx + 1
+                  u(n1,i2,i3) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=2,n3-1
+               do  i2=2,n2-1
+                  indx = indx + 1
+                  u(1,i2,i3) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         endif
+      endif
+
+      if( axis .eq.  2 )then
+         if( dir .eq. -1 )then
+
+            do  i3=2,n3-1
+               do  i1=1,n1
+                  indx = indx + 1
+                  u(i1,n2,i3) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=2,n3-1
+               do  i1=1,n1
+                  indx = indx + 1
+                  u(i1,1,i3) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         endif
+      endif
+
+      if( axis .eq.  3 )then
+         if( dir .eq. -1 )then
+
+            do  i2=1,n2
+               do  i1=1,n1
+                  indx = indx + 1
+                  u(i1,i2,n3) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         else if( dir .eq. +1 ) then
+
+            do  i2=1,n2
+               do  i1=1,n1
+                  indx = indx + 1
+                  u(i1,i2,1) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine give3_ex( axis, dir, u, n1, n2, n3, k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     give3_ex sends border data out to expand number of processors
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer axis, dir, n1, n2, n3, k, ierr
+      double precision u( n1, n2, n3 )
+
+      integer i3, i2, i1, buff_len, buff_id
+
+      buff_id = 2 + dir 
+      buff_len = 0
+
+      if( axis .eq.  1 )then
+         if( dir .eq. -1 )then
+
+            do  i3=1,n3
+               do  i2=1,n2
+                  buff_len = buff_len + 1
+                  buff(buff_len,buff_id ) = u( 2,  i2,i3)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=1,n3
+               do  i2=1,n2
+                  do  i1=n1-1,n1
+                     buff_len = buff_len + 1
+                     buff(buff_len,buff_id)= u(i1,i2,i3)
+                  enddo
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         endif
+      endif
+
+      if( axis .eq.  2 )then
+         if( dir .eq. -1 )then
+
+            do  i3=1,n3
+               do  i1=1,n1
+                  buff_len = buff_len + 1
+                  buff(buff_len, buff_id ) = u( i1,  2,i3)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=1,n3
+               do  i2=n2-1,n2
+                  do  i1=1,n1
+                     buff_len = buff_len + 1
+                     buff(buff_len,buff_id )= u(i1,i2,i3)
+                  enddo
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         endif
+      endif
+
+      if( axis .eq.  3 )then
+         if( dir .eq. -1 )then
+
+            do  i2=1,n2
+               do  i1=1,n1
+                  buff_len = buff_len + 1
+                  buff(buff_len, buff_id ) = u( i1,i2,2)
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=n3-1,n3
+               do  i2=1,n2
+                  do  i1=1,n1
+                     buff_len = buff_len + 1
+                     buff(buff_len, buff_id ) = u( i1,i2,i3)
+                  enddo
+               enddo
+            enddo
+
+            call mpi_send( 
+     >           buff(1, buff_id ), buff_len,dp_type,
+     >           nbr( axis, dir, k ), msg_type(axis,dir), 
+     >           mpi_comm_world, ierr)
+
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine take3_ex( axis, dir, u, n1, n2, n3 )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     take3_ex copies in border data to expand number of processors
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer axis, dir, n1, n2, n3
+      double precision u( n1, n2, n3 )
+
+      integer buff_id, indx
+
+      integer status(mpi_status_size) , ierr
+
+      integer i3, i2, i1
+
+      call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
+      buff_id = 3 + dir
+      indx = 0
+
+      if( axis .eq.  1 )then
+         if( dir .eq. -1 )then
+
+            do  i3=1,n3
+               do  i2=1,n2
+                  indx = indx + 1
+                  u(n1,i2,i3) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=1,n3
+               do  i2=1,n2
+                  do  i1=1,2
+                     indx = indx + 1
+                     u(i1,i2,i3) = buff(indx,buff_id)
+                  enddo
+               enddo
+            enddo
+
+         endif
+      endif
+
+      if( axis .eq.  2 )then
+         if( dir .eq. -1 )then
+
+            do  i3=1,n3
+               do  i1=1,n1
+                  indx = indx + 1
+                  u(i1,n2,i3) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=1,n3
+               do  i2=1,2
+                  do  i1=1,n1
+                     indx = indx + 1
+                     u(i1,i2,i3) = buff(indx,buff_id)
+                  enddo
+               enddo
+            enddo
+
+         endif
+      endif
+
+      if( axis .eq.  3 )then
+         if( dir .eq. -1 )then
+
+            do  i2=1,n2
+               do  i1=1,n1
+                  indx = indx + 1
+                  u(i1,i2,n3) = buff(indx, buff_id )
+               enddo
+            enddo
+
+         else if( dir .eq. +1 ) then
+
+            do  i3=1,2
+               do  i2=1,n2
+                  do  i1=1,n1
+                     indx = indx + 1
+                     u(i1,i2,i3) = buff(indx,buff_id)
+                  enddo
+               enddo
+            enddo
+
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine comm1p( axis, u, n1, n2, n3, kk )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer axis, dir, n1, n2, n3
+      double precision u( n1, n2, n3 )
+
+      integer i3, i2, i1, buff_len,buff_id
+      integer i, kk, indx
+
+      dir = -1
+
+      buff_id = 3 + dir
+      buff_len = nm2
+
+      do  i=1,nm2
+         buff(i,buff_id) = 0.0D0
+      enddo
+
+
+      dir = +1
+
+      buff_id = 3 + dir
+      buff_len = nm2
+
+      do  i=1,nm2
+         buff(i,buff_id) = 0.0D0
+      enddo
+
+      dir = +1
+
+      buff_id = 2 + dir 
+      buff_len = 0
+
+      if( axis .eq.  1 )then
+         do  i3=2,n3-1
+            do  i2=2,n2-1
+               buff_len = buff_len + 1
+               buff(buff_len, buff_id ) = u( n1-1, i2,i3)
+            enddo
+         enddo
+      endif
+
+      if( axis .eq.  2 )then
+         do  i3=2,n3-1
+            do  i1=1,n1
+               buff_len = buff_len + 1
+               buff(buff_len,  buff_id )= u( i1,n2-1,i3)
+            enddo
+         enddo
+      endif
+
+      if( axis .eq.  3 )then
+         do  i2=1,n2
+            do  i1=1,n1
+               buff_len = buff_len + 1
+               buff(buff_len, buff_id ) = u( i1,i2,n3-1)
+            enddo
+         enddo
+      endif
+
+      dir = -1
+
+      buff_id = 2 + dir 
+      buff_len = 0
+
+      if( axis .eq.  1 )then
+         do  i3=2,n3-1
+            do  i2=2,n2-1
+               buff_len = buff_len + 1
+               buff(buff_len,buff_id ) = u( 2,  i2,i3)
+            enddo
+         enddo
+      endif
+
+      if( axis .eq.  2 )then
+         do  i3=2,n3-1
+            do  i1=1,n1
+               buff_len = buff_len + 1
+               buff(buff_len, buff_id ) = u( i1,  2,i3)
+            enddo
+         enddo
+      endif
+
+      if( axis .eq.  3 )then
+         do  i2=1,n2
+            do  i1=1,n1
+               buff_len = buff_len + 1
+               buff(buff_len, buff_id ) = u( i1,i2,2)
+            enddo
+         enddo
+      endif
+
+      do  i=1,nm2
+         buff(i,4) = buff(i,3)
+         buff(i,2) = buff(i,1)
+      enddo
+
+      dir = -1
+
+      buff_id = 3 + dir
+      indx = 0
+
+      if( axis .eq.  1 )then
+         do  i3=2,n3-1
+            do  i2=2,n2-1
+               indx = indx + 1
+               u(n1,i2,i3) = buff(indx, buff_id )
+            enddo
+         enddo
+      endif
+
+      if( axis .eq.  2 )then
+         do  i3=2,n3-1
+            do  i1=1,n1
+               indx = indx + 1
+               u(i1,n2,i3) = buff(indx, buff_id )
+            enddo
+         enddo
+      endif
+
+      if( axis .eq.  3 )then
+         do  i2=1,n2
+            do  i1=1,n1
+               indx = indx + 1
+               u(i1,i2,n3) = buff(indx, buff_id )
+            enddo
+         enddo
+      endif
+
+
+      dir = +1
+
+      buff_id = 3 + dir
+      indx = 0
+
+      if( axis .eq.  1 )then
+         do  i3=2,n3-1
+            do  i2=2,n2-1
+               indx = indx + 1
+               u(1,i2,i3) = buff(indx, buff_id )
+            enddo
+         enddo
+      endif
+
+      if( axis .eq.  2 )then
+         do  i3=2,n3-1
+            do  i1=1,n1
+               indx = indx + 1
+               u(i1,1,i3) = buff(indx, buff_id )
+            enddo
+         enddo
+      endif
+
+      if( axis .eq.  3 )then
+         do  i2=1,n2
+            do  i1=1,n1
+               indx = indx + 1
+               u(i1,i2,1) = buff(indx, buff_id )
+            enddo
+         enddo
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine comm1p_ex( axis, u, n1, n2, n3, kk )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      include 'globals.h'
+
+      integer axis, dir, n1, n2, n3
+      double precision u( n1, n2, n3 )
+
+      integer i3, i2, i1, buff_len,buff_id
+      integer i, kk, indx
+
+      if( take_ex( axis, kk ) ) then
+
+         dir = -1
+
+         buff_id = 3 + dir
+         buff_len = nm2
+
+         do  i=1,nm2
+            buff(i,buff_id) = 0.0D0
+         enddo
+
+
+         dir = +1
+
+         buff_id = 3 + dir
+         buff_len = nm2
+
+         do  i=1,nm2
+            buff(i,buff_id) = 0.0D0
+         enddo
+
+
+         dir = -1
+
+         buff_id = 3 + dir
+         indx = 0
+
+         if( axis .eq.  1 )then
+            do  i3=1,n3
+               do  i2=1,n2
+                  indx = indx + 1
+                  u(n1,i2,i3) = buff(indx, buff_id )
+               enddo
+            enddo
+         endif
+
+         if( axis .eq.  2 )then
+            do  i3=1,n3
+               do  i1=1,n1
+                  indx = indx + 1
+                  u(i1,n2,i3) = buff(indx, buff_id )
+               enddo
+            enddo
+         endif
+
+         if( axis .eq.  3 )then
+            do  i2=1,n2
+               do  i1=1,n1
+                  indx = indx + 1
+                  u(i1,i2,n3) = buff(indx, buff_id )
+               enddo
+            enddo
+         endif
+
+         dir = +1
+
+         buff_id = 3 + dir
+         indx = 0
+
+         if( axis .eq.  1 )then
+            do  i3=1,n3
+               do  i2=1,n2
+                  do  i1=1,2
+                     indx = indx + 1
+                     u(i1,i2,i3) = buff(indx,buff_id)
+                  enddo
+               enddo
+            enddo
+         endif
+
+         if( axis .eq.  2 )then
+            do  i3=1,n3
+               do  i2=1,2
+                  do  i1=1,n1
+                     indx = indx + 1
+                     u(i1,i2,i3) = buff(indx,buff_id)
+                  enddo
+               enddo
+            enddo
+         endif
+
+         if( axis .eq.  3 )then
+            do  i3=1,2
+               do  i2=1,n2
+                  do  i1=1,n1
+                     indx = indx + 1
+                     u(i1,i2,i3) = buff(indx,buff_id)
+                  enddo
+               enddo
+            enddo
+         endif
+
+      endif
+
+      if( give_ex( axis, kk ) )then
+
+         dir = +1
+
+         buff_id = 2 + dir 
+         buff_len = 0
+
+         if( axis .eq.  1 )then
+            do  i3=1,n3
+               do  i2=1,n2
+                  do  i1=n1-1,n1
+                     buff_len = buff_len + 1
+                     buff(buff_len,buff_id)= u(i1,i2,i3)
+                  enddo
+               enddo
+            enddo
+         endif
+
+         if( axis .eq.  2 )then
+            do  i3=1,n3
+               do  i2=n2-1,n2
+                  do  i1=1,n1
+                     buff_len = buff_len + 1
+                     buff(buff_len,buff_id )= u(i1,i2,i3)
+                  enddo
+               enddo
+            enddo
+         endif
+
+         if( axis .eq.  3 )then
+            do  i3=n3-1,n3
+               do  i2=1,n2
+                  do  i1=1,n1
+                     buff_len = buff_len + 1
+                     buff(buff_len, buff_id ) = u( i1,i2,i3)
+                  enddo
+               enddo
+            enddo
+         endif
+
+         dir = -1
+
+         buff_id = 2 + dir 
+         buff_len = 0
+
+         if( axis .eq.  1 )then
+            do  i3=1,n3
+               do  i2=1,n2
+                  buff_len = buff_len + 1
+                  buff(buff_len,buff_id ) = u( 2,  i2,i3)
+               enddo
+            enddo
+         endif
+
+         if( axis .eq.  2 )then
+            do  i3=1,n3
+               do  i1=1,n1
+                  buff_len = buff_len + 1
+                  buff(buff_len, buff_id ) = u( i1,  2,i3)
+               enddo
+            enddo
+         endif
+
+         if( axis .eq.  3 )then
+            do  i2=1,n2
+               do  i1=1,n1
+                  buff_len = buff_len + 1
+                  buff(buff_len, buff_id ) = u( i1,i2,2)
+               enddo
+            enddo
+         endif
+
+      endif
+
+      do  i=1,nm2
+         buff(i,4) = buff(i,3)
+         buff(i,2) = buff(i,1)
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine zran3(z,n1,n2,n3,nx,ny,k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     zran3  loads +1 at ten randomly chosen points,
+c     loads -1 at a different ten random points,
+c     and zero elsewhere.
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer  is1, is2, is3, ie1, ie2, ie3
+      common /grid/ is1,is2,is3,ie1,ie2,ie3
+
+      integer n1, n2, n3, k, nx, ny, ierr, i0, m0, m1
+      double precision z(n1,n2,n3)
+
+      integer mm, i1, i2, i3, d1, e1, e2, e3
+      double precision x, a
+      double precision xx, x0, x1, a1, a2, ai, power
+      parameter( mm = 10,  a = 5.D0 ** 13, x = 314159265.D0)
+      double precision ten( mm, 0:1 ), temp, best
+      integer i, j1( mm, 0:1 ), j2( mm, 0:1 ), j3( mm, 0:1 )
+      integer jg( 0:3, mm, 0:1 ), jg_temp(4)
+
+      external randlc
+      double precision randlc, rdummy
+
+      a1 = power( a, nx, 1, 0 )
+      a2 = power( a, nx, ny, 0 )
+
+      call zero3(z,n1,n2,n3)
+
+c      i = is1-2+nx*(is2-2+ny*(is3-2))
+
+      ai = power( a, nx, is2-2+ny*(is3-2), is1-2 )
+      d1 = ie1 - is1 + 1
+      e1 = ie1 - is1 + 2
+      e2 = ie2 - is2 + 2
+      e3 = ie3 - is3 + 2
+      x0 = x
+      rdummy = randlc( x0, ai )
+      do  i3 = 2, e3
+         x1 = x0
+         do  i2 = 2, e2
+            xx = x1
+            call vranlc( d1, xx, a, z( 2, i2, i3 ))
+            rdummy = randlc( x1, a1 )
+         enddo
+         rdummy = randlc( x0, a2 )
+      enddo
+
+c---------------------------------------------------------------------
+c       call comm3(z,n1,n2,n3)
+c       call showall(z,n1,n2,n3)
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     each processor looks for twenty candidates
+c---------------------------------------------------------------------
+      do  i=1,mm
+         ten( i, 1 ) = 0.0D0
+         j1( i, 1 ) = 0
+         j2( i, 1 ) = 0
+         j3( i, 1 ) = 0
+         ten( i, 0 ) = 1.0D0
+         j1( i, 0 ) = 0
+         j2( i, 0 ) = 0
+         j3( i, 0 ) = 0
+      enddo
+
+      do  i3=2,n3-1
+         do  i2=2,n2-1
+            do  i1=2,n1-1
+               if( z(i1,i2,i3) .gt. ten( 1, 1 ) )then
+                  ten(1,1) = z(i1,i2,i3) 
+                  j1(1,1) = i1
+                  j2(1,1) = i2
+                  j3(1,1) = i3
+                  call bubble( ten, j1, j2, j3, mm, 1 )
+               endif
+               if( z(i1,i2,i3) .lt. ten( 1, 0 ) )then
+                  ten(1,0) = z(i1,i2,i3) 
+                  j1(1,0) = i1
+                  j2(1,0) = i2
+                  j3(1,0) = i3
+                  call bubble( ten, j1, j2, j3, mm, 0 )
+               endif
+            enddo
+         enddo
+      enddo
+
+      call mpi_barrier(mpi_comm_world,ierr)
+
+c---------------------------------------------------------------------
+c     Now which of these are globally best?
+c---------------------------------------------------------------------
+      i1 = mm
+      i0 = mm
+      do  i=mm,1,-1
+
+         best = z( j1(i1,1), j2(i1,1), j3(i1,1) )
+         call mpi_allreduce(best,temp,1,dp_type,
+     >        mpi_max,mpi_comm_world,ierr)
+         best = temp
+         if(best.eq.z(j1(i1,1),j2(i1,1),j3(i1,1)))then
+            jg( 0, i, 1) = me
+            jg( 1, i, 1) = is1 - 2 + j1( i1, 1 ) 
+            jg( 2, i, 1) = is2 - 2 + j2( i1, 1 ) 
+            jg( 3, i, 1) = is3 - 2 + j3( i1, 1 ) 
+            i1 = i1-1
+         else
+            jg( 0, i, 1) = 0
+            jg( 1, i, 1) = 0
+            jg( 2, i, 1) = 0
+            jg( 3, i, 1) = 0
+         endif
+         ten( i, 1 ) = best
+         call mpi_allreduce(jg(0,i,1), jg_temp,4,MPI_INTEGER,
+     >        mpi_max,mpi_comm_world,ierr)
+         jg( 0, i, 1) =  jg_temp(1)
+         jg( 1, i, 1) =  jg_temp(2)
+         jg( 2, i, 1) =  jg_temp(3)
+         jg( 3, i, 1) =  jg_temp(4)
+
+         best = z( j1(i0,0), j2(i0,0), j3(i0,0) )
+         call mpi_allreduce(best,temp,1,dp_type,
+     >        mpi_min,mpi_comm_world,ierr)
+         best = temp
+         if(best.eq.z(j1(i0,0),j2(i0,0),j3(i0,0)))then
+            jg( 0, i, 0) = me
+            jg( 1, i, 0) = is1 - 2 + j1( i0, 0 ) 
+            jg( 2, i, 0) = is2 - 2 + j2( i0, 0 ) 
+            jg( 3, i, 0) = is3 - 2 + j3( i0, 0 ) 
+            i0 = i0-1
+         else
+            jg( 0, i, 0) = 0
+            jg( 1, i, 0) = 0
+            jg( 2, i, 0) = 0
+            jg( 3, i, 0) = 0
+         endif
+         ten( i, 0 ) = best
+         call mpi_allreduce(jg(0,i,0), jg_temp,4,MPI_INTEGER,
+     >        mpi_max,mpi_comm_world,ierr)
+         jg( 0, i, 0) =  jg_temp(1)
+         jg( 1, i, 0) =  jg_temp(2)
+         jg( 2, i, 0) =  jg_temp(3)
+         jg( 3, i, 0) =  jg_temp(4)
+
+      enddo
+      m1 = i1+1
+      m0 = i0+1
+
+c      if( me .eq. root) then
+c         write(*,*)' '
+c         write(*,*)' negative charges at'
+c         write(*,9)(jg(1,i,0),jg(2,i,0),jg(3,i,0),i=1,mm)
+c         write(*,*)' positive charges at'
+c         write(*,9)(jg(1,i,1),jg(2,i,1),jg(3,i,1),i=1,mm)
+c         write(*,*)' small random numbers were'
+c         write(*,8)(ten( i,0),i=mm,1,-1)
+c         write(*,*)' and they were found on processor number'
+c         write(*,7)(jg(0,i,0),i=mm,1,-1)
+c         write(*,*)' large random numbers were'
+c         write(*,8)(ten( i,1),i=mm,1,-1)
+c         write(*,*)' and they were found on processor number'
+c         write(*,7)(jg(0,i,1),i=mm,1,-1)
+c      endif
+c 9    format(5(' (',i3,2(',',i3),')'))
+c 8    format(5D15.8)
+c 7    format(10i4)
+      call mpi_barrier(mpi_comm_world,ierr)
+      do  i3=1,n3
+         do  i2=1,n2
+            do  i1=1,n1
+               z(i1,i2,i3) = 0.0D0
+            enddo
+         enddo
+      enddo
+      do  i=mm,m0,-1
+         z( j1(i,0), j2(i,0), j3(i,0) ) = -1.0D0
+      enddo
+      do  i=mm,m1,-1
+         z( j1(i,1), j2(i,1), j3(i,1) ) = +1.0D0
+      enddo
+      call comm3(z,n1,n2,n3,k)
+
+c---------------------------------------------------------------------
+c          call showall(z,n1,n2,n3)
+c---------------------------------------------------------------------
+
+      return 
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine show_l(z,n1,n2,n3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer n1,n2,n3,i1,i2,i3,ierr
+      double precision z(n1,n2,n3)
+      integer m1, m2, m3,i
+
+      m1 = min(n1,18)
+      m2 = min(n2,14)
+      m3 = min(n3,18)
+
+      write(*,*)'  '
+      do  i=0,nprocs-1
+         if( me .eq. i )then
+            write(*,*)' id = ', me
+            do  i3=1,m3
+               do  i1=1,m1
+                  write(*,6)(z(i1,i2,i3),i2=1,m2)
+               enddo
+               write(*,*)' - - - - - - - '
+            enddo
+            write(*,*)'  '
+ 6          format(6f15.11)
+         endif
+         call mpi_barrier(mpi_comm_world,ierr)
+      enddo
+
+      return 
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine showall(z,n1,n2,n3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer n1,n2,n3,i1,i2,i3,i,ierr
+      double precision z(n1,n2,n3)
+      integer m1, m2, m3
+
+      m1 = min(n1,18)
+      m2 = min(n2,14)
+      m3 = min(n3,18)
+
+      write(*,*)'  '
+      do  i=0,nprocs-1
+         if( me .eq. i )then
+            write(*,*)' id = ', me
+            do  i3=1,m3
+               do  i1=1,m1
+                  write(*,6)(z(i1,i2,i3),i2=1,m2)
+               enddo
+               write(*,*)' - - - - - - - '
+            enddo
+            write(*,*)'  '
+ 6          format(15f6.3)
+         endif
+         call mpi_barrier(mpi_comm_world,ierr)
+      enddo
+
+      return 
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine show(z,n1,n2,n3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+      integer n1,n2,n3,i1,i2,i3,ierr,i
+      double precision z(n1,n2,n3)
+
+      write(*,*)'  '
+      do  i=0,nprocs-1
+         if( me .eq. i )then
+            write(*,*)' id = ', me
+            do  i3=2,n3-1
+               do  i1=2,n1-1
+                  write(*,6)(z(i1,i2,i3),i2=2,n1-1)
+               enddo
+               write(*,*)' - - - - - - - '
+            enddo
+            write(*,*)'  '
+ 6          format(8D10.3)
+         endif
+         call mpi_barrier(mpi_comm_world,ierr)
+      enddo
+
+c     call comm3(z,n1,n2,n3)
+
+      return 
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision function power( a, n1, n2, n3 )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     power  raises an integer, disguised as a double
+c     precision real, to an integer power.
+c     This version tries to avoid integer overflow by treating
+c     it as expressed in a form of "n1*n2+n3".
+c---------------------------------------------------------------------
+      implicit none
+
+      double precision a, aj
+      integer n1, n2, n3
+
+      integer n1j, n2j, nj
+      external randlc
+      double precision randlc, rdummy
+
+      power = 1.0d0
+      aj = a
+      nj = n3
+      n1j = n1
+      n2j = n2
+ 100  continue
+
+      if( n2j .gt. 0 ) then
+         if( mod(n2j,2) .eq. 1 ) nj = nj + n1j
+         n2j = n2j/2
+      else if( nj .eq. 0 ) then
+         go to 200
+      endif
+      if( mod(nj,2) .eq. 1 ) rdummy =  randlc( power, aj )
+      rdummy = randlc( aj, aj )
+      nj = nj/2
+      go to 100
+
+ 200  continue
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine bubble( ten, j1, j2, j3, m, ind )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     bubble        does a bubble sort in direction dir
+c---------------------------------------------------------------------
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer m, ind, j1( m, 0:1 ), j2( m, 0:1 ), j3( m, 0:1 )
+      double precision ten( m, 0:1 )
+      double precision temp
+      integer i, j_temp
+
+      if( ind .eq. 1 )then
+
+         do  i=1,m-1
+            if( ten(i,ind) .gt. ten(i+1,ind) )then
+
+               temp = ten( i+1, ind )
+               ten( i+1, ind ) = ten( i, ind )
+               ten( i, ind ) = temp
+
+               j_temp           = j1( i+1, ind )
+               j1( i+1, ind ) = j1( i,   ind )
+               j1( i,   ind ) = j_temp
+
+               j_temp           = j2( i+1, ind )
+               j2( i+1, ind ) = j2( i,   ind )
+               j2( i,   ind ) = j_temp
+
+               j_temp           = j3( i+1, ind )
+               j3( i+1, ind ) = j3( i,   ind )
+               j3( i,   ind ) = j_temp
+
+            else 
+               return
+            endif
+         enddo
+
+      else
+
+         do  i=1,m-1
+            if( ten(i,ind) .lt. ten(i+1,ind) )then
+
+               temp = ten( i+1, ind )
+               ten( i+1, ind ) = ten( i, ind )
+               ten( i, ind ) = temp
+
+               j_temp           = j1( i+1, ind )
+               j1( i+1, ind ) = j1( i,   ind )
+               j1( i,   ind ) = j_temp
+
+               j_temp           = j2( i+1, ind )
+               j2( i+1, ind ) = j2( i,   ind )
+               j2( i,   ind ) = j_temp
+
+               j_temp           = j3( i+1, ind )
+               j3( i+1, ind ) = j3( i,   ind )
+               j3( i,   ind ) = j_temp
+
+            else 
+               return
+            endif
+         enddo
+
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine zero3(z,n1,n2,n3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+      include 'mpinpb.h'
+
+      integer n1, n2, n3
+      double precision z(n1,n2,n3)
+      integer i1, i2, i3
+
+      do  i3=1,n3
+         do  i2=1,n2
+            do  i1=1,n1
+               z(i1,i2,i3)=0.0D0
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+
+c----- end of program ------------------------------------------------
diff --git a/examples/smpi/NAS/MG/mg.input.sample b/examples/smpi/NAS/MG/mg.input.sample
new file mode 100644
index 0000000000..a4dcf81275
--- /dev/null
+++ b/examples/smpi/NAS/MG/mg.input.sample
@@ -0,0 +1,4 @@
+ 8 = top level
+ 256 256 256 = nx ny nz
+ 20 = nit
+ 0 0 0 0 0 0 0 0 = debug_vec
diff --git a/examples/smpi/NAS/MG/mpinpb.h b/examples/smpi/NAS/MG/mpinpb.h
new file mode 100644
index 0000000000..1f0368c0b7
--- /dev/null
+++ b/examples/smpi/NAS/MG/mpinpb.h
@@ -0,0 +1,9 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include           'mpif.h'
+
+      integer           me, nprocs, root, dp_type
+      common /mpistuff/ me, nprocs, root, dp_type
+
diff --git a/examples/smpi/NAS/MPI_dummy/Makefile b/examples/smpi/NAS/MPI_dummy/Makefile
new file mode 100644
index 0000000000..86288d7a1d
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/Makefile
@@ -0,0 +1,38 @@
+# Makefile for MPI dummy library. 
+# Must be edited for a specific machine. Does NOT read in 
+# the make.def file of NPB 2.3
+F77 = f77
+CC = cc
+AR = ar
+
+# Enable if either Cray or IBM: (no such flag for most machines: see wtime.h)
+# MACHINE	=	-DCRAY
+# MACHINE	=	-DIBM
+
+libmpi.a: mpi_dummy.o mpi_dummy_c.o wtime.o
+	$(AR) r libmpi.a mpi_dummy.o mpi_dummy_c.o wtime.o
+
+mpi_dummy.o: mpi_dummy.f mpif.h
+	$(F77) -c mpi_dummy.f
+# For a Cray C90, try:
+#	cf77 -dp -c mpi_dummy.f
+# For an IBM 590, try:
+#	xlf -c mpi_dummy.f
+
+mpi_dummy_c.o: mpi_dummy.c mpi.h
+	$(CC) -c ${MACHINE} -o mpi_dummy_c.o mpi_dummy.c
+
+wtime.o: wtime.c
+# For most machines or CRAY or IBM
+	$(CC) -c ${MACHINE} wtime.c
+# For a precise timer on an SGI Power Challenge, try:
+#	$(CC) -o wtime.o -c wtime_sgi64.c
+
+test: test.f
+	$(F77) -o test -I. test.f -L. -lmpi
+
+
+
+clean: 
+	- rm -f *~ *.o
+	- rm -f test libmpi.a
diff --git a/examples/smpi/NAS/MPI_dummy/README b/examples/smpi/NAS/MPI_dummy/README
new file mode 100644
index 0000000000..9096a0b350
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/README
@@ -0,0 +1,52 @@
+###########################################
+# NAS Parallel Benchmarks 2&3             #
+# MPI/F77/C                               #
+# Revision 3.3                            #
+# NASA Ames Research Center               #
+# npb@nas.nasa.gov                        #
+# http://www.nas.nasa.gov/Software/NPB/   #
+###########################################
+
+MPI Dummy Library
+
+
+The MPI dummy library is supplied as a convenience for people who do
+not have an MPI library but would like to try running on one processor
+anyway. The NPB 2.x/3.x benchmarks are designed so that they do not
+actually try to do any message passing when run on one node. The MPI
+dummy library is just that - a set of dummy MPI routines which don't
+do anything, but allow you to link the benchmarks. Actually they do a
+few things, but nothing important. Note that the dummy library is 
+sufficient only for the NPB 2.x/3.x benchmarks. It probably won't be
+useful for anything else because it implements only a handful of
+functions. 
+
+Because the dummy library is just an extra goody, and since we don't
+have an infinite amount of time, it may be a bit trickier to configure
+than the rest of the benchmarks. You need to:
+
+1. Find out how C and Fortran interact on your machine. On most machines, 
+the fortran functon foo(x) is declared in C as foo_(xp) where xp is 
+a pointer, not a value. On IBMs, it's just foo(xp). On Cray C90s, its
+FOO(xp). You can define CRAY or IBM to get these, or you need to
+edit wtime.c if you've got something else. 
+
+2. Edit the Makefile to compile mpi_dummy.f and wtime.c correctly
+for your machine (including -DCRAY or -DIBM if necessary). 
+
+3. The substitute MPI timer gives wall clock time, not CPU time. 
+If you're running on a timeshared machine, you may want to 
+use a CPU timer. Edit the function mpi_wtime() in mpi_dummy.f
+to change this timer. (NOTE: for official benchmark results, 
+ONLY wall clock times are valid. Using a CPU timer is ok 
+if you want to get things running, but don't report any results
+measured with a CPU timer. )
+
+TROUBLESHOOTING
+
+o Compiling or linking of the benchmark aborts because the dummy MPI
+  header file or the dummy MPI library cannot be found.
+  - the file make.dummy in subdirectory config relies on the use
+    of the -I"path" and -L"path" -l"library" constructs to pass
+    information to the compilers and linkers. Edit this file to conform
+    to your system.
diff --git a/examples/smpi/NAS/MPI_dummy/mpi.h b/examples/smpi/NAS/MPI_dummy/mpi.h
new file mode 100644
index 0000000000..70eb3138b4
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/mpi.h
@@ -0,0 +1,112 @@
+#define MPI_DOUBLE          1
+#define MPI_INT             2
+#define MPI_BYTE            3
+#define MPI_FLOAT           4
+#define MPI_LONG            5
+
+#define MPI_COMM_WORLD      0
+
+#define MPI_MAX             1
+#define MPI_SUM             2
+#define MPI_MIN             3
+
+#define MPI_SUCCESS         0
+#define MPI_ANY_SOURCE     -1
+#define MPI_ERR_OTHER      -1
+#define MPI_STATUS_SIZE     3
+
+
+/* 
+   Status object.  It is the only user-visible MPI data-structure 
+   The "count" field is PRIVATE; use MPI_Get_count to access it. 
+ */
+typedef struct { 
+    int count;
+    int MPI_SOURCE;
+    int MPI_TAG;
+    int MPI_ERROR;
+} MPI_Status;
+
+
+/* MPI request objects */
+typedef int MPI_Request;
+
+/* MPI datatype */
+typedef int MPI_Datatype;
+
+/* MPI comm */
+typedef int MPI_Comm;
+
+/* MPI operation */
+typedef int MPI_Op;
+
+
+
+/* Prototypes: */
+void  mpi_error( void );
+
+int   MPI_Irecv( void         *buf,
+                 int          count,
+                 MPI_Datatype datatype,
+                 int          source,
+                 int          tag,
+                 MPI_Comm     comm,
+                 MPI_Request  *request );
+
+int   MPI_Send( void         *buf,
+                int          count,
+                MPI_Datatype datatype,
+                int          dest,
+                int          tag,
+                MPI_Comm     comm );
+
+int   MPI_Wait( MPI_Request *request,
+                MPI_Status  *status );
+
+int   MPI_Init( int  *argc,
+                char ***argv );
+
+int   MPI_Comm_rank( MPI_Comm comm, 
+                     int      *rank );
+
+int   MPI_Comm_size( MPI_Comm comm, 
+                     int      *size );
+
+double MPI_Wtime( void );
+
+int  MPI_Barrier( MPI_Comm comm );
+
+int  MPI_Finalize( void );
+
+int  MPI_Allreduce( void         *sendbuf,
+                    void         *recvbuf,
+                    int          nitems,
+                    MPI_Datatype type,
+                    MPI_Op       op,
+                    MPI_Comm     comm );
+
+int  MPI_Reduce( void         *sendbuf,
+                 void         *recvbuf,
+                 int          nitems,
+                 MPI_Datatype type,
+                 MPI_Op       op,
+                 int          root,
+                 MPI_Comm     comm );
+
+int  MPI_Alltoall( void         *sendbuf,
+                   int          sendcount,
+                   MPI_Datatype sendtype,
+                   void         *recvbuf,
+                   int          recvcount,
+                   MPI_Datatype recvtype,
+                   MPI_Comm     comm );
+
+int  MPI_Alltoallv( void         *sendbuf,
+                    int          *sendcounts,
+                    int          *senddispl,
+                    MPI_Datatype sendtype,
+                    void         *recvbuf,
+                    int          *recvcounts,
+                    int          *recvdispl,
+                    MPI_Datatype recvtype,
+                    MPI_Comm     comm );
diff --git a/examples/smpi/NAS/MPI_dummy/mpi_dummy.c b/examples/smpi/NAS/MPI_dummy/mpi_dummy.c
new file mode 100644
index 0000000000..d2cbfb84bc
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/mpi_dummy.c
@@ -0,0 +1,267 @@
+#include "mpi.h"
+#include "wtime.h"
+#include <stdlib.h>
+
+
+
+void  mpi_error( void )
+{
+    printf( "mpi_error called\n" );
+    abort();
+}
+
+
+
+
+int   MPI_Irecv( void         *buf,
+                 int          count,
+                 MPI_Datatype datatype,
+                 int          source,
+                 int          tag,
+                 MPI_Comm     comm,
+                 MPI_Request  *request )
+{
+    mpi_error();
+    return( MPI_ERR_OTHER );
+}
+
+
+
+
+int   MPI_Recv( void         *buf,
+                int          count,
+                MPI_Datatype datatype,
+                int          source,
+                int          tag,
+                MPI_Comm     comm,
+                MPI_Status   *status )
+{
+    mpi_error();
+    return( MPI_ERR_OTHER );
+}
+
+
+
+
+int   MPI_Send( void         *buf,
+                int          count,
+                MPI_Datatype datatype,
+                int          dest,
+                int          tag,
+                MPI_Comm     comm )
+{
+    mpi_error();
+    return( MPI_ERR_OTHER );
+}
+
+
+
+
+int   MPI_Wait( MPI_Request *request,
+                MPI_Status  *status )
+{
+    mpi_error();
+    return( MPI_ERR_OTHER );
+}
+
+
+
+
+int   MPI_Init( int  *argc,
+                char ***argv )
+{
+    return( MPI_SUCCESS );
+}
+
+
+
+
+int   MPI_Comm_rank( MPI_Comm comm, 
+                     int      *rank )
+{
+    *rank = 0;
+    return( MPI_SUCCESS );
+}
+
+
+
+
+int   MPI_Comm_size( MPI_Comm comm, 
+                     int      *size )
+{
+    *size = 1;
+    return( MPI_SUCCESS );
+}
+
+
+
+
+double MPI_Wtime( void )
+{
+    void wtime();
+
+    double t;
+    wtime( &t );
+    return( t );
+}
+
+
+
+
+int  MPI_Barrier( MPI_Comm comm )
+{
+    return( MPI_SUCCESS );
+}
+
+
+
+
+int  MPI_Finalize( void )
+{
+    return( MPI_SUCCESS );
+}
+
+
+
+
+int  MPI_Allreduce( void         *sendbuf,
+                    void         *recvbuf,
+                    int          nitems,
+                    MPI_Datatype type,
+                    MPI_Op       op,
+                    MPI_Comm     comm )
+{
+    int i;
+    if( type == MPI_INT )
+    {
+        int *pd_sendbuf, *pd_recvbuf;
+        pd_sendbuf = (int *) sendbuf;    
+        pd_recvbuf = (int *) recvbuf;    
+        for( i=0; i<nitems; i++ )
+            *(pd_recvbuf+i) = *(pd_sendbuf+i);
+    }
+    if( type == MPI_LONG )
+    {
+        long *pd_sendbuf, *pd_recvbuf;
+        pd_sendbuf = (long *) sendbuf;    
+        pd_recvbuf = (long *) recvbuf;    
+        for( i=0; i<nitems; i++ )
+            *(pd_recvbuf+i) = *(pd_sendbuf+i);
+    }
+    if( type == MPI_DOUBLE )
+    {
+        double *pd_sendbuf, *pd_recvbuf;
+        pd_sendbuf = (double *) sendbuf;    
+        pd_recvbuf = (double *) recvbuf;    
+        for( i=0; i<nitems; i++ )
+            *(pd_recvbuf+i) = *(pd_sendbuf+i);
+    }
+    return( MPI_SUCCESS );
+}
+  
+
+
+
+int  MPI_Reduce( void         *sendbuf,
+                 void         *recvbuf,
+                 int          nitems,
+                 MPI_Datatype type,
+                 MPI_Op       op,
+                 int          root,
+                 MPI_Comm     comm )
+{
+    int i;
+    if( type == MPI_INT )
+    {
+        int *pi_sendbuf, *pi_recvbuf;
+        pi_sendbuf = (int *) sendbuf;    
+        pi_recvbuf = (int *) recvbuf;    
+        for( i=0; i<nitems; i++ )
+            *(pi_recvbuf+i) = *(pi_sendbuf+i);
+    }
+    if( type == MPI_LONG )
+    {
+        long *pi_sendbuf, *pi_recvbuf;
+        pi_sendbuf = (long *) sendbuf;    
+        pi_recvbuf = (long *) recvbuf;    
+        for( i=0; i<nitems; i++ )
+            *(pi_recvbuf+i) = *(pi_sendbuf+i);
+    }
+    if( type == MPI_DOUBLE )
+    {
+        double *pd_sendbuf, *pd_recvbuf;
+        pd_sendbuf = (double *) sendbuf;    
+        pd_recvbuf = (double *) recvbuf;    
+        for( i=0; i<nitems; i++ )
+            *(pd_recvbuf+i) = *(pd_sendbuf+i);
+    }
+    return( MPI_SUCCESS );
+}
+  
+
+
+
+int  MPI_Alltoall( void         *sendbuf,
+                   int          sendcount,
+                   MPI_Datatype sendtype,
+                   void         *recvbuf,
+                   int          recvcount,
+                   MPI_Datatype recvtype,
+                   MPI_Comm     comm )
+{
+    int i;
+    if( recvtype == MPI_INT )
+    {
+        int *pd_sendbuf, *pd_recvbuf;
+        pd_sendbuf = (int *) sendbuf;    
+        pd_recvbuf = (int *) recvbuf;    
+        for( i=0; i<sendcount; i++ )
+            *(pd_recvbuf+i) = *(pd_sendbuf+i);
+    }
+    if( recvtype == MPI_LONG )
+    {
+        long *pd_sendbuf, *pd_recvbuf;
+        pd_sendbuf = (long *) sendbuf;    
+        pd_recvbuf = (long *) recvbuf;    
+        for( i=0; i<sendcount; i++ )
+            *(pd_recvbuf+i) = *(pd_sendbuf+i);
+    }
+    return( MPI_SUCCESS );
+}
+  
+
+
+
+int  MPI_Alltoallv( void         *sendbuf,
+                    int          *sendcounts,
+                    int          *senddispl,
+                    MPI_Datatype sendtype,
+                    void         *recvbuf,
+                    int          *recvcounts,
+                    int          *recvdispl,
+                    MPI_Datatype recvtype,
+                    MPI_Comm     comm )
+{
+    int i;
+    if( recvtype == MPI_INT )
+    {
+        int *pd_sendbuf, *pd_recvbuf;
+        pd_sendbuf = (int *) sendbuf;    
+        pd_recvbuf = (int *) recvbuf;    
+        for( i=0; i<sendcounts[0]; i++ )
+            *(pd_recvbuf+i+recvdispl[0]) = *(pd_sendbuf+i+senddispl[0]);
+    }
+    if( recvtype == MPI_LONG )
+    {
+        long *pd_sendbuf, *pd_recvbuf;
+        pd_sendbuf = (long *) sendbuf;    
+        pd_recvbuf = (long *) recvbuf;    
+        for( i=0; i<sendcounts[0]; i++ )
+            *(pd_recvbuf+i+recvdispl[0]) = *(pd_sendbuf+i+senddispl[0]);
+    }
+    return( MPI_SUCCESS );
+}
+  
+
+
+
diff --git a/examples/smpi/NAS/MPI_dummy/mpi_dummy.f b/examples/smpi/NAS/MPI_dummy/mpi_dummy.f
new file mode 100644
index 0000000000..2550aa3452
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/mpi_dummy.f
@@ -0,0 +1,309 @@
+      subroutine mpi_isend(buf,count,datatype,source,
+     & tag,comm,request,ierror)
+      integer buf(*), count,datatype,source,tag,comm,
+     & request,ierror
+      call mpi_error()
+      return
+      end  
+
+      subroutine mpi_irecv(buf,count,datatype,source,
+     & tag,comm,request,ierror)
+      integer buf(*), count,datatype,source,tag,comm,
+     & request,ierror
+      call mpi_error()
+      return
+      end
+
+      subroutine mpi_send(buf,count,datatype,dest,tag,comm,ierror)
+      integer buf(*), count,datatype,dest,tag,comm,ierror
+      call mpi_error()
+      return
+      end
+      
+      subroutine mpi_recv(buf,count,datatype,source,
+     & tag,comm,status,ierror)
+      integer buf(*), count,datatype,source,tag,comm,
+     & status(*),ierror
+      call mpi_error()
+      return
+      end
+
+      subroutine mpi_comm_split(comm,color,key,newcomm,ierror)
+      integer comm,color,key,newcomm,ierror
+      return
+      end
+
+      subroutine mpi_comm_rank(comm, rank,ierr)
+      implicit none
+      integer comm, rank,ierr
+      rank = 0
+      return
+      end
+
+      subroutine mpi_comm_size(comm, size, ierr)
+      implicit none
+      integer comm, size, ierr
+      size = 1
+      return
+      end
+
+      double precision function mpi_wtime()
+      implicit none
+      double precision t
+c This function must measure wall clock time, not CPU time. 
+c Since there is no portable timer in Fortran (77)
+c we call a routine compiled in C (though the C source may have
+c to be tweaked). 
+      call wtime(t)
+c The following is not ok for "official" results because it reports
+c CPU time not wall clock time. It may be useful for developing/testing
+c on timeshared Crays, though. 
+c     call second(t)
+
+      mpi_wtime = t
+
+      return
+      end
+
+
+c may be valid to call this in single processor case
+      subroutine mpi_barrier(comm,ierror)
+      return
+      end
+
+c may be valid to call this in single processor case
+      subroutine mpi_bcast(buf, nitems, type, root, comm, ierr)
+      implicit none
+      integer buf(*), nitems, type, root, comm, ierr
+      return
+      end
+
+      subroutine mpi_comm_dup(oldcomm, newcomm,ierror)
+      integer oldcomm, newcomm,ierror
+      newcomm= oldcomm
+      return
+      end
+
+      subroutine mpi_error()
+      print *, 'mpi_error called'
+      stop
+      end 
+
+      subroutine mpi_abort(comm, errcode, ierr)
+      implicit none
+      integer comm, errcode, ierr
+      print *, 'mpi_abort called'
+      stop
+      end
+
+      subroutine mpi_finalize(ierr)
+      return
+      end
+
+      subroutine mpi_init(ierr)
+      return
+      end
+
+
+c assume double precision, which is all SP uses 
+      subroutine mpi_reduce(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      implicit none
+      include 'mpif.h'
+      integer nitems, type, op, root, comm, ierr
+      double precision inbuf(*), outbuf(*)
+
+      if (type .eq. mpi_double_precision) then
+         call mpi_reduce_dp(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      else if (type .eq.  mpi_double_complex) then
+         call mpi_reduce_dc(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      else if (type .eq.  mpi_complex) then
+         call mpi_reduce_complex(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      else if (type .eq.  mpi_real) then
+         call mpi_reduce_real(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      else if (type .eq.  mpi_integer) then
+         call mpi_reduce_int(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      else 
+         print *, 'mpi_reduce: unknown type ', type
+      end if
+      return
+      end
+
+
+      subroutine mpi_reduce_real(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      implicit none
+      integer nitems, type, op, root, comm, ierr, i
+      real inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+      subroutine mpi_reduce_dp(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      implicit none
+      integer nitems, type, op, root, comm, ierr, i
+      double precision inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+      subroutine mpi_reduce_dc(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      implicit none
+      integer nitems, type, op, root, comm, ierr, i
+      double complex inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+
+      subroutine mpi_reduce_complex(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      implicit none
+      integer nitems, type, op, root, comm, ierr, i
+      complex inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+      subroutine mpi_reduce_int(inbuf, outbuf, nitems, 
+     $                      type, op, root, comm, ierr)
+      implicit none
+      integer nitems, type, op, root, comm, ierr, i
+      integer inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+      subroutine mpi_allreduce(inbuf, outbuf, nitems, 
+     $                      type, op, comm, ierr)
+      implicit none
+      integer nitems, type, op, comm, ierr
+      double precision inbuf(*), outbuf(*)
+
+      call mpi_reduce(inbuf, outbuf, nitems, 
+     $                      type, op, 0, comm, ierr)
+      return
+      end
+
+      subroutine mpi_alltoall(inbuf, nitems, type, outbuf, nitems_dum, 
+     $                        type_dum, comm, ierr)
+      implicit none
+      include 'mpif.h'
+      integer nitems, type, comm, ierr, nitems_dum, type_dum
+      double precision inbuf(*), outbuf(*)
+      if (type .eq. mpi_double_precision) then
+         call mpi_alltoall_dp(inbuf, outbuf, nitems, 
+     $                      type, comm, ierr)
+      else if (type .eq.  mpi_double_complex) then
+         call mpi_alltoall_dc(inbuf, outbuf, nitems, 
+     $                      type, comm, ierr)
+      else if (type .eq.  mpi_complex) then
+         call mpi_alltoall_complex(inbuf, outbuf, nitems, 
+     $                      type, comm, ierr)
+      else if (type .eq.  mpi_real) then
+         call mpi_alltoall_real(inbuf, outbuf, nitems, 
+     $                      type, comm, ierr)
+      else if (type .eq.  mpi_integer) then
+         call mpi_alltoall_int(inbuf, outbuf, nitems, 
+     $                      type, comm, ierr)
+      else 
+         print *, 'mpi_alltoall: unknown type ', type
+      end if
+      return
+      end
+
+      subroutine mpi_alltoall_dc(inbuf, outbuf, nitems, 
+     $                           type, comm, ierr)
+      implicit none
+      integer nitems, type, comm, ierr, i
+      double complex inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+
+      subroutine mpi_alltoall_complex(inbuf, outbuf, nitems, 
+     $                           type, comm, ierr)
+      implicit none
+      integer nitems, type, comm, ierr, i
+      double complex inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+      subroutine mpi_alltoall_dp(inbuf, outbuf, nitems, 
+     $                           type, comm, ierr)
+      implicit none
+      integer nitems, type, comm, ierr, i
+      double precision inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+      subroutine mpi_alltoall_real(inbuf, outbuf, nitems, 
+     $                             type, comm, ierr)
+      implicit none
+      integer nitems, type, comm, ierr, i
+      real inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+      subroutine mpi_alltoall_int(inbuf, outbuf, nitems, 
+     $                            type, comm, ierr)
+      implicit none
+      integer nitems, type, comm, ierr, i
+      integer inbuf(*), outbuf(*)
+      do i = 1, nitems
+         outbuf(i) = inbuf(i)
+      end do
+      
+      return
+      end
+
+      subroutine mpi_wait(request,status,ierror)
+      integer request,status,ierror
+      call mpi_error()
+      return
+      end
+
+      subroutine mpi_waitall(count,requests,status,ierror)
+      integer count,requests(*),status(*),ierror
+      call mpi_error()
+      return
+      end
+
diff --git a/examples/smpi/NAS/MPI_dummy/mpif.h b/examples/smpi/NAS/MPI_dummy/mpif.h
new file mode 100644
index 0000000000..92686aa361
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/mpif.h
@@ -0,0 +1,27 @@
+      integer mpi_comm_world
+      parameter (mpi_comm_world = 0)
+
+      integer mpi_max, mpi_min, mpi_sum
+      parameter (mpi_max = 1, mpi_sum = 2, mpi_min = 3)
+
+      integer mpi_byte, mpi_integer, mpi_real,
+     >                  mpi_double_precision,  mpi_complex,
+     >                  mpi_double_complex
+      parameter (mpi_double_precision = 1,
+     $           mpi_integer = 2, 
+     $           mpi_byte = 3, 
+     $           mpi_real= 4, 
+     $           mpi_complex = 5,
+     $           mpi_double_complex = 6)
+
+      integer mpi_any_source
+      parameter (mpi_any_source = -1)
+
+      integer mpi_err_other
+      parameter (mpi_err_other = -1)
+
+      double precision mpi_wtime
+      external mpi_wtime
+
+      integer mpi_status_size
+      parameter (mpi_status_size=3)
diff --git a/examples/smpi/NAS/MPI_dummy/test.f b/examples/smpi/NAS/MPI_dummy/test.f
new file mode 100644
index 0000000000..081c73c72f
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/test.f
@@ -0,0 +1,10 @@
+      program
+      implicit none
+      double precision t, mpi_wtime
+      external mpi_wtime
+      t = 0.0
+      t = mpi_wtime()
+      print *, t
+      t = mpi_wtime()
+      print *, t
+      end
diff --git a/examples/smpi/NAS/MPI_dummy/wtime.c b/examples/smpi/NAS/MPI_dummy/wtime.c
new file mode 100644
index 0000000000..221d2225ae
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/wtime.c
@@ -0,0 +1,13 @@
+#include "wtime.h"
+#include <sys/time.h>
+
+void wtime(double *t)
+{
+  static int sec = -1;
+  struct timeval tv;
+  gettimeofday(&tv, (void *)0);
+  if (sec < 0) sec = tv.tv_sec;
+  *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
+}
+
+    
diff --git a/examples/smpi/NAS/MPI_dummy/wtime.f b/examples/smpi/NAS/MPI_dummy/wtime.f
new file mode 100644
index 0000000000..a1cfde9aa3
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/wtime.f
@@ -0,0 +1,12 @@
+      subroutine wtime(tim)
+      real*8 tim
+      dimension tarray(2)
+      call etime(tarray)
+      tim = tarray(1)
+      return
+      end
+
+
+
+
+
diff --git a/examples/smpi/NAS/MPI_dummy/wtime.h b/examples/smpi/NAS/MPI_dummy/wtime.h
new file mode 100644
index 0000000000..12eb0cb0ee
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/wtime.h
@@ -0,0 +1,12 @@
+/* C/Fortran interface is different on different machines. 
+ * You may need to tweak this.
+ */
+
+
+#if defined(IBM)
+#define wtime wtime
+#elif defined(CRAY)
+#define wtime WTIME
+#else
+#define wtime wtime_
+#endif
diff --git a/examples/smpi/NAS/MPI_dummy/wtime_sgi64.c b/examples/smpi/NAS/MPI_dummy/wtime_sgi64.c
new file mode 100644
index 0000000000..d08d50cd34
--- /dev/null
+++ b/examples/smpi/NAS/MPI_dummy/wtime_sgi64.c
@@ -0,0 +1,74 @@
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/syssgi.h>
+#include <sys/immu.h>
+#include <errno.h>
+#include <stdio.h>
+
+/* The following works on SGI Power Challenge systems */
+
+typedef unsigned long iotimer_t;
+
+unsigned int cycleval;
+volatile iotimer_t *iotimer_addr, base_counter;
+double resolution;
+
+/* address_t is an integer type big enough to hold an address */
+typedef unsigned long address_t;
+
+
+
+void timer_init() 
+{
+  
+  int fd;
+  char *virt_addr;
+  address_t phys_addr, page_offset, pagemask, pagebase_addr;
+  
+  pagemask = getpagesize() - 1;
+  errno = 0;
+  phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
+  if (errno != 0) {
+    perror("SGI_QUERY_CYCLECNTR");
+    exit(1);
+  }
+  /* rel_addr = page offset of physical address */
+  page_offset = phys_addr & pagemask;
+  pagebase_addr = phys_addr - page_offset;
+  fd = open("/dev/mmem", O_RDONLY);
+
+  virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
+  virt_addr = virt_addr + page_offset;
+  iotimer_addr = (iotimer_t *)virt_addr;
+  /* cycleval in picoseconds to this gives resolution in seconds */
+  resolution = 1.0e-12*cycleval; 
+  base_counter = *iotimer_addr;
+}
+
+void wtime_(double *time) 
+{
+  static int initialized = 0;
+  volatile iotimer_t counter_value;
+  if (!initialized) { 
+    timer_init();
+    initialized = 1;
+  }
+  counter_value = *iotimer_addr - base_counter;
+  *time = (double)counter_value * resolution;
+}
+
+
+void wtime(double *time) 
+{
+  static int initialized = 0;
+  volatile iotimer_t counter_value;
+  if (!initialized) { 
+    timer_init();
+    initialized = 1;
+  }
+  counter_value = *iotimer_addr - base_counter;
+  *time = (double)counter_value * resolution;
+}
+
+
diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile
new file mode 100644
index 0000000000..8f356aa71d
--- /dev/null
+++ b/examples/smpi/NAS/Makefile
@@ -0,0 +1,69 @@
+SHELL=/bin/sh
+CLASS=U
+NPROCS=1
+SUBTYPE=
+VERSION=
+SFILE=config/suite.def
+
+default: header
+	@ sys/print_instructions
+
+BT: bt
+bt: header
+	cd BT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) SUBTYPE=$(SUBTYPE) VERSION=$(VERSION)
+
+SP: sp
+sp: header
+	cd SP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+LU: lu
+lu: header
+	cd LU; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) VERSION=$(VERSION)
+
+MG: mg
+mg: header
+	cd MG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+FT: ft
+ft: header
+	cd FT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+IS: is
+is: header
+	cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+CG: cg
+cg: header
+	cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+EP: ep
+ep: header
+	cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+DT: dt
+dt: header
+	cd DT; $(MAKE) CLASS=$(CLASS)
+
+# Awk script courtesy cmg@cray.com, modified by Haoqiang Jin
+suite:
+	@ awk -f sys/suite.awk SMAKE=$(MAKE) $(SFILE) | $(SHELL)
+
+
+# It would be nice to make clean in each subdirectory (the targets
+# are defined) but on a really clean system this will won't work
+# because those makefiles need config/make.def
+clean:
+	- rm -f core 
+	- rm -f *~ */core */*~ */*.o */npbparams.h */*.obj */*.exe
+	- rm -f MPI_dummy/test MPI_dummy/libmpi.a
+	- rm -f sys/setparams sys/makesuite sys/setparams.h
+	- rm -f btio.*.out*
+
+veryclean: clean
+	- rm -f config/make.def config/suite.def 
+	- rm -f bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* 
+	- rm -f bin/ep.* bin/cg.* bin/dt.*
+
+header:
+	@ sys/print_header
+
diff --git a/examples/smpi/NAS/README b/examples/smpi/NAS/README
new file mode 100644
index 0000000000..a80f5d6d70
--- /dev/null
+++ b/examples/smpi/NAS/README
@@ -0,0 +1,52 @@
+*** Warning ***
+
+This version of benchmarks IS, DT and EP contain special
+tweaks to work with SMPI.
+
+
+
+The MPI implementation of NPB 3.3 (NPB3.3-MPI)
+--------------------------------------------------
+
+For problem reports and suggestions on the implementation, 
+please contact:
+
+   NAS Parallel Benchmark Team
+   npb@nas.nasa.gov
+
+   http://www.nas.nasa.gov/Software/NPB
+
+
+This directory contains the MPI implementation of the NAS
+Parallel Benchmarks, Version 3.3 (NPB3.3-MPI).  A brief
+summary of the new features introduced in this version is
+given below.
+
+For changes from different versions, see the Changes.log file
+included in the upper directory of this distribution.
+
+For explanation of compilation and running of the benchmarks,
+please refer to README.install.  For a special note on DT, please
+see the README file in the DT subdirectory.
+
+
+New features in NPB3.3-MPI:
+  * NPB3.3-MPI introduces a new problem size (class E) to seven of  
+    the benchmarks (BT, SP, LU, CG, MG, FT, and EP).  The version 
+    also includes a new problem size (class D) for the IS benchmark, 
+    which was not present in the previous releases.
+
+  * The release is merged with the vector codes for the BT and LU
+    benchmarks, which can be selected with the VERSION=VEC option
+    during compilation.  However, it should be noted that successful
+    vectorization highly depends on the compiler used.  Some changes
+    to compiler directives for vectorization in the current codes
+    (see *_vec.f files) may be required.
+
+  * New improvements to BTIO (BT with IO subtypes):
+    - added I/O stats (I/O timing, data size written, I/O data rate)
+    - added an option for interleaving reads between writes through
+      the inputbt.data file.  Although the data file size would be
+      smaller as a result, the total amount of data written is still
+      the same.
+
diff --git a/examples/smpi/NAS/README.install b/examples/smpi/NAS/README.install
new file mode 100644
index 0000000000..47a3139562
--- /dev/null
+++ b/examples/smpi/NAS/README.install
@@ -0,0 +1,156 @@
+Some explanations on the MPI implementation of NPB 3.3 (NPB3.3-MPI)
+----------------------------------------------------------------------
+
+NPB-MPI is a sample MPI implementation based on NPB2.4 and NPB3.0-SER.
+This implementation contains all eight original benchmarks:
+Seven in Fortran: BT, SP, LU, FT, CG, MG, and EP; one in C: IS,
+as well as the DT benchmark, written in C, introduced in NPB3.2-MPI.
+
+For changes from different versions, see the Changes.log file
+included in the upper directory of this distribution.
+
+This version has been tested, among others, on an SGI Origin3000 and
+an SGI Altix.  For problem reports and suggestions on the implementation, 
+please contact
+
+   NAS Parallel Benchmark Team
+   npb@nas.nasa.gov
+
+
+CAUTION *********************************
+When running the I/O benchmark, one or more data files will be written
+in the directory from which the executable is invoked. They are not
+deleted at the end of the program. A new run will overwrite the old
+file(s). If not enough space is available in the user partition, the
+program will fail. For classes C and D the disk space required is
+3 GB and 135 GB, respectively.
+*****************************************
+
+
+1. Compilation
+
+   NPB3-MPI uses the same directory tree as NPB3-SER (and NPB2.x) does.
+   Before compilation, one needs to check the configuration file
+   'make.def' in the config directory and modify the file if necessary.  
+   If it does not (yet) exist, copy 'make.def.template' or one of the
+   sample files in the NAS.samples subdirectory to 'make.def' and
+   edit the content for site- and machine-specific data.  Then
+
+       make <benchmark-name> NPROCS=<number> CLASS=<class> \
+         [SUBTYPE=<type>] [VERSION=VEC]
+
+   where <benchmark-name>  is "bt", "cg", "dt", "ep", "ft", "is", 
+                              "lu", "mg", or "sp"
+         <number>          is the number of processes
+         <class>           is "S", "W", "A", "B", "C", "D", or "E"
+
+   Classes C, D and E are not available for DT.
+   Class E is not available for IS.
+
+   The "VERSION=VEC" option is used for selecting the vectorized 
+   versions of BT and LU.
+
+   Only when making the I/O benchmark:
+         <benchmark-name>  is "bt"
+         <number>, <class> as above
+         <type>            is "full", "simple", "fortran", or "epio"
+
+   Three parameters not used in the original BT benchmark are present in
+   the I/O benchmark. Two are set by default in the file BT/bt.f. 
+   Changing them is optional.
+   One is set in make.def. It must be specified.
+
+   bt.f: collbuf_nodes: number of processes used to buffer data before
+                        writing to file in the collective buffering mode
+                        (<type> is "full").
+         collbuf_size:  size of buffer (in bytes) per process used in
+                        collective buffering
+
+   make.def: -DFORTRAN_REC_SIZE: Fortran I/O record length in bytes. This
+                        is a system-specific value. It is part of the
+                        definition string of variable CONVERTFLAG. Syntax:
+                        "CONVERTFLAG = -DFORTRAN_REC_SIZE=n", where n is
+                        the record length.
+
+   When <type> is "full" or "simple", the code must be linked with an
+   MPI library that contains the subset of IO routines defined in MPI 2.
+
+
+   Class D for IS (Integer Sort) requires a compiler/system that 
+   supports the "long" type in C to be 64-bit.  As examples, the SGI 
+   MIPS compiler for the SGI Origin using the "-64" compilation flag and
+   the Intel compiler for IA64 are known to work.
+
+
+   The above procedure allows you to build one benchmark
+   at a time. To build a whole suite, you can type "make suite"
+   Make will look in file "config/suite.def" for a list of 
+   executables to build. The file contains one line per specification, 
+   with comments preceded by "#". Each line contains the name
+   of a benchmark, the class, and the number of processors, separated
+   by spaces or tabs. config/suite.def.template contains an example
+   of such a file.
+
+
+   The benchmarks have been designed so that they can be run
+   on a single processor without an MPI library. A few "dummy" 
+   MPI routines are still required for linking. For convenience
+   such a library is supplied in the "MPI_dummy" subdirectory of
+   the distribution. It contains an mpif.h and mpi.f include files
+   which must be used as well. The dummy library is built and
+   linked automatically and paths to the include files are defined
+   by inserting the line "include ../config/make.dummy" into the
+   make.def file (see example in make.def.template). Make sure to 
+   read the warnings in the README file in "MPI_dummy".The use of
+   the library is fragile and can produce unexpected errors.
+
+
+   ================================
+   
+   The "RAND" variable in make.def
+   --------------------------------
+   
+   Most of the NPBs use a random number generator. In two of the NPBs (FT
+   and EP) the computation of random numbers is included in the timed
+   part of the calculation, and it is important that the random number
+   generator be efficient.  The default random number generator package
+   provided is called "randi8" and should be used where possible. It has 
+   the following requirements:
+   
+   randi8:
+     1. Uses integer*8 arithmetic. Compiler must support integer*8
+     2. Uses the Fortran 90 IAND intrinsic. Compiler must support IAND.
+     3. Assumes overflow bits are discarded by the hardware. In particular, 
+        that the lowest 46 bits of a*b are always correct, even if the 
+        result a*b is larger than 2^64. 
+   
+   Since randi8 may not work on all machines, we supply the following
+   alternatives:
+   
+   randi8_safe
+     1. Uses integer*8 arithmetic
+     2. Uses the Fortran 90 IBITS intrinsic. 
+     3. Does not make any assumptions about overflow. Should always
+        work correctly if compiler supports integer*8 and IBITS. 
+   
+   randdp
+     1. Uses double precision arithmetic (to simulate integer*8 operations). 
+        Should work with any system with support for 64-bit floating
+        point arithmetic.      
+   
+   randdpvec
+     1. Similar to randdp but written to be easier to vectorize. 
+   
+   
+2. Execution
+
+   The executable is named <benchmark-name>.<class>.<nprocs>[.<suffix>],
+   where <suffix> is "fortran_io", "mpi_io_simple",  "ep_io", or 
+                     "mpi_io_full"
+   The executable is placed in the bin subdirectory (or in the directory 
+   BINDIR specified in make.def, if you've defined it). The method for 
+   running the MPI program depends on your local system.
+   When any of the I/O benchmarks is run (non-empty subtype), one or 
+   more output files are created, and placed in the directory from which
+   the program was started. These are not removed automatically, and 
+   will be overwritten the next time an IO benchmark is run.
diff --git a/examples/smpi/NAS/SP/Makefile b/examples/smpi/NAS/SP/Makefile
new file mode 100644
index 0000000000..01508aa935
--- /dev/null
+++ b/examples/smpi/NAS/SP/Makefile
@@ -0,0 +1,60 @@
+SHELL=/bin/sh
+BENCHMARK=sp
+BENCHMARKU=SP
+
+include ../config/make.def
+
+
+OBJS = sp.o make_set.o initialize.o exact_solution.o exact_rhs.o \
+       set_constants.o adi.o define.o copy_faces.o rhs.o      \
+       lhsx.o lhsy.o lhsz.o x_solve.o ninvr.o y_solve.o pinvr.o    \
+       z_solve.o tzetar.o add.o txinvr.o error.o verify.o setup_mpi.o \
+       ${COMMON}/print_results.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+# npbparams.h is included by header.h
+# The following rule should do the trick but many make programs (not gmake)
+# will do the wrong thing and rebuild the world every time (because the
+# mod time on header.h is not changed. One solution would be to 
+# touch header.h but this might cause confusion if someone has
+# accidentally deleted it. Instead, make the dependency on npbparams.h
+# explicit in all the lines below (even though dependence is indirect). 
+
+# header.h: npbparams.h
+
+${PROGRAM}: config ${OBJS}
+	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+.f.o:
+	${FCOMPILE} $<
+
+sp.o:             sp.f  header.h npbparams.h  mpinpb.h
+make_set.o:       make_set.f  header.h npbparams.h  mpinpb.h
+initialize.o:     initialize.f  header.h npbparams.h
+exact_solution.o: exact_solution.f  header.h npbparams.h
+exact_rhs.o:      exact_rhs.f  header.h npbparams.h
+set_constants.o:  set_constants.f  header.h npbparams.h
+adi.o:            adi.f  header.h npbparams.h
+define.o:         define.f  header.h npbparams.h
+copy_faces.o:     copy_faces.f  header.h npbparams.h  mpinpb.h
+rhs.o:            rhs.f  header.h npbparams.h
+lhsx.o:           lhsx.f  header.h npbparams.h
+lhsy.o:           lhsy.f  header.h npbparams.h
+lhsz.o:           lhsz.f  header.h npbparams.h
+x_solve.o:        x_solve.f  header.h npbparams.h  mpinpb.h
+ninvr.o:          ninvr.f  header.h npbparams.h
+y_solve.o:        y_solve.f  header.h npbparams.h  mpinpb.h
+pinvr.o:          pinvr.f  header.h npbparams.h
+z_solve.o:        z_solve.f  header.h npbparams.h  mpinpb.h
+tzetar.o:         tzetar.f  header.h npbparams.h
+add.o:            add.f  header.h npbparams.h
+txinvr.o:         txinvr.f  header.h npbparams.h
+error.o:          error.f  header.h npbparams.h  mpinpb.h
+verify.o:         verify.f  header.h npbparams.h  mpinpb.h
+setup_mpi.o:      setup_mpi.f mpinpb.h npbparams.h 
+
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f npbparams.h core
diff --git a/examples/smpi/NAS/SP/README b/examples/smpi/NAS/SP/README
new file mode 100644
index 0000000000..fe423db43f
--- /dev/null
+++ b/examples/smpi/NAS/SP/README
@@ -0,0 +1,17 @@
+
+This code implements a 3D Multi-partition algorithm for the solution 
+of the uncoupled systems of linear equations resulting from 
+Beam-Warming approximate factorization.  Consequently, the program 
+must be run on a square number of processors.  The included file 
+"npbparams.h" contains a parameter statement which sets "maxcells" 
+and "problem_size".  The parameter maxcells must be set to the 
+square root of the number of processors.  For example, if running 
+on 25 processors, then set max_cells=5.  The standard problem sizes 
+are problem_size=64 for class A, 102 for class B, and 162 for class C.
+
+The number of time steps and the time step size dt are set in the 
+npbparams.h but may be overridden in the input deck "inputsp.data".  
+The number of time steps is 400 for all three 
+standard problems, and the appropriate time step sizes "dt" are 
+0.0015d0 for class A, 0.001d0 for class B, and 0.00067 for class C.  
+
diff --git a/examples/smpi/NAS/SP/add.f b/examples/smpi/NAS/SP/add.f
new file mode 100644
index 0000000000..cdc4765cbf
--- /dev/null
+++ b/examples/smpi/NAS/SP/add.f
@@ -0,0 +1,31 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine  add
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c addition of update to the vector u
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       integer  c, i, j, k, m
+
+       do  c = 1, ncells
+          do m = 1, 5
+             do  k = start(3,c), cell_size(3,c)-end(3,c)-1
+                do  j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do  i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      u(i,j,k,m,c) = u(i,j,k,m,c) + rhs(i,j,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+       end do
+
+       return
+       end
diff --git a/examples/smpi/NAS/SP/adi.f b/examples/smpi/NAS/SP/adi.f
new file mode 100644
index 0000000000..e55cfd60da
--- /dev/null
+++ b/examples/smpi/NAS/SP/adi.f
@@ -0,0 +1,24 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine  adi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       call copy_faces
+
+       call txinvr
+
+       call x_solve
+
+       call y_solve
+
+       call z_solve
+
+       call add
+
+       return
+       end
+
diff --git a/examples/smpi/NAS/SP/copy_faces.f b/examples/smpi/NAS/SP/copy_faces.f
new file mode 100644
index 0000000000..41824d2198
--- /dev/null
+++ b/examples/smpi/NAS/SP/copy_faces.f
@@ -0,0 +1,306 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine copy_faces
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function copies the face values of a variable defined on a set 
+c of cells to the overlap locations of the adjacent sets of cells. 
+c Because a set of cells interfaces in each direction with exactly one 
+c other set, we only need to fill six different buffers. We could try to 
+c overlap communication with computation, by computing
+c some internal values while communicating boundary values, but this
+c adds so much overhead that it's not clearly useful. 
+c---------------------------------------------------------------------
+
+       include 'header.h'
+       include 'mpinpb.h'
+
+       integer i, j, k, c, m, requests(0:11), p0, p1, 
+     >         p2, p3, p4, p5, b_size(0:5), ss(0:5), 
+     >         sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
+
+c---------------------------------------------------------------------
+c      exit immediately if there are no faces to be copied           
+c---------------------------------------------------------------------
+       if (no_nodes .eq. 1) then
+          call compute_rhs
+          return
+       endif
+
+
+       ss(0) = start_send_east
+       ss(1) = start_send_west
+       ss(2) = start_send_north
+       ss(3) = start_send_south
+       ss(4) = start_send_top
+       ss(5) = start_send_bottom
+
+       sr(0) = start_recv_east
+       sr(1) = start_recv_west
+       sr(2) = start_recv_north
+       sr(3) = start_recv_south
+       sr(4) = start_recv_top
+       sr(5) = start_recv_bottom
+
+       b_size(0) = east_size   
+       b_size(1) = west_size   
+       b_size(2) = north_size  
+       b_size(3) = south_size  
+       b_size(4) = top_size    
+       b_size(5) = bottom_size 
+
+c---------------------------------------------------------------------
+c because the difference stencil for the diagonalized scheme is 
+c orthogonal, we do not have to perform the staged copying of faces, 
+c but can send all face information simultaneously to the neighboring 
+c cells in all directions          
+c---------------------------------------------------------------------
+       p0 = 0
+       p1 = 0
+       p2 = 0
+       p3 = 0
+       p4 = 0
+       p5 = 0
+
+       do  c = 1, ncells
+          do   m = 1, 5
+
+c---------------------------------------------------------------------
+c            fill the buffer to be sent to eastern neighbors (i-dir)
+c---------------------------------------------------------------------
+             if (cell_coord(1,c) .ne. ncells) then
+                do   k = 0, cell_size(3,c)-1
+                   do   j = 0, cell_size(2,c)-1
+                      do   i = cell_size(1,c)-2, cell_size(1,c)-1
+                         out_buffer(ss(0)+p0) = u(i,j,k,m,c)
+                         p0 = p0 + 1
+                      end do
+                   end do
+                end do
+             endif
+
+c---------------------------------------------------------------------
+c            fill the buffer to be sent to western neighbors 
+c---------------------------------------------------------------------
+             if (cell_coord(1,c) .ne. 1) then
+                do   k = 0, cell_size(3,c)-1
+                   do   j = 0, cell_size(2,c)-1
+                      do   i = 0, 1
+                         out_buffer(ss(1)+p1) = u(i,j,k,m,c)
+                         p1 = p1 + 1
+                      end do
+                   end do
+                end do
+
+
+             endif
+
+c---------------------------------------------------------------------
+c            fill the buffer to be sent to northern neighbors (j_dir)
+c---------------------------------------------------------------------
+             if (cell_coord(2,c) .ne. ncells) then
+                do   k = 0, cell_size(3,c)-1
+                   do   j = cell_size(2,c)-2, cell_size(2,c)-1
+                      do   i = 0, cell_size(1,c)-1
+                         out_buffer(ss(2)+p2) = u(i,j,k,m,c)
+                         p2 = p2 + 1
+                      end do
+                   end do
+                end do
+             endif
+
+c---------------------------------------------------------------------
+c            fill the buffer to be sent to southern neighbors 
+c---------------------------------------------------------------------
+             if (cell_coord(2,c).ne. 1) then
+                do   k = 0, cell_size(3,c)-1
+                   do   j = 0, 1
+                      do   i = 0, cell_size(1,c)-1   
+                         out_buffer(ss(3)+p3) = u(i,j,k,m,c)
+                         p3 = p3 + 1
+                      end do
+                   end do
+                end do
+             endif
+
+c---------------------------------------------------------------------
+c            fill the buffer to be sent to top neighbors (k-dir)
+c---------------------------------------------------------------------
+             if (cell_coord(3,c) .ne. ncells) then
+                do   k = cell_size(3,c)-2, cell_size(3,c)-1
+                   do   j = 0, cell_size(2,c)-1
+                      do   i = 0, cell_size(1,c)-1
+                         out_buffer(ss(4)+p4) = u(i,j,k,m,c)
+                         p4 = p4 + 1
+                      end do
+                   end do
+                end do
+             endif
+
+c---------------------------------------------------------------------
+c            fill the buffer to be sent to bottom neighbors
+c---------------------------------------------------------------------
+             if (cell_coord(3,c).ne. 1) then
+                 do    k=0, 1
+                    do   j = 0, cell_size(2,c)-1
+                       do   i = 0, cell_size(1,c)-1
+                          out_buffer(ss(5)+p5) = u(i,j,k,m,c)
+                          p5 = p5 + 1
+                       end do
+                    end do
+                 end do
+              endif
+
+c---------------------------------------------------------------------
+c          m loop
+c---------------------------------------------------------------------
+           end do
+
+c---------------------------------------------------------------------
+c       cell loop
+c---------------------------------------------------------------------
+        end do
+
+       call mpi_irecv(in_buffer(sr(0)), b_size(0), 
+     >                dp_type, successor(1), WEST,  
+     >                comm_rhs, requests(0), error)
+       call mpi_irecv(in_buffer(sr(1)), b_size(1), 
+     >                dp_type, predecessor(1), EAST,  
+     >                comm_rhs, requests(1), error)
+       call mpi_irecv(in_buffer(sr(2)), b_size(2), 
+     >                dp_type, successor(2), SOUTH, 
+     >                comm_rhs, requests(2), error)
+       call mpi_irecv(in_buffer(sr(3)), b_size(3), 
+     >                dp_type, predecessor(2), NORTH, 
+     >                comm_rhs, requests(3), error)
+       call mpi_irecv(in_buffer(sr(4)), b_size(4), 
+     >                dp_type, successor(3), BOTTOM,
+     >                comm_rhs, requests(4), error)
+       call mpi_irecv(in_buffer(sr(5)), b_size(5), 
+     >                dp_type, predecessor(3), TOP,   
+     >                comm_rhs, requests(5), error)
+
+       call mpi_isend(out_buffer(ss(0)), b_size(0), 
+     >                dp_type, successor(1),   EAST, 
+     >                comm_rhs, requests(6), error)
+       call mpi_isend(out_buffer(ss(1)), b_size(1), 
+     >                dp_type, predecessor(1), WEST, 
+     >                comm_rhs, requests(7), error)
+       call mpi_isend(out_buffer(ss(2)), b_size(2), 
+     >                dp_type,successor(2),   NORTH, 
+     >                comm_rhs, requests(8), error)
+       call mpi_isend(out_buffer(ss(3)), b_size(3), 
+     >                dp_type,predecessor(2), SOUTH, 
+     >                comm_rhs, requests(9), error)
+       call mpi_isend(out_buffer(ss(4)), b_size(4), 
+     >                dp_type,successor(3),   TOP, 
+     >                comm_rhs,   requests(10), error)
+       call mpi_isend(out_buffer(ss(5)), b_size(5), 
+     >                dp_type,predecessor(3), BOTTOM, 
+     >                comm_rhs,requests(11), error)
+
+
+       call mpi_waitall(12, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the data that has just been received;             
+c---------------------------------------------------------------------
+       p0 = 0
+       p1 = 0
+       p2 = 0
+       p3 = 0
+       p4 = 0
+       p5 = 0
+
+       do   c = 1, ncells
+          do    m = 1, 5
+
+             if (cell_coord(1,c) .ne. 1) then
+                do   k = 0, cell_size(3,c)-1
+                   do   j = 0, cell_size(2,c)-1
+                      do   i = -2, -1
+                         u(i,j,k,m,c) = in_buffer(sr(1)+p0)
+                         p0 = p0 + 1
+                      end do
+                   end do
+                end do
+             endif
+
+             if (cell_coord(1,c) .ne. ncells) then
+                do  k = 0, cell_size(3,c)-1
+                   do  j = 0, cell_size(2,c)-1
+                      do  i = cell_size(1,c), cell_size(1,c)+1
+                         u(i,j,k,m,c) = in_buffer(sr(0)+p1)
+                         p1 = p1 + 1
+                      end do
+                   end do
+                end do
+             end if
+ 
+             if (cell_coord(2,c) .ne. 1) then
+                do  k = 0, cell_size(3,c)-1
+                   do   j = -2, -1
+                      do  i = 0, cell_size(1,c)-1
+                         u(i,j,k,m,c) = in_buffer(sr(3)+p2)
+                         p2 = p2 + 1
+                      end do
+                   end do
+                end do
+
+             endif
+ 
+             if (cell_coord(2,c) .ne. ncells) then
+                do  k = 0, cell_size(3,c)-1
+                   do   j = cell_size(2,c), cell_size(2,c)+1
+                      do  i = 0, cell_size(1,c)-1
+                         u(i,j,k,m,c) = in_buffer(sr(2)+p3)
+                         p3 = p3 + 1
+                      end do
+                   end do
+                end do
+             endif
+
+             if (cell_coord(3,c) .ne. 1) then
+                do  k = -2, -1
+                   do  j = 0, cell_size(2,c)-1
+                      do  i = 0, cell_size(1,c)-1
+                         u(i,j,k,m,c) = in_buffer(sr(5)+p4)
+                         p4 = p4 + 1
+                      end do
+                   end do
+                end do
+             endif
+
+             if (cell_coord(3,c) .ne. ncells) then
+                do  k = cell_size(3,c), cell_size(3,c)+1
+                   do  j = 0, cell_size(2,c)-1
+                      do  i = 0, cell_size(1,c)-1
+                         u(i,j,k,m,c) = in_buffer(sr(4)+p5)
+                         p5 = p5 + 1
+                      end do
+                   end do
+                end do
+             endif
+
+c---------------------------------------------------------------------
+c         m loop            
+c---------------------------------------------------------------------
+          end do
+
+c---------------------------------------------------------------------
+c      cells loop
+c---------------------------------------------------------------------
+       end do
+
+c---------------------------------------------------------------------
+c now that we have all the data, compute the rhs
+c---------------------------------------------------------------------
+       call compute_rhs
+
+       return
+       end
diff --git a/examples/smpi/NAS/SP/define.f b/examples/smpi/NAS/SP/define.f
new file mode 100644
index 0000000000..c465533f9a
--- /dev/null
+++ b/examples/smpi/NAS/SP/define.f
@@ -0,0 +1,66 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine compute_buffer_size(dim)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       integer  c, dim, face_size
+
+       if (ncells .eq. 1) return
+
+c---------------------------------------------------------------------
+c      compute the actual sizes of the buffers; note that there is 
+c      always one cell face that doesn't need buffer space, because it 
+c      is at the boundary of the grid
+c---------------------------------------------------------------------
+
+       west_size = 0
+       east_size = 0
+
+       do   c = 1, ncells
+          face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
+          if (cell_coord(1,c).ne.1) west_size = west_size + face_size
+          if (cell_coord(1,c).ne.ncells) east_size = east_size + 
+     >                                                 face_size 
+       end do
+
+       north_size = 0
+       south_size = 0
+       do   c = 1, ncells
+          face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
+          if (cell_coord(2,c).ne.1) south_size = south_size + face_size
+          if (cell_coord(2,c).ne.ncells) north_size = north_size + 
+     >                                                  face_size 
+       end do
+
+       top_size = 0
+       bottom_size = 0
+       do   c = 1, ncells
+          face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
+          if (cell_coord(3,c).ne.1) bottom_size = bottom_size + 
+     >                                            face_size
+          if (cell_coord(3,c).ne.ncells) top_size = top_size +
+     >                                                face_size     
+       end do
+
+       start_send_west   = 1
+       start_send_east   = start_send_west   + west_size
+       start_send_south  = start_send_east   + east_size
+       start_send_north  = start_send_south  + south_size
+       start_send_bottom = start_send_north  + north_size
+       start_send_top    = start_send_bottom + bottom_size
+       start_recv_west   = 1
+       start_recv_east   = start_recv_west   + west_size
+       start_recv_south  = start_recv_east   + east_size
+       start_recv_north  = start_recv_south  + south_size
+       start_recv_bottom = start_recv_north  + north_size
+       start_recv_top    = start_recv_bottom + bottom_size
+
+       return
+       end
+
diff --git a/examples/smpi/NAS/SP/error.f b/examples/smpi/NAS/SP/error.f
new file mode 100644
index 0000000000..fd9aab37b3
--- /dev/null
+++ b/examples/smpi/NAS/SP/error.f
@@ -0,0 +1,105 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine error_norm(rms)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function computes the norm of the difference between the
+c computed solution and the exact solution
+c---------------------------------------------------------------------
+
+       include 'header.h'
+       include 'mpinpb.h'
+
+       integer c, i, j, k, m, ii, jj, kk, d, error
+       double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
+     >                  add
+
+       do   m = 1, 5 
+          rms_work(m) = 0.0d0
+       end do
+
+       do   c = 1, ncells
+          kk = 0
+          do   k = cell_low(3,c), cell_high(3,c)
+             zeta = dble(k) * dnzm1
+             jj = 0
+             do   j = cell_low(2,c), cell_high(2,c)
+                eta = dble(j) * dnym1
+                ii = 0
+                do   i = cell_low(1,c), cell_high(1,c)
+                   xi = dble(i) * dnxm1
+                   call exact_solution(xi, eta, zeta, u_exact)
+
+                   do   m = 1, 5
+                      add = u(ii,jj,kk,m,c)-u_exact(m)
+                      rms_work(m) = rms_work(m) + add*add
+                   end do
+                   ii = ii + 1
+                end do
+                jj = jj + 1
+             end do
+             kk = kk + 1
+          end do
+       end do
+
+       call mpi_allreduce(rms_work, rms, 5, dp_type, 
+     >                 MPI_SUM, comm_setup, error)
+
+       do    m = 1, 5
+          do    d = 1, 3
+             rms(m) = rms(m) / dble(grid_points(d)-2)
+          end do
+          rms(m) = dsqrt(rms(m))
+       end do
+
+       return
+       end
+
+
+
+       subroutine rhs_norm(rms)
+
+       include 'header.h'
+       include 'mpinpb.h'
+
+       integer c, i, j, k, d, m, error
+       double precision rms(5), rms_work(5), add
+
+       do    m = 1, 5
+          rms_work(m) = 0.0d0
+       end do
+
+       do   c = 1, ncells
+          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+                do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+                   do   m = 1, 5
+                      add = rhs(i,j,k,m,c)
+                      rms_work(m) = rms_work(m) + add*add
+                   end do
+                end do
+             end do
+          end do
+       end do
+
+
+
+       call mpi_allreduce(rms_work, rms, 5, dp_type, 
+     >                 MPI_SUM, comm_setup, error)
+
+       do   m = 1, 5
+          do   d = 1, 3
+             rms(m) = rms(m) / dble(grid_points(d)-2)
+          end do
+          rms(m) = dsqrt(rms(m))
+       end do
+
+       return
+       end
+
+
diff --git a/examples/smpi/NAS/SP/exact_rhs.f b/examples/smpi/NAS/SP/exact_rhs.f
new file mode 100644
index 0000000000..b589668126
--- /dev/null
+++ b/examples/smpi/NAS/SP/exact_rhs.f
@@ -0,0 +1,363 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine exact_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       double precision dtemp(5), xi, eta, zeta, dtpp
+       integer          c, m, i, j, k, ip1, im1, jp1, 
+     >                  jm1, km1, kp1
+
+c---------------------------------------------------------------------
+c loop over all cells owned by this node                   
+c---------------------------------------------------------------------
+       do   c = 1, ncells
+
+c---------------------------------------------------------------------
+c         initialize                                  
+c---------------------------------------------------------------------
+          do   m = 1, 5
+             do   k= 0, cell_size(3,c)-1
+                do   j = 0, cell_size(2,c)-1
+                   do   i = 0, cell_size(1,c)-1
+                      forcing(i,j,k,m,c) = 0.0d0
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c xi-direction flux differences                      
+c---------------------------------------------------------------------
+          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+             zeta = dble(k+cell_low(3,c)) * dnzm1
+             do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+                eta = dble(j+cell_low(2,c)) * dnym1
+
+                do  i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
+                   xi = dble(i+cell_low(1,c)) * dnxm1
+
+                   call exact_solution(xi, eta, zeta, dtemp)
+                   do  m = 1, 5
+                      ue(i,m) = dtemp(m)
+                   end do
+
+                   dtpp = 1.0d0 / dtemp(1)
+
+                   do  m = 2, 5
+                      buf(i,m) = dtpp * dtemp(m)
+                   end do
+
+                   cuf(i)   = buf(i,2) * buf(i,2)
+                   buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + 
+     >                        buf(i,4) * buf(i,4) 
+                   q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
+     >                           buf(i,4)*ue(i,4))
+
+                end do
+ 
+                do  i = start(1,c), cell_size(1,c)-end(1,c)-1
+                   im1 = i-1
+                   ip1 = i+1
+
+                   forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
+     >                 tx2*( ue(ip1,2)-ue(im1,2) )+
+     >                 dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
+
+                   forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tx2 * (
+     >                (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
+     >                (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
+     >                 xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
+     >                 dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
+
+                   forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tx2 * (
+     >                 ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
+     >                 xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
+     >                 dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
+                  
+                   forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tx2*(
+     >                 ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
+     >                 xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
+     >                 dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
+
+                   forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tx2*(
+     >                 buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
+     >                 buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
+     >                 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
+     >                               buf(im1,1))+
+     >                 xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
+     >                 xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
+     >                 dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
+                end do
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation                         
+c---------------------------------------------------------------------
+                if (start(1,c) .gt. 0) then
+                   do   m = 1, 5
+                      i = 1
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                    (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
+                      i = 2
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
+     >                     4.0d0*ue(i+1,m) +       ue(i+2,m))
+                   end do
+                endif
+
+                do   m = 1, 5
+                   do  i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
+     >                   (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
+                   end do
+                end do
+
+                if (end(1,c) .gt. 0) then
+                   do   m = 1, 5
+                      i = cell_size(1,c)-3
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
+                      i = cell_size(1,c)-2
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
+                   end do
+                endif
+
+             end do
+          end do
+c---------------------------------------------------------------------
+c  eta-direction flux differences             
+c---------------------------------------------------------------------
+          do   k = start(3,c), cell_size(3,c)-end(3,c)-1          
+             zeta = dble(k+cell_low(3,c)) * dnzm1
+             do   i=start(1,c), cell_size(1,c)-end(1,c)-1
+                xi = dble(i+cell_low(1,c)) * dnxm1
+
+                do  j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
+                   eta = dble(j+cell_low(2,c)) * dnym1
+
+                   call exact_solution(xi, eta, zeta, dtemp)
+                   do   m = 1, 5 
+                      ue(j,m) = dtemp(m)
+                   end do
+                   dtpp = 1.0d0/dtemp(1)
+
+                   do  m = 2, 5
+                      buf(j,m) = dtpp * dtemp(m)
+                   end do
+
+                   cuf(j)   = buf(j,3) * buf(j,3)
+                   buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + 
+     >                        buf(j,4) * buf(j,4)
+                   q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
+     >                           buf(j,4)*ue(j,4))
+                end do
+
+                do  j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   jm1 = j-1
+                   jp1 = j+1
+                  
+                   forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
+     >                ty2*( ue(jp1,3)-ue(jm1,3) )+
+     >                dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
+
+                   forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - ty2*(
+     >                ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
+     >                yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
+     >                dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
+
+                   forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - ty2*(
+     >                (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
+     >                (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
+     >                yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
+     >                dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
+
+                   forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - ty2*(
+     >                ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
+     >                yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
+     >                dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
+
+                   forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - ty2*(
+     >                buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
+     >                buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
+     >                0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
+     >                              buf(jm1,1))+
+     >                yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
+     >                yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
+     >                dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
+                end do
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation                      
+c---------------------------------------------------------------------
+                if (start(2,c) .gt. 0) then
+                   do   m = 1, 5
+                      j = 1
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                    (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
+                      j = 2
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
+     >                     4.0d0*ue(j+1,m) +       ue(j+2,m))
+                   end do
+                endif
+
+                do   m = 1, 5
+                   do  j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
+     >                   (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
+                   end do
+                end do
+                if (end(2,c) .gt. 0) then
+                   do   m = 1, 5
+                      j = cell_size(2,c)-3
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
+                      j = cell_size(2,c)-2
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
+
+                   end do
+                endif
+
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c zeta-direction flux differences                      
+c---------------------------------------------------------------------
+          do  j=start(2,c), cell_size(2,c)-end(2,c)-1
+             eta = dble(j+cell_low(2,c)) * dnym1
+             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+                xi = dble(i+cell_low(1,c)) * dnxm1
+
+                do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
+                   zeta = dble(k+cell_low(3,c)) * dnzm1
+
+                   call exact_solution(xi, eta, zeta, dtemp)
+                   do   m = 1, 5
+                      ue(k,m) = dtemp(m)
+                   end do
+
+                   dtpp = 1.0d0/dtemp(1)
+
+                   do   m = 2, 5
+                      buf(k,m) = dtpp * dtemp(m)
+                   end do
+
+                   cuf(k)   = buf(k,4) * buf(k,4)
+                   buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + 
+     >                        buf(k,3) * buf(k,3)
+                   q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
+     >                           buf(k,4)*ue(k,4))
+                end do
+
+                do    k=start(3,c), cell_size(3,c)-end(3,c)-1
+                   km1 = k-1
+                   kp1 = k+1
+                  
+                   forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
+     >                 tz2*( ue(kp1,4)-ue(km1,4) )+
+     >                 dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
+
+                   forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tz2 * (
+     >                 ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
+     >                 zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
+     >                 dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
+
+                   forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tz2 * (
+     >                 ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
+     >                 zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
+     >                 dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
+
+                   forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tz2 * (
+     >                (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
+     >                (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
+     >                zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
+     >                dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
+
+                   forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tz2 * (
+     >                 buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
+     >                 buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
+     >                 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
+     >                              +buf(km1,1))+
+     >                 zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
+     >                 zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
+     >                 dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
+                end do
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation                        
+c---------------------------------------------------------------------
+                if (start(3,c) .gt. 0) then
+                   do   m = 1, 5
+                      k = 1
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                    (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
+                      k = 2
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
+     >                     4.0d0*ue(k+1,m) +       ue(k+2,m))
+                   end do
+                endif
+
+                do   m = 1, 5
+                   do  k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
+     >                   (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
+                   end do
+                end do
+
+                if (end(3,c) .gt. 0) then
+                   do    m = 1, 5
+                      k = cell_size(3,c)-3
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
+                      k = cell_size(3,c)-2
+                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+     >                   (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
+                   end do
+                endif
+
+             end do
+          end do
+c---------------------------------------------------------------------
+c now change the sign of the forcing function, 
+c---------------------------------------------------------------------
+          do   m = 1, 5
+             do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+                do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      forcing(i,j,k,m,c) = -1.d0 * forcing(i,j,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c      cell loop
+c---------------------------------------------------------------------
+       end do
+
+       return
+       end
+
+
+
+
+
diff --git a/examples/smpi/NAS/SP/exact_solution.f b/examples/smpi/NAS/SP/exact_solution.f
new file mode 100644
index 0000000000..2644f0b8f9
--- /dev/null
+++ b/examples/smpi/NAS/SP/exact_solution.f
@@ -0,0 +1,30 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine exact_solution(xi,eta,zeta,dtemp)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function returns the exact solution at point xi, eta, zeta  
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       double precision  xi, eta, zeta, dtemp(5)
+       integer m
+
+       do  m = 1, 5
+          dtemp(m) =  ce(m,1) +
+     >    xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
+     >    eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
+     >    zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + 
+     >    zeta*ce(m,13))))
+       end do
+
+       return
+       end
+
+
diff --git a/examples/smpi/NAS/SP/header.h b/examples/smpi/NAS/SP/header.h
new file mode 100644
index 0000000000..663515a104
--- /dev/null
+++ b/examples/smpi/NAS/SP/header.h
@@ -0,0 +1,113 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+
+c---------------------------------------------------------------------
+c The following include file is generated automatically by the
+c "setparams" utility. It defines 
+c      maxcells:      the square root of the maximum number of processors
+c      problem_size:  12, 64, 102, 162 (for class T, A, B, C)
+c      dt_default:    default time step for this problem size if no
+c                     config file
+c      niter_default: default number of iterations for this problem size
+c---------------------------------------------------------------------
+
+      include 'npbparams.h'
+
+      integer           ncells, grid_points(3)
+      common /global/   ncells, grid_points
+
+      double precision  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
+     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+     >                  ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, 
+     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+      common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
+     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+     >                  ce, dxmax, dymax, dzmax, xxcon1, xxcon2, 
+     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+      integer           EAST, WEST, NORTH, SOUTH, 
+     >                  BOTTOM, TOP
+
+      parameter (EAST=2000, WEST=3000,      NORTH=4000, SOUTH=5000,
+     >           BOTTOM=6000, TOP=7000)
+
+      integer cell_coord (3,maxcells), cell_low (3,maxcells), 
+     >        cell_high  (3,maxcells), cell_size(3,maxcells),
+     >        predecessor(3),          slice    (3,maxcells),
+     >        grid_size  (3),          successor(3),
+     >        start      (3,maxcells), end      (3,maxcells)
+      common /partition/ cell_coord, cell_low, cell_high, cell_size,
+     >                   grid_size, successor, predecessor, slice,
+     >                   start, end
+
+      integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE, IMAXP, JMAXP
+
+      parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
+
+      parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
+      parameter (IMAXP=IMAX/2*2+1,JMAXP=JMAX/2*2+1)
+
+c---------------------------------------------------------------------
+c +1 at end to avoid zero length arrays for 1 node
+c---------------------------------------------------------------------
+      parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60*2+1)
+
+      double precision 
+     >   u       (-2:IMAXP+1,-2:JMAXP+1,-2:KMAX+1, 5,maxcells),
+     >   us      (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
+     >   vs      (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
+     >   ws      (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
+     >   qs      (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
+     >   ainv    (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
+     >   rho_i   (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
+     >   speed   (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
+     >   square  (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
+     >   rhs     ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells),
+     >   forcing ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells),
+     >   lhs     ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1,15,maxcells),
+     >   in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
+      common /fields/  u, us, vs, ws, qs, ainv, rho_i, speed, square, 
+     >                 rhs, forcing, lhs, in_buffer, out_buffer
+
+      double precision cv(-2:MAX_CELL_DIM+1),   rhon(-2:MAX_CELL_DIM+1),
+     >                 rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
+     >                 cuf(-2:MAX_CELL_DIM+1),  q(-2:MAX_CELL_DIM+1),
+     >                 ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
+      common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
+
+      integer  west_size, east_size, bottom_size, top_size,
+     >         north_size, south_size, start_send_west, 
+     >         start_send_east, start_send_south, start_send_north,
+     >         start_send_bottom, start_send_top, start_recv_west,
+     >         start_recv_east, start_recv_south, start_recv_north,
+     >         start_recv_bottom, start_recv_top
+      common /box/ west_size, east_size, bottom_size,
+     >             top_size, north_size, south_size, 
+     >             start_send_west, start_send_east, start_send_south,
+     >             start_send_north, start_send_bottom, start_send_top,
+     >             start_recv_west, start_recv_east, start_recv_south,
+     >             start_recv_north, start_recv_bottom, start_recv_top
diff --git a/examples/smpi/NAS/SP/initialize.f b/examples/smpi/NAS/SP/initialize.f
new file mode 100644
index 0000000000..655c8d9369
--- /dev/null
+++ b/examples/smpi/NAS/SP/initialize.f
@@ -0,0 +1,286 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine  initialize
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This subroutine initializes the field variable u using 
+c tri-linear transfinite interpolation of the boundary values     
+c---------------------------------------------------------------------
+
+       include 'header.h'
+  
+       integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
+       double precision  xi, eta, zeta, Pface(5,3,2), Pxi, Peta, 
+     >                   Pzeta, temp(5)
+
+
+c---------------------------------------------------------------------
+c  Later (in compute_rhs) we compute 1/u for every element. A few of 
+c  the corner elements are not used, but it convenient (and faster) 
+c  to compute the whole thing with a simple loop. Make sure those 
+c  values are nonzero by initializing the whole thing here. 
+c---------------------------------------------------------------------
+      do c = 1, ncells
+         do kk = -1, IMAX
+            do jj = -1, IMAX
+               do ii = -1, IMAX
+                  u(ii, jj, kk, 1, c) = 1.0
+                  u(ii, jj, kk, 2, c) = 0.0
+                  u(ii, jj, kk, 3, c) = 0.0
+                  u(ii, jj, kk, 4, c) = 0.0
+                  u(ii, jj, kk, 5, c) = 1.0
+               end do
+            end do
+         end do
+      end do
+
+c---------------------------------------------------------------------
+c first store the "interpolated" values everywhere on the grid    
+c---------------------------------------------------------------------
+       do  c=1, ncells
+          kk = 0
+          do  k = cell_low(3,c), cell_high(3,c)
+             zeta = dble(k) * dnzm1
+             jj = 0
+             do  j = cell_low(2,c), cell_high(2,c)
+                eta = dble(j) * dnym1
+                ii = 0
+                do   i = cell_low(1,c), cell_high(1,c)
+                   xi = dble(i) * dnxm1
+                  
+                   do ix = 1, 2
+                      call exact_solution(dble(ix-1), eta, zeta, 
+     >                                    Pface(1,1,ix))
+                   end do
+
+                   do    iy = 1, 2
+                      call exact_solution(xi, dble(iy-1) , zeta, 
+     >                                    Pface(1,2,iy))
+                   end do
+
+                   do    iz = 1, 2
+                      call exact_solution(xi, eta, dble(iz-1),   
+     >                                    Pface(1,3,iz))
+                   end do
+
+                   do   m = 1, 5
+                      Pxi   = xi   * Pface(m,1,2) + 
+     >                        (1.0d0-xi)   * Pface(m,1,1)
+                      Peta  = eta  * Pface(m,2,2) + 
+     >                        (1.0d0-eta)  * Pface(m,2,1)
+                      Pzeta = zeta * Pface(m,3,2) + 
+     >                        (1.0d0-zeta) * Pface(m,3,1)
+ 
+                      u(ii,jj,kk,m,c) = Pxi + Peta + Pzeta - 
+     >                          Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + 
+     >                          Pxi*Peta*Pzeta
+
+                   end do
+                   ii = ii + 1
+                end do
+                jj = jj + 1
+             end do
+             kk = kk+1
+          end do
+       end do
+
+c---------------------------------------------------------------------
+c now store the exact values on the boundaries        
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c west face                                                  
+c---------------------------------------------------------------------
+       c = slice(1,1)
+       ii = 0
+       xi = 0.0d0
+       kk = 0
+       do  k = cell_low(3,c), cell_high(3,c)
+          zeta = dble(k) * dnzm1
+          jj = 0
+          do   j = cell_low(2,c), cell_high(2,c)
+             eta = dble(j) * dnym1
+             call exact_solution(xi, eta, zeta, temp)
+             do   m = 1, 5
+                u(ii,jj,kk,m,c) = temp(m)
+             end do
+             jj = jj + 1
+          end do
+          kk = kk + 1
+       end do
+
+c---------------------------------------------------------------------
+c east face                                                      
+c---------------------------------------------------------------------
+       c  = slice(1,ncells)
+       ii = cell_size(1,c)-1
+       xi = 1.0d0
+       kk = 0
+       do   k = cell_low(3,c), cell_high(3,c)
+          zeta = dble(k) * dnzm1
+          jj = 0
+          do   j = cell_low(2,c), cell_high(2,c)
+             eta = dble(j) * dnym1
+             call exact_solution(xi, eta, zeta, temp)
+             do   m = 1, 5
+                u(ii,jj,kk,m,c) = temp(m)
+             end do
+             jj = jj + 1
+          end do
+          kk = kk + 1
+       end do
+
+c---------------------------------------------------------------------
+c south face                                                 
+c---------------------------------------------------------------------
+       c = slice(2,1)
+       jj = 0
+       eta = 0.0d0
+       kk = 0
+       do  k = cell_low(3,c), cell_high(3,c)
+          zeta = dble(k) * dnzm1
+          ii = 0
+          do   i = cell_low(1,c), cell_high(1,c)
+             xi = dble(i) * dnxm1
+             call exact_solution(xi, eta, zeta, temp)
+             do   m = 1, 5
+                u(ii,jj,kk,m,c) = temp(m)
+             end do
+             ii = ii + 1
+          end do
+          kk = kk + 1
+       end do
+
+
+c---------------------------------------------------------------------
+c north face                                    
+c---------------------------------------------------------------------
+       c = slice(2,ncells)
+       jj = cell_size(2,c)-1
+       eta = 1.0d0
+       kk = 0
+       do   k = cell_low(3,c), cell_high(3,c)
+          zeta = dble(k) * dnzm1
+          ii = 0
+          do   i = cell_low(1,c), cell_high(1,c)
+             xi = dble(i) * dnxm1
+             call exact_solution(xi, eta, zeta, temp)
+             do   m = 1, 5
+                u(ii,jj,kk,m,c) = temp(m)
+             end do
+             ii = ii + 1
+          end do
+          kk = kk + 1
+       end do
+
+c---------------------------------------------------------------------
+c bottom face                                       
+c---------------------------------------------------------------------
+       c = slice(3,1)
+       kk = 0
+       zeta = 0.0d0
+       jj = 0
+       do   j = cell_low(2,c), cell_high(2,c)
+          eta = dble(j) * dnym1
+          ii = 0
+          do   i =cell_low(1,c), cell_high(1,c)
+             xi = dble(i) *dnxm1
+             call exact_solution(xi, eta, zeta, temp)
+             do   m = 1, 5
+                u(ii,jj,kk,m,c) = temp(m)
+             end do
+             ii = ii + 1
+          end do
+          jj = jj + 1
+       end do
+
+c---------------------------------------------------------------------
+c top face     
+c---------------------------------------------------------------------
+       c = slice(3,ncells)
+       kk = cell_size(3,c)-1
+       zeta = 1.0d0
+       jj = 0
+       do   j = cell_low(2,c), cell_high(2,c)
+          eta = dble(j) * dnym1
+          ii = 0
+          do   i =cell_low(1,c), cell_high(1,c)
+             xi = dble(i) * dnxm1
+             call exact_solution(xi, eta, zeta, temp)
+             do   m = 1, 5
+                u(ii,jj,kk,m,c) = temp(m)
+             end do
+             ii = ii + 1
+          end do
+          jj = jj + 1
+       end do
+
+       return
+       end
+
+
+       subroutine lhsinit
+
+       include 'header.h'
+       
+       integer i, j, k, d, c, n
+
+c---------------------------------------------------------------------
+c loop over all cells                                       
+c---------------------------------------------------------------------
+       do  c = 1, ncells
+
+c---------------------------------------------------------------------
+c         first, initialize the start and end arrays
+c---------------------------------------------------------------------
+          do  d = 1, 3
+             if (cell_coord(d,c) .eq. 1) then
+                start(d,c) = 1
+             else 
+                start(d,c) = 0
+             endif
+             if (cell_coord(d,c) .eq. ncells) then
+                end(d,c) = 1
+             else
+                end(d,c) = 0
+             endif
+          end do
+
+c---------------------------------------------------------------------
+c     zap the whole left hand side for starters
+c---------------------------------------------------------------------
+          do  n = 1, 15
+             do  k = 0, cell_size(3,c)-1
+                do  j = 0, cell_size(2,c)-1
+                   do  i = 0, cell_size(1,c)-1
+                      lhs(i,j,k,n,c) = 0.0d0
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c next, set all diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+          do   n = 1, 3
+             do   k = 0, cell_size(3,c)-1
+                do   j = 0, cell_size(2,c)-1
+                   do   i = 0, cell_size(1,c)-1
+                      lhs(i,j,k,5*n-2,c) = 1.0d0
+                   end do
+                end do
+             end do
+          end do
+
+       end do
+
+      return
+      end
+
+
+
diff --git a/examples/smpi/NAS/SP/inputsp.data.sample b/examples/smpi/NAS/SP/inputsp.data.sample
new file mode 100644
index 0000000000..ae3801fdb7
--- /dev/null
+++ b/examples/smpi/NAS/SP/inputsp.data.sample
@@ -0,0 +1,3 @@
+400       number of time steps
+0.0015d0  dt for class A = 0.0015d0. class B = 0.001d0  class C = 0.00067d0
+64 64 64
diff --git a/examples/smpi/NAS/SP/lhsx.f b/examples/smpi/NAS/SP/lhsx.f
new file mode 100644
index 0000000000..cae7779122
--- /dev/null
+++ b/examples/smpi/NAS/SP/lhsx.f
@@ -0,0 +1,124 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine lhsx(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three x-factors  
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       double precision ru1
+       integer          i, j, k, c
+
+
+c---------------------------------------------------------------------
+c      treat only cell c             
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c      first fill the lhs for the u-eigenvalue                   
+c---------------------------------------------------------------------
+       do  k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do  j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do  i = start(1,c)-1, cell_size(1,c)-end(1,c)
+                ru1 = c3c4*rho_i(i,j,k,c)
+                cv(i) = us(i,j,k,c)
+                rhon(i) = dmax1(dx2+con43*ru1, 
+     >                          dx5+c1c5*ru1,
+     >                          dxmax+ru1,
+     >                          dx1)
+             end do
+
+             do  i = start(1,c), cell_size(1,c)-end(1,c)-1
+                lhs(i,j,k,1,c) =   0.0d0
+                lhs(i,j,k,2,c) = - dttx2 * cv(i-1) - dttx1 * rhon(i-1)
+                lhs(i,j,k,3,c) =   1.0d0 + c2dttx1 * rhon(i)
+                lhs(i,j,k,4,c) =   dttx2 * cv(i+1) - dttx1 * rhon(i+1)
+                lhs(i,j,k,5,c) =   0.0d0
+             end do
+          end do
+       end do
+
+c---------------------------------------------------------------------
+c      add fourth order dissipation                             
+c---------------------------------------------------------------------
+       if (start(1,c) .gt. 0) then
+          i = 1
+          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+  
+                lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4
+                lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz6
+                lhs(i+1,j,k,4,c) = lhs(i+1,j,k,4,c) - comz4
+                lhs(i+1,j,k,5,c) = lhs(i+1,j,k,5,c) + comz1
+             end do
+          end do
+       endif
+
+       do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do   i=3*start(1,c), cell_size(1,c)-3*end(1,c)-1
+                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+             end do
+          end do
+       end do
+
+       if (end(1,c) .gt. 0) then
+          i = cell_size(1,c)-3
+          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+
+                lhs(i+1,j,k,1,c) = lhs(i+1,j,k,1,c) + comz1
+                lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4
+                lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz5
+             end do
+          end do
+       endif
+
+c---------------------------------------------------------------------
+c      subsequently, fill the other factors (u+c), (u-c) by a4ing to 
+c      the first  
+c---------------------------------------------------------------------
+       do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+                lhs(i,j,k,1+5,c)  = lhs(i,j,k,1,c)
+                lhs(i,j,k,2+5,c)  = lhs(i,j,k,2,c) - 
+     >                            dttx2 * speed(i-1,j,k,c)
+                lhs(i,j,k,3+5,c)  = lhs(i,j,k,3,c)
+                lhs(i,j,k,4+5,c)  = lhs(i,j,k,4,c) + 
+     >                            dttx2 * speed(i+1,j,k,c)
+                lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
+                lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
+                lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + 
+     >                            dttx2 * speed(i-1,j,k,c)
+                lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
+                lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - 
+     >                            dttx2 * speed(i+1,j,k,c)
+                lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
+             end do
+          end do
+       end do
+
+       return
+       end
+
+
+
diff --git a/examples/smpi/NAS/SP/lhsy.f b/examples/smpi/NAS/SP/lhsy.f
new file mode 100644
index 0000000000..9c07a35538
--- /dev/null
+++ b/examples/smpi/NAS/SP/lhsy.f
@@ -0,0 +1,125 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine lhsy(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three y-factors   
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       double precision ru1
+       integer          i, j, k, c
+
+c---------------------------------------------------------------------
+c      treat only cell c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c      first fill the lhs for the u-eigenvalue         
+c---------------------------------------------------------------------
+       do  k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do  i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+             do  j = start(2,c)-1, cell_size(2,c)-end(2,c)
+                ru1 = c3c4*rho_i(i,j,k,c)
+                cv(j) = vs(i,j,k,c)
+                rhoq(j) = dmax1( dy3 + con43 * ru1,
+     >                           dy5 + c1c5*ru1,
+     >                           dymax + ru1,
+     >                           dy1)
+             end do
+            
+             do  j = start(2,c), cell_size(2,c)-end(2,c)-1
+                lhs(i,j,k,1,c) =  0.0d0
+                lhs(i,j,k,2,c) = -dtty2 * cv(j-1) - dtty1 * rhoq(j-1)
+                lhs(i,j,k,3,c) =  1.0 + c2dtty1 * rhoq(j)
+                lhs(i,j,k,4,c) =  dtty2 * cv(j+1) - dtty1 * rhoq(j+1)
+                lhs(i,j,k,5,c) =  0.0d0
+             end do
+          end do
+       end do
+
+c---------------------------------------------------------------------
+c      add fourth order dissipation                             
+c---------------------------------------------------------------------
+       if (start(2,c) .gt. 0) then
+          j = 1
+          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+       
+                lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4
+                lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz6
+                lhs(i,j+1,k,4,c) = lhs(i,j+1,k,4,c) - comz4
+                lhs(i,j+1,k,5,c) = lhs(i,j+1,k,5,c) + comz1
+             end do
+          end do
+       endif
+
+       do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do   j=3*start(2,c), cell_size(2,c)-3*end(2,c)-1
+             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+             end do
+          end do
+       end do
+
+       if (end(2,c) .gt. 0) then
+          j = cell_size(2,c)-3
+          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+
+                lhs(i,j+1,k,1,c) = lhs(i,j+1,k,1,c) + comz1
+                lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4
+                lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz5
+             end do
+          end do
+       endif
+
+c---------------------------------------------------------------------
+c      subsequently, do the other two factors                    
+c---------------------------------------------------------------------
+       do    k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+                lhs(i,j,k,1+5,c)  = lhs(i,j,k,1,c)
+                lhs(i,j,k,2+5,c)  = lhs(i,j,k,2,c) - 
+     >                            dtty2 * speed(i,j-1,k,c)
+                lhs(i,j,k,3+5,c)  = lhs(i,j,k,3,c)
+                lhs(i,j,k,4+5,c)  = lhs(i,j,k,4,c) + 
+     >                            dtty2 * speed(i,j+1,k,c)
+                lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
+                lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
+                lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + 
+     >                            dtty2 * speed(i,j-1,k,c)
+                lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
+                lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - 
+     >                            dtty2 * speed(i,j+1,k,c)
+                lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
+             end do
+          end do
+       end do
+
+       return
+       end
+
+
+
diff --git a/examples/smpi/NAS/SP/lhsz.f b/examples/smpi/NAS/SP/lhsz.f
new file mode 100644
index 0000000000..08ea0bc24d
--- /dev/null
+++ b/examples/smpi/NAS/SP/lhsz.f
@@ -0,0 +1,123 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine lhsz(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three z-factors   
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       double precision ru1
+       integer i, j, k, c
+
+c---------------------------------------------------------------------
+c      treat only cell c                                         
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c first fill the lhs for the u-eigenvalue                          
+c---------------------------------------------------------------------
+       do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+          do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+             do   k = start(3,c)-1, cell_size(3,c)-end(3,c)
+                ru1 = c3c4*rho_i(i,j,k,c)
+                cv(k) = ws(i,j,k,c)
+                rhos(k) = dmax1(dz4 + con43 * ru1,
+     >                          dz5 + c1c5 * ru1,
+     >                          dzmax + ru1,
+     >                          dz1)
+             end do
+
+             do   k =  start(3,c), cell_size(3,c)-end(3,c)-1
+                lhs(i,j,k,1,c) =  0.0d0
+                lhs(i,j,k,2,c) = -dttz2 * cv(k-1) - dttz1 * rhos(k-1)
+                lhs(i,j,k,3,c) =  1.0 + c2dttz1 * rhos(k)
+                lhs(i,j,k,4,c) =  dttz2 * cv(k+1) - dttz1 * rhos(k+1)
+                lhs(i,j,k,5,c) =  0.0d0
+             end do
+          end do
+       end do
+
+c---------------------------------------------------------------------
+c      add fourth order dissipation                                  
+c---------------------------------------------------------------------
+       if (start(3,c) .gt. 0) then
+          k = 1
+          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+
+                lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4
+                lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz6
+                lhs(i,j,k+1,4,c) = lhs(i,j,k+1,4,c) - comz4
+                lhs(i,j,k+1,5,c) = lhs(i,j,k+1,5,c) + comz1
+             end do
+          end do
+       endif
+
+       do    k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
+          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+             end do
+          end do
+       end do
+
+       if (end(3,c) .gt. 0) then
+          k = cell_size(3,c)-3 
+          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+
+                lhs(i,j,k+1,1,c) = lhs(i,j,k+1,1,c) + comz1
+                lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4
+                lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz5
+             end do
+          end do
+       endif
+
+
+c---------------------------------------------------------------------
+c      subsequently, fill the other factors (u+c), (u-c) 
+c---------------------------------------------------------------------
+       do    k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+                lhs(i,j,k,1+5,c)  = lhs(i,j,k,1,c)
+                lhs(i,j,k,2+5,c)  = lhs(i,j,k,2,c) - 
+     >                            dttz2 * speed(i,j,k-1,c)
+                lhs(i,j,k,3+5,c)  = lhs(i,j,k,3,c)
+                lhs(i,j,k,4+5,c)  = lhs(i,j,k,4,c) + 
+     >                            dttz2 * speed(i,j,k+1,c)
+                lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
+                lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
+                lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + 
+     >                            dttz2 * speed(i,j,k-1,c)
+                lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
+                lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - 
+     >                            dttz2 * speed(i,j,k+1,c)
+                lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
+             end do
+          end do
+       end do
+
+       return
+       end
+
+
diff --git a/examples/smpi/NAS/SP/make_set.f b/examples/smpi/NAS/SP/make_set.f
new file mode 100644
index 0000000000..7a84e93010
--- /dev/null
+++ b/examples/smpi/NAS/SP/make_set.f
@@ -0,0 +1,120 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine make_set
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function allocates space for a set of cells and fills the set     
+c such that communication between cells on different nodes is only
+c nearest neighbor                                                   
+c---------------------------------------------------------------------
+
+       include 'header.h'
+       include 'mpinpb.h'
+
+       integer p, i, j, c, dir, size, excess, ierr,ierrcode
+
+c---------------------------------------------------------------------
+c     compute square root; add small number to allow for roundoff
+c     (note: this is computed in setup_mpi.f also, but prefer to do
+c     it twice because of some include file problems).
+c---------------------------------------------------------------------
+      ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c      this makes coding easier
+c---------------------------------------------------------------------
+       p = ncells
+   
+c---------------------------------------------------------------------
+c      determine the location of the cell at the bottom of the 3D 
+c      array of cells
+c---------------------------------------------------------------------
+       cell_coord(1,1) = mod(node,p) 
+       cell_coord(2,1) = node/p 
+       cell_coord(3,1) = 0
+
+c---------------------------------------------------------------------
+c      set the cell_coords for cells in the rest of the z-layers; 
+c      this comes down to a simple linear numbering in the z-direct-
+c      ion, and to the doubly-cyclic numbering in the other dirs     
+c---------------------------------------------------------------------
+       do    c=2, p
+          cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) 
+          cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) 
+          cell_coord(3,c) = c-1
+       end do
+
+c---------------------------------------------------------------------
+c      offset all the coordinates by 1 to adjust for Fortran arrays
+c---------------------------------------------------------------------
+       do    dir = 1, 3
+          do    c = 1, p
+             cell_coord(dir,c) = cell_coord(dir,c) + 1
+          end do
+       end do
+   
+c---------------------------------------------------------------------
+c      slice(dir,n) contains the sequence number of the cell that is in
+c      coordinate plane n in the dir direction
+c---------------------------------------------------------------------
+       do   dir = 1, 3
+          do   c = 1, p
+             slice(dir,cell_coord(dir,c)) = c
+          end do
+       end do
+
+
+c---------------------------------------------------------------------
+c      fill the predecessor and successor entries, using the indices 
+c      of the bottom cells (they are the same at each level of k 
+c      anyway) acting as if full periodicity pertains; note that p is
+c      added to those arguments to the mod functions that might
+c      otherwise return wrong values when using the modulo function
+c---------------------------------------------------------------------
+       i = cell_coord(1,1)-1
+       j = cell_coord(2,1)-1
+
+       predecessor(1) = mod(i-1+p,p) + p*j
+       predecessor(2) = i + p*mod(j-1+p,p)
+       predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
+       successor(1)   = mod(i+1,p) + p*j
+       successor(2)   = i + p*mod(j+1,p)
+       successor(3)   = mod(i-1+p,p) + p*mod(j+1,p)
+
+c---------------------------------------------------------------------
+c now compute the sizes of the cells                                    
+c---------------------------------------------------------------------
+       do    dir= 1, 3
+c---------------------------------------------------------------------
+c         set cell_coord range for each direction                            
+c---------------------------------------------------------------------
+          size   = grid_points(dir)/p
+          excess = mod(grid_points(dir),p)
+          do    c=1, ncells
+             if (cell_coord(dir,c) .le. excess) then
+                cell_size(dir,c) = size+1
+                cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
+                cell_high(dir,c) = cell_low(dir,c)+size
+             else 
+                cell_size(dir,c) = size
+                cell_low(dir,c)  = excess*(size+1)+
+     >                   (cell_coord(dir,c)-excess-1)*size
+                cell_high(dir,c) = cell_low(dir,c)+size-1
+             endif
+             if (cell_size(dir, c) .le. 2) then
+                write(*,50)
+ 50             format(' Error: Cell size too small. Min size is 3')
+                call MPI_Abort(mpi_comm_world,ierrcode,ierr)
+                stop
+             endif
+          end do
+       end do
+
+       return
+       end
+
diff --git a/examples/smpi/NAS/SP/mpinpb.h b/examples/smpi/NAS/SP/mpinpb.h
new file mode 100644
index 0000000000..439db34f60
--- /dev/null
+++ b/examples/smpi/NAS/SP/mpinpb.h
@@ -0,0 +1,13 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'mpif.h'
+
+      integer           node, no_nodes, total_nodes, root, comm_setup, 
+     >                  comm_solve, comm_rhs, dp_type
+      logical           active
+      common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, 
+     >                  comm_solve, comm_rhs, dp_type, active
+      integer           DEFAULT_TAG
+      parameter         (DEFAULT_TAG = 0)
diff --git a/examples/smpi/NAS/SP/ninvr.f b/examples/smpi/NAS/SP/ninvr.f
new file mode 100644
index 0000000000..146d046e8b
--- /dev/null
+++ b/examples/smpi/NAS/SP/ninvr.f
@@ -0,0 +1,45 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine  ninvr(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   block-diagonal matrix-vector multiplication              
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       integer  c,  i, j, k
+       double precision r1, r2, r3, r4, r5, t1, t2
+
+c---------------------------------------------------------------------
+c      treat only one cell                           
+c---------------------------------------------------------------------
+       do k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+                r1 = rhs(i,j,k,1,c)
+                r2 = rhs(i,j,k,2,c)
+                r3 = rhs(i,j,k,3,c)
+                r4 = rhs(i,j,k,4,c)
+                r5 = rhs(i,j,k,5,c)
+               
+                t1 = bt * r3
+                t2 = 0.5d0 * ( r4 + r5 )
+
+                rhs(i,j,k,1,c) = -r2
+                rhs(i,j,k,2,c) =  r1
+                rhs(i,j,k,3,c) = bt * ( r4 - r5 )
+                rhs(i,j,k,4,c) = -t1 + t2
+                rhs(i,j,k,5,c) =  t1 + t2
+             enddo    
+          enddo
+       enddo
+
+       return
+       end
diff --git a/examples/smpi/NAS/SP/pinvr.f b/examples/smpi/NAS/SP/pinvr.f
new file mode 100644
index 0000000000..060f0a57ef
--- /dev/null
+++ b/examples/smpi/NAS/SP/pinvr.f
@@ -0,0 +1,48 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine pinvr(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   block-diagonal matrix-vector multiplication                       
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       integer i, j, k, c
+       double precision r1, r2, r3, r4, r5, t1, t2
+
+c---------------------------------------------------------------------
+c      treat only one cell                                   
+c---------------------------------------------------------------------
+       do   k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do   j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+                r1 = rhs(i,j,k,1,c)
+                r2 = rhs(i,j,k,2,c)
+                r3 = rhs(i,j,k,3,c)
+                r4 = rhs(i,j,k,4,c)
+                r5 = rhs(i,j,k,5,c)
+
+                t1 = bt * r1
+                t2 = 0.5d0 * ( r4 + r5 )
+
+                rhs(i,j,k,1,c) =  bt * ( r4 - r5 )
+                rhs(i,j,k,2,c) = -r3
+                rhs(i,j,k,3,c) =  r2
+                rhs(i,j,k,4,c) = -t1 + t2
+                rhs(i,j,k,5,c) =  t1 + t2
+             end do
+          end do
+       end do
+
+       return
+       end
+
+
+
diff --git a/examples/smpi/NAS/SP/rhs.f b/examples/smpi/NAS/SP/rhs.f
new file mode 100644
index 0000000000..34e562a4e3
--- /dev/null
+++ b/examples/smpi/NAS/SP/rhs.f
@@ -0,0 +1,446 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine compute_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       integer c, i, j, k, m
+       double precision aux, rho_inv, uijk, up1, um1, vijk, vp1, vm1,
+     >                  wijk, wp1, wm1
+
+
+c---------------------------------------------------------------------
+c loop over all cells owned by this node                           
+c---------------------------------------------------------------------
+       do    c = 1, ncells
+
+c---------------------------------------------------------------------
+c         compute the reciprocal of density, and the kinetic energy, 
+c         and the speed of sound. 
+c---------------------------------------------------------------------
+
+          do    k = -1, cell_size(3,c)
+             do    j = -1, cell_size(2,c)
+                do    i = -1, cell_size(1,c)
+                   rho_inv = 1.0d0/u(i,j,k,1,c)
+                   rho_i(i,j,k,c) = rho_inv
+                   us(i,j,k,c) = u(i,j,k,2,c) * rho_inv
+                   vs(i,j,k,c) = u(i,j,k,3,c) * rho_inv
+                   ws(i,j,k,c) = u(i,j,k,4,c) * rho_inv
+                   square(i,j,k,c)     = 0.5d0* (
+     >                        u(i,j,k,2,c)*u(i,j,k,2,c) + 
+     >                        u(i,j,k,3,c)*u(i,j,k,3,c) +
+     >                        u(i,j,k,4,c)*u(i,j,k,4,c) ) * rho_inv
+                   qs(i,j,k,c) = square(i,j,k,c) * rho_inv
+c---------------------------------------------------------------------
+c                  (don't need speed and ainx until the lhs computation)
+c---------------------------------------------------------------------
+                   aux = c1c2*rho_inv* (u(i,j,k,5,c) - square(i,j,k,c))
+                   aux = dsqrt(aux)
+                   speed(i,j,k,c) = aux
+                   ainv(i,j,k,c)  = 1.0d0/aux
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c copy the exact forcing term to the right hand side;  because 
+c this forcing term is known, we can store it on the whole of every 
+c cell,  including the boundary                   
+c---------------------------------------------------------------------
+
+          do   m = 1, 5
+             do   k = 0, cell_size(3,c)-1
+                do   j = 0, cell_size(2,c)-1
+                   do   i = 0, cell_size(1,c)-1
+                      rhs(i,j,k,m,c) = forcing(i,j,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+
+c---------------------------------------------------------------------
+c         compute xi-direction fluxes 
+c---------------------------------------------------------------------
+          do    k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+                do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+                   uijk = us(i,j,k,c)
+                   up1  = us(i+1,j,k,c)
+                   um1  = us(i-1,j,k,c)
+
+                   rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dx1tx1 * 
+     >                    (u(i+1,j,k,1,c) - 2.0d0*u(i,j,k,1,c) + 
+     >                     u(i-1,j,k,1,c)) -
+     >                    tx2 * (u(i+1,j,k,2,c) - u(i-1,j,k,2,c))
+
+                   rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dx2tx1 * 
+     >                    (u(i+1,j,k,2,c) - 2.0d0*u(i,j,k,2,c) + 
+     >                     u(i-1,j,k,2,c)) +
+     >                    xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
+     >                    tx2 * (u(i+1,j,k,2,c)*up1 - 
+     >                           u(i-1,j,k,2,c)*um1 +
+     >                           (u(i+1,j,k,5,c)- square(i+1,j,k,c)-
+     >                            u(i-1,j,k,5,c)+ square(i-1,j,k,c))*
+     >                            c2)
+
+                   rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dx3tx1 * 
+     >                    (u(i+1,j,k,3,c) - 2.0d0*u(i,j,k,3,c) +
+     >                     u(i-1,j,k,3,c)) +
+     >                    xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
+     >                              vs(i-1,j,k,c)) -
+     >                    tx2 * (u(i+1,j,k,3,c)*up1 - 
+     >                           u(i-1,j,k,3,c)*um1)
+
+                   rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dx4tx1 * 
+     >                    (u(i+1,j,k,4,c) - 2.0d0*u(i,j,k,4,c) +
+     >                     u(i-1,j,k,4,c)) +
+     >                    xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
+     >                              ws(i-1,j,k,c)) -
+     >                    tx2 * (u(i+1,j,k,4,c)*up1 - 
+     >                           u(i-1,j,k,4,c)*um1)
+
+                   rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dx5tx1 * 
+     >                    (u(i+1,j,k,5,c) - 2.0d0*u(i,j,k,5,c) +
+     >                     u(i-1,j,k,5,c)) +
+     >                    xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
+     >                              qs(i-1,j,k,c)) +
+     >                    xxcon4 * (up1*up1 -       2.0d0*uijk*uijk + 
+     >                              um1*um1) +
+     >                    xxcon5 * (u(i+1,j,k,5,c)*rho_i(i+1,j,k,c) - 
+     >                              2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
+     >                              u(i-1,j,k,5,c)*rho_i(i-1,j,k,c)) -
+     >                    tx2 * ( (c1*u(i+1,j,k,5,c) - 
+     >                             c2*square(i+1,j,k,c))*up1 -
+     >                            (c1*u(i-1,j,k,5,c) - 
+     >                             c2*square(i-1,j,k,c))*um1 )
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         add fourth order xi-direction dissipation               
+c---------------------------------------------------------------------
+          if (start(1,c) .gt. 0) then
+             i = 1
+             do    m = 1, 5
+                do    k = start(3,c), cell_size(3,c)-end(3,c)-1
+                   do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * 
+     >                    ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) +
+     >                            u(i+2,j,k,m,c))
+                   end do
+                end do
+             end do
+
+             i = 2
+             do    m = 1, 5
+                do    k = start(3,c), cell_size(3,c)-end(3,c)-1
+                   do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
+     >                    (-4.0d0*u(i-1,j,k,m,c) + 6.0d0*u(i,j,k,m,c) -
+     >                      4.0d0*u(i+1,j,k,m,c) + u(i+2,j,k,m,c))
+                   end do
+                end do
+             end do
+          endif
+
+          do     m = 1, 5
+             do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do  i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
+     >                    (  u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) + 
+     >                     6.0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) + 
+     >                         u(i+2,j,k,m,c) )
+                   end do
+                end do
+             end do
+          end do
+ 
+
+          if (end(1,c) .gt. 0) then
+             i = cell_size(1,c)-3
+             do     m = 1, 5
+                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                   do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+     >                    ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) + 
+     >                      6.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) )
+                   end do
+                end do
+             end do
+
+             i = cell_size(1,c)-2
+             do     m = 1, 5
+                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                   do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+     >                    ( u(i-2,j,k,m,c) - 4.d0*u(i-1,j,k,m,c) +
+     >                      5.d0*u(i,j,k,m,c) )
+                   end do
+                end do
+             end do
+          endif
+
+c---------------------------------------------------------------------
+c         compute eta-direction fluxes 
+c---------------------------------------------------------------------
+          do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                   vijk = vs(i,j,k,c)
+                   vp1  = vs(i,j+1,k,c)
+                   vm1  = vs(i,j-1,k,c)
+                   rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dy1ty1 * 
+     >                   (u(i,j+1,k,1,c) - 2.0d0*u(i,j,k,1,c) + 
+     >                    u(i,j-1,k,1,c)) -
+     >                   ty2 * (u(i,j+1,k,3,c) - u(i,j-1,k,3,c))
+                   rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dy2ty1 * 
+     >                   (u(i,j+1,k,2,c) - 2.0d0*u(i,j,k,2,c) + 
+     >                    u(i,j-1,k,2,c)) +
+     >                   yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + 
+     >                             us(i,j-1,k,c)) -
+     >                   ty2 * (u(i,j+1,k,2,c)*vp1 - 
+     >                          u(i,j-1,k,2,c)*vm1)
+                   rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dy3ty1 * 
+     >                   (u(i,j+1,k,3,c) - 2.0d0*u(i,j,k,3,c) + 
+     >                    u(i,j-1,k,3,c)) +
+     >                   yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
+     >                   ty2 * (u(i,j+1,k,3,c)*vp1 - 
+     >                          u(i,j-1,k,3,c)*vm1 +
+     >                          (u(i,j+1,k,5,c) - square(i,j+1,k,c) - 
+     >                           u(i,j-1,k,5,c) + square(i,j-1,k,c))
+     >                          *c2)
+                   rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dy4ty1 * 
+     >                   (u(i,j+1,k,4,c) - 2.0d0*u(i,j,k,4,c) + 
+     >                    u(i,j-1,k,4,c)) +
+     >                   yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + 
+     >                             ws(i,j-1,k,c)) -
+     >                   ty2 * (u(i,j+1,k,4,c)*vp1 - 
+     >                          u(i,j-1,k,4,c)*vm1)
+                   rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dy5ty1 * 
+     >                   (u(i,j+1,k,5,c) - 2.0d0*u(i,j,k,5,c) + 
+     >                    u(i,j-1,k,5,c)) +
+     >                   yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + 
+     >                             qs(i,j-1,k,c)) +
+     >                   yycon4 * (vp1*vp1       - 2.0d0*vijk*vijk + 
+     >                             vm1*vm1) +
+     >                   yycon5 * (u(i,j+1,k,5,c)*rho_i(i,j+1,k,c) - 
+     >                             2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
+     >                             u(i,j-1,k,5,c)*rho_i(i,j-1,k,c)) -
+     >                   ty2 * ((c1*u(i,j+1,k,5,c) - 
+     >                           c2*square(i,j+1,k,c)) * vp1 -
+     >                          (c1*u(i,j-1,k,5,c) - 
+     >                           c2*square(i,j-1,k,c)) * vm1)
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         add fourth order eta-direction dissipation         
+c---------------------------------------------------------------------
+          if (start(2,c) .gt. 0) then
+             j = 1
+             do     m = 1, 5
+                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * 
+     >                    ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) +
+     >                            u(i,j+2,k,m,c))
+                   end do
+                end do
+             end do
+
+             j = 2
+             do     m = 1, 5
+                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
+     >                    (-4.0d0*u(i,j-1,k,m,c) + 6.0d0*u(i,j,k,m,c) -
+     >                      4.0d0*u(i,j+1,k,m,c) + u(i,j+2,k,m,c))
+                   end do
+                end do
+             end do
+          endif
+
+          do     m = 1, 5
+             do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                do    j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
+                   do  i = start(1,c),cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
+     >                    (  u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) + 
+     >                     6.0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) + 
+     >                         u(i,j+2,k,m,c) )
+                   end do
+                end do
+             end do
+          end do
+ 
+          if (end(2,c) .gt. 0) then
+             j = cell_size(2,c)-3
+             do     m = 1, 5
+                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+     >                    ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) + 
+     >                      6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) )
+                   end do
+                end do
+             end do
+
+             j = cell_size(2,c)-2
+             do     m = 1, 5
+                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+     >                    ( u(i,j-2,k,m,c) - 4.d0*u(i,j-1,k,m,c) +
+     >                      5.d0*u(i,j,k,m,c) )
+                   end do
+                end do
+             end do
+          endif
+
+
+c---------------------------------------------------------------------
+c         compute zeta-direction fluxes 
+c---------------------------------------------------------------------
+          do    k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                   wijk = ws(i,j,k,c)
+                   wp1  = ws(i,j,k+1,c)
+                   wm1  = ws(i,j,k-1,c)
+
+                   rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dz1tz1 * 
+     >                   (u(i,j,k+1,1,c) - 2.0d0*u(i,j,k,1,c) + 
+     >                    u(i,j,k-1,1,c)) -
+     >                   tz2 * (u(i,j,k+1,4,c) - u(i,j,k-1,4,c))
+                   rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dz2tz1 * 
+     >                   (u(i,j,k+1,2,c) - 2.0d0*u(i,j,k,2,c) + 
+     >                    u(i,j,k-1,2,c)) +
+     >                   zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + 
+     >                             us(i,j,k-1,c)) -
+     >                   tz2 * (u(i,j,k+1,2,c)*wp1 - 
+     >                          u(i,j,k-1,2,c)*wm1)
+                   rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dz3tz1 * 
+     >                   (u(i,j,k+1,3,c) - 2.0d0*u(i,j,k,3,c) + 
+     >                    u(i,j,k-1,3,c)) +
+     >                   zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + 
+     >                             vs(i,j,k-1,c)) -
+     >                   tz2 * (u(i,j,k+1,3,c)*wp1 - 
+     >                          u(i,j,k-1,3,c)*wm1)
+                   rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dz4tz1 * 
+     >                   (u(i,j,k+1,4,c) - 2.0d0*u(i,j,k,4,c) + 
+     >                    u(i,j,k-1,4,c)) +
+     >                   zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
+     >                   tz2 * (u(i,j,k+1,4,c)*wp1 - 
+     >                          u(i,j,k-1,4,c)*wm1 +
+     >                          (u(i,j,k+1,5,c) - square(i,j,k+1,c) - 
+     >                           u(i,j,k-1,5,c) + square(i,j,k-1,c))
+     >                          *c2)
+                   rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dz5tz1 * 
+     >                   (u(i,j,k+1,5,c) - 2.0d0*u(i,j,k,5,c) + 
+     >                    u(i,j,k-1,5,c)) +
+     >                   zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + 
+     >                             qs(i,j,k-1,c)) +
+     >                   zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + 
+     >                             wm1*wm1) +
+     >                   zzcon5 * (u(i,j,k+1,5,c)*rho_i(i,j,k+1,c) - 
+     >                             2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
+     >                             u(i,j,k-1,5,c)*rho_i(i,j,k-1,c)) -
+     >                   tz2 * ( (c1*u(i,j,k+1,5,c) - 
+     >                            c2*square(i,j,k+1,c))*wp1 -
+     >                           (c1*u(i,j,k-1,5,c) - 
+     >                            c2*square(i,j,k-1,c))*wm1)
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         add fourth order zeta-direction dissipation                
+c---------------------------------------------------------------------
+          if (start(3,c) .gt. 0) then
+             k = 1
+             do     m = 1, 5
+                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * 
+     >                    ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) +
+     >                            u(i,j,k+2,m,c))
+                   end do
+                end do
+             end do
+
+             k = 2
+             do     m = 1, 5
+                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
+     >                    (-4.0d0*u(i,j,k-1,m,c) + 6.0d0*u(i,j,k,m,c) -
+     >                      4.0d0*u(i,j,k+1,m,c) + u(i,j,k+2,m,c))
+                   end do
+                end do
+             end do
+          endif
+
+          do     m = 1, 5
+             do     k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
+                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do     i = start(1,c),cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
+     >                    (  u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) + 
+     >                     6.0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) + 
+     >                         u(i,j,k+2,m,c) )
+                   end do
+                end do
+             end do
+          end do
+ 
+          if (end(3,c) .gt. 0) then
+             k = cell_size(3,c)-3
+             do     m = 1, 5
+                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+     >                    ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) + 
+     >                      6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) )
+                   end do
+                end do
+             end do
+
+             k = cell_size(3,c)-2
+             do     m = 1, 5
+                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+     >                    ( u(i,j,k-2,m,c) - 4.d0*u(i,j,k-1,m,c) +
+     >                      5.d0*u(i,j,k,m,c) )
+                   end do
+                end do
+             end do
+          endif
+
+          do     m = 1, 5
+             do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+                   do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) * dt
+                   end do
+                end do
+             end do
+          end do
+
+       end do
+    
+       return
+       end
+
+
+
+
diff --git a/examples/smpi/NAS/SP/set_constants.f b/examples/smpi/NAS/SP/set_constants.f
new file mode 100644
index 0000000000..63ce72bb9b
--- /dev/null
+++ b/examples/smpi/NAS/SP/set_constants.f
@@ -0,0 +1,203 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine  set_constants
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       include 'header.h'
+  
+       ce(1,1)  = 2.0d0
+       ce(1,2)  = 0.0d0
+       ce(1,3)  = 0.0d0
+       ce(1,4)  = 4.0d0
+       ce(1,5)  = 5.0d0
+       ce(1,6)  = 3.0d0
+       ce(1,7)  = 0.5d0
+       ce(1,8)  = 0.02d0
+       ce(1,9)  = 0.01d0
+       ce(1,10) = 0.03d0
+       ce(1,11) = 0.5d0
+       ce(1,12) = 0.4d0
+       ce(1,13) = 0.3d0
+ 
+       ce(2,1)  = 1.0d0
+       ce(2,2)  = 0.0d0
+       ce(2,3)  = 0.0d0
+       ce(2,4)  = 0.0d0
+       ce(2,5)  = 1.0d0
+       ce(2,6)  = 2.0d0
+       ce(2,7)  = 3.0d0
+       ce(2,8)  = 0.01d0
+       ce(2,9)  = 0.03d0
+       ce(2,10) = 0.02d0
+       ce(2,11) = 0.4d0
+       ce(2,12) = 0.3d0
+       ce(2,13) = 0.5d0
+
+       ce(3,1)  = 2.0d0
+       ce(3,2)  = 2.0d0
+       ce(3,3)  = 0.0d0
+       ce(3,4)  = 0.0d0
+       ce(3,5)  = 0.0d0
+       ce(3,6)  = 2.0d0
+       ce(3,7)  = 3.0d0
+       ce(3,8)  = 0.04d0
+       ce(3,9)  = 0.03d0
+       ce(3,10) = 0.05d0
+       ce(3,11) = 0.3d0
+       ce(3,12) = 0.5d0
+       ce(3,13) = 0.4d0
+
+       ce(4,1)  = 2.0d0
+       ce(4,2)  = 2.0d0
+       ce(4,3)  = 0.0d0
+       ce(4,4)  = 0.0d0
+       ce(4,5)  = 0.0d0
+       ce(4,6)  = 2.0d0
+       ce(4,7)  = 3.0d0
+       ce(4,8)  = 0.03d0
+       ce(4,9)  = 0.05d0
+       ce(4,10) = 0.04d0
+       ce(4,11) = 0.2d0
+       ce(4,12) = 0.1d0
+       ce(4,13) = 0.3d0
+
+       ce(5,1)  = 5.0d0
+       ce(5,2)  = 4.0d0
+       ce(5,3)  = 3.0d0
+       ce(5,4)  = 2.0d0
+       ce(5,5)  = 0.1d0
+       ce(5,6)  = 0.4d0
+       ce(5,7)  = 0.3d0
+       ce(5,8)  = 0.05d0
+       ce(5,9)  = 0.04d0
+       ce(5,10) = 0.03d0
+       ce(5,11) = 0.1d0
+       ce(5,12) = 0.3d0
+       ce(5,13) = 0.2d0
+
+       c1 = 1.4d0
+       c2 = 0.4d0
+       c3 = 0.1d0
+       c4 = 1.0d0
+       c5 = 1.4d0
+
+       bt = dsqrt(0.5d0)
+
+       dnxm1 = 1.0d0 / dble(grid_points(1)-1)
+       dnym1 = 1.0d0 / dble(grid_points(2)-1)
+       dnzm1 = 1.0d0 / dble(grid_points(3)-1)
+
+       c1c2 = c1 * c2
+       c1c5 = c1 * c5
+       c3c4 = c3 * c4
+       c1345 = c1c5 * c3c4
+
+       conz1 = (1.0d0-c1c5)
+
+       tx1 = 1.0d0 / (dnxm1 * dnxm1)
+       tx2 = 1.0d0 / (2.0d0 * dnxm1)
+       tx3 = 1.0d0 / dnxm1
+
+       ty1 = 1.0d0 / (dnym1 * dnym1)
+       ty2 = 1.0d0 / (2.0d0 * dnym1)
+       ty3 = 1.0d0 / dnym1
+ 
+       tz1 = 1.0d0 / (dnzm1 * dnzm1)
+       tz2 = 1.0d0 / (2.0d0 * dnzm1)
+       tz3 = 1.0d0 / dnzm1
+
+       dx1 = 0.75d0
+       dx2 = 0.75d0
+       dx3 = 0.75d0
+       dx4 = 0.75d0
+       dx5 = 0.75d0
+
+       dy1 = 0.75d0
+       dy2 = 0.75d0
+       dy3 = 0.75d0
+       dy4 = 0.75d0
+       dy5 = 0.75d0
+
+       dz1 = 1.0d0
+       dz2 = 1.0d0
+       dz3 = 1.0d0
+       dz4 = 1.0d0
+       dz5 = 1.0d0
+
+       dxmax = dmax1(dx3, dx4)
+       dymax = dmax1(dy2, dy4)
+       dzmax = dmax1(dz2, dz3)
+
+       dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
+
+       c4dssp = 4.0d0 * dssp
+       c5dssp = 5.0d0 * dssp
+
+       dttx1 = dt*tx1
+       dttx2 = dt*tx2
+       dtty1 = dt*ty1
+       dtty2 = dt*ty2
+       dttz1 = dt*tz1
+       dttz2 = dt*tz2
+
+       c2dttx1 = 2.0d0*dttx1
+       c2dtty1 = 2.0d0*dtty1
+       c2dttz1 = 2.0d0*dttz1
+
+       dtdssp = dt*dssp
+
+       comz1  = dtdssp
+       comz4  = 4.0d0*dtdssp
+       comz5  = 5.0d0*dtdssp
+       comz6  = 6.0d0*dtdssp
+
+       c3c4tx3 = c3c4*tx3
+       c3c4ty3 = c3c4*ty3
+       c3c4tz3 = c3c4*tz3
+
+       dx1tx1 = dx1*tx1
+       dx2tx1 = dx2*tx1
+       dx3tx1 = dx3*tx1
+       dx4tx1 = dx4*tx1
+       dx5tx1 = dx5*tx1
+        
+       dy1ty1 = dy1*ty1
+       dy2ty1 = dy2*ty1
+       dy3ty1 = dy3*ty1
+       dy4ty1 = dy4*ty1
+       dy5ty1 = dy5*ty1
+        
+       dz1tz1 = dz1*tz1
+       dz2tz1 = dz2*tz1
+       dz3tz1 = dz3*tz1
+       dz4tz1 = dz4*tz1
+       dz5tz1 = dz5*tz1
+
+       c2iv  = 2.5d0
+       con43 = 4.0d0/3.0d0
+       con16 = 1.0d0/6.0d0
+        
+       xxcon1 = c3c4tx3*con43*tx3
+       xxcon2 = c3c4tx3*tx3
+       xxcon3 = c3c4tx3*conz1*tx3
+       xxcon4 = c3c4tx3*con16*tx3
+       xxcon5 = c3c4tx3*c1c5*tx3
+
+       yycon1 = c3c4ty3*con43*ty3
+       yycon2 = c3c4ty3*ty3
+       yycon3 = c3c4ty3*conz1*ty3
+       yycon4 = c3c4ty3*con16*ty3
+       yycon5 = c3c4ty3*c1c5*ty3
+
+       zzcon1 = c3c4tz3*con43*tz3
+       zzcon2 = c3c4tz3*tz3
+       zzcon3 = c3c4tz3*conz1*tz3
+       zzcon4 = c3c4tz3*con16*tz3
+       zzcon5 = c3c4tz3*c1c5*tz3
+
+       return
+       end
diff --git a/examples/smpi/NAS/SP/setup_mpi.f b/examples/smpi/NAS/SP/setup_mpi.f
new file mode 100644
index 0000000000..2d98f7dd02
--- /dev/null
+++ b/examples/smpi/NAS/SP/setup_mpi.f
@@ -0,0 +1,65 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_mpi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c set up MPI stuff
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'mpinpb.h'
+      include 'npbparams.h'
+      integer error, nc, color
+
+      call mpi_init(error)
+      
+      call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
+      call mpi_comm_rank(MPI_COMM_WORLD, node, error)
+
+      if (.not. convertdouble) then
+         dp_type = MPI_DOUBLE_PRECISION
+      else
+         dp_type = MPI_REAL
+      endif
+
+c---------------------------------------------------------------------
+c     compute square root; add small number to allow for roundoff
+c---------------------------------------------------------------------
+      nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c We handle a non-square number of nodes by making the excess nodes
+c inactive. However, we can never handle more cells than were compiled
+c in. 
+c---------------------------------------------------------------------
+
+      if (nc .gt. maxcells) nc = maxcells
+
+      if (node .ge. nc*nc) then
+         active = .false.
+         color = 1
+      else
+         active = .true.
+         color = 0
+      end if
+      
+      call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
+      if (.not. active) return
+
+      call mpi_comm_size(comm_setup, no_nodes, error)
+      call mpi_comm_dup(comm_setup, comm_solve, error)
+      call mpi_comm_dup(comm_setup, comm_rhs, error)
+      
+c---------------------------------------------------------------------
+c     let node 0 be the root for the group (there is only one)
+c---------------------------------------------------------------------
+      root = 0
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/SP/sp.f b/examples/smpi/NAS/SP/sp.f
new file mode 100644
index 0000000000..740cadee46
--- /dev/null
+++ b/examples/smpi/NAS/SP/sp.f
@@ -0,0 +1,194 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                                   S P                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+!    It is described in NAS Technical Reports 95-020 and 02-007           !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c
+c Authors: R. F. Van der Wijngaart
+c          W. Saphir
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+       program MPSP
+c---------------------------------------------------------------------
+
+       include  'header.h'
+       include  'mpinpb.h'
+      
+       integer          i, niter, step, c, error, fstatus
+       external timer_read
+       double precision mflops, t, tmax, timer_read
+       logical          verified
+       character        class
+
+       call setup_mpi
+       if (.not. active) goto 999
+
+c---------------------------------------------------------------------
+c      Root node reads input file (if it exists) else takes
+c      defaults from parameters
+c---------------------------------------------------------------------
+       if (node .eq. root) then
+          
+          write(*, 1000)
+          open (unit=2,file='inputsp.data',status='old', iostat=fstatus)
+c
+          if (fstatus .eq. 0) then
+            write(*,233) 
+ 233        format(' Reading from input file inputsp.data')
+            read (2,*) niter
+            read (2,*) dt
+            read (2,*) grid_points(1), grid_points(2), grid_points(3)
+            close(2)
+          else
+            write(*,234) 
+            niter = niter_default
+            dt    = dt_default
+            grid_points(1) = problem_size
+            grid_points(2) = problem_size
+            grid_points(3) = problem_size
+          endif
+ 234      format(' No input file inputsp.data. Using compiled defaults')
+
+          write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
+          write(*, 1002) niter, dt
+          if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
+          if (no_nodes .ne. maxcells*maxcells) 
+     >        write(*, 1005) maxcells*maxcells
+          write(*, 1003) no_nodes
+
+ 1000 format(//,' NAS Parallel Benchmarks 3.3 -- SP Benchmark',/)
+ 1001     format(' Size: ', i4, 'x', i4, 'x', i4)
+ 1002     format(' Iterations: ', i4, '    dt: ', F11.7)
+ 1004     format(' Total number of processes: ', i5)
+ 1005     format(' WARNING: compiled for ', i5, ' processes ')
+ 1003     format(' Number of active processes: ', i5, /)
+
+       endif
+
+       call mpi_bcast(niter, 1, MPI_INTEGER, 
+     >                root, comm_setup, error)
+
+       call mpi_bcast(dt, 1, dp_type, 
+     >                root, comm_setup, error)
+
+       call mpi_bcast(grid_points(1), 3, MPI_INTEGER, 
+     >                root, comm_setup, error)
+
+
+       call make_set
+
+       do  c = 1, ncells
+          if ( (cell_size(1,c) .gt. IMAX) .or.
+     >         (cell_size(2,c) .gt. JMAX) .or.
+     >         (cell_size(3,c) .gt. KMAX) ) then
+             print *,node, c, (cell_size(i,c),i=1,3)
+             print *,' Problem size too big for compiled array sizes'
+             goto 999
+          endif
+       end do
+
+       call set_constants
+
+       call initialize
+
+c       call mpi_finalize(error)
+c       stop
+
+       call lhsinit
+
+       call exact_rhs
+
+       call compute_buffer_size(5)
+
+c---------------------------------------------------------------------
+c      do one time step to touch all code, and reinitialize
+c---------------------------------------------------------------------
+       call adi
+       call initialize
+
+c---------------------------------------------------------------------
+c      Synchronize before placing time stamp
+c---------------------------------------------------------------------
+       call mpi_barrier(comm_setup, error)
+
+       call timer_clear(1)
+       call timer_start(1)
+
+       do  step = 1, niter
+
+          if (node .eq. root) then
+             if (mod(step, 20) .eq. 0 .or. 
+     >           step .eq. 1) then
+                write(*, 200) step
+ 200            format(' Time step ', i4)
+              endif
+          endif
+
+          call adi
+
+       end do
+
+       call timer_stop(1)
+       t = timer_read(1)
+       
+       call verify(niter, class, verified)
+
+       call mpi_reduce(t, tmax, 1, 
+     >                 dp_type, MPI_MAX, 
+     >                 root, comm_setup, error)
+
+       if( node .eq. root ) then
+          if( tmax .ne. 0. ) then
+             mflops = (881.174*float( problem_size )**3
+     >                -4683.91*float( problem_size )**2
+     >                +11484.5*float( problem_size )
+     >                -19272.4) * float( niter ) / (tmax*1000000.0d0)
+          else
+             mflops = 0.0
+          endif
+
+         call print_results('SP', class, grid_points(1), 
+     >     grid_points(2), grid_points(3), niter, maxcells*maxcells, 
+     >     total_nodes, tmax, mflops, '          floating point', 
+     >     verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, 
+     >     cs6, '(none)')
+       endif
+
+ 999   continue
+       call mpi_barrier(MPI_COMM_WORLD, error)
+       call mpi_finalize(error)
+
+       end
diff --git a/examples/smpi/NAS/SP/txinvr.f b/examples/smpi/NAS/SP/txinvr.f
new file mode 100644
index 0000000000..b5ca4616f5
--- /dev/null
+++ b/examples/smpi/NAS/SP/txinvr.f
@@ -0,0 +1,59 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine  txinvr
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c block-diagonal matrix-vector multiplication                  
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       integer c, i, j, k
+       double precision t1, t2, t3, ac, ru1, uu, vv, ww, r1, r2, r3, 
+     >                  r4, r5, ac2inv
+
+c---------------------------------------------------------------------
+c      loop over all cells owned by this node          
+c---------------------------------------------------------------------
+       do   c = 1, ncells
+          do    k = start(3,c), cell_size(3,c)-end(3,c)-1
+             do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+                do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+                   ru1 = rho_i(i,j,k,c)
+                   uu = us(i,j,k,c)
+                   vv = vs(i,j,k,c)
+                   ww = ws(i,j,k,c)
+                   ac = speed(i,j,k,c)
+                   ac2inv = ainv(i,j,k,c)*ainv(i,j,k,c)
+
+                   r1 = rhs(i,j,k,1,c)
+                   r2 = rhs(i,j,k,2,c)
+                   r3 = rhs(i,j,k,3,c)
+                   r4 = rhs(i,j,k,4,c)
+                   r5 = rhs(i,j,k,5,c)
+
+                   t1 = c2 * ac2inv * ( qs(i,j,k,c)*r1 - uu*r2  - 
+     >                  vv*r3 - ww*r4 + r5 )
+                   t2 = bt * ru1 * ( uu * r1 - r2 )
+                   t3 = ( bt * ru1 * ac ) * t1
+
+                   rhs(i,j,k,1,c) = r1 - t1
+                   rhs(i,j,k,2,c) = - ru1 * ( ww*r1 - r4 )
+                   rhs(i,j,k,3,c) =   ru1 * ( vv*r1 - r3 )
+                   rhs(i,j,k,4,c) = - t2 + t3
+                   rhs(i,j,k,5,c) =   t2 + t3
+                end do
+             end do
+          end do
+       end do
+
+       return
+       end
+
+
diff --git a/examples/smpi/NAS/SP/tzetar.f b/examples/smpi/NAS/SP/tzetar.f
new file mode 100644
index 0000000000..554066d6fc
--- /dev/null
+++ b/examples/smpi/NAS/SP/tzetar.f
@@ -0,0 +1,60 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine  tzetar(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   block-diagonal matrix-vector multiplication                       
+c---------------------------------------------------------------------
+
+       include 'header.h'
+
+       integer i, j, k, c
+       double precision  t1, t2, t3, ac, xvel, yvel, zvel, r1, r2, r3, 
+     >                   r4, r5, btuz, acinv, ac2u, uzik1
+
+c---------------------------------------------------------------------
+c      treat only one cell                                             
+c---------------------------------------------------------------------
+       do    k = start(3,c), cell_size(3,c)-end(3,c)-1
+          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
+             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+                xvel = us(i,j,k,c)
+                yvel = vs(i,j,k,c)
+                zvel = ws(i,j,k,c)
+                ac   = speed(i,j,k,c)
+                acinv = ainv(i,j,k,c)
+
+                ac2u = ac*ac
+
+                r1 = rhs(i,j,k,1,c)
+                r2 = rhs(i,j,k,2,c)
+                r3 = rhs(i,j,k,3,c)
+                r4 = rhs(i,j,k,4,c)
+                r5 = rhs(i,j,k,5,c)      
+
+                uzik1 = u(i,j,k,1,c)
+                btuz  = bt * uzik1
+
+                t1 = btuz*acinv * (r4 + r5)
+                t2 = r3 + t1
+                t3 = btuz * (r4 - r5)
+
+                rhs(i,j,k,1,c) = t2
+                rhs(i,j,k,2,c) = -uzik1*r2 + xvel*t2
+                rhs(i,j,k,3,c) =  uzik1*r1 + yvel*t2
+                rhs(i,j,k,4,c) =  zvel*t2  + t3
+                rhs(i,j,k,5,c) =  uzik1*(-xvel*r2 + yvel*r1) + 
+     >                    qs(i,j,k,c)*t2 + c2iv*ac2u*t1 + zvel*t3
+
+             end do
+          end do
+       end do
+
+       return
+       end
diff --git a/examples/smpi/NAS/SP/verify.f b/examples/smpi/NAS/SP/verify.f
new file mode 100644
index 0000000000..08be79c8a6
--- /dev/null
+++ b/examples/smpi/NAS/SP/verify.f
@@ -0,0 +1,358 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+        subroutine verify(no_time_steps, class, verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c  verification routine                         
+c---------------------------------------------------------------------
+
+        include 'header.h'
+        include 'mpinpb.h'
+
+        double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), 
+     >                   epsilon, xce(5), xcr(5), dtref
+        integer m, no_time_steps
+        character class
+        logical verified
+
+c---------------------------------------------------------------------
+c   tolerance level
+c---------------------------------------------------------------------
+        epsilon = 1.0d-08
+
+
+c---------------------------------------------------------------------
+c   compute the error norm and the residual norm, and exit if not printing
+c---------------------------------------------------------------------
+        call error_norm(xce)
+        call copy_faces
+
+        call rhs_norm(xcr)
+
+        do m = 1, 5
+           xcr(m) = xcr(m) / dt
+        enddo
+
+        if (node .ne. 0) return
+
+        class = 'U'
+        verified = .true.
+
+        do m = 1,5
+           xcrref(m) = 1.0
+           xceref(m) = 1.0
+        end do
+
+c---------------------------------------------------------------------
+c    reference data for 12X12X12 grids after 100 time steps, with DT = 1.50d-02
+c---------------------------------------------------------------------
+        if ( (grid_points(1)  .eq. 12     ) .and. 
+     >       (grid_points(2)  .eq. 12     ) .and.
+     >       (grid_points(3)  .eq. 12     ) .and.
+     >       (no_time_steps   .eq. 100    ))  then
+
+           class = 'S'
+           dtref = 1.5d-2
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 2.7470315451339479d-02
+           xcrref(2) = 1.0360746705285417d-02
+           xcrref(3) = 1.6235745065095532d-02
+           xcrref(4) = 1.5840557224455615d-02
+           xcrref(5) = 3.4849040609362460d-02
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+           xceref(1) = 2.7289258557377227d-05
+           xceref(2) = 1.0364446640837285d-05
+           xceref(3) = 1.6154798287166471d-05
+           xceref(4) = 1.5750704994480102d-05
+           xceref(5) = 3.4177666183390531d-05
+
+
+c---------------------------------------------------------------------
+c    reference data for 36X36X36 grids after 400 time steps, with DT = 1.5d-03
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 36) .and. 
+     >           (grid_points(2) .eq. 36) .and.
+     >           (grid_points(3) .eq. 36) .and.
+     >           (no_time_steps . eq. 400) ) then
+
+           class = 'W'
+           dtref = 1.5d-3
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 0.1893253733584d-02
+           xcrref(2) = 0.1717075447775d-03
+           xcrref(3) = 0.2778153350936d-03
+           xcrref(4) = 0.2887475409984d-03
+           xcrref(5) = 0.3143611161242d-02
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+           xceref(1) = 0.7542088599534d-04
+           xceref(2) = 0.6512852253086d-05
+           xceref(3) = 0.1049092285688d-04
+           xceref(4) = 0.1128838671535d-04
+           xceref(5) = 0.1212845639773d-03
+
+c---------------------------------------------------------------------
+c    reference data for 64X64X64 grids after 400 time steps, with DT = 1.5d-03
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 64) .and. 
+     >           (grid_points(2) .eq. 64) .and.
+     >           (grid_points(3) .eq. 64) .and.
+     >           (no_time_steps . eq. 400) ) then
+
+           class = 'A'
+           dtref = 1.5d-3
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 2.4799822399300195d0
+           xcrref(2) = 1.1276337964368832d0
+           xcrref(3) = 1.5028977888770491d0
+           xcrref(4) = 1.4217816211695179d0
+           xcrref(5) = 2.1292113035138280d0
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+           xceref(1) = 1.0900140297820550d-04
+           xceref(2) = 3.7343951769282091d-05
+           xceref(3) = 5.0092785406541633d-05
+           xceref(4) = 4.7671093939528255d-05
+           xceref(5) = 1.3621613399213001d-04
+
+c---------------------------------------------------------------------
+c    reference data for 102X102X102 grids after 400 time steps,
+c    with DT = 1.0d-03
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 102) .and. 
+     >           (grid_points(2) .eq. 102) .and.
+     >           (grid_points(3) .eq. 102) .and.
+     >           (no_time_steps . eq. 400) ) then
+
+           class = 'B'
+           dtref = 1.0d-3
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 0.6903293579998d+02
+           xcrref(2) = 0.3095134488084d+02
+           xcrref(3) = 0.4103336647017d+02
+           xcrref(4) = 0.3864769009604d+02
+           xcrref(5) = 0.5643482272596d+02
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+           xceref(1) = 0.9810006190188d-02
+           xceref(2) = 0.1022827905670d-02
+           xceref(3) = 0.1720597911692d-02
+           xceref(4) = 0.1694479428231d-02
+           xceref(5) = 0.1847456263981d-01
+
+c---------------------------------------------------------------------
+c    reference data for 162X162X162 grids after 400 time steps,
+c    with DT = 0.67d-03
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 162) .and. 
+     >           (grid_points(2) .eq. 162) .and.
+     >           (grid_points(3) .eq. 162) .and.
+     >           (no_time_steps . eq. 400) ) then
+
+           class = 'C'
+           dtref = 0.67d-3
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 0.5881691581829d+03
+           xcrref(2) = 0.2454417603569d+03
+           xcrref(3) = 0.3293829191851d+03
+           xcrref(4) = 0.3081924971891d+03
+           xcrref(5) = 0.4597223799176d+03
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+           xceref(1) = 0.2598120500183d+00
+           xceref(2) = 0.2590888922315d-01
+           xceref(3) = 0.5132886416320d-01
+           xceref(4) = 0.4806073419454d-01
+           xceref(5) = 0.5483377491301d+00
+
+c---------------------------------------------------------------------
+c    reference data for 408X408X408 grids after 500 time steps,
+c    with DT = 0.3d-03
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 408) .and. 
+     >           (grid_points(2) .eq. 408) .and.
+     >           (grid_points(3) .eq. 408) .and.
+     >           (no_time_steps . eq. 500) ) then
+
+           class = 'D'
+           dtref = 0.30d-3
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 0.1044696216887d+05
+           xcrref(2) = 0.3204427762578d+04
+           xcrref(3) = 0.4648680733032d+04
+           xcrref(4) = 0.4238923283697d+04
+           xcrref(5) = 0.7588412036136d+04
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+           xceref(1) = 0.5089471423669d+01
+           xceref(2) = 0.5323514855894d+00
+           xceref(3) = 0.1187051008971d+01
+           xceref(4) = 0.1083734951938d+01
+           xceref(5) = 0.1164108338568d+02
+
+c---------------------------------------------------------------------
+c    reference data for 1020X1020X1020 grids after 500 time steps,
+c    with DT = 0.1d-03
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 1020) .and. 
+     >           (grid_points(2) .eq. 1020) .and.
+     >           (grid_points(3) .eq. 1020) .and.
+     >           (no_time_steps . eq. 500) ) then
+
+           class = 'E'
+           dtref = 0.10d-3
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 0.6255387422609d+05
+           xcrref(2) = 0.1495317020012d+05
+           xcrref(3) = 0.2347595750586d+05
+           xcrref(4) = 0.2091099783534d+05
+           xcrref(5) = 0.4770412841218d+05
+
+c---------------------------------------------------------------------
+c    Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+           xceref(1) = 0.6742735164909d+02
+           xceref(2) = 0.5390656036938d+01
+           xceref(3) = 0.1680647196477d+02
+           xceref(4) = 0.1536963126457d+02
+           xceref(5) = 0.1575330146156d+03
+
+        else
+           verified = .false.
+        endif
+
+c---------------------------------------------------------------------
+c    verification test for residuals if gridsize is one of 
+c    the defined grid sizes above (class .ne. 'U')
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c    Compute the difference of solution values and the known reference values.
+c---------------------------------------------------------------------
+        do m = 1, 5
+           
+           xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) 
+           xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
+           
+        enddo
+
+c---------------------------------------------------------------------
+c    Output the comparison of computed results to known cases.
+c---------------------------------------------------------------------
+
+        if (class .ne. 'U') then
+           write(*, 1990) class
+ 1990      format(' Verification being performed for class ', a)
+           write (*,2000) epsilon
+ 2000      format(' accuracy setting for epsilon = ', E20.13)
+           verified = (dabs(dt-dtref) .le. epsilon)
+           if (.not.verified) then  
+              class = 'U'
+              write (*,1000) dtref
+ 1000         format(' DT does not match the reference value of ', 
+     >                 E15.8)
+           endif
+        else 
+           write(*, 1995)
+ 1995      format(' Unknown class')
+        endif
+
+
+        if (class .ne. 'U') then
+           write (*,2001) 
+        else
+           write (*, 2005)
+        endif
+
+ 2001   format(' Comparison of RMS-norms of residual')
+ 2005   format(' RMS-norms of residual')
+        do m = 1, 5
+           if (class .eq. 'U') then
+              write(*, 2015) m, xcr(m)
+           else if (xcrdif(m) .le. epsilon) then
+              write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
+           else 
+              verified = .false.
+              write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
+           endif
+        enddo
+
+        if (class .ne. 'U') then
+           write (*,2002)
+        else
+           write (*,2006)
+        endif
+ 2002   format(' Comparison of RMS-norms of solution error')
+ 2006   format(' RMS-norms of solution error')
+        
+        do m = 1, 5
+           if (class .eq. 'U') then
+              write(*, 2015) m, xce(m)
+           else if (xcedif(m) .le. epsilon) then
+              write (*,2011) m,xce(m),xceref(m),xcedif(m)
+           else
+              verified = .false.
+              write (*,2010) m,xce(m),xceref(m),xcedif(m)
+           endif
+        enddo
+        
+ 2010   format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
+ 2011   format('          ', i2, E20.13, E20.13, E20.13)
+ 2015   format('          ', i2, E20.13)
+        
+        if (class .eq. 'U') then
+           write(*, 2022)
+           write(*, 2023)
+ 2022      format(' No reference values provided')
+ 2023      format(' No verification performed')
+        else if (verified) then
+           write(*, 2020)
+ 2020      format(' Verification Successful')
+        else
+           write(*, 2021)
+ 2021      format(' Verification failed')
+        endif
+
+        return
+
+
+        end
diff --git a/examples/smpi/NAS/SP/x_solve.f b/examples/smpi/NAS/SP/x_solve.f
new file mode 100644
index 0000000000..cd40756ec3
--- /dev/null
+++ b/examples/smpi/NAS/SP/x_solve.f
@@ -0,0 +1,545 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine x_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function performs the solution of the approximate factorization
+c step in the x-direction for all five matrix components
+c simultaneously. The Thomas algorithm is employed to solve the
+c systems for the x-lines. Boundary conditions are non-periodic
+c---------------------------------------------------------------------
+
+       include 'header.h'
+       include 'mpinpb.h'
+
+
+       integer i, j, k, jp, kp, n, iend, jsize, ksize, i1, i2,
+     >         buffer_size, c, m, p, istart, stage, error,
+     >         requests(2), statuses(MPI_STATUS_SIZE, 2)
+       double precision  r1, r2, d, e, s(5), sm1, sm2,
+     >                   fac1, fac2
+
+
+
+c---------------------------------------------------------------------
+c      OK, now we know that there are multiple processors
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
+c on this node in the direction of increasing i for the forward sweep,
+c and after that reversing the direction for the backsubstitution.
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c                          FORWARD ELIMINATION  
+c---------------------------------------------------------------------
+       do    stage = 1, ncells
+          c         = slice(1,stage)
+
+          istart = 0
+          iend   = cell_size(1,c)-1
+
+          jsize     = cell_size(2,c)
+          ksize     = cell_size(3,c)
+          jp        = cell_coord(2,c)-1
+          kp        = cell_coord(3,c)-1
+
+          buffer_size = (jsize-start(2,c)-end(2,c)) * 
+     >                  (ksize-start(3,c)-end(3,c))
+
+          if ( stage .ne. 1) then
+
+c---------------------------------------------------------------------
+c            if this is not the first processor in this row of cells, 
+c            receive data from predecessor containing the right hand
+c            sides and the upper diagonal elements of the previous two rows
+c---------------------------------------------------------------------
+             call mpi_irecv(in_buffer, 22*buffer_size, 
+     >                      dp_type, predecessor(1), 
+     >                      DEFAULT_TAG,  comm_solve, 
+     >                      requests(1), error)
+
+
+c---------------------------------------------------------------------
+c            communication has already been started. 
+c            compute the left hand side while waiting for the msg
+c---------------------------------------------------------------------
+             call lhsx(c)
+
+c---------------------------------------------------------------------
+c            wait for pending communication to complete
+c            This waits on the current receive and on the send
+c            from the previous stage. They always come in pairs. 
+c---------------------------------------------------------------------
+
+             call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c            unpack the buffer                                 
+c---------------------------------------------------------------------
+             i  = istart
+             i1 = istart + 1
+             n = 0
+
+c---------------------------------------------------------------------
+c            create a running pointer
+c---------------------------------------------------------------------
+             p = 0
+             do    k = start(3,c), ksize-end(3,c)-1
+                do    j = start(2,c), jsize-end(2,c)-1
+                   lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+     >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
+                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+     >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
+                   end do
+                   d            = in_buffer(p+6)
+                   e            = in_buffer(p+7)
+                   do    m = 1, 3
+                      s(m) = in_buffer(p+7+m)
+                   end do
+                   r1 = lhs(i,j,k,n+2,c)
+                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+                   lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+                   end do
+                   r2 = lhs(i1,j,k,n+1,c)
+                   lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
+                   lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
+                   do    m = 1, 3
+                      rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
+                   end do
+                   p = p + 10
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do    k = start(3,c), ksize-end(3,c)-1
+                   do    j = start(2,c), jsize-end(2,c)-1
+                      lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+     >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
+                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+     >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
+                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
+     >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
+                      d                = in_buffer(p+4)
+                      e                = in_buffer(p+5)
+                      s(m)             = in_buffer(p+6)
+                      r1 = lhs(i,j,k,n+2,c)
+                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+                      lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
+                      r2 = lhs(i1,j,k,n+1,c)
+                      lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
+                      lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
+                      rhs(i1,j,k,m,c)   = rhs(i1,j,k,m,c) - s(m) * r2
+                      p = p + 6
+                   end do
+                end do
+             end do
+
+          else            
+
+c---------------------------------------------------------------------
+c            if this IS the first cell, we still compute the lhs
+c---------------------------------------------------------------------
+             call lhsx(c)
+          endif
+
+c---------------------------------------------------------------------
+c         perform the Thomas algorithm; first, FORWARD ELIMINATION     
+c---------------------------------------------------------------------
+          n = 0
+
+          do    k = start(3,c), ksize-end(3,c)-1
+             do    j = start(2,c), jsize-end(2,c)-1
+                do    i = istart, iend-2
+                   i1 = i  + 1
+                   i2 = i  + 2
+                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                   end do
+                   lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
+     >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
+                   lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
+     >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+     >                         lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
+                   end do
+                   lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
+     >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
+                   lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
+     >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
+     >                         lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         The last two rows in this grid block are a bit different, 
+c         since they do not have two more rows available for the
+c         elimination of off-diagonal entries
+c---------------------------------------------------------------------
+
+          i  = iend - 1
+          i1 = iend
+          do    k = start(3,c), ksize-end(3,c)-1
+             do    j = start(2,c), jsize-end(2,c)-1
+                fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                do    m = 1, 3
+                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                end do
+                lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
+     >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
+                lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
+     >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
+                do    m = 1, 3
+                   rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+     >                      lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
+                end do
+c---------------------------------------------------------------------
+c               scale the last row immediately (some of this is
+c               overkill in case this is the last cell)
+c---------------------------------------------------------------------
+                fac2               = 1.d0/lhs(i1,j,k,n+3,c)
+                lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
+                lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)  
+                do    m = 1, 3
+                   rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         do the u+c and the u-c factors                 
+c---------------------------------------------------------------------
+
+          do    m = 4, 5
+             n = (m-3)*5
+             do    k = start(3,c), ksize-end(3,c)-1
+                do    j = start(2,c), jsize-end(2,c)-1
+                   do    i = istart, iend-2
+                   i1 = i  + 1
+                   i2 = i  + 2
+                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                   lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
+     >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
+                   lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
+     >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
+                   rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+     >                         lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
+                   lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
+     >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
+                   lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
+     >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
+                   rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
+     >                         lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c            And again the last two rows separately
+c---------------------------------------------------------------------
+             i  = iend - 1
+             i1 = iend
+             do    k = start(3,c), ksize-end(3,c)-1
+                do    j = start(2,c), jsize-end(2,c)-1
+                fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
+                lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
+     >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
+                lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
+     >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
+                rhs(i1,j,k,m,c)   = rhs(i1,j,k,m,c) -
+     >                      lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
+c---------------------------------------------------------------------
+c               Scale the last row immediately (some of this is overkill
+c               if this is the last cell)
+c---------------------------------------------------------------------
+                fac2               = 1.d0/lhs(i1,j,k,n+3,c)
+                lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
+                lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
+                rhs(i1,j,k,m,c)   = fac2*rhs(i1,j,k,m,c)
+
+             end do
+          end do
+       end do
+
+c---------------------------------------------------------------------
+c         send information to the next processor, except when this
+c         is the last grid block
+c---------------------------------------------------------------------
+          if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c            create a running pointer for the send buffer  
+c---------------------------------------------------------------------
+             p = 0
+             n = 0
+             do    k = start(3,c), ksize-end(3,c)-1
+                do    j = start(2,c), jsize-end(2,c)-1
+                   do    i = iend-1, iend
+                      out_buffer(p+1) = lhs(i,j,k,n+4,c)
+                      out_buffer(p+2) = lhs(i,j,k,n+5,c)
+                      do    m = 1, 3
+                         out_buffer(p+2+m) = rhs(i,j,k,m,c)
+                      end do
+                      p = p+5
+                   end do
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do    k = start(3,c), ksize-end(3,c)-1
+                   do    j = start(2,c), jsize-end(2,c)-1
+                      do    i = iend-1, iend
+                         out_buffer(p+1) = lhs(i,j,k,n+4,c)
+                         out_buffer(p+2) = lhs(i,j,k,n+5,c)
+                         out_buffer(p+3) = rhs(i,j,k,m,c)
+                         p = p + 3
+                      end do
+                   end do
+                end do
+             end do
+
+c---------------------------------------------------------------------
+c send data to next phase
+c can't receive data yet because buffer size will be wrong 
+c---------------------------------------------------------------------
+             call mpi_isend(out_buffer, 22*buffer_size, 
+     >                     dp_type, successor(1), 
+     >                     DEFAULT_TAG, comm_solve, 
+     >                     requests(2), error)
+
+          endif
+       end do
+
+c---------------------------------------------------------------------
+c      now go in the reverse direction                      
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c                         BACKSUBSTITUTION 
+c---------------------------------------------------------------------
+       do    stage = ncells, 1, -1
+          c = slice(1,stage)
+
+          istart = 0
+          iend   = cell_size(1,c)-1
+
+          jsize = cell_size(2,c)
+          ksize = cell_size(3,c)
+          jp    = cell_coord(2,c)-1
+          kp    = cell_coord(3,c)-1
+
+          buffer_size = (jsize-start(2,c)-end(2,c)) * 
+     >                  (ksize-start(3,c)-end(3,c))
+
+          if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c            if this is not the starting cell in this row of cells, 
+c            wait for a message to be received, containing the 
+c            solution of the previous two stations     
+c---------------------------------------------------------------------
+             call mpi_irecv(in_buffer, 10*buffer_size, 
+     >                      dp_type, successor(1), 
+     >                      DEFAULT_TAG, comm_solve, 
+     >                      requests(1), error)
+
+
+c---------------------------------------------------------------------
+c            communication has already been started
+c            while waiting, do the block-diagonal inversion for the 
+c            cell that was just finished                
+c---------------------------------------------------------------------
+
+             call ninvr(slice(1,stage+1))
+
+c---------------------------------------------------------------------
+c            wait for pending communication to complete
+c---------------------------------------------------------------------
+             call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c            unpack the buffer for the first three factors         
+c---------------------------------------------------------------------
+             n = 0
+             p = 0
+             i  = iend
+             i1 = i - 1
+             do    m = 1, 3
+                do   k = start(3,c), ksize-end(3,c)-1
+                   do   j = start(2,c), jsize-end(2,c)-1
+                      sm1 = in_buffer(p+1)
+                      sm2 = in_buffer(p+2)
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                        lhs(i,j,k,n+4,c)*sm1 -
+     >                        lhs(i,j,k,n+5,c)*sm2
+                      rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+     >                        lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - 
+     >                        lhs(i1,j,k,n+5,c) * sm1
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+c---------------------------------------------------------------------
+c            now unpack the buffer for the remaining two factors
+c---------------------------------------------------------------------
+             do    m = 4, 5
+                n = (m-3)*5
+                do   k = start(3,c), ksize-end(3,c)-1
+                   do   j = start(2,c), jsize-end(2,c)-1
+                      sm1 = in_buffer(p+1)
+                      sm2 = in_buffer(p+2)
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                        lhs(i,j,k,n+4,c)*sm1 -
+     >                        lhs(i,j,k,n+5,c)*sm2
+                      rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+     >                        lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - 
+     >                        lhs(i1,j,k,n+5,c) * sm1
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+          else
+
+c---------------------------------------------------------------------
+c            now we know this is the first grid block on the back sweep,
+c            so we don't need a message to start the substitution. 
+c---------------------------------------------------------------------
+             i  = iend-1
+             i1 = iend
+             n = 0
+             do   m = 1, 3
+                do   k = start(3,c), ksize-end(3,c)-1
+                   do   j = start(2,c), jsize-end(2,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                             lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
+                   end do
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do   k = start(3,c), ksize-end(3,c)-1
+                   do   j = start(2,c), jsize-end(2,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                             lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
+                   end do
+                end do
+             end do
+          endif
+
+c---------------------------------------------------------------------
+c         Whether or not this is the last processor, we always have
+c         to complete the back-substitution 
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c         The first three factors
+c---------------------------------------------------------------------
+          n = 0
+          do   m = 1, 3
+             do   k = start(3,c), ksize-end(3,c)-1
+                do   j = start(2,c), jsize-end(2,c)-1
+                   do    i = iend-2, istart, -1
+                      i1 = i  + 1
+                      i2 = i  + 2
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                          lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
+     >                          lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         And the remaining two
+c---------------------------------------------------------------------
+          do    m = 4, 5
+             n = (m-3)*5
+             do   k = start(3,c), ksize-end(3,c)-1
+                do   j = start(2,c), jsize-end(2,c)-1
+                   do    i = iend-2, istart, -1
+                      i1 = i  + 1
+                      i2 = i  + 2
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                          lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
+     >                          lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         send on information to the previous processor, if needed
+c---------------------------------------------------------------------
+          if (stage .ne.  1) then
+             i  = istart
+             i1 = istart+1
+             p = 0
+             do    m = 1, 5
+                do    k = start(3,c), ksize-end(3,c)-1
+                   do    j = start(2,c), jsize-end(2,c)-1
+                      out_buffer(p+1) = rhs(i,j,k,m,c)
+                       out_buffer(p+2) = rhs(i1,j,k,m,c)
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+c---------------------------------------------------------------------
+c            pack and send the buffer
+c---------------------------------------------------------------------
+             call mpi_isend(out_buffer, 10*buffer_size, 
+     >                     dp_type, predecessor(1), 
+     >                     DEFAULT_TAG, comm_solve, 
+     >                     requests(2), error)
+
+          endif
+
+c---------------------------------------------------------------------
+c         If this was the last stage, do the block-diagonal inversion          
+c---------------------------------------------------------------------
+          if (stage .eq. 1) call ninvr(c)
+
+       end do
+
+       return
+       end
+    
+
+
+
+
+
+
diff --git a/examples/smpi/NAS/SP/y_solve.f b/examples/smpi/NAS/SP/y_solve.f
new file mode 100644
index 0000000000..fdcbb4d03f
--- /dev/null
+++ b/examples/smpi/NAS/SP/y_solve.f
@@ -0,0 +1,538 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine y_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function performs the solution of the approximate factorization
+c step in the y-direction for all five matrix components
+c simultaneously. The Thomas algorithm is employed to solve the
+c systems for the y-lines. Boundary conditions are non-periodic
+c---------------------------------------------------------------------
+
+       include 'header.h'
+       include 'mpinpb.h'
+
+       integer i, j, k, stage, ip, kp, n, isize, jend, ksize, j1, j2,
+     >         buffer_size, c, m, p, jstart, error,
+     >         requests(2), statuses(MPI_STATUS_SIZE, 2)
+       double precision  r1, r2, d, e, s(5), sm1, sm2,
+     >                   fac1, fac2
+
+
+c---------------------------------------------------------------------
+c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
+c on this node in the direction of increasing i for the forward sweep,
+c and after that reversing the direction for the backsubstitution  
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c                          FORWARD ELIMINATION  
+c---------------------------------------------------------------------
+       do    stage = 1, ncells
+          c      = slice(2,stage)
+
+          jstart = 0
+          jend   = cell_size(2,c)-1
+
+          isize     = cell_size(1,c)
+          ksize     = cell_size(3,c)
+          ip        = cell_coord(1,c)-1
+          kp        = cell_coord(3,c)-1
+
+          buffer_size = (isize-start(1,c)-end(1,c)) * 
+     >                  (ksize-start(3,c)-end(3,c))
+
+          if ( stage .ne. 1) then
+
+c---------------------------------------------------------------------
+c            if this is not the first processor in this row of cells, 
+c            receive data from predecessor containing the right hand
+c            sides and the upper diagonal elements of the previous two rows
+c---------------------------------------------------------------------
+
+             call mpi_irecv(in_buffer, 22*buffer_size, 
+     >                      dp_type, predecessor(2), 
+     >                      DEFAULT_TAG, comm_solve, 
+     >                      requests(1), error)
+
+c---------------------------------------------------------------------
+c            communication has already been started. 
+c            compute the left hand side while waiting for the msg
+c---------------------------------------------------------------------
+             call lhsy(c)
+
+c---------------------------------------------------------------------
+c            wait for pending communication to complete
+c            This waits on the current receive and on the send
+c            from the previous stage. They always come in pairs. 
+c---------------------------------------------------------------------
+             call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c            unpack the buffer                                 
+c---------------------------------------------------------------------
+             j  = jstart
+             j1 = jstart + 1
+             n = 0
+c---------------------------------------------------------------------
+c            create a running pointer
+c---------------------------------------------------------------------
+             p = 0
+             do    k = start(3,c), ksize-end(3,c)-1
+                do    i = start(1,c), isize-end(1,c)-1
+                   lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+     >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
+                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+     >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
+                   end do
+                   d            = in_buffer(p+6)
+                   e            = in_buffer(p+7)
+                   do    m = 1, 3
+                      s(m) = in_buffer(p+7+m)
+                   end do
+                   r1 = lhs(i,j,k,n+2,c)
+                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+                   lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+                   end do
+                   r2 = lhs(i,j1,k,n+1,c)
+                   lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
+                   lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
+                   do    m = 1, 3
+                      rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
+                   end do
+                   p = p + 10
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do    k = start(3,c), ksize-end(3,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+     >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
+                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+     >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
+                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
+     >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
+                      d                = in_buffer(p+4)
+                      e                = in_buffer(p+5)
+                      s(m)             = in_buffer(p+6)
+                      r1 = lhs(i,j,k,n+2,c)
+                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+                      lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
+                      r2 = lhs(i,j1,k,n+1,c)
+                      lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
+                      lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
+                      rhs(i,j1,k,m,c)   = rhs(i,j1,k,m,c) - s(m) * r2
+                      p = p + 6
+                   end do
+                end do
+             end do
+
+          else            
+
+c---------------------------------------------------------------------
+c            if this IS the first cell, we still compute the lhs
+c---------------------------------------------------------------------
+             call lhsy(c)
+          endif
+
+c---------------------------------------------------------------------
+c         perform the Thomas algorithm; first, FORWARD ELIMINATION     
+c---------------------------------------------------------------------
+          n = 0
+
+          do    k = start(3,c), ksize-end(3,c)-1
+             do    j = jstart, jend-2
+                do    i = start(1,c), isize-end(1,c)-1
+                   j1 = j  + 1
+                   j2 = j  + 2
+                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                   end do
+                   lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
+     >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
+                   lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
+     >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+     >                         lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
+                   end do
+                   lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
+     >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
+                   lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
+     >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
+     >                         lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         The last two rows in this grid block are a bit different, 
+c         since they do not have two more rows available for the
+c         elimination of off-diagonal entries
+c---------------------------------------------------------------------
+
+          j  = jend - 1
+          j1 = jend
+          do    k = start(3,c), ksize-end(3,c)-1
+             do    i = start(1,c), isize-end(1,c)-1
+                fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                do    m = 1, 3
+                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                end do
+                lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
+     >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
+                lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
+     >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
+                do    m = 1, 3
+                   rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+     >                      lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
+                end do
+c---------------------------------------------------------------------
+c               scale the last row immediately (some of this is
+c               overkill in case this is the last cell)
+c---------------------------------------------------------------------
+                fac2               = 1.d0/lhs(i,j1,k,n+3,c)
+                lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
+                lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)  
+                do    m = 1, 3
+                   rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         do the u+c and the u-c factors                 
+c---------------------------------------------------------------------
+          do    m = 4, 5
+             n = (m-3)*5
+             do    k = start(3,c), ksize-end(3,c)-1
+                do    j = jstart, jend-2
+                   do    i = start(1,c), isize-end(1,c)-1
+                   j1 = j  + 1
+                   j2 = j  + 2
+                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                   lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
+     >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
+                   lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
+     >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
+                   rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+     >                         lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
+                   lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
+     >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
+                   lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
+     >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
+                   rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
+     >                         lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c            And again the last two rows separately
+c---------------------------------------------------------------------
+             j  = jend - 1
+             j1 = jend
+             do    k = start(3,c), ksize-end(3,c)-1
+                do    i = start(1,c), isize-end(1,c)-1
+                fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
+                lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
+     >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
+                lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
+     >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
+                rhs(i,j1,k,m,c)   = rhs(i,j1,k,m,c) -
+     >                      lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
+c---------------------------------------------------------------------
+c               Scale the last row immediately (some of this is overkill
+c               if this is the last cell)
+c---------------------------------------------------------------------
+                fac2               = 1.d0/lhs(i,j1,k,n+3,c)
+                lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
+                lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
+                rhs(i,j1,k,m,c)   = fac2*rhs(i,j1,k,m,c)
+
+             end do
+          end do
+       end do
+
+c---------------------------------------------------------------------
+c         send information to the next processor, except when this
+c         is the last grid block;
+c---------------------------------------------------------------------
+
+          if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c            create a running pointer for the send buffer  
+c---------------------------------------------------------------------
+             p = 0
+             n = 0
+             do    k = start(3,c), ksize-end(3,c)-1
+                do    i = start(1,c), isize-end(1,c)-1
+                   do    j = jend-1, jend
+                      out_buffer(p+1) = lhs(i,j,k,n+4,c)
+                      out_buffer(p+2) = lhs(i,j,k,n+5,c)
+                      do    m = 1, 3
+                         out_buffer(p+2+m) = rhs(i,j,k,m,c)
+                      end do
+                      p = p+5
+                   end do
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do    k = start(3,c), ksize-end(3,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      do    j = jend-1, jend
+                         out_buffer(p+1) = lhs(i,j,k,n+4,c)
+                         out_buffer(p+2) = lhs(i,j,k,n+5,c)
+                         out_buffer(p+3) = rhs(i,j,k,m,c)
+                         p = p + 3
+                      end do
+                   end do
+                end do
+             end do
+
+c---------------------------------------------------------------------
+c            pack and send the buffer
+c---------------------------------------------------------------------
+             call mpi_isend(out_buffer, 22*buffer_size, 
+     >                     dp_type, successor(2), 
+     >                     DEFAULT_TAG, comm_solve, 
+     >                     requests(2), error)
+
+          endif
+       end do
+
+c---------------------------------------------------------------------
+c      now go in the reverse direction                      
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c                         BACKSUBSTITUTION 
+c---------------------------------------------------------------------
+       do    stage = ncells, 1, -1
+          c = slice(2,stage)
+
+          jstart = 0
+          jend   = cell_size(2,c)-1
+
+          isize = cell_size(1,c)
+          ksize = cell_size(3,c)
+          ip    = cell_coord(1,c)-1
+          kp    = cell_coord(3,c)-1
+
+          buffer_size = (isize-start(1,c)-end(1,c)) * 
+     >                  (ksize-start(3,c)-end(3,c))
+
+          if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c            if this is not the starting cell in this row of cells, 
+c            wait for a message to be received, containing the 
+c            solution of the previous two stations     
+c---------------------------------------------------------------------
+
+             call mpi_irecv(in_buffer, 10*buffer_size, 
+     >                      dp_type, successor(2), 
+     >                      DEFAULT_TAG, comm_solve, 
+     >                      requests(1), error)
+
+
+c---------------------------------------------------------------------
+c            communication has already been started
+c            while waiting, do the block-diagonal inversion for the 
+c            cell that was just finished                
+c---------------------------------------------------------------------
+
+             call pinvr(slice(2,stage+1))
+
+c---------------------------------------------------------------------
+c            wait for pending communication to complete
+c---------------------------------------------------------------------
+             call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c            unpack the buffer for the first three factors         
+c---------------------------------------------------------------------
+             n = 0
+             p = 0
+             j  = jend
+             j1 = j - 1
+             do    m = 1, 3
+                do   k = start(3,c), ksize-end(3,c)-1
+                   do   i = start(1,c), isize-end(1,c)-1
+                      sm1 = in_buffer(p+1)
+                      sm2 = in_buffer(p+2)
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                        lhs(i,j,k,n+4,c)*sm1 -
+     >                        lhs(i,j,k,n+5,c)*sm2
+                      rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+     >                        lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) - 
+     >                        lhs(i,j1,k,n+5,c) * sm1
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+c---------------------------------------------------------------------
+c            now unpack the buffer for the remaining two factors
+c---------------------------------------------------------------------
+             do    m = 4, 5
+                n = (m-3)*5
+                do   k = start(3,c), ksize-end(3,c)-1
+                   do   i = start(1,c), isize-end(1,c)-1
+                      sm1 = in_buffer(p+1)
+                      sm2 = in_buffer(p+2)
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                        lhs(i,j,k,n+4,c)*sm1 -
+     >                        lhs(i,j,k,n+5,c)*sm2
+                      rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+     >                        lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) - 
+     >                        lhs(i,j1,k,n+5,c) * sm1
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+          else
+c---------------------------------------------------------------------
+c            now we know this is the first grid block on the back sweep,
+c            so we don't need a message to start the substitution. 
+c---------------------------------------------------------------------
+
+             j  = jend - 1
+             j1 = jend
+             n = 0
+             do   m = 1, 3
+                do   k = start(3,c), ksize-end(3,c)-1
+                   do   i = start(1,c), isize-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                             lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
+                   end do
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do   k = start(3,c), ksize-end(3,c)-1
+                   do   i = start(1,c), isize-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                             lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
+                   end do
+                end do
+             end do
+          endif
+
+c---------------------------------------------------------------------
+c         Whether or not this is the last processor, we always have
+c         to complete the back-substitution 
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c         The first three factors
+c---------------------------------------------------------------------
+          n = 0
+          do   m = 1, 3
+             do   k = start(3,c), ksize-end(3,c)-1
+                do   j = jend-2, jstart, -1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      j1 = j  + 1
+                      j2 = j  + 2
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                          lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
+     >                          lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         And the remaining two
+c---------------------------------------------------------------------
+          do    m = 4, 5
+             n = (m-3)*5
+             do   k = start(3,c), ksize-end(3,c)-1
+                do   j = jend-2, jstart, -1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      j1 = j  + 1
+                      j2 = j1 + 1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                          lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
+     >                          lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         send on information to the previous processor, if needed
+c---------------------------------------------------------------------
+          if (stage .ne.  1) then
+             j  = jstart
+             j1 = jstart + 1
+             p = 0
+             do    m = 1, 5
+                do    k = start(3,c), ksize-end(3,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      out_buffer(p+1) = rhs(i,j,k,m,c)
+                      out_buffer(p+2) = rhs(i,j1,k,m,c)
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+c---------------------------------------------------------------------
+c            pack and send the buffer
+c---------------------------------------------------------------------
+
+             call mpi_isend(out_buffer, 10*buffer_size, 
+     >                     dp_type, predecessor(2), 
+     >                     DEFAULT_TAG, comm_solve, 
+     >                     requests(2), error)
+
+          endif
+
+c---------------------------------------------------------------------
+c         If this was the last stage, do the block-diagonal inversion          
+c---------------------------------------------------------------------
+          if (stage .eq. 1) call pinvr(c)
+
+       end do
+
+       return
+       end
+    
+
+
+
+
+
+
diff --git a/examples/smpi/NAS/SP/z_solve.f b/examples/smpi/NAS/SP/z_solve.f
new file mode 100644
index 0000000000..ad0dc7e727
--- /dev/null
+++ b/examples/smpi/NAS/SP/z_solve.f
@@ -0,0 +1,532 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+       subroutine z_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function performs the solution of the approximate factorization
+c step in the z-direction for all five matrix components
+c simultaneously. The Thomas algorithm is employed to solve the
+c systems for the z-lines. Boundary conditions are non-periodic
+c---------------------------------------------------------------------
+
+       include 'header.h'
+       include 'mpinpb.h'
+
+       integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2,
+     >         buffer_size, c, m, p, kstart, error,
+     >         requests(2), statuses(MPI_STATUS_SIZE, 2)
+       double precision  r1, r2, d, e, s(5), sm1, sm2,
+     >                   fac1, fac2
+
+c---------------------------------------------------------------------
+c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
+c on this node in the direction of increasing i for the forward sweep,
+c and after that reversing the direction for the backsubstitution  
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c                          FORWARD ELIMINATION  
+c---------------------------------------------------------------------
+       do    stage = 1, ncells
+          c         = slice(3,stage)
+
+          kstart = 0
+          kend   = cell_size(3,c)-1
+
+          isize     = cell_size(1,c)
+          jsize     = cell_size(2,c)
+          ip        = cell_coord(1,c)-1
+          jp        = cell_coord(2,c)-1
+
+          buffer_size = (isize-start(1,c)-end(1,c)) * 
+     >                  (jsize-start(2,c)-end(2,c))
+
+          if (stage .ne. 1) then
+
+
+c---------------------------------------------------------------------
+c            if this is not the first processor in this row of cells, 
+c            receive data from predecessor containing the right hand
+c            sides and the upper diagonal elements of the previous two rows
+c---------------------------------------------------------------------
+
+             call mpi_irecv(in_buffer, 22*buffer_size, 
+     >                      dp_type, predecessor(3), 
+     >                      DEFAULT_TAG, comm_solve, 
+     >                      requests(1), error)
+
+
+c---------------------------------------------------------------------
+c            communication has already been started. 
+c            compute the left hand side while waiting for the msg
+c---------------------------------------------------------------------
+             call lhsz(c)
+
+c---------------------------------------------------------------------
+c            wait for pending communication to complete
+c---------------------------------------------------------------------
+             call mpi_waitall(2, requests, statuses, error)
+             
+c---------------------------------------------------------------------
+c            unpack the buffer                                 
+c---------------------------------------------------------------------
+             k  = kstart
+             k1 = kstart + 1
+             n = 0
+
+c---------------------------------------------------------------------
+c            create a running pointer
+c---------------------------------------------------------------------
+             p = 0
+             do    j = start(2,c), jsize-end(2,c)-1
+                do    i = start(1,c), isize-end(1,c)-1
+                   lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+     >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
+                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+     >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
+                   end do
+                   d            = in_buffer(p+6)
+                   e            = in_buffer(p+7)
+                   do    m = 1, 3
+                      s(m) = in_buffer(p+7+m)
+                   end do
+                   r1 = lhs(i,j,k,n+2,c)
+                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+                   lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+                   end do
+                   r2 = lhs(i,j,k1,n+1,c)
+                   lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
+                   lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
+                   do    m = 1, 3
+                      rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
+                   end do
+                   p = p + 10
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do    j = start(2,c), jsize-end(2,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+     >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
+                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+     >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
+                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
+     >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
+                      d                = in_buffer(p+4)
+                      e                = in_buffer(p+5)
+                      s(m)             = in_buffer(p+6)
+                      r1 = lhs(i,j,k,n+2,c)
+                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+                      lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
+                      r2 = lhs(i,j,k1,n+1,c)
+                      lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
+                      lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
+                      rhs(i,j,k1,m,c)   = rhs(i,j,k1,m,c) - s(m) * r2
+                      p = p + 6
+                   end do
+                end do
+             end do
+
+          else            
+
+c---------------------------------------------------------------------
+c            if this IS the first cell, we still compute the lhs
+c---------------------------------------------------------------------
+             call lhsz(c)
+          endif
+
+c---------------------------------------------------------------------
+c         perform the Thomas algorithm; first, FORWARD ELIMINATION     
+c---------------------------------------------------------------------
+          n = 0
+
+          do    k = kstart, kend-2
+             do    j = start(2,c), jsize-end(2,c)-1
+                do    i = start(1,c), isize-end(1,c)-1
+                   k1 = k  + 1
+                   k2 = k  + 2
+                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                   end do
+                   lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
+     >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
+     >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+     >                         lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
+                   end do
+                   lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
+     >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
+     >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
+                   do    m = 1, 3
+                      rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
+     >                         lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         The last two rows in this grid block are a bit different, 
+c         since they do not have two more rows available for the
+c         elimination of off-diagonal entries
+c---------------------------------------------------------------------
+          k  = kend - 1
+          k1 = kend
+          do    j = start(2,c), jsize-end(2,c)-1
+             do    i = start(1,c), isize-end(1,c)-1
+                fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                do    m = 1, 3
+                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                end do
+                lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
+     >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
+                lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
+     >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
+                do    m = 1, 3
+                   rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+     >                      lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
+                end do
+c---------------------------------------------------------------------
+c               scale the last row immediately (some of this is
+c               overkill in case this is the last cell)
+c---------------------------------------------------------------------
+                fac2               = 1.d0/lhs(i,j,k1,n+3,c)
+                lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
+                lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)  
+                do    m = 1, 3
+                   rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         do the u+c and the u-c factors               
+c---------------------------------------------------------------------
+          do   m = 4, 5
+             n = (m-3)*5
+             do    k = kstart, kend-2
+                do    j = start(2,c), jsize-end(2,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                   k1 = k  + 1
+                   k2 = k  + 2
+                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+                   lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
+     >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
+     >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
+                   rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+     >                         lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
+                   lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
+     >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
+                   lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
+     >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
+                   rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
+     >                         lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c            And again the last two rows separately
+c---------------------------------------------------------------------
+             k  = kend - 1
+             k1 = kend
+             do    j = start(2,c), jsize-end(2,c)-1
+                do    i = start(1,c), isize-end(1,c)-1
+                fac1               = 1.d0/lhs(i,j,k,n+3,c)
+                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
+                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
+                rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
+                lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
+     >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
+                lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
+     >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
+                rhs(i,j,k1,m,c)   = rhs(i,j,k1,m,c) -
+     >                      lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
+c---------------------------------------------------------------------
+c               Scale the last row immediately (some of this is overkill
+c               if this is the last cell)
+c---------------------------------------------------------------------
+                fac2               = 1.d0/lhs(i,j,k1,n+3,c)
+                lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
+                lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
+                rhs(i,j,k1,m,c)   = fac2*rhs(i,j,k1,m,c)
+
+             end do
+          end do
+       end do
+
+c---------------------------------------------------------------------
+c         send information to the next processor, except when this
+c         is the last grid block,
+c---------------------------------------------------------------------
+
+          if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c            create a running pointer for the send buffer  
+c---------------------------------------------------------------------
+             p = 0
+             n = 0
+             do    j = start(2,c), jsize-end(2,c)-1
+                do    i = start(1,c), isize-end(1,c)-1
+                   do    k = kend-1, kend
+                      out_buffer(p+1) = lhs(i,j,k,n+4,c)
+                      out_buffer(p+2) = lhs(i,j,k,n+5,c)
+                      do    m = 1, 3
+                         out_buffer(p+2+m) = rhs(i,j,k,m,c)
+                      end do
+                      p = p+5
+                   end do
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do    j = start(2,c), jsize-end(2,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      do    k = kend-1, kend
+                         out_buffer(p+1) = lhs(i,j,k,n+4,c)
+                         out_buffer(p+2) = lhs(i,j,k,n+5,c)
+                         out_buffer(p+3) = rhs(i,j,k,m,c)
+                         p = p + 3
+                      end do
+                   end do
+                end do
+             end do
+
+
+             call mpi_isend(out_buffer, 22*buffer_size, 
+     >                     dp_type, successor(3), 
+     >                     DEFAULT_TAG, comm_solve, 
+     >                     requests(2), error)
+
+          endif
+       end do
+
+c---------------------------------------------------------------------
+c      now go in the reverse direction                      
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c                         BACKSUBSTITUTION 
+c---------------------------------------------------------------------
+       do    stage = ncells, 1, -1
+          c = slice(3,stage)
+
+          kstart = 0
+          kend   = cell_size(3,c)-1
+
+          isize     = cell_size(1,c)
+          jsize     = cell_size(2,c)
+          ip        = cell_coord(1,c)-1
+          jp        = cell_coord(2,c)-1
+
+          buffer_size = (isize-start(1,c)-end(1,c)) * 
+     >                  (jsize-start(2,c)-end(2,c))
+
+          if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c            if this is not the starting cell in this row of cells, 
+c            wait for a message to be received, containing the 
+c            solution of the previous two stations     
+c---------------------------------------------------------------------
+
+             call mpi_irecv(in_buffer, 10*buffer_size, 
+     >                      dp_type, successor(3), 
+     >                      DEFAULT_TAG, comm_solve, 
+     >                      requests(1), error)
+
+
+c---------------------------------------------------------------------
+c            communication has already been started
+c            while waiting, do the  block-diagonal inversion for the 
+c            cell that was just finished                
+c---------------------------------------------------------------------
+
+             call tzetar(slice(3,stage+1))
+
+c---------------------------------------------------------------------
+c            wait for pending communication to complete
+c---------------------------------------------------------------------
+             call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c            unpack the buffer for the first three factors         
+c---------------------------------------------------------------------
+             n = 0
+             p = 0
+             k  = kend
+             k1 = k - 1
+             do    m = 1, 3
+                do   j = start(2,c), jsize-end(2,c)-1
+                   do   i = start(1,c), isize-end(1,c)-1
+                      sm1 = in_buffer(p+1)
+                      sm2 = in_buffer(p+2)
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                        lhs(i,j,k,n+4,c)*sm1 -
+     >                        lhs(i,j,k,n+5,c)*sm2
+                      rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+     >                        lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - 
+     >                        lhs(i,j,k1,n+5,c) * sm1
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+c---------------------------------------------------------------------
+c            now unpack the buffer for the remaining two factors
+c---------------------------------------------------------------------
+             do    m = 4, 5
+                n = (m-3)*5
+                do   j = start(2,c), jsize-end(2,c)-1
+                   do   i = start(1,c), isize-end(1,c)-1
+                      sm1 = in_buffer(p+1)
+                      sm2 = in_buffer(p+2)
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                        lhs(i,j,k,n+4,c)*sm1 -
+     >                        lhs(i,j,k,n+5,c)*sm2
+                      rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+     >                        lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - 
+     >                        lhs(i,j,k1,n+5,c) * sm1
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+          else
+
+c---------------------------------------------------------------------
+c            now we know this is the first grid block on the back sweep,
+c            so we don't need a message to start the substitution. 
+c---------------------------------------------------------------------
+
+             k  = kend - 1
+             k1 = kend
+             n = 0
+             do   m = 1, 3
+                do   j = start(2,c), jsize-end(2,c)-1
+                   do   i = start(1,c), isize-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                             lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
+                   end do
+                end do
+             end do
+
+             do    m = 4, 5
+                n = (m-3)*5
+                do   j = start(2,c), jsize-end(2,c)-1
+                   do   i = start(1,c), isize-end(1,c)-1
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+     >                             lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
+                   end do
+                end do
+             end do
+          endif
+
+c---------------------------------------------------------------------
+c         Whether or not this is the last processor, we always have
+c         to complete the back-substitution 
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c         The first three factors
+c---------------------------------------------------------------------
+          n = 0
+          do   m = 1, 3
+             do   k = kend-2, kstart, -1
+                do   j = start(2,c), jsize-end(2,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      k1 = k  + 1
+                      k2 = k  + 2
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                          lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
+     >                          lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         And the remaining two
+c---------------------------------------------------------------------
+          do    m = 4, 5
+             n = (m-3)*5
+             do   k = kend-2, kstart, -1
+                do   j = start(2,c), jsize-end(2,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      k1 = k  + 1
+                      k2 = k  + 2
+                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
+     >                          lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
+     >                          lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
+                   end do
+                end do
+             end do
+          end do
+
+c---------------------------------------------------------------------
+c         send on information to the previous processor, if needed
+c---------------------------------------------------------------------
+          if (stage .ne.  1) then
+             k  = kstart
+             k1 = kstart + 1
+             p = 0
+             do    m = 1, 5
+                do    j = start(2,c), jsize-end(2,c)-1
+                   do    i = start(1,c), isize-end(1,c)-1
+                      out_buffer(p+1) = rhs(i,j,k,m,c)
+                      out_buffer(p+2) = rhs(i,j,k1,m,c)
+                      p = p + 2
+                   end do
+                end do
+             end do
+
+             call mpi_isend(out_buffer, 10*buffer_size, 
+     >                     dp_type, predecessor(3), 
+     >                     DEFAULT_TAG, comm_solve, 
+     >                     requests(2), error)
+
+          endif
+
+c---------------------------------------------------------------------
+c         If this was the last stage, do the block-diagonal inversion
+c---------------------------------------------------------------------
+          if (stage .eq. 1) call tzetar(c)
+
+       end do
+
+       return
+       end
+    
+
+
+
+
+
+
diff --git a/examples/smpi/NAS/common/c_print_results.c b/examples/smpi/NAS/common/c_print_results.c
new file mode 100644
index 0000000000..d7417fba15
--- /dev/null
+++ b/examples/smpi/NAS/common/c_print_results.c
@@ -0,0 +1,94 @@
+/*****************************************************************/
+/******     C  _  P  R  I  N  T  _  R  E  S  U  L  T  S     ******/
+/*****************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+
+void c_print_results( char   *name,
+                      char   class,
+                      int    n1, 
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      int    nprocs_compiled,
+                      int    nprocs_total,
+                      double t,
+                      double mops,
+		      char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *mpicc,
+                      char   *clink,
+                      char   *cmpi_lib,
+                      char   *cmpi_inc,
+                      char   *cflags,
+                      char   *clinkflags )
+{
+    char *evalue="1000";
+
+    printf( "\n\n %s Benchmark Completed\n", name ); 
+
+    printf( " Class           =                        %c\n", class );
+
+    if( n3 == 0 ) {
+        long nn = n1;
+        if ( n2 != 0 ) nn *= n2;
+        printf( " Size            =             %12ld\n", nn );   /* as in IS */
+    }
+    else
+        printf( " Size            =              %3dx %3dx %3d\n", n1,n2,n3 );
+
+    printf( " Iterations      =             %12d\n", niter );
+ 
+    printf( " Time in seconds =             %12.2f\n", t );
+
+    printf( " Total processes =             %12d\n", nprocs_total );
+
+    if ( nprocs_compiled != 0 )
+        printf( " Compiled procs  =             %12d\n", nprocs_compiled );
+
+    printf( " Mop/s total     =             %12.2f\n", mops );
+
+    printf( " Mop/s/process   =             %12.2f\n", mops/((float) nprocs_total) );
+
+    printf( " Operation type  = %24s\n", optype);
+
+    if( passed_verification )
+        printf( " Verification    =               SUCCESSFUL\n" );
+    else
+        printf( " Verification    =             UNSUCCESSFUL\n" );
+
+    printf( " Version         =             %12s\n", npbversion );
+
+    printf( " Compile date    =             %12s\n", compiletime );
+
+    printf( "\n Compile options:\n" );
+
+    printf( "    MPICC        = %s\n", mpicc );
+
+    printf( "    CLINK        = %s\n", clink );
+
+    printf( "    CMPI_LIB     = %s\n", cmpi_lib );
+
+    printf( "    CMPI_INC     = %s\n", cmpi_inc );
+
+    printf( "    CFLAGS       = %s\n", cflags );
+
+    printf( "    CLINKFLAGS   = %s\n", clinkflags );
+#ifdef SMP
+    evalue = getenv("MP_SET_NUMTHREADS");
+    printf( "   MULTICPUS = %s\n", evalue );
+#endif
+
+    printf( "\n\n" );
+    printf( " Please send the results of this run to:\n\n" );
+    printf( " NPB Development Team\n" );
+    printf( " Internet: npb@nas.nasa.gov\n \n" );
+    printf( " If email is not available, send this to:\n\n" );
+    printf( " MS T27A-1\n" );
+    printf( " NASA Ames Research Center\n" );
+    printf( " Moffett Field, CA  94035-1000\n\n" );
+    printf( " Fax: 650-604-3957\n\n" );
+}
+ 
diff --git a/examples/smpi/NAS/common/c_timers.c b/examples/smpi/NAS/common/c_timers.c
new file mode 100644
index 0000000000..c8c81e7c7f
--- /dev/null
+++ b/examples/smpi/NAS/common/c_timers.c
@@ -0,0 +1,45 @@
+
+#include "mpi.h"
+
+double start[64], elapsed[64];
+
+/*****************************************************************/
+/******            T  I  M  E  R  _  C  L  E  A  R          ******/
+/*****************************************************************/
+void timer_clear( int n )
+{
+    elapsed[n] = 0.0;
+}
+
+
+/*****************************************************************/
+/******            T  I  M  E  R  _  S  T  A  R  T          ******/
+/*****************************************************************/
+void timer_start( int n )
+{
+    start[n] = MPI_Wtime();
+}
+
+
+/*****************************************************************/
+/******            T  I  M  E  R  _  S  T  O  P             ******/
+/*****************************************************************/
+void timer_stop( int n )
+{
+    double t, now;
+
+    now = MPI_Wtime();
+    t = now - start[n];
+    elapsed[n] += t;
+
+}
+
+
+/*****************************************************************/
+/******            T  I  M  E  R  _  R  E  A  D             ******/
+/*****************************************************************/
+double timer_read( int n )
+{
+    return( elapsed[n] );
+}
+
diff --git a/examples/smpi/NAS/common/print_results.f b/examples/smpi/NAS/common/print_results.f
new file mode 100644
index 0000000000..9feddac482
--- /dev/null
+++ b/examples/smpi/NAS/common/print_results.f
@@ -0,0 +1,115 @@
+
+      subroutine print_results(name, class, n1, n2, n3, niter, 
+     >               nprocs_compiled, nprocs_total,
+     >               t, mops, optype, verified, npbversion, 
+     >               compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+      
+      implicit none
+      character*2 name
+      character*1 class
+      integer n1, n2, n3, niter, nprocs_compiled, nprocs_total, j
+      double precision t, mops
+      character optype*24, size*15
+      logical verified
+      character*(*) npbversion, compiletime, 
+     >              cs1, cs2, cs3, cs4, cs5, cs6, cs7
+
+         write (*, 2) name 
+ 2       format(//, ' ', A2, ' Benchmark Completed.')
+
+         write (*, 3) Class
+ 3       format(' Class           = ', 12x, a12)
+
+c   If this is not a grid-based problem (EP, FT, CG), then
+c   we only print n1, which contains some measure of the
+c   problem size. In that case, n2 and n3 are both zero.
+c   Otherwise, we print the grid size n1xn2xn3
+
+         if ((n2 .eq. 0) .and. (n3 .eq. 0)) then
+            if (name(1:2) .eq. 'EP') then
+               write(size, '(f15.0)' ) 2.d0**n1
+               j = 15
+               if (size(j:j) .eq. '.') j = j - 1
+               write (*,42) size(1:j)
+ 42            format(' Size            = ',9x, a15)
+            else
+               write (*,44) n1
+ 44            format(' Size            = ',12x, i12)
+            endif
+         else
+            write (*, 4) n1,n2,n3
+ 4          format(' Size            =  ',9x, i4,'x',i4,'x',i4)
+         endif
+
+         write (*, 5) niter
+ 5       format(' Iterations      = ', 12x, i12)
+         
+         write (*, 6) t
+ 6       format(' Time in seconds = ',12x, f12.2)
+         
+         write (*,7) nprocs_total
+ 7       format(' Total processes = ', 12x, i12)
+         
+         write (*,8) nprocs_compiled
+ 8       format(' Compiled procs  = ', 12x, i12)
+
+         write (*,9) mops
+ 9       format(' Mop/s total     = ',12x, f12.2)
+
+         write (*,10) mops/float( nprocs_total )
+ 10      format(' Mop/s/process   = ', 12x, f12.2)        
+         
+         write(*, 11) optype
+ 11      format(' Operation type  = ', a24)
+
+         if (verified) then 
+            write(*,12) '  SUCCESSFUL'
+         else
+            write(*,12) 'UNSUCCESSFUL'
+         endif
+ 12      format(' Verification    = ', 12x, a)
+
+         write(*,13) npbversion
+ 13      format(' Version         = ', 12x, a12)
+
+         write(*,14) compiletime
+ 14      format(' Compile date    = ', 12x, a12)
+
+
+         write (*,121) cs1
+ 121     format(/, ' Compile options:', /, 
+     >          '    MPIF77       = ', A)
+
+         write (*,122) cs2
+ 122     format('    FLINK        = ', A)
+
+         write (*,123) cs3
+ 123     format('    FMPI_LIB     = ', A)
+
+         write (*,124) cs4
+ 124     format('    FMPI_INC     = ', A)
+
+         write (*,125) cs5
+ 125     format('    FFLAGS       = ', A)
+
+         write (*,126) cs6
+ 126     format('    FLINKFLAGS   = ', A)
+
+         write(*, 127) cs7
+ 127     format('    RAND         = ', A)
+        
+         write (*,130)
+ 130     format(//' Please send the results of this run to:'//
+     >            ' NPB Development Team '/
+     >            ' Internet: npb@nas.nasa.gov'/
+     >            ' '/
+     >            ' If email is not available, send this to:'//
+     >            ' MS T27A-1'/
+     >            ' NASA Ames Research Center'/
+     >            ' Moffett Field, CA  94035-1000'//
+     >            ' Fax: 650-604-3957'//)
+
+
+         return
+         end
+
diff --git a/examples/smpi/NAS/common/randdp.c b/examples/smpi/NAS/common/randdp.c
new file mode 100644
index 0000000000..676624795d
--- /dev/null
+++ b/examples/smpi/NAS/common/randdp.c
@@ -0,0 +1,64 @@
+//---------------------------------------------------------------------
+//   This function is C verson of random number generator randdp.f 
+//---------------------------------------------------------------------
+
+double	randlc(X, A)
+double *X;
+double *A;
+{
+      static int        KS=0;
+      static double	R23, R46, T23, T46;
+      double		T1, T2, T3, T4;
+      double		A1;
+      double		A2;
+      double		X1;
+      double		X2;
+      double		Z;
+      int     		i, j;
+
+      if (KS == 0) 
+      {
+        R23 = 1.0;
+        R46 = 1.0;
+        T23 = 1.0;
+        T46 = 1.0;
+    
+        for (i=1; i<=23; i++)
+        {
+          R23 = 0.50 * R23;
+          T23 = 2.0 * T23;
+        }
+        for (i=1; i<=46; i++)
+        {
+          R46 = 0.50 * R46;
+          T46 = 2.0 * T46;
+        }
+        KS = 1;
+      }
+
+/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
+
+      T1 = R23 * *A;
+      j  = T1;
+      A1 = j;
+      A2 = *A - T23 * A1;
+
+/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
+    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
+
+      T1 = R23 * *X;
+      j  = T1;
+      X1 = j;
+      X2 = *X - T23 * X1;
+      T1 = A1 * X2 + A2 * X1;
+      
+      j  = R23 * T1;
+      T2 = j;
+      Z = T1 - T23 * T2;
+      T3 = T23 * Z + A2 * X2;
+      j  = R46 * T3;
+      T4 = j;
+      *X = T3 - T46 * T4;
+      return(R46 * *X);
+} 
diff --git a/examples/smpi/NAS/common/randdp.f b/examples/smpi/NAS/common/randdp.f
new file mode 100644
index 0000000000..64860d96d0
--- /dev/null
+++ b/examples/smpi/NAS/common/randdp.f
@@ -0,0 +1,137 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision function randlc (x, a)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   This routine returns a uniform pseudorandom double precision number in the
+c   range (0, 1) by using the linear congruential generator
+c
+c   x_{k+1} = a x_k  (mod 2^46)
+c
+c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+c   before repeating.  The argument A is the same as 'a' in the above formula,
+c   and X is the same as x_0.  A and X must be odd double precision integers
+c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
+c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+c   the new seed x_1, so that subsequent calls to RANDLC using the same
+c   arguments will generate a continuous sequence.
+c
+c   This routine should produce the same results on any computer with at least
+c   48 mantissa bits in double precision floating point data.  On 64 bit
+c   systems, double precision should be disabled.
+c
+c   David H. Bailey     October 26, 1990
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
+      parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
+     >  t46 = t23 ** 2)
+
+c---------------------------------------------------------------------
+c   Break A into two parts such that A = 2^23 * A1 + A2.
+c---------------------------------------------------------------------
+      t1 = r23 * a
+      a1 = int (t1)
+      a2 = a - t23 * a1
+
+c---------------------------------------------------------------------
+c   Break X into two parts such that X = 2^23 * X1 + X2, compute
+c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+c   X = 2^23 * Z + A2 * X2  (mod 2^46).
+c---------------------------------------------------------------------
+      t1 = r23 * x
+      x1 = int (t1)
+      x2 = x - t23 * x1
+      t1 = a1 * x2 + a2 * x1
+      t2 = int (r23 * t1)
+      z = t1 - t23 * t2
+      t3 = t23 * z + a2 * x2
+      t4 = int (r46 * t3)
+      x = t3 - t46 * t4
+      randlc = r46 * x
+
+      return
+      end
+
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine vranlc (n, x, a, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   This routine generates N uniform pseudorandom double precision numbers in
+c   the range (0, 1) by using the linear congruential generator
+c
+c   x_{k+1} = a x_k  (mod 2^46)
+c
+c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+c   before repeating.  The argument A is the same as 'a' in the above formula,
+c   and X is the same as x_0.  A and X must be odd double precision integers
+c   in the range (1, 2^46).  The N results are placed in Y and are normalized
+c   to be between 0 and 1.  X is updated to contain the new seed, so that
+c   subsequent calls to VRANLC using the same arguments will generate a
+c   continuous sequence.  If N is zero, only initialization is performed, and
+c   the variables X, A and Y are ignored.
+c
+c   This routine is the standard version designed for scalar or RISC systems.
+c   However, it should produce the same results on any single processor
+c   computer with at least 48 mantissa bits in double precision floating point
+c   data.  On 64 bit systems, double precision should be disabled.
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      integer i,n
+      double precision y,r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
+      dimension y(*)
+      parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
+     >  t46 = t23 ** 2)
+
+
+c---------------------------------------------------------------------
+c   Break A into two parts such that A = 2^23 * A1 + A2.
+c---------------------------------------------------------------------
+      t1 = r23 * a
+      a1 = int (t1)
+      a2 = a - t23 * a1
+
+c---------------------------------------------------------------------
+c   Generate N results.   This loop is not vectorizable.
+c---------------------------------------------------------------------
+      do i = 1, n
+
+c---------------------------------------------------------------------
+c   Break X into two parts such that X = 2^23 * X1 + X2, compute
+c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+c   X = 2^23 * Z + A2 * X2  (mod 2^46).
+c---------------------------------------------------------------------
+        t1 = r23 * x
+        x1 = int (t1)
+        x2 = x - t23 * x1
+        t1 = a1 * x2 + a2 * x1
+        t2 = int (r23 * t1)
+        z = t1 - t23 * t2
+        t3 = t23 * z + a2 * x2
+        t4 = int (r46 * t3)
+        x = t3 - t46 * t4
+        y(i) = r46 * x
+      enddo
+
+      return
+      end
diff --git a/examples/smpi/NAS/common/randdpvec.f b/examples/smpi/NAS/common/randdpvec.f
new file mode 100644
index 0000000000..c7080717ce
--- /dev/null
+++ b/examples/smpi/NAS/common/randdpvec.f
@@ -0,0 +1,186 @@
+c---------------------------------------------------------------------
+      double precision function randlc (x, a)
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c   This routine returns a uniform pseudorandom double precision number in the
+c   range (0, 1) by using the linear congruential generator
+c
+c   x_{k+1} = a x_k  (mod 2^46)
+c
+c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+c   before repeating.  The argument A is the same as 'a' in the above formula,
+c   and X is the same as x_0.  A and X must be odd double precision integers
+c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
+c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+c   the new seed x_1, so that subsequent calls to RANDLC using the same
+c   arguments will generate a continuous sequence.
+c
+c   This routine should produce the same results on any computer with at least
+c   48 mantissa bits in double precision floating point data.  On 64 bit
+c   systems, double precision should be disabled.
+c
+c   David H. Bailey     October 26, 1990
+c
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
+      parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
+     >  t46 = t23 ** 2)
+
+c---------------------------------------------------------------------
+c   Break A into two parts such that A = 2^23 * A1 + A2.
+c---------------------------------------------------------------------
+      t1 = r23 * a
+      a1 = int (t1)
+      a2 = a - t23 * a1
+
+c---------------------------------------------------------------------
+c   Break X into two parts such that X = 2^23 * X1 + X2, compute
+c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+c   X = 2^23 * Z + A2 * X2  (mod 2^46).
+c---------------------------------------------------------------------
+      t1 = r23 * x
+      x1 = int (t1)
+      x2 = x - t23 * x1
+
+
+      t1 = a1 * x2 + a2 * x1
+      t2 = int (r23 * t1)
+      z = t1 - t23 * t2
+      t3 = t23 * z + a2 * x2
+      t4 = int (r46 * t3)
+      x = t3 - t46 * t4
+      randlc = r46 * x
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine vranlc (n, x, a, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c   This routine generates N uniform pseudorandom double precision numbers in
+c   the range (0, 1) by using the linear congruential generator
+c   
+c   x_{k+1} = a x_k  (mod 2^46)
+c   
+c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+c   before repeating.  The argument A is the same as 'a' in the above formula,
+c   and X is the same as x_0.  A and X must be odd double precision integers
+c   in the range (1, 2^46).  The N results are placed in Y and are normalized
+c   to be between 0 and 1.  X is updated to contain the new seed, so that
+c   subsequent calls to RANDLC using the same arguments will generate a
+c   continuous sequence.
+c   
+c   This routine generates the output sequence in batches of length NV, for
+c   convenience on vector computers.  This routine should produce the same
+c   results on any computer with at least 48 mantissa bits in double precision
+c   floating point data.  On Cray systems, double precision should be disabled.
+c   
+c   David H. Bailey    August 30, 1990
+c---------------------------------------------------------------------
+
+      integer n
+      double precision x, a, y(*)
+      
+      double precision r23, r46, t23, t46
+      integer nv
+      parameter (r23 = 2.d0 ** (-23), r46 = r23 * r23, t23 = 2.d0 ** 23,
+     >     t46 = t23 * t23, nv = 64)
+      double precision  xv(nv), t1, t2, t3, t4, an, a1, a2, x1, x2, yy
+      integer n1, i, j
+      external randlc
+      double precision randlc
+
+c---------------------------------------------------------------------
+c     Compute the first NV elements of the sequence using RANDLC.
+c---------------------------------------------------------------------
+      t1 = x
+      n1 = min (n, nv)
+
+      do  i = 1, n1
+         xv(i) = t46 * randlc (t1, a)
+      enddo
+
+c---------------------------------------------------------------------
+c     It is not necessary to compute AN, A1 or A2 unless N is greater than NV.
+c---------------------------------------------------------------------
+      if (n .gt. nv) then
+
+c---------------------------------------------------------------------
+c     Compute AN = AA ^ NV (mod 2^46) using successive calls to RANDLC.
+c---------------------------------------------------------------------
+         t1 = a
+         t2 = r46 * a
+
+         do  i = 1, nv - 1
+            t2 = randlc (t1, a)
+         enddo
+
+         an = t46 * t2
+
+c---------------------------------------------------------------------
+c     Break AN into two parts such that AN = 2^23 * A1 + A2.
+c---------------------------------------------------------------------
+         t1 = r23 * an
+         a1 = aint (t1)
+         a2 = an - t23 * a1
+      endif
+
+c---------------------------------------------------------------------
+c     Compute N pseudorandom results in batches of size NV.
+c---------------------------------------------------------------------
+      do  j = 0, n - 1, nv
+         n1 = min (nv, n - j)
+
+c---------------------------------------------------------------------
+c     Compute up to NV results based on the current seed vector XV.
+c---------------------------------------------------------------------
+         do  i = 1, n1
+            y(i+j) = r46 * xv(i)
+         enddo
+
+c---------------------------------------------------------------------
+c     If this is the last pass through the 140 loop, it is not necessary to
+c     update the XV vector.
+c---------------------------------------------------------------------
+         if (j + n1 .eq. n) goto 150
+
+c---------------------------------------------------------------------
+c     Update the XV vector by multiplying each element by AN (mod 2^46).
+c---------------------------------------------------------------------
+         do  i = 1, nv
+            t1 = r23 * xv(i)
+            x1 = aint (t1)
+            x2 = xv(i) - t23 * x1
+            t1 = a1 * x2 + a2 * x1
+            t2 = aint (r23 * t1)
+            yy = t1 - t23 * t2
+            t3 = t23 * yy + a2 * x2
+            t4 = aint (r46 * t3)
+            xv(i) = t3 - t46 * t4
+         enddo
+
+      enddo
+
+c---------------------------------------------------------------------
+c     Save the last seed in X so that subsequent calls to VRANLC will generate
+c     a continuous sequence.
+c---------------------------------------------------------------------
+ 150  x = xv(n1)
+
+      return
+      end
+
+c----- end of program ------------------------------------------------
+
diff --git a/examples/smpi/NAS/common/randi8.f b/examples/smpi/NAS/common/randi8.f
new file mode 100644
index 0000000000..21ab8815db
--- /dev/null
+++ b/examples/smpi/NAS/common/randi8.f
@@ -0,0 +1,79 @@
+      double precision function randlc(x, a)
+
+c---------------------------------------------------------------------
+c
+c   This routine returns a uniform pseudorandom double precision number in the
+c   range (0, 1) by using the linear congruential generator
+c
+c   x_{k+1} = a x_k  (mod 2^46)
+c
+c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+c   before repeating.  The argument A is the same as 'a' in the above formula,
+c   and X is the same as x_0.  A and X must be odd double precision integers
+c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
+c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+c   the new seed x_1, so that subsequent calls to RANDLC using the same
+c   arguments will generate a continuous sequence.
+
+      implicit none
+      double precision x, a
+      integer*8 i246m1, Lx, La
+      double precision d2m46
+
+      parameter(d2m46=0.5d0**46)
+
+      save i246m1
+      data i246m1/X'00003FFFFFFFFFFF'/
+
+      Lx = X
+      La = A
+
+      Lx   = iand(Lx*La,i246m1)
+      randlc = d2m46*dble(Lx)
+      x    = dble(Lx)
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+      SUBROUTINE VRANLC (N, X, A, Y)
+      implicit none
+      integer n, i
+      double precision x, a, y(*)
+      integer*8 i246m1, Lx, La
+      double precision d2m46
+
+c This doesn't work, because the compiler does the calculation in 32
+c bits and overflows. No standard way (without f90 stuff) to specify
+c that the rhs should be done in 64 bit arithmetic. 
+c      parameter(i246m1=2**46-1)
+
+      parameter(d2m46=0.5d0**46)
+
+      save i246m1
+      data i246m1/X'00003FFFFFFFFFFF'/
+
+c Note that the v6 compiler on an R8000 does something stupid with
+c the above. Using the following instead (or various other things)
+c makes the calculation run almost 10 times as fast. 
+c 
+c      save d2m46
+c      data d2m46/0.0d0/
+c      if (d2m46 .eq. 0.0d0) then
+c         d2m46 = 0.5d0**46
+c      endif
+
+      Lx = X
+      La = A
+      do i = 1, N
+         Lx   = iand(Lx*La,i246m1)
+         y(i) = d2m46*dble(Lx)
+      end do
+      x    = dble(Lx)
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/common/randi8_safe.f b/examples/smpi/NAS/common/randi8_safe.f
new file mode 100644
index 0000000000..f725b6a1fb
--- /dev/null
+++ b/examples/smpi/NAS/common/randi8_safe.f
@@ -0,0 +1,64 @@
+      double precision function randlc(x, a)
+
+c---------------------------------------------------------------------
+c
+c   This routine returns a uniform pseudorandom double precision number in the
+c   range (0, 1) by using the linear congruential generator
+c
+c   x_{k+1} = a x_k  (mod 2^46)
+c
+c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+c   before repeating.  The argument A is the same as 'a' in the above formula,
+c   and X is the same as x_0.  A and X must be odd double precision integers
+c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
+c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+c   the new seed x_1, so that subsequent calls to RANDLC using the same
+c   arguments will generate a continuous sequence.
+
+      implicit none
+      double precision x, a
+      integer*8 Lx, La, a1, a2, x1, x2, xa
+      double precision d2m46
+      parameter(d2m46=0.5d0**46)
+
+      Lx = x
+      La = A
+      a1 = ibits(La, 23, 23)
+      a2 = ibits(La, 0, 23)
+      x1 = ibits(Lx, 23, 23)
+      x2 = ibits(Lx, 0, 23)
+      xa = ishft(ibits(a1*x2+a2*x1, 0, 23), 23) + a2*x2
+      Lx   = ibits(xa,0, 46)
+      x    = dble(Lx)
+      randlc = d2m46*x
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+      SUBROUTINE VRANLC (N, X, A, Y)
+      implicit none
+      integer n, i
+      double precision x, a, y(*)
+      integer*8 Lx, La, a1, a2, x1, x2, xa
+      double precision d2m46
+      parameter(d2m46=0.5d0**46)
+
+      Lx = X
+      La = A
+      a1 = ibits(La, 23, 23)
+      a2 = ibits(La, 0, 23)
+      do i = 1, N
+         x1 = ibits(Lx, 23, 23)
+         x2 = ibits(Lx, 0, 23)
+         xa = ishft(ibits(a1*x2+a2*x1, 0, 23), 23) + a2*x2
+         Lx   = ibits(xa,0, 46)
+         y(i) = d2m46*dble(Lx)
+      end do
+      x = dble(Lx)
+      return
+      end
+
diff --git a/examples/smpi/NAS/common/timers.f b/examples/smpi/NAS/common/timers.f
new file mode 100644
index 0000000000..7a19ccf56f
--- /dev/null
+++ b/examples/smpi/NAS/common/timers.f
@@ -0,0 +1,78 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine timer_clear(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      integer n
+      
+      double precision start(64), elapsed(64)
+      common /tt/ start, elapsed
+
+      elapsed(n) = 0.0
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine timer_start(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      integer n
+      include 'mpif.h'
+      double precision start(64), elapsed(64)
+      common /tt/ start, elapsed
+
+      start(n) = MPI_Wtime()
+
+      return
+      end
+      
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine timer_stop(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      integer n
+      include 'mpif.h'
+      double precision start(64), elapsed(64)
+      common /tt/ start, elapsed
+      double precision t, now
+      now = MPI_Wtime()
+      t = now - start(n)
+      elapsed(n) = elapsed(n) + t
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision function timer_read(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit none
+      integer n
+      double precision start(64), elapsed(64)
+      common /tt/ start, elapsed
+      
+      timer_read = elapsed(n)
+      return
+      end
+
diff --git a/examples/smpi/NAS/config/NAS.samples/README b/examples/smpi/NAS/config/NAS.samples/README
new file mode 100644
index 0000000000..ae535e95cf
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/README
@@ -0,0 +1,7 @@
+This directory contains examples of make.def files that were used 
+by the NPB team in testing the benchmarks on different platforms. 
+They can be used as starting points for make.def files for your 
+own platform, but you may need to taylor them for best performance 
+on your installation. A clean template can be found in directory 
+`config'.
+Some examples of suite.def files are also provided.
\ No newline at end of file
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha b/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha
new file mode 100644
index 0000000000..44f045354d
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha
@@ -0,0 +1,18 @@
+#This is for a DEC Alpha 8400. The code will execute on a 
+#single processor
+#Warning: parallel make does not work properly in general
+MPIF77  = f77
+FLINK   = f77
+#Optimization -O5 breaks SP; works fine for all other codes
+FFLAGS  = -O4
+
+MPICC   = cc
+CLINK   = cc
+CFLAGS  = -O5 
+
+include ../config/make.dummy
+
+CC      = cc -g
+BINDIR  = ../bin
+
+RAND   = randi8
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2 b/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2
new file mode 100644
index 0000000000..f76404755a
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2
@@ -0,0 +1,16 @@
+#This is for a generic single-processor SGI workstation
+MPIF77 = f77
+FLINK	= f77
+FFLAGS	= -O3
+
+MPICC = cc
+CLINK	= cc
+CFLAGS	= -O3 
+
+include ../config/make.dummy
+
+CC	= cc -g
+BINDIR	= ../bin
+
+RAND   = randi8
+
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.origin b/examples/smpi/NAS/config/NAS.samples/make.def.origin
new file mode 100644
index 0000000000..11c63c975c
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/make.def.origin
@@ -0,0 +1,20 @@
+# This is for a an SGI Origin 2000 or 3000 with vendor MPI. The Fortran
+# record length is specified, so it can be used for the I/O benchmark.
+# as well
+MPIF77   = f77 
+FMPI_LIB = -lmpi
+FLINK    = f77 -64
+FFLAGS   = -O3 -64
+
+MPICC    = cc
+CMPI_LIB = -lmpi
+CLINK    = cc
+CFLAGS   = -O3 
+
+CC       = cc -g
+BINDIR   = ../bin
+
+RAND   = randi8
+
+CONVERTFLAG = -DFORTRAN_REC_SIZE=4
+
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge b/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge
new file mode 100644
index 0000000000..379726d057
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge
@@ -0,0 +1,16 @@
+# This is for the SGI PowerChallenge Array at NASA Ames. mrf77 and 
+# mrcc are local scripts that invoke the proper MPI library.
+MPIF77 = mrf77
+FLINK  = mrf77
+FFLAGS = -O3 -OPT:fold_arith_limit=1204
+
+MPICC  = mrcc
+CLINK  = mrcc
+CFLAGS = -O3 -OPT:fold_arith_limit=1204
+
+CC     = cc -g
+BINDIR = ../bin
+
+RAND   = randi8
+
+
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage b/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage
new file mode 100644
index 0000000000..7896d56f7d
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage
@@ -0,0 +1,17 @@
+#This is for the IBM SP2 at Ames; mrf77 and mrcc are local scripts
+MPIF77     = mrf77
+FLINK      = mrf77
+FFLAGS     = -O3 
+FLINKFLAGS = -bmaxdata:0x60000000
+
+MPICC      = mrcc
+CLINK      = mrcc
+CFLAGS     = -O3 
+CLINKFLAGS = -bmaxdata:0x60000000
+
+CC         = cc -g
+
+BINDIR     = ../bin
+
+RAND       = randi8
+
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc b/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc
new file mode 100644
index 0000000000..420dfde821
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc
@@ -0,0 +1,30 @@
+#    This is for a Sun SparcCenter or UltraEnterprise machine 
+MPIF77     = f77
+FLINK      = f77
+FMPI_LIB   = -L<your mpich installation tree>/lib/solaris/ch_lfshmem -lmpi
+FMPI_INC   = -I<your mpich installation tree>/include
+#    sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m)
+#    and f77 -V returns 4.0 or greater
+# FFLAGS   = -fast -xtarget=super -xO4 -depend
+#    Ultra1,2, UltraEnterprise servers (uname -m returns sun4u)
+FFLAGS     = -fast -xtarget=ultra -xarch=v8plus -xO4 -depend
+FLINKFLAGS = -lmopt -lcopt -lsunmath
+
+MPICC      = cc
+CLINK      = cc
+CMPI_LIB   = -L<your mpich installation tree>/lib/solaris/ch_lfshmem -lmpi
+CMPI_INC   = -I<your mpich installation tree>/include
+#    sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m)
+#    and cc -V returns 4.0 or greater
+#CFLAGS	   =  -fast -xtarget=super -xO4 -xdepend
+#    Ultra1,2, UltraEnterprise servers (uname -m returns sun4u)
+CFLAGS     =  -fast -xtarget=ultra -xarch=v8plus -xO4 -xdepend
+CLINKFLAGS = -fast
+
+CC         = cc -g
+
+BINDIR     = ../bin
+
+#    Cannot use randi8 or randi8-safe on a 32-but machine. Use double precision
+RAND       = randdp
+
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos b/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos
new file mode 100644
index 0000000000..d3b3bbf528
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos
@@ -0,0 +1,25 @@
+#This is for the Cray T3D at the Jet Propulsion Laboratory
+MPIF77     = cf77
+FLINK      = cf77
+FMPI_LIB   = -L/usr/local/mpp/lib -lmpi
+FMPI_INC   = -I/usr/local/mpp/lib/include/mpp
+FFLAGS     = -dp -Wf-onoieeedivide -C cray-t3d 
+#The following flags provide more effective optimization, but may
+#cause the random number generator randi8(_safe) to break in EP
+#FFLAGS    = -dp -Wf-oaggress -Wf-onoieeedivide -C cray-t3d 
+FLINKFLAGS = -Wl-Drdahead=on -C cray-t3d
+
+MPICC      = cc
+CLINK	   = cc
+CMPI_LIB   = -L/usr/local/mpp/lib -lmpi
+CMPI_INC   = -I/usr/local/mpp/lib/include/mpp
+CFLAGS	   = -O3 -Tcray-t3d
+CLINKFLAGS = -Tcray-t3d
+
+CC	   = cc -g -Tcray-ymp
+BINDIR	   = ../bin
+
+CONVERTFLAG= -DCONVERTDOUBLE
+
+RAND       = randi8
+
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich b/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich
new file mode 100644
index 0000000000..99b0b69041
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich
@@ -0,0 +1,165 @@
+#---------------------------------------------------------------------------
+#
+#                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
+#
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Items in this file will need to be changed for each platform.
+# (Note these definitions are inconsistent with NPB2.1.)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Parallel Fortran:
+#
+# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must 
+# be defined:
+#
+# MPIF77     - Fortran compiler
+# FFLAGS     - Fortran compilation arguments
+# FMPI_INC   - any -I arguments required for compiling MPI/Fortran 
+# FLINK      - Fortran linker
+# FLINKFLAGS - Fortran linker arguments
+# FMPI_LIB   - any -L and -l arguments required for linking MPI/Fortran 
+# 
+# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or
+#                            $(MPIF77) $(FFLAGS)
+# linking is done with       $(FLINK) $(FMPI_LIB) $(FLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the fortran compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPIF77 = mpif77
+# This links MPI fortran programs; usually the same as ${MPIF77}
+FLINK	= $(MPIF77)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+FMPI_LIB  =
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpif.h'
+#---------------------------------------------------------------------------
+FMPI_INC =
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for Fortran programs
+#---------------------------------------------------------------------------
+FFLAGS	= -fast
+# FFLAGS = -g
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable 
+# size usually go here. 
+#---------------------------------------------------------------------------
+FLINKFLAGS = -fast
+
+
+#---------------------------------------------------------------------------
+# Parallel C:
+#
+# For IS, which is in C, the following must be defined:
+#
+# MPICC      - C compiler 
+# CFLAGS     - C compilation arguments
+# CMPI_INC   - any -I arguments required for compiling MPI/C 
+# CLINK      - C linker
+# CLINKFLAGS - C linker flags
+# CMPI_LIB   - any -L and -l arguments required for linking MPI/C 
+#
+# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
+#                            $(MPICC) $(CFLAGS)
+# linking is done with       $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the C compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPICC = mpicc
+# This links MPI C programs; usually the same as ${MPICC}
+CLINK	= $(MPICC)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+CMPI_LIB  =
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpi.h'
+#---------------------------------------------------------------------------
+CMPI_INC =
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for C programs
+#---------------------------------------------------------------------------
+CFLAGS	= -fast
+# CFLAGS = -g
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable 
+# size usually go here. 
+#---------------------------------------------------------------------------
+CLINKFLAGS = -fast
+
+
+#---------------------------------------------------------------------------
+# MPI dummy library:
+#
+# Uncomment if you want to use the MPI dummy library supplied by NAS instead 
+# of the true message-passing library. The include file redefines several of
+# the above macros. It also invokes make in subdirectory MPI_dummy. Make 
+# sure that no spaces or tabs precede include.
+#---------------------------------------------------------------------------
+# include ../config/make.dummy
+
+
+#---------------------------------------------------------------------------
+# Utilities C:
+#
+# This is the C compiler used to compile C utilities.  Flags required by 
+# this compiler go here also; typically there are few flags required; hence 
+# there are no separate macros provided for such flags.
+#---------------------------------------------------------------------------
+CC	= cc -g
+
+
+#---------------------------------------------------------------------------
+# Destination of executables, relative to subdirs of the main directory. . 
+#---------------------------------------------------------------------------
+BINDIR	= ../bin
+
+
+#---------------------------------------------------------------------------
+# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which
+# is twice the precision required for the NPB suite. A compiler flag 
+# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to
+# 64 bits, but the MPI library may continue to send 128 bits. Short of
+# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit
+# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment
+# the following line to enable this substitution. 
+# 
+# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO
+#       SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC
+#       VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE
+#       THE CORRECT VALUE FOR "length".
+#       IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED,
+#       UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR
+#       "length"
+#---------------------------------------------------------------------------
+# CONVERTFLAG	= -DCONVERTDOUBLE
+CONVERTFLAG	= -DFORTRAN_REC_SIZE=1
+# CONVERTFLAG	= -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length
+
+
+#---------------------------------------------------------------------------
+# The variable RAND controls which random number generator 
+# is used. It is described in detail in Doc/README.install. 
+# Use "randi8" unless there is a reason to use another one. 
+# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
+#---------------------------------------------------------------------------
+RAND   = randi8
+# The following is highly reliable but may be slow:
+# RAND   = randdp
+
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.bt b/examples/smpi/NAS/config/NAS.samples/suite.def.bt
new file mode 100644
index 0000000000..f330636aa0
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.bt
@@ -0,0 +1,37 @@
+bt	S	1
+bt	S	4
+bt	S	9
+bt	S	16
+bt	A	1
+bt	A	4
+bt	A	9
+bt	A	16
+bt	A	25
+bt	A	36
+bt	A	49
+bt	A	64
+bt	A	81
+bt	A	100
+bt	A	121
+bt	B	1
+bt	B	4
+bt	B	9
+bt	B	16
+bt	B	25
+bt	B	36
+bt	B	49
+bt	B	64
+bt	B	81
+bt	B	100
+bt	B	121
+bt	C	1
+bt	C	4
+bt	C	9
+bt	C	16
+bt	C	25
+bt	C	36
+bt	C	49
+bt	C	64
+bt	C	81
+bt	C	100
+bt	C	121
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.cg b/examples/smpi/NAS/config/NAS.samples/suite.def.cg
new file mode 100644
index 0000000000..393bc508f6
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.cg
@@ -0,0 +1,29 @@
+cg	S	1
+cg	S	2
+cg	S	4
+cg	S	8
+cg	S	16
+cg	A	1
+cg	A	2
+cg	A	4
+cg	A	8
+cg	A	16
+cg	A	32
+cg	A	64
+cg	A	128
+cg	B	1
+cg	B	2
+cg	B	4
+cg	B	8
+cg	B	16
+cg	B	32
+cg	B	64
+cg	B	128
+cg	C	1
+cg	C	2
+cg	C	4
+cg	C	8
+cg	C	16
+cg	C	32
+cg	C	64
+cg	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.ep b/examples/smpi/NAS/config/NAS.samples/suite.def.ep
new file mode 100644
index 0000000000..e2ca3cd007
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.ep
@@ -0,0 +1,29 @@
+ep	S	1
+ep	S	2
+ep	S	4
+ep	S	8
+ep	S	16
+ep	A	1
+ep	A	2
+ep	A	4
+ep	A	8
+ep	A	16
+ep	A	32
+ep	A	64
+ep	A	128
+ep	B	1
+ep	B	2
+ep	B	4
+ep	B	8
+ep	B	16
+ep	B	32
+ep	B	64
+ep	B	128
+ep	C	1
+ep	C	2
+ep	C	4
+ep	C	8
+ep	C	16
+ep	C	32
+ep	C	64
+ep	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.ft b/examples/smpi/NAS/config/NAS.samples/suite.def.ft
new file mode 100644
index 0000000000..6f05189094
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.ft
@@ -0,0 +1,29 @@
+ft	S	1
+ft	S	2
+ft	S	4
+ft	S	8
+ft	S	16
+ft	A	1
+ft	A	2
+ft	A	4
+ft	A	8
+ft	A	16
+ft	A	32
+ft	A	64
+ft	A	128
+ft	B	1
+ft	B	2
+ft	B	4
+ft	B	8
+ft	B	16
+ft	B	32
+ft	B	64
+ft	B	128
+ft	C	1
+ft	C	2
+ft	C	4
+ft	C	8
+ft	C	16
+ft	C	32
+ft	C	64
+ft	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.is b/examples/smpi/NAS/config/NAS.samples/suite.def.is
new file mode 100644
index 0000000000..97e898d132
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.is
@@ -0,0 +1,29 @@
+is	S	1
+is	S	2
+is	S	4
+is	S	8
+is	S	16
+is	A	1
+is	A	2
+is	A	4
+is	A	8
+is	A	16
+is	A	32
+is	A	64
+is	A	128
+is	B	1
+is	B	2
+is	B	4
+is	B	8
+is	B	16
+is	B	32
+is	B	64
+is	B	128
+is	C	1
+is	C	2
+is	C	4
+is	C	8
+is	C	16
+is	C	32
+is	C	64
+is	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.lu b/examples/smpi/NAS/config/NAS.samples/suite.def.lu
new file mode 100644
index 0000000000..442e0b6842
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.lu
@@ -0,0 +1,29 @@
+lu	S	1
+lu	S	2
+lu	S	4
+lu	S	8
+lu	S	16
+lu	A	1
+lu	A	2
+lu	A	4
+lu	A	8
+lu	A	16
+lu	A	32
+lu	A	64
+lu	A	128
+lu	B	1
+lu	B	2
+lu	B	4
+lu	B	8
+lu	B	16
+lu	B	32
+lu	B	64
+lu	B	128
+lu	C	1
+lu	C	2
+lu	C	4
+lu	C	8
+lu	C	16
+lu	C	32
+lu	C	64
+lu	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.mg b/examples/smpi/NAS/config/NAS.samples/suite.def.mg
new file mode 100644
index 0000000000..b5c01d4557
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.mg
@@ -0,0 +1,29 @@
+mg	S	1
+mg	S	2
+mg	S	4
+mg	S	8
+mg	S	16
+mg	A	1
+mg	A	2
+mg	A	4
+mg	A	8
+mg	A	16
+mg	A	32
+mg	A	64
+mg	A	128
+mg	B	1
+mg	B	2
+mg	B	4
+mg	B	8
+mg	B	16
+mg	B	32
+mg	B	64
+mg	B	128
+mg	C	1
+mg	C	2
+mg	C	4
+mg	C	8
+mg	C	16
+mg	C	32
+mg	C	64
+mg	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.small b/examples/smpi/NAS/config/NAS.samples/suite.def.small
new file mode 100644
index 0000000000..5a094042bd
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.small
@@ -0,0 +1,8 @@
+bt	S	1
+cg	S	1
+ep	S	1
+ft	S	1
+is	S	1
+lu	S	1
+mg	S	1
+sp	S	1
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.sp b/examples/smpi/NAS/config/NAS.samples/suite.def.sp
new file mode 100644
index 0000000000..f8113a256e
--- /dev/null
+++ b/examples/smpi/NAS/config/NAS.samples/suite.def.sp
@@ -0,0 +1,37 @@
+sp	S	1
+sp	S	4
+sp	S	9
+sp	S	16
+sp	A	1
+sp	A	4
+sp	A	9
+sp	A	16
+sp	A	25
+sp	A	36
+sp	A	49
+sp	A	64
+sp	A	81
+sp	A	100
+sp	A	121
+sp	B	1
+sp	B	4
+sp	B	9
+sp	B	16
+sp	B	25
+sp	B	36
+sp	B	49
+sp	B	64
+sp	B	81
+sp	B	100
+sp	B	121
+sp	C	1
+sp	C	4
+sp	C	9
+sp	C	16
+sp	C	25
+sp	C	36
+sp	C	49
+sp	C	64
+sp	C	81
+sp	C	100
+sp	C	121
diff --git a/examples/smpi/NAS/config/make.def b/examples/smpi/NAS/config/make.def
new file mode 100644
index 0000000000..ffcfda2bef
--- /dev/null
+++ b/examples/smpi/NAS/config/make.def
@@ -0,0 +1,162 @@
+#---------------------------------------------------------------------------
+#
+#                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
+#
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Items in this file will need to be changed for each platform.
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Parallel Fortran:
+#
+# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must 
+# be defined:
+#
+# MPIF77     - Fortran compiler
+# FFLAGS     - Fortran compilation arguments
+# FMPI_INC   - any -I arguments required for compiling MPI/Fortran 
+# FLINK      - Fortran linker
+# FLINKFLAGS - Fortran linker arguments
+# FMPI_LIB   - any -L and -l arguments required for linking MPI/Fortran 
+# 
+# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or
+#                            $(MPIF77) $(FFLAGS)
+# linking is done with       $(FLINK) $(FMPI_LIB) $(FLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the fortran compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPIF77 = smpicc
+# This links MPI fortran programs; usually the same as ${MPIF77}
+FLINK	= $(MPIF77)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+FMPI_LIB  = -lgfortran
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpif.h'
+#---------------------------------------------------------------------------
+FMPI_INC = -I/usr/lib/openmpi/include
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for Fortran programs
+#---------------------------------------------------------------------------
+FFLAGS	= -O2
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable 
+# size usually go here. 
+#---------------------------------------------------------------------------
+FLINKFLAGS = -O2
+
+
+#---------------------------------------------------------------------------
+# Parallel C:
+#
+# For IS, which is in C, the following must be defined:
+#
+# MPICC      - C compiler 
+# CFLAGS     - C compilation arguments
+# CMPI_INC   - any -I arguments required for compiling MPI/C 
+# CLINK      - C linker
+# CLINKFLAGS - C linker flags
+# CMPI_LIB   - any -L and -l arguments required for linking MPI/C 
+#
+# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
+#                            $(MPICC) $(CFLAGS)
+# linking is done with       $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the C compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPICC = smpicc
+# This links MPI C programs; usually the same as ${MPICC}
+CLINK	= $(MPICC)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+CMPI_LIB  =
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpi.h'
+#---------------------------------------------------------------------------
+CMPI_INC =
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for C programs
+#---------------------------------------------------------------------------
+CFLAGS	= -O2
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable 
+# size usually go here. 
+#---------------------------------------------------------------------------
+CLINKFLAGS = -O2
+
+
+#---------------------------------------------------------------------------
+# MPI dummy library:
+#
+# Uncomment if you want to use the MPI dummy library supplied by NAS instead 
+# of the true message-passing library. The include file redefines several of
+# the above macros. It also invokes make in subdirectory MPI_dummy. Make 
+# sure that no spaces or tabs precede include.
+#---------------------------------------------------------------------------
+# include ../config/make.dummy
+
+
+#---------------------------------------------------------------------------
+# Utilities C:
+#
+# This is the C compiler used to compile C utilities.  Flags required by 
+# this compiler go here also; typically there are few flags required; hence 
+# there are no separate macros provided for such flags.
+#---------------------------------------------------------------------------
+CC	= gcc -g
+
+
+#---------------------------------------------------------------------------
+# Destination of executables, relative to subdirs of the main directory. . 
+#---------------------------------------------------------------------------
+BINDIR	= ../bin
+
+
+#---------------------------------------------------------------------------
+# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which
+# is twice the precision required for the NPB suite. A compiler flag 
+# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to
+# 64 bits, but the MPI library may continue to send 128 bits. Short of
+# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit
+# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment
+# the following line to enable this substitution. 
+# 
+# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO
+#       SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC
+#       VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE
+#       THE CORRECT VALUE FOR "length".
+#       IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED,
+#       UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR
+#       "length"
+#---------------------------------------------------------------------------
+# CONVERTFLAG	= -DCONVERTDOUBLE
+# CONVERTFLAG	= -DFORTRAN_REC_SIZE=length
+# CONVERTFLAG	= -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length
+
+
+#---------------------------------------------------------------------------
+# The variable RAND controls which random number generator 
+# is used. It is described in detail in README.install. 
+# Use "randi8" unless there is a reason to use another one. 
+# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
+#---------------------------------------------------------------------------
+RAND   = randi8
+# The following is highly reliable but may be slow:
+# RAND   = randdp
+
diff --git a/examples/smpi/NAS/config/make.def.template b/examples/smpi/NAS/config/make.def.template
new file mode 100644
index 0000000000..8cccc29439
--- /dev/null
+++ b/examples/smpi/NAS/config/make.def.template
@@ -0,0 +1,162 @@
+#---------------------------------------------------------------------------
+#
+#                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
+#
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Items in this file will need to be changed for each platform.
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Parallel Fortran:
+#
+# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must 
+# be defined:
+#
+# MPIF77     - Fortran compiler
+# FFLAGS     - Fortran compilation arguments
+# FMPI_INC   - any -I arguments required for compiling MPI/Fortran 
+# FLINK      - Fortran linker
+# FLINKFLAGS - Fortran linker arguments
+# FMPI_LIB   - any -L and -l arguments required for linking MPI/Fortran 
+# 
+# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or
+#                            $(MPIF77) $(FFLAGS)
+# linking is done with       $(FLINK) $(FMPI_LIB) $(FLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the fortran compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPIF77 = f77
+# This links MPI fortran programs; usually the same as ${MPIF77}
+FLINK	= $(MPIF77)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+FMPI_LIB  = -L/usr/local/lib -lmpi
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpif.h'
+#---------------------------------------------------------------------------
+FMPI_INC = -I/usr/local/include
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for Fortran programs
+#---------------------------------------------------------------------------
+FFLAGS	= -O
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable 
+# size usually go here. 
+#---------------------------------------------------------------------------
+FLINKFLAGS = -O
+
+
+#---------------------------------------------------------------------------
+# Parallel C:
+#
+# For IS, which is in C, the following must be defined:
+#
+# MPICC      - C compiler 
+# CFLAGS     - C compilation arguments
+# CMPI_INC   - any -I arguments required for compiling MPI/C 
+# CLINK      - C linker
+# CLINKFLAGS - C linker flags
+# CMPI_LIB   - any -L and -l arguments required for linking MPI/C 
+#
+# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
+#                            $(MPICC) $(CFLAGS)
+# linking is done with       $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the C compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPICC = cc
+# This links MPI C programs; usually the same as ${MPICC}
+CLINK	= $(MPICC)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+CMPI_LIB  = -L/usr/local/lib -lmpi
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpi.h'
+#---------------------------------------------------------------------------
+CMPI_INC = -I/usr/local/include
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for C programs
+#---------------------------------------------------------------------------
+CFLAGS	= -O
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable 
+# size usually go here. 
+#---------------------------------------------------------------------------
+CLINKFLAGS = -O
+
+
+#---------------------------------------------------------------------------
+# MPI dummy library:
+#
+# Uncomment if you want to use the MPI dummy library supplied by NAS instead 
+# of the true message-passing library. The include file redefines several of
+# the above macros. It also invokes make in subdirectory MPI_dummy. Make 
+# sure that no spaces or tabs precede include.
+#---------------------------------------------------------------------------
+# include ../config/make.dummy
+
+
+#---------------------------------------------------------------------------
+# Utilities C:
+#
+# This is the C compiler used to compile C utilities.  Flags required by 
+# this compiler go here also; typically there are few flags required; hence 
+# there are no separate macros provided for such flags.
+#---------------------------------------------------------------------------
+CC	= cc -g
+
+
+#---------------------------------------------------------------------------
+# Destination of executables, relative to subdirs of the main directory. . 
+#---------------------------------------------------------------------------
+BINDIR	= ../bin
+
+
+#---------------------------------------------------------------------------
+# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which
+# is twice the precision required for the NPB suite. A compiler flag 
+# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to
+# 64 bits, but the MPI library may continue to send 128 bits. Short of
+# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit
+# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment
+# the following line to enable this substitution. 
+# 
+# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO
+#       SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC
+#       VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE
+#       THE CORRECT VALUE FOR "length".
+#       IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED,
+#       UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR
+#       "length"
+#---------------------------------------------------------------------------
+# CONVERTFLAG	= -DCONVERTDOUBLE
+# CONVERTFLAG	= -DFORTRAN_REC_SIZE=length
+# CONVERTFLAG	= -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length
+
+
+#---------------------------------------------------------------------------
+# The variable RAND controls which random number generator 
+# is used. It is described in detail in README.install. 
+# Use "randi8" unless there is a reason to use another one. 
+# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
+#---------------------------------------------------------------------------
+RAND   = randi8
+# The following is highly reliable but may be slow:
+# RAND   = randdp
+
diff --git a/examples/smpi/NAS/config/make.dummy b/examples/smpi/NAS/config/make.dummy
new file mode 100644
index 0000000000..16b2350667
--- /dev/null
+++ b/examples/smpi/NAS/config/make.dummy
@@ -0,0 +1,7 @@
+FMPI_LIB  = -L../MPI_dummy -lmpi
+FMPI_INC  = -I../MPI_dummy
+CMPI_LIB  = -L../MPI_dummy -lmpi
+CMPI_INC  = -I../MPI_dummy
+default:: ${PROGRAM} libmpi.a
+libmpi.a: 
+	cd ../MPI_dummy; $(MAKE) F77=$(MPIF77) CC=$(MPICC)
diff --git a/examples/smpi/NAS/config/suite.def.template b/examples/smpi/NAS/config/suite.def.template
new file mode 100644
index 0000000000..aea8b2358e
--- /dev/null
+++ b/examples/smpi/NAS/config/suite.def.template
@@ -0,0 +1,24 @@
+# config/suite.def
+# This file is used to build several benchmarks with a single command. 
+# Typing "make suite" in the main directory will build all the benchmarks
+# specified in this file. 
+# Each line of this file contains a benchmark name, class, and number
+# of nodes. The name is one of "cg", "is", "ep", mg", "ft", "sp", "bt", 
+# "lu", and "dt". 
+# The class is one of "S", "W", "A", "B", "C", "D", and "E"
+# (except that no classes C, D and E for DT, and no class E for IS).
+# The number of nodes must be a legal number for a particular
+# benchmark. The utility which parses this file is primitive, so
+# formatting is inflexible. Separate name/class/number by tabs. 
+# Comments start with "#" as the first character on a line. 
+# No blank lines. 
+# The following example builds 1 processor sample sizes of all benchmarks. 
+ft	S	1
+mg	S	1
+sp	S	1
+lu	S	1
+bt	S	1
+is	S	1
+ep	S	1
+cg	S	1
+dt	S	1
diff --git a/examples/smpi/NAS/sys/Makefile b/examples/smpi/NAS/sys/Makefile
new file mode 100644
index 0000000000..56d1c444bd
--- /dev/null
+++ b/examples/smpi/NAS/sys/Makefile
@@ -0,0 +1,22 @@
+include ../config/make.def
+
+# Note that COMPILE is also defined in make.common and should
+# be the same. We can't include make.common because it has a lot
+# of other garbage. LINK is not defined in make.common because
+# ${MPI_LIB} needs to go at the end of the line. 
+FCOMPILE = $(MPIF77) -c $(FMPI_INC) $(FFLAGS)
+
+all: setparams 
+
+# setparams creates an npbparam.h file for each benchmark 
+# configuration. npbparams.h also contains info about how a benchmark
+# was compiled and linked
+
+setparams: setparams.c ../config/make.def
+	$(CC) ${CONVERTFLAG} -o setparams setparams.c
+
+
+clean: 
+	-rm -f setparams setparams.h npbparams.h
+	-rm -f *~ *.o
+
diff --git a/examples/smpi/NAS/sys/README b/examples/smpi/NAS/sys/README
new file mode 100644
index 0000000000..3c97c524c3
--- /dev/null
+++ b/examples/smpi/NAS/sys/README
@@ -0,0 +1,39 @@
+This directory contains utilities and files used by the 
+build process. You should not need to change anything
+in this directory. 
+
+Original Files
+--------------
+setparams.c:
+        Source for the setparams program. This program is used internally
+        in the build process to create the file "npbparams.h" for each 
+        benchmark. npbparams.h contains Fortran or C parameters to build a 
+        benchmark for a specific class and number of nodes. The setparams 
+        program is never run directly by a user. Its invocation syntax is 
+        "setparams benchmark-name nprocs class". 
+        It examines the file "npbparams.h" in the current directory. If 
+        the specified parameters are the same as those in the npbparams.h 
+        file, nothing it changed. If the file does not exist or corresponds 
+        to a different class/number of nodes, it is (re)built. 
+	One of the more complicated things in npbparams.h is that it 
+        contains, in a Fortran string, the compiler flags used to build a 
+        benchmark, so that a benchmark can print out how it was compiled. 
+
+make.common
+        A makefile segment that is included in each individual benchmark
+        program makefile. It sets up some standard macros (COMPILE, etc) 
+        and makes sure everything is configured correctly (npbparams.h)
+
+Makefile
+        Builds  setparams
+
+README
+        This file. 
+
+
+Created files
+-------------
+
+setparams
+	See descriptions above
+
diff --git a/examples/smpi/NAS/sys/make.common b/examples/smpi/NAS/sys/make.common
new file mode 100644
index 0000000000..4469596990
--- /dev/null
+++ b/examples/smpi/NAS/sys/make.common
@@ -0,0 +1,54 @@
+PROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS).$(NPROCS)
+FCOMPILE = $(MPIF77) -c $(FMPI_INC) $(FFLAGS)
+CCOMPILE = $(MPICC)  -c $(CMPI_INC) $(CFLAGS)
+
+# Class "U" is used internally by the setparams program to mean
+# "unknown". This means that if you don't specify CLASS=
+# on the command line, you'll get an error. It would be nice
+# to be able to avoid this, but we'd have to get information
+# from the setparams back to the make program, which isn't easy. 
+CLASS=U
+NPROCS=1
+
+default:: ${PROGRAM}
+
+# This makes sure the configuration utility setparams 
+# is up to date. 
+# Note that this must be run every time, which is why the
+# target does not exist and is not created. 
+# If you create a file called "config" you will break things. 
+config:
+	@cd ../sys; ${MAKE} all
+	../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE}
+
+COMMON=../common
+${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f
+	cd ${COMMON}; ${FCOMPILE} ${RAND}.f
+${COMMON}/c_randdp.o: ${COMMON}/randdp.c
+	cd ${COMMON}; ${CCOMPILE} -o c_randdp.o randdp.c
+
+${COMMON}/print_results.o: ${COMMON}/print_results.f
+	cd ${COMMON}; ${FCOMPILE} print_results.f
+
+${COMMON}/c_print_results.o: ${COMMON}/c_print_results.c
+	cd ${COMMON}; ${CCOMPILE} c_print_results.c
+
+${COMMON}/timers.o: ${COMMON}/timers.f
+	cd ${COMMON}; ${FCOMPILE} timers.f
+
+${COMMON}/c_timers.o: ${COMMON}/c_timers.c
+	cd ${COMMON}; ${CCOMPILE} c_timers.c
+
+# Normally setparams updates npbparams.h only if the settings (CLASS/NPROCS)
+# have changed. However, we also want to update if the compile options
+# may have changed (set in ../config/make.def). 
+npbparams.h: ../config/make.def
+	@ echo make.def modified. Rebuilding npbparams.h just in case
+	rm -f npbparams.h
+	../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE}
+
+# So that "make benchmark-name" works
+${BENCHMARK}:  default
+${BENCHMARKU}: default
+
+
diff --git a/examples/smpi/NAS/sys/print_header b/examples/smpi/NAS/sys/print_header
new file mode 100755
index 0000000000..4fdb5785d2
--- /dev/null
+++ b/examples/smpi/NAS/sys/print_header
@@ -0,0 +1,5 @@
+echo '   ========================================='
+echo '   =      NAS Parallel Benchmarks 3.3      ='
+echo '   =      MPI/F77/C                        ='
+echo '   ========================================='
+echo ''
diff --git a/examples/smpi/NAS/sys/print_instructions b/examples/smpi/NAS/sys/print_instructions
new file mode 100755
index 0000000000..d2f1999b7f
--- /dev/null
+++ b/examples/smpi/NAS/sys/print_instructions
@@ -0,0 +1,26 @@
+echo ''
+echo '   To make a NAS benchmark type '
+echo ''
+echo '         make <benchmark-name> NPROCS=<number> CLASS=<class> [SUBTYPE=<type>]'
+echo ''
+echo '   where <benchmark-name>  is "bt", "cg", "ep", "ft", "is", "lu",'
+echo '                              "mg", or "sp"'
+echo '         <number>          is the number of processors'
+echo '         <class>           is "S", "W", "A", "B", "C", or "D"'
+echo ''
+echo '   Only when making the I/O benchmark:'
+echo ''
+echo '         <benchmark-name>  is "bt"'
+echo '         <number>, <class> as above'
+echo '         <type>            is "full", "simple", "fortran", or "epio"'
+echo ''
+echo '   To make a set of benchmarks, create the file config/suite.def'
+echo '   according to the instructions in config/suite.def.template and type'
+echo ''
+echo '         make suite'
+echo ''
+echo ' ***************************************************************'
+echo ' * Remember to edit the file config/make.def for site specific *'
+echo ' * information as described in the README file                 *'
+echo ' ***************************************************************'
+
diff --git a/examples/smpi/NAS/sys/setparams.c b/examples/smpi/NAS/sys/setparams.c
new file mode 100644
index 0000000000..9a4fba590c
--- /dev/null
+++ b/examples/smpi/NAS/sys/setparams.c
@@ -0,0 +1,1210 @@
+/* 
+ * This utility configures a NPB to be built for a specific number
+ * of nodes and a specific class. It creates a file "npbparams.h" 
+ * in the source directory. This file keeps state information about 
+ * which size of benchmark is currently being built (so that nothing
+ * if unnecessarily rebuilt) and defines (through PARAMETER statements)
+ * the number of nodes and class for which a benchmark is being built. 
+
+ * The utility takes 3 arguments: 
+ *       setparams benchmark-name nprocs class
+ *    benchmark-name is "sp", "bt", etc
+ *    nprocs is the number of processors to run on
+ *    class is the size of the benchmark
+ * These parameters are checked for the current benchmark. If they
+ * are invalid, this program prints a message and aborts. 
+ * If the parameters are ok, the current npbsize.h (actually just
+ * the first line) is read in. If the new parameters are the same as 
+ * the old, nothing is done, but an exit code is returned to force the
+ * user to specify (otherwise the make procedure succeeds but builds a
+ * binary of the wrong name).  Otherwise the file is rewritten. 
+ * Errors write a message (to stdout) and abort. 
+ * 
+ * This program makes use of two extra benchmark "classes"
+ * class "X" means an invalid specification. It is returned if
+ * there is an error parsing the config file. 
+ * class "U" is an external specification meaning "unknown class"
+ * 
+ * Unfortunately everything has to be case sensitive. This is
+ * because we can always convert lower to upper or v.v. but
+ * can't feed this information back to the makefile, so typing
+ * make CLASS=a and make CLASS=A will produce different binaries.
+ *
+ * 
+ */
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <time.h>
+
+/*
+ * This is the master version number for this set of 
+ * NPB benchmarks. It is in an obscure place so people
+ * won't accidentally change it. 
+ */
+
+#define VERSION "3.3"
+
+/* controls verbose output from setparams */
+/* #define VERBOSE */
+
+#define FILENAME "npbparams.h"
+#define DESC_LINE "c NPROCS = %d CLASS = %c\n"
+#define BT_DESC_LINE "c NPROCS = %d CLASS = %c SUBTYPE = %s\n"
+#define DEF_CLASS_LINE     "#define CLASS '%c'\n"
+#define DEF_NUM_PROCS_LINE "#define NUM_PROCS %d\n"
+#define FINDENT  "        "
+#define CONTINUE "     > "
+
+#ifdef FORTRAN_REC_SIZE
+int fortran_rec_size = FORTRAN_REC_SIZE;
+#else
+int fortran_rec_size = 4;
+#endif
+
+void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp,
+	      int* subtypep);
+void check_info(int type, int nprocs, char class);
+void read_info(int type, int *nprocsp, char *classp, int *subtypep);
+void write_info(int type, int nprocs, char class, int subtype);
+void write_sp_info(FILE *fp, int nprocs, char class);
+void write_bt_info(FILE *fp, int nprocs, char class, int io);
+void write_lu_info(FILE *fp, int nprocs, char class);
+void write_mg_info(FILE *fp, int nprocs, char class);
+void write_cg_info(FILE *fp, int nprocs, char class);
+void write_ft_info(FILE *fp, int nprocs, char class);
+void write_ep_info(FILE *fp, int nprocs, char class);
+void write_ep_info_C(FILE *fp, int nprocs, char class);  /* after C translation */
+void write_is_info(FILE *fp, int nprocs, char class);
+void write_dt_info(FILE *fp, int nprocs, char class);
+void write_compiler_info(int type, FILE *fp);
+void write_convertdouble_info(int type, FILE *fp);
+void check_line(char *line, char *label, char *val);
+int  check_include_line(char *line, char *filename);
+void put_string(FILE *fp, char *name, char *val);
+void put_def_string(FILE *fp, char *name, char *val);
+void put_def_variable(FILE *fp, char *name, char *val);
+int isqrt(int i);
+int ilog2(int i);
+int ipow2(int i);
+
+enum benchmark_types {SP, BT, LU, MG, FT, IS, DT, EP, CG};
+enum iotypes { NONE = 0, FULL, SIMPLE, EPIO, FORTRAN};
+
+int main(int argc, char *argv[])
+{
+  int nprocs, nprocs_old, type;
+  char class, class_old;
+  int subtype = -1, old_subtype = -1;
+  
+  /* Get command line arguments. Make sure they're ok. */
+  get_info(argc, argv, &type, &nprocs, &class, &subtype);
+  if (class != 'U') {
+#ifdef VERBOSE
+    printf("setparams: For benchmark %s: number of processors = %d class = %c\n", 
+	   argv[1], nprocs, class); 
+#endif
+    check_info(type, nprocs, class);
+  }
+
+  /* Get old information. */
+  read_info(type, &nprocs_old, &class_old, &old_subtype);
+  if (class != 'U') {
+    if (class_old != 'X') {
+#ifdef VERBOSE
+      printf("setparams:     old settings: number of processors = %d class = %c\n", 
+	     nprocs_old, class_old); 
+#endif
+    }
+  } else {
+    printf("setparams:\n\
+  *********************************************************************\n\
+  * You must specify NPROCS and CLASS to build this benchmark         *\n\
+  * For example, to build a class A benchmark for 4 processors, type  *\n\
+  *       make {benchmark-name} NPROCS=4 CLASS=A                      *\n\
+  *********************************************************************\n\n"); 
+
+    if (class_old != 'X') {
+#ifdef VERBOSE
+      printf("setparams: Previous settings were CLASS=%c NPROCS=%d\n", 
+	     class_old, nprocs_old); 
+#endif
+    }
+    exit(1); /* exit on class==U */
+  }
+
+  /* Write out new information if it's different. */
+  if (nprocs != nprocs_old || class != class_old || subtype != old_subtype) {
+#ifdef VERBOSE
+    printf("setparams: Writing %s\n", FILENAME); 
+#endif
+    write_info(type, nprocs, class, subtype);
+  } else {
+#ifdef VERBOSE
+    printf("setparams: Settings unchanged. %s unmodified\n", FILENAME); 
+#endif
+  }
+
+  return 0;
+}
+
+
+/*
+ *  get_info(): Get parameters from command line 
+ */
+
+void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp,
+	      int *subtypep) 
+{
+
+  if (argc < 4) {
+    printf("Usage: %s (%d) benchmark-name nprocs class\n", argv[0], argc);
+    exit(1);
+  }
+
+  *nprocsp = atoi(argv[2]);
+
+  *classp = *argv[3];
+
+  if      (!strcmp(argv[1], "sp") || !strcmp(argv[1], "SP")) *typep = SP;
+  else if (!strcmp(argv[1], "ft") || !strcmp(argv[1], "FT")) *typep = FT;
+  else if (!strcmp(argv[1], "lu") || !strcmp(argv[1], "LU")) *typep = LU;
+  else if (!strcmp(argv[1], "mg") || !strcmp(argv[1], "MG")) *typep = MG;
+  else if (!strcmp(argv[1], "is") || !strcmp(argv[1], "IS")) *typep = IS;
+  else if (!strcmp(argv[1], "dt") || !strcmp(argv[1], "DT")) *typep = DT;
+  else if (!strcmp(argv[1], "ep") || !strcmp(argv[1], "EP")) *typep = EP;
+  else if (!strcmp(argv[1], "cg") || !strcmp(argv[1], "CG")) *typep = CG;
+  else if (!strcmp(argv[1], "bt") || !strcmp(argv[1], "BT")) {
+    *typep = BT;
+    if (argc != 5) {
+      /* printf("Usage: %s (%d) benchmark-name nprocs class\n", argv[0], argc); */
+      /* exit(1); */
+      *subtypep = NONE;
+    } else {
+      if (!strcmp(argv[4], "full") || !strcmp(argv[4], "FULL")) {
+        *subtypep = FULL;
+      } else if (!strcmp(argv[4], "simple") || !strcmp(argv[4], "SIMPLE")) {
+        *subtypep = SIMPLE;
+      } else if (!strcmp(argv[4], "epio") || !strcmp(argv[4], "EPIO")) {
+        *subtypep = EPIO;
+      } else if (!strcmp(argv[4], "fortran") || !strcmp(argv[4], "FORTRAN")) {
+        *subtypep = FORTRAN;
+      } else if (!strcmp(argv[4], "none") || !strcmp(argv[4], "NONE")) {
+        *subtypep = NONE;
+      } else {
+        printf("setparams: Error: unknown btio type %s\n", argv[4]);
+        exit(1);
+      }
+    }
+  } else {
+    printf("setparams: Error: unknown benchmark type %s\n", argv[1]);
+    exit(1);
+  }
+}
+
+/*
+ *  check_info(): Make sure command line data is ok for this benchmark 
+ */
+
+void check_info(int type, int nprocs, char class) 
+{
+  int rootprocs, logprocs; 
+
+  /* check number of processors */
+  if (nprocs <= 0) {
+    printf("setparams: Number of processors must be greater than zero\n");
+    exit(1);
+  }
+  switch(type) {
+
+  case SP:
+  case BT:
+    rootprocs = isqrt(nprocs);
+    if (rootprocs < 0) {
+      printf("setparams: Number of processors %d must be a square (1,4,9,...) for this benchmark", 
+              nprocs);
+      exit(1);
+    }
+    if (class == 'S' && nprocs > 16) {
+      printf("setparams: BT and SP sample sizes cannot be run on more\n");
+      printf("           than 16 processors because the cell size would be too small.\n");
+      exit(1);
+    }
+    break;
+
+  case CG:
+  case FT:
+  case MG:
+  case IS:
+  case LU:
+    logprocs = ilog2(nprocs);
+    if (logprocs < 0) {
+      printf("setparams: Number of processors must be a power of two (1,2,4,...) for this benchmark\n");
+      exit(1);
+    }
+
+    break;
+
+  case EP:
+  case DT:
+    break;
+
+  default:
+    /* never should have gotten this far with a bad name */
+    printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type); 
+    exit(1);
+  }
+
+  /* check class */
+  if (class != 'S' && 
+      class != 'W' && 
+      class != 'A' && 
+      class != 'B' && 
+      class != 'C' && 
+      class != 'D' && 
+      class != 'E') {
+    printf("setparams: Unknown benchmark class %c\n", class); 
+    printf("setparams: Allowed classes are \"S\", \"W\", and \"A\" through \"E\"\n");
+    exit(1);
+  }
+
+  if (class == 'E' && (type == IS || type == DT)) {
+    printf("setparams: Benchmark class %c not defined for IS or DT\n", class);
+    exit(1);
+  }
+
+  if (class == 'D' && type == IS && nprocs < 4) {
+    printf("setparams: IS class D size cannot be run on less than 4 processors\n");
+    exit(1);
+  }
+}
+
+
+/* 
+ * read_info(): Read previous information from file. 
+ *              Not an error if file doesn't exist, because this
+ *              may be the first time we're running. 
+ *              Assumes the first line of the file is in a special
+ *              format that we understand (since we wrote it). 
+ */
+
+void read_info(int type, int *nprocsp, char *classp, int *subtypep)
+{
+  int nread = 0;
+  FILE *fp;
+  fp = fopen(FILENAME, "r");
+  if (fp == NULL) {
+#ifdef VERBOSE
+    printf("setparams: INFO: configuration file %s does not exist (yet)\n", FILENAME); 
+#endif
+    goto abort;
+  }
+  
+  /* first line of file contains info (fortran), first two lines (C) */
+
+  switch(type) {
+      case BT: {
+	  char subtype_str[100];
+          nread = fscanf(fp, BT_DESC_LINE, nprocsp, classp, subtype_str);
+          if (nread != 3) {
+            if (nread != 2) {
+              printf("setparams: Error parsing config file %s. Ignoring previous settings\n", FILENAME);
+              goto abort;
+	    }
+	    *subtypep = 0;
+	    break;
+          }
+          if (!strcmp(subtype_str, "full") || !strcmp(subtype_str, "FULL")) {
+		*subtypep = FULL;
+          } else if (!strcmp(subtype_str, "simple") ||
+		     !strcmp(subtype_str, "SIMPLE")) {
+		*subtypep = SIMPLE;
+          } else if (!strcmp(subtype_str, "epio") || !strcmp(subtype_str, "EPIO")) {
+		*subtypep = EPIO;
+          } else if (!strcmp(subtype_str, "fortran") ||
+		     !strcmp(subtype_str, "FORTRAN")) {
+		*subtypep = FORTRAN;
+          } else {
+		*subtypep = -1;
+	  }
+          break;
+      }
+
+      case SP:
+      case FT:
+      case MG:
+      case LU:
+      //case EP:
+      case CG:
+          nread = fscanf(fp, DESC_LINE, nprocsp, classp);
+          if (nread != 2) {
+            printf("setparams: Error line %d parsing config file %s. Ignoring previous settings\n", __LINE__,FILENAME);
+            goto abort;
+          }
+          break;
+      case IS:
+      case EP:
+      case DT:
+          nread = fscanf(fp, DEF_CLASS_LINE, classp);
+          nread += fscanf(fp, DEF_NUM_PROCS_LINE, nprocsp);
+          if (nread != 2) {
+            printf("setparams: Error line %d parsing config file %s. Ignoring previous settings\n", __LINE__,FILENAME);
+            goto abort;
+          }
+          break;
+      default:
+        /* never should have gotten this far with a bad name */
+        printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type); 
+        exit(1);
+  }
+
+  fclose(fp);
+
+
+  return;
+
+ abort:
+  *nprocsp = -1;
+  *classp = 'X';
+  *subtypep = -1;
+  return;
+}
+
+
+/* 
+ * write_info(): Write new information to config file. 
+ *               First line is in a special format so we can read
+ *               it in again. Then comes a warning. The rest is all
+ *               specific to a particular benchmark. 
+ */
+
+void write_info(int type, int nprocs, char class, int subtype) 
+{
+  FILE *fp;
+  char *BT_TYPES[] = {"NONE", "FULL", "SIMPLE", "EPIO", "FORTRAN"};
+
+  fp = fopen(FILENAME, "w");
+  if (fp == NULL) {
+    printf("setparams: Can't open file %s for writing\n", FILENAME);
+    exit(1);
+  }
+
+  switch(type) {
+      case BT:
+          /* Write out the header */
+	  if (subtype == -1 || subtype == 0) {
+            fprintf(fp, DESC_LINE, nprocs, class);
+	  } else {
+            fprintf(fp, BT_DESC_LINE, nprocs, class, BT_TYPES[subtype]);
+	  }
+          /* Print out a warning so bozos don't mess with the file */
+          fprintf(fp, "\
+c  \n\
+c  \n\
+c  This file is generated automatically by the setparams utility.\n\
+c  It sets the number of processors and the class of the NPB\n\
+c  in this directory. Do not modify it by hand.\n\
+c  \n");
+
+          break;
+	
+      case SP:
+      case FT:
+      case MG:
+      case LU:
+      //case EP:
+      case CG:
+          /* Write out the header */
+          fprintf(fp, DESC_LINE, nprocs, class);
+          /* Print out a warning so bozos don't mess with the file */
+          fprintf(fp, "\
+c  \n\
+c  \n\
+c  This file is generated automatically by the setparams utility.\n\
+c  It sets the number of processors and the class of the NPB\n\
+c  in this directory. Do not modify it by hand.\n\
+c  \n");
+
+          break;
+	case EP:
+      case IS:
+      case DT:
+          fprintf(fp, DEF_CLASS_LINE, class);
+          fprintf(fp, DEF_NUM_PROCS_LINE, nprocs);
+          fprintf(fp, "\
+/*\n\
+   This file is generated automatically by the setparams utility.\n\
+   It sets the number of processors and the class of the NPB\n\
+   in this directory. Do not modify it by hand.   */\n\
+   \n");
+          break;
+      default:
+          printf("setparams: (Internal error): Unknown benchmark type %d\n", 
+                                                                         type);
+          exit(1);
+  }
+
+  /* Now do benchmark-specific stuff */
+  switch(type) {
+  case SP:
+    write_sp_info(fp, nprocs, class);
+    break;
+  case LU:
+    write_lu_info(fp, nprocs, class);
+    break;
+  case MG:
+    write_mg_info(fp, nprocs, class);
+    break;
+  case IS:
+    write_is_info(fp, nprocs, class);  
+    break;
+  case DT:
+    write_dt_info(fp, nprocs, class);  
+    break;
+  case FT:
+    write_ft_info(fp, nprocs, class);
+    break;
+  case EP:
+    //write_ep_info(fp, nprocs, class);
+    write_ep_info_C(fp, nprocs, class);
+    break;
+  case CG:
+    write_cg_info(fp, nprocs, class);
+    break;
+  case BT:
+    write_bt_info(fp, nprocs, class, subtype);
+    break;
+  default:
+    printf("setparams: (Internal error): Unknown benchmark type %d\n", type);
+    exit(1);
+  }
+  write_convertdouble_info(type, fp);
+  write_compiler_info(type, fp);
+  fclose(fp);
+  return;
+}
+
+
+/* 
+ * write_sp_info(): Write SP specific info to config file
+ */
+
+void write_sp_info(FILE *fp, int nprocs, char class) 
+{
+  int maxcells, problem_size, niter;
+  char *dt;
+  maxcells = isqrt(nprocs);
+  if      (class == 'S') { problem_size = 12;  dt = "0.015d0";   niter = 100; }
+  else if (class == 'W') { problem_size = 36;  dt = "0.0015d0";  niter = 400; }
+  else if (class == 'A') { problem_size = 64;  dt = "0.0015d0";  niter = 400; }
+  else if (class == 'B') { problem_size = 102; dt = "0.001d0";   niter = 400; }
+  else if (class == 'C') { problem_size = 162; dt = "0.00067d0"; niter = 400; }
+  else if (class == 'D') { problem_size = 408; dt = "0.00030d0"; niter = 500; }
+  else if (class == 'E') { problem_size = 1020; dt = "0.0001d0"; niter = 500; }
+  else {
+    printf("setparams: Internal error: invalid class %c\n", class);
+    exit(1);
+  }
+  fprintf(fp, "%sinteger maxcells, problem_size, niter_default\n", FINDENT);
+  fprintf(fp, "%sparameter (maxcells=%d, problem_size=%d, niter_default=%d)\n", 
+	       FINDENT, maxcells, problem_size, niter);
+  fprintf(fp, "%sdouble precision dt_default\n", FINDENT);
+  fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt);
+}
+  
+/* 
+ * write_bt_info(): Write BT specific info to config file
+ */
+
+void write_bt_info(FILE *fp, int nprocs, char class, int io) 
+{
+  int maxcells, problem_size, niter, wr_interval;
+  char *dt;
+  maxcells = isqrt(nprocs);
+  if      (class == 'S') { problem_size = 12;  dt = "0.010d0";    niter = 60;  }
+  else if (class == 'W') { problem_size = 24;  dt = "0.0008d0";   niter = 200; }
+  else if (class == 'A') { problem_size = 64;  dt = "0.0008d0";   niter = 200; }
+  else if (class == 'B') { problem_size = 102; dt = "0.0003d0";   niter = 200; }
+  else if (class == 'C') { problem_size = 162; dt = "0.0001d0";   niter = 200; }
+  else if (class == 'D') { problem_size = 408; dt = "0.00002d0";  niter = 250; }
+  else if (class == 'E') { problem_size = 1020; dt = "0.4d-5";    niter = 250; }
+  else {
+    printf("setparams: Internal error: invalid class %c\n", class);
+    exit(1);
+  }
+  wr_interval = 5;
+  fprintf(fp, "%sinteger maxcells, problem_size, niter_default\n", FINDENT);
+  fprintf(fp, "%sparameter (maxcells=%d, problem_size=%d, niter_default=%d)\n", 
+	       FINDENT, maxcells, problem_size, niter);
+  fprintf(fp, "%sdouble precision dt_default\n", FINDENT);
+  fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt);
+  fprintf(fp, "%sinteger wr_default\n", FINDENT);
+  fprintf(fp, "%sparameter (wr_default = %d)\n", FINDENT, wr_interval);
+  fprintf(fp, "%sinteger iotype\n", FINDENT);
+  fprintf(fp, "%sparameter (iotype = %d)\n", FINDENT, io);
+  if (io) {
+    fprintf(fp, "%scharacter*(*) filenm\n", FINDENT);
+    switch (io) {
+	case FULL:
+	    fprintf(fp, "%sparameter (filenm = 'btio.full.out')\n", FINDENT);
+	    break;
+	case SIMPLE:
+	    fprintf(fp, "%sparameter (filenm = 'btio.simple.out')\n", FINDENT);
+	    break;
+	case EPIO:
+	    fprintf(fp, "%sparameter (filenm = 'btio.epio.out')\n", FINDENT);
+	    break;
+	case FORTRAN:
+	    fprintf(fp, "%sparameter (filenm = 'btio.fortran.out')\n", FINDENT);
+	    fprintf(fp, "%sinteger fortran_rec_sz\n", FINDENT);
+	    fprintf(fp, "%sparameter (fortran_rec_sz = %d)\n",
+		    FINDENT, fortran_rec_size);
+	    break;
+	default:
+	    break;
+    }
+  }
+}
+  
+
+
+/* 
+ * write_lu_info(): Write SP specific info to config file
+ */
+
+void write_lu_info(FILE *fp, int nprocs, char class) 
+{
+  int isiz1, isiz2, itmax, inorm, problem_size;
+  int xdiv, ydiv; /* number of cells in x and y direction */
+  char *dt_default;
+
+  if      (class == 'S') { problem_size = 12;  dt_default = "0.5d0";  itmax = 50; }
+  else if (class == 'W') { problem_size = 33;  dt_default = "1.5d-3"; itmax = 300; }
+  else if (class == 'A') { problem_size = 64;  dt_default = "2.0d0";  itmax = 250; }
+  else if (class == 'B') { problem_size = 102; dt_default = "2.0d0";  itmax = 250; }
+  else if (class == 'C') { problem_size = 162; dt_default = "2.0d0";  itmax = 250; }
+  else if (class == 'D') { problem_size = 408; dt_default = "1.0d0";  itmax = 300; }
+  else if (class == 'E') { problem_size = 1020; dt_default = "0.5d0"; itmax = 300; }
+  else {
+    printf("setparams: Internal error: invalid class %c\n", class);
+    exit(1);
+  }
+  inorm = itmax;
+  xdiv = ydiv = ilog2(nprocs)/2;
+  if (xdiv+ydiv != ilog2(nprocs)) xdiv += 1;
+  xdiv = ipow2(xdiv); ydiv = ipow2(ydiv);
+  isiz1 = problem_size/xdiv; if (isiz1*xdiv < problem_size) isiz1++;
+  isiz2 = problem_size/ydiv; if (isiz2*ydiv < problem_size) isiz2++;
+  
+
+  fprintf(fp, "\nc number of nodes for which this version is compiled\n");
+  fprintf(fp, "%sinteger nnodes_compiled\n", FINDENT);
+  fprintf(fp, "%sparameter (nnodes_compiled = %d)\n", FINDENT, nprocs);
+
+  fprintf(fp, "\nc full problem size\n");
+  fprintf(fp, "%sinteger isiz01, isiz02, isiz03\n", FINDENT);
+  fprintf(fp, "%sparameter (isiz01=%d, isiz02=%d, isiz03=%d)\n", 
+	  FINDENT, problem_size, problem_size, problem_size);
+
+  fprintf(fp, "\nc sub-domain array size\n");
+  fprintf(fp, "%sinteger isiz1, isiz2, isiz3\n", FINDENT);
+  fprintf(fp, "%sparameter (isiz1=%d, isiz2=%d, isiz3=isiz03)\n", 
+	       FINDENT, isiz1, isiz2);
+
+  fprintf(fp, "\nc number of iterations and how often to print the norm\n");
+  fprintf(fp, "%sinteger itmax_default, inorm_default\n", FINDENT);
+  fprintf(fp, "%sparameter (itmax_default=%d, inorm_default=%d)\n", 
+	  FINDENT, itmax, inorm);
+
+  fprintf(fp, "%sdouble precision dt_default\n", FINDENT);
+  fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt_default);
+  
+}
+
+/* 
+ * write_mg_info(): Write MG specific info to config file
+ */
+
+void write_mg_info(FILE *fp, int nprocs, char class) 
+{
+  int problem_size, nit, log2_size, log2_nprocs, lt_default, lm;
+  int ndim1, ndim2, ndim3;
+  if      (class == 'S') { problem_size = 32;   nit = 4; }
+  else if (class == 'W') { problem_size = 128;  nit = 4; }
+  else if (class == 'A') { problem_size = 256;  nit = 4; }
+  else if (class == 'B') { problem_size = 256;  nit = 20; }
+  else if (class == 'C') { problem_size = 512;  nit = 20; }
+  else if (class == 'D') { problem_size = 1024; nit = 50; }
+  else if (class == 'E') { problem_size = 2048; nit = 50; }
+  else {
+    printf("setparams: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+  log2_size = ilog2(problem_size);
+  log2_nprocs = ilog2(nprocs);
+  /* lt is log of largest total dimension */
+  lt_default = log2_size;
+  /* log of log of maximum dimension on a node */
+  lm = log2_size - log2_nprocs/3;
+  ndim1 = lm;
+  ndim3 = log2_size - (log2_nprocs+2)/3;
+  ndim2 = log2_size - (log2_nprocs+1)/3;
+
+  fprintf(fp, "%sinteger nprocs_compiled\n", FINDENT);
+  fprintf(fp, "%sparameter (nprocs_compiled = %d)\n", FINDENT, nprocs);
+  fprintf(fp, "%sinteger nx_default, ny_default, nz_default\n", FINDENT);
+  fprintf(fp, "%sparameter (nx_default=%d, ny_default=%d, nz_default=%d)\n", 
+	  FINDENT, problem_size, problem_size, problem_size);
+  fprintf(fp, "%sinteger nit_default, lm, lt_default\n", FINDENT);
+  fprintf(fp, "%sparameter (nit_default=%d, lm = %d, lt_default=%d)\n", 
+	  FINDENT, nit, lm, lt_default);
+  fprintf(fp, "%sinteger debug_default\n", FINDENT);
+  fprintf(fp, "%sparameter (debug_default=%d)\n", FINDENT, 0);
+  fprintf(fp, "%sinteger ndim1, ndim2, ndim3\n", FINDENT);
+  fprintf(fp, "%sparameter (ndim1 = %d, ndim2 = %d, ndim3 = %d)\n", 
+	  FINDENT, ndim1, ndim2, ndim3);
+}
+
+
+/* 
+ * write_dt_info(): Write DT specific info to config file
+ */
+
+void write_dt_info(FILE *fp, int nprocs, char class) 
+{
+  int num_samples,deviation,num_sources;
+  if      (class == 'S') { num_samples=1728; deviation=128; num_sources=4; }
+  else if (class == 'W') { num_samples=1728*8; deviation=128*2; num_sources=4*2; }
+  else if (class == 'A') { num_samples=1728*64; deviation=128*4; num_sources=4*4; }
+  else if (class == 'B') { num_samples=1728*512; deviation=128*8; num_sources=4*8; }
+  else if (class == 'C') { num_samples=1728*4096; deviation=128*16; num_sources=4*16; }
+  else if (class == 'D') { num_samples=1728*4096*8; deviation=128*32; num_sources=4*32; }
+  else {
+    printf("setparams: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+  fprintf(fp, "#define NUM_SAMPLES %d\n", num_samples);
+  fprintf(fp, "#define STD_DEVIATION %d\n", deviation);
+  fprintf(fp, "#define NUM_SOURCES %d\n", num_sources);
+}
+
+/* 
+ * write_is_info(): Write IS specific info to config file
+ */
+
+void write_is_info(FILE *fp, int nprocs, char class) 
+{
+  if( class != 'S' &&
+      class != 'W' &&
+      class != 'A' &&
+      class != 'B' &&
+      class != 'C' &&
+      class != 'D' )
+  {
+    printf("setparams: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+}
+
+/* 
+ * write_cg_info(): Write CG specific info to config file
+ */
+
+void write_cg_info(FILE *fp, int nprocs, char class) 
+{
+  int na,nonzer,niter;
+  char *shift,*rcond="1.0d-1";
+  char *shiftS="10.",
+       *shiftW="12.",
+       *shiftA="20.",
+       *shiftB="60.",
+       *shiftC="110.",
+       *shiftD="500.",
+       *shiftE="1.5d3";
+
+  int num_proc_cols, num_proc_rows;
+
+
+  if( class == 'S' )
+  { na=1400;    nonzer=7;  niter=15;  shift=shiftS; }
+  else if( class == 'W' )
+  { na=7000;    nonzer=8;  niter=15;  shift=shiftW; }
+  else if( class == 'A' )
+  { na=14000;   nonzer=11; niter=15;  shift=shiftA; }
+  else if( class == 'B' )
+  { na=75000;   nonzer=13; niter=75;  shift=shiftB; }
+  else if( class == 'C' )
+  { na=150000;  nonzer=15; niter=75;  shift=shiftC; }
+  else if( class == 'D' )
+  { na=1500000; nonzer=21; niter=100; shift=shiftD; }
+  else if( class == 'E' )
+  { na=9000000; nonzer=26; niter=100; shift=shiftE; }
+  else
+  {
+    printf("setparams: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+  fprintf( fp, "%sinteger            na, nonzer, niter\n", FINDENT );
+  fprintf( fp, "%sdouble precision   shift, rcond\n", FINDENT );
+  fprintf( fp, "%sparameter(  na=%d,\n", FINDENT, na );
+  fprintf( fp, "%s             nonzer=%d,\n", CONTINUE, nonzer );
+  fprintf( fp, "%s             niter=%d,\n", CONTINUE, niter );
+  fprintf( fp, "%s             shift=%s,\n", CONTINUE, shift );
+  fprintf( fp, "%s             rcond=%s )\n", CONTINUE, rcond );
+
+
+  num_proc_cols = num_proc_rows = ilog2(nprocs)/2;
+  if (num_proc_cols+num_proc_rows != ilog2(nprocs)) num_proc_cols += 1;
+  num_proc_cols = ipow2(num_proc_cols); num_proc_rows = ipow2(num_proc_rows);
+  
+  fprintf( fp, "\nc number of nodes for which this version is compiled\n" );
+  fprintf( fp, "%sinteger    nnodes_compiled\n", FINDENT );
+  fprintf( fp, "%sparameter( nnodes_compiled = %d)\n", FINDENT, nprocs );
+  fprintf( fp, "%sinteger    num_proc_cols, num_proc_rows\n", FINDENT );
+  fprintf( fp, "%sparameter( num_proc_cols=%d, num_proc_rows=%d )\n", 
+                                                          FINDENT,
+                                                          num_proc_cols,
+                                                          num_proc_rows );
+}
+
+
+/* 
+ * write_ft_info(): Write FT specific info to config file
+ */
+
+void write_ft_info(FILE *fp, int nprocs, char class) 
+{
+  /* easiest way (given the way the benchmark is written)
+   * is to specify log of number of grid points in each
+   * direction m1, m2, m3. nt is the number of iterations
+   */
+  int nx, ny, nz, maxdim, niter;
+  if      (class == 'S') { nx = 64;   ny = 64;   nz = 64;   niter = 6;}
+  else if (class == 'W') { nx = 128;  ny = 128;  nz = 32;   niter = 6;}
+  else if (class == 'A') { nx = 256;  ny = 256;  nz = 128;  niter = 6;}
+  else if (class == 'B') { nx = 512;  ny = 256;  nz = 256;  niter =20;}
+  else if (class == 'C') { nx = 512;  ny = 512;  nz = 512;  niter =20;}
+  else if (class == 'D') { nx = 2048; ny = 1024; nz = 1024; niter =25;}
+  else if (class == 'E') { nx = 4096; ny = 2048; nz = 2048; niter =25;}
+  else {
+    printf("setparams: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+  maxdim = nx;
+  if (ny > maxdim) maxdim = ny;
+  if (nz > maxdim) maxdim = nz;
+  fprintf(fp, "%sinteger nx, ny, nz, maxdim, niter_default, ntdivnp, np_min\n", FINDENT);
+  fprintf(fp, "%sparameter (nx=%d, ny=%d, nz=%d, maxdim=%d)\n", 
+          FINDENT, nx, ny, nz, maxdim);
+  fprintf(fp, "%sparameter (niter_default=%d)\n", FINDENT, niter);
+  fprintf(fp, "%sparameter (np_min = %d)\n", FINDENT, nprocs);
+  fprintf(fp, "%sparameter (ntdivnp=((nx*ny)/np_min)*nz)\n", FINDENT);
+  fprintf(fp, "%sdouble precision ntotal_f\n", FINDENT);
+  fprintf(fp, "%sparameter (ntotal_f=1.d0*nx*ny*nz)\n", FINDENT);
+}
+
+/*
+ * write_ep_info(): Write EP specific info to config file
+ */
+
+void write_ep_info(FILE *fp, int nprocs, char class)
+{
+  /* easiest way (given the way the benchmark is written)
+   * is to specify log of number of grid points in each
+   * direction m1, m2, m3. nt is the number of iterations
+   */
+  int m;
+  if      (class == 'S') { m = 24; }
+  else if (class == 'W') { m = 25; }
+  else if (class == 'A') { m = 28; }
+  else if (class == 'B') { m = 30; }
+  else if (class == 'C') { m = 32; }
+  else if (class == 'D') { m = 36; }
+  else if (class == 'E') { m = 40; }
+  else {
+    printf("setparams: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+  /* number of processors given by "npm" */
+
+
+  fprintf(fp, "%scharacter class\n",FINDENT);
+  fprintf(fp, "%sparameter (class =\'%c\')\n",
+                  FINDENT, class);
+  fprintf(fp, "%sinteger m, npm\n", FINDENT);
+  fprintf(fp, "%sparameter (m=%d, npm=%d)\n",
+          FINDENT, m, nprocs);
+}
+/*
+ * write_ep_info_C(): Write EP specific info to config file
+ */
+
+
+void write_ep_info_C(FILE *fp, int nprocs, char class)
+{
+  /* easiest way (given the way the benchmark is written)
+   * is to specify log of number of grid points in each
+   * direction m1, m2, m3. nt is the number of iterations
+   */
+  int m;
+  if      (class == 'S') { m = 24; }
+  else if (class == 'W') { m = 25; }
+  else if (class == 'A') { m = 28; }
+  else if (class == 'B') { m = 30; }
+  else if (class == 'C') { m = 32; }
+  else if (class == 'D') { m = 36; }
+  else if (class == 'E') { m = 40; }
+  else {
+    printf("setparams: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+  /* number of processors given by "npm" */
+
+
+  fprintf(fp, "%schar *_class=\"%c\";\n",FINDENT,class);
+  fprintf(fp, "%sint m=%d;\n", FINDENT,m);
+  fprintf(fp, "%sint npm=%d;\n", FINDENT,nprocs);
+}
+/* 
+ * This is a gross hack to allow the benchmarks to 
+ * print out how they were compiled. Various other ways
+ * of doing this have been tried and they all fail on
+ * some machine - due to a broken "make" program, or
+ * F77 limitations, of whatever. Hopefully this will
+ * always work because it uses very portable C. Unfortunately
+ * it relies on parsing the make.def file - YUK. 
+ * If your machine doesn't have <string.h> or <ctype.h>, happy hacking!
+ * 
+ */
+
+#define VERBOSE
+#define LL 400
+#include <stdio.h>
+#define DEFFILE "../config/make.def"
+#define DEFAULT_MESSAGE "(none)"
+FILE *deffile;
+void write_compiler_info(int type, FILE *fp)
+{
+  char line[LL];
+  char mpif77[LL], flink[LL], fmpi_lib[LL], fmpi_inc[LL], fflags[LL], flinkflags[LL];
+  char compiletime[LL], randfile[LL];
+  char mpicc[LL], cflags[LL], clink[LL], clinkflags[LL],
+       cmpi_lib[LL], cmpi_inc[LL];
+  struct tm *tmp;
+  time_t t;
+  deffile = fopen(DEFFILE, "r");
+  if (deffile == NULL) {
+    printf("\n\
+setparams: File %s doesn't exist. To build the NAS benchmarks\n\
+           you need to create is according to the instructions\n\
+           in the README in the main directory and comments in \n\
+           the file config/make.def.template\n", DEFFILE);
+    exit(1);
+  }
+  strcpy(mpif77, DEFAULT_MESSAGE);
+  strcpy(flink, DEFAULT_MESSAGE);
+  strcpy(fmpi_lib, DEFAULT_MESSAGE);
+  strcpy(fmpi_inc, DEFAULT_MESSAGE);
+  strcpy(fflags, DEFAULT_MESSAGE);
+  strcpy(flinkflags, DEFAULT_MESSAGE);
+  strcpy(randfile, DEFAULT_MESSAGE);
+  strcpy(mpicc, DEFAULT_MESSAGE);
+  strcpy(cflags, DEFAULT_MESSAGE);
+  strcpy(clink, DEFAULT_MESSAGE);
+  strcpy(clinkflags, DEFAULT_MESSAGE);
+  strcpy(cmpi_lib, DEFAULT_MESSAGE);
+  strcpy(cmpi_inc, DEFAULT_MESSAGE);
+
+  while (fgets(line, LL, deffile) != NULL) {
+    if (*line == '#') continue;
+    /* yes, this is inefficient. but it's simple! */
+    check_line(line, "MPIF77", mpif77);
+    check_line(line, "FLINK", flink);
+    check_line(line, "FMPI_LIB", fmpi_lib);
+    check_line(line, "FMPI_INC", fmpi_inc);
+    check_line(line, "FFLAGS", fflags);
+    check_line(line, "FLINKFLAGS", flinkflags);
+    check_line(line, "RAND", randfile);
+    check_line(line, "MPICC", mpicc);
+    check_line(line, "CFLAGS", cflags);
+    check_line(line, "CLINK", clink);
+    check_line(line, "CLINKFLAGS", clinkflags);
+    check_line(line, "CMPI_LIB", cmpi_lib);
+    check_line(line, "CMPI_INC", cmpi_inc);
+    /* if the dummy library is used by including make.dummy, we set the
+       Fortran and C paths to libraries and headers accordingly     */
+    if(check_include_line(line, "../config/make.dummy")) {
+       strcpy(fmpi_lib, "-L../MPI_dummy -lmpi");
+       strcpy(fmpi_inc, "-I../MPI_dummy");
+       strcpy(cmpi_lib, "-L../MPI_dummy -lmpi");
+       strcpy(cmpi_inc, "-I../MPI_dummy");
+    }
+  }
+
+  
+  (void) time(&t);
+  tmp = localtime(&t);
+  (void) strftime(compiletime, (size_t)LL, "%d %b %Y", tmp);
+
+
+  switch(type) {
+      case FT:
+      case SP:
+      case BT:
+      case MG:
+      case LU:
+      //case EP:
+      case CG:
+          put_string(fp, "compiletime", compiletime);
+          put_string(fp, "npbversion", VERSION);
+          put_string(fp, "cs1", mpif77);
+          put_string(fp, "cs2", flink);
+          put_string(fp, "cs3", fmpi_lib);
+          put_string(fp, "cs4", fmpi_inc);
+          put_string(fp, "cs5", fflags);
+          put_string(fp, "cs6", flinkflags);
+	  put_string(fp, "cs7", randfile);
+          break;
+      case IS:
+      case EP:
+      case DT:
+          put_def_string(fp, "COMPILETIME", compiletime);
+          put_def_string(fp, "NPBVERSION", VERSION);
+          put_def_string(fp, "MPICC", mpicc);
+          put_def_string(fp, "CFLAGS", cflags);
+          put_def_string(fp, "CLINK", clink);
+          put_def_string(fp, "CLINKFLAGS", clinkflags);
+          put_def_string(fp, "CMPI_LIB", cmpi_lib);
+          put_def_string(fp, "CMPI_INC", cmpi_inc);
+          break;
+      default:
+          printf("setparams: (Internal error): Unknown benchmark type %d\n", 
+                                                                         type);
+          exit(1);
+  }
+
+}
+
+void check_line(char *line, char *label, char *val)
+{
+  char *original_line;
+  int n;
+  original_line = line;
+  /* compare beginning of line and label */
+  while (*label != '\0' && *line == *label) {
+    line++; label++; 
+  }
+  /* if *label is not EOS, we must have had a mismatch */
+  if (*label != '\0') return;
+  /* if *line is not a space, actual label is longer than test label */
+  if (!isspace(*line) && *line != '=') return ; 
+  /* skip over white space */
+  while (isspace(*line)) line++;
+  /* next char should be '=' */
+  if (*line != '=') return;
+  /* skip over white space */
+  while (isspace(*++line));
+  /* if EOS, nothing was specified */
+  if (*line == '\0') return;
+  /* finally we've come to the value */
+  strcpy(val, line);
+  /* chop off the newline at the end */
+  n = strlen(val)-1;
+  if (n >= 0 && val[n] == '\n')
+    val[n--] = '\0';
+  if (n >= 0 && val[n] == '\r')
+    val[n--] = '\0';
+  /* treat continuation */
+  while (val[n] == '\\' && fgets(original_line, LL, deffile)) {
+     line = original_line;
+     while (isspace(*line)) line++;
+     if (isspace(*original_line)) val[n++] = ' ';
+     while (*line && *line != '\n' && *line != '\r' && n < LL-1)
+       val[n++] = *line++;
+     val[n] = '\0';
+     n--;
+  }
+/*  if (val[strlen(val) - 1] == '\\') {
+    printf("\n\
+setparams: Error in file make.def. Because of the way in which\n\
+           command line arguments are incorporated into the\n\
+           executable benchmark, you can't have any continued\n\
+           lines in the file make.def, that is, lines ending\n\
+           with the character \"\\\". Although it may be ugly, \n\
+           you should be able to reformat without continuation\n\
+           lines. The offending line is\n\
+  %s\n", original_line);
+    exit(1);
+  } */
+}
+
+int check_include_line(char *line, char *filename)
+{
+  char *include_string = "include";
+  /* compare beginning of line and "include" */
+  while (*include_string != '\0' && *line == *include_string) {
+    line++; include_string++; 
+  }
+  /* if *include_string is not EOS, we must have had a mismatch */
+  if (*include_string != '\0') return(0);
+  /* if *line is not a space, first word is not "include" */
+  if (!isspace(*line)) return(0); 
+  /* skip over white space */
+  while (isspace(*++line));
+  /* if EOS, nothing was specified */
+  if (*line == '\0') return(0);
+  /* next keyword should be name of include file in *filename */
+  while (*filename != '\0' && *line == *filename) {
+    line++; filename++; 
+  }  
+  if (*filename != '\0' || 
+      (*line != ' ' && *line != '\0' && *line !='\n')) return(0);
+  else return(1);
+}
+
+
+#define MAXL 46
+void put_string(FILE *fp, char *name, char *val)
+{
+  int len;
+  len = strlen(val);
+  if (len > MAXL) {
+    val[MAXL] = '\0';
+    val[MAXL-1] = '.';
+    val[MAXL-2] = '.';
+    val[MAXL-3] = '.';
+    len = MAXL;
+  }
+  fprintf(fp, "%scharacter*%d %s\n", FINDENT, len, name);
+  fprintf(fp, "%sparameter (%s=\'%s\')\n", FINDENT, name, val);
+}
+
+/* NOTE: is the ... stuff necessary in C? */
+void put_def_string(FILE *fp, char *name, char *val)
+{
+  int len;
+  len = strlen(val);
+  if (len > MAXL) {
+    val[MAXL] = '\0';
+    val[MAXL-1] = '.';
+    val[MAXL-2] = '.';
+    val[MAXL-3] = '.';
+    len = MAXL;
+  }
+  fprintf(fp, "#define %s \"%s\"\n", name, val);
+}
+
+void put_def_variable(FILE *fp, char *name, char *val)
+{
+  int len;
+  len = strlen(val);
+  if (len > MAXL) {
+    val[MAXL] = '\0';
+    val[MAXL-1] = '.';
+    val[MAXL-2] = '.';
+    val[MAXL-3] = '.';
+    len = MAXL;
+  }
+  fprintf(fp, "#define %s %s\n", name, val);
+}
+
+
+
+#if 0
+
+/* this version allows arbitrarily long lines but 
+ * some compilers don't like that and they're rarely
+ * useful 
+ */
+
+#define LINELEN 65
+void put_string(FILE *fp, char *name, char *val)
+{
+  int len, nlines, pos, i;
+  char line[100];
+  len = strlen(val);
+  nlines = len/LINELEN;
+  if (nlines*LINELEN < len) nlines++;
+  fprintf(fp, "%scharacter*%d %s\n", FINDENT, nlines*LINELEN, name);
+  fprintf(fp, "%sparameter (%s = \n", FINDENT, name);
+  for (i = 0; i < nlines; i++) {
+    pos = i*LINELEN;
+    if (i == 0) fprintf(fp, "%s\'", CONTINUE);
+    else        fprintf(fp, "%s", CONTINUE);
+    /* number should be same as LINELEN */
+    fprintf(fp, "%.65s", val+pos);
+    if (i == nlines-1) fprintf(fp, "\')\n");
+    else             fprintf(fp, "\n");
+  }
+}
+
+#endif
+
+
+/* integer square root. Return error if argument isn't
+ * a perfect square or is less than or equal to zero 
+ */
+
+int isqrt(int i)
+{
+  int root, square;
+  if (i <= 0) return(-1);
+  square = 0;
+  for (root = 1; square <= i; root++) {
+    square = root*root;
+    if (square == i) return(root);
+  }
+  return(-1);
+}
+  
+
+/* integer log base two. Return error is argument isn't
+ * a power of two or is less than or equal to zero 
+ */
+
+int ilog2(int i)
+{
+  int log2;
+  int exp2 = 1;
+  if (i <= 0) return(-1);
+
+  for (log2 = 0; log2 < 20; log2++) {
+    if (exp2 == i) return(log2);
+    exp2 *= 2;
+  }
+  return(-1);
+}
+
+int ipow2(int i)
+{
+  int pow2 = 1;
+  if (i < 0) return(-1);
+  if (i == 0) return(1);
+  while(i--) pow2 *= 2;
+  return(pow2);
+}
+ 
+
+
+void write_convertdouble_info(int type, FILE *fp)
+{
+  switch(type) {
+  case SP:
+  case BT:
+  case LU:
+  case FT:
+  case MG:
+  //case EP:
+  case CG:
+    fprintf(fp, "%slogical  convertdouble\n", FINDENT);
+#ifdef CONVERTDOUBLE
+    fprintf(fp, "%sparameter (convertdouble = .true.)\n", FINDENT);
+#else
+    fprintf(fp, "%sparameter (convertdouble = .false.)\n", FINDENT);
+#endif
+    break;
+  }
+}
diff --git a/examples/smpi/NAS/sys/suite.awk b/examples/smpi/NAS/sys/suite.awk
new file mode 100644
index 0000000000..2e5fc3137c
--- /dev/null
+++ b/examples/smpi/NAS/sys/suite.awk
@@ -0,0 +1,20 @@
+BEGIN { SMAKE = "make" } {
+  if ($1 !~ /^#/ &&  NF > 2) {
+    printf "cd `echo %s|tr '[a-z]' '[A-Z]'`; %s clean;", $1, SMAKE;
+    printf "%s CLASS=%s NPROCS=%s", SMAKE, $2, $3;
+    if ( NF > 3 ) {
+      if ( $4 ~ /^vec/ ||  $4 ~ /^VEC/ ) {
+        printf " VERSION=%s", $4;
+        if ( NF > 4 ) {
+          printf " SUBTYPE=%s", $5;
+        }
+      } else {
+        printf " SUBTYPE=%s", $4;
+        if ( NF > 4 ) {
+          printf " VERSION=%s", $5;
+        }
+      }
+    }
+    printf "; cd ..\n";
+  }
+}
-- 
2.20.1