From: Martin Quinson <martin.quinson@loria.fr>
Date: Thu, 13 Nov 2014 20:59:09 +0000 (+0100)
Subject: Remove the unmodified NAS examples as they are really useless nowadays
X-Git-Tag: v3_12~732^2~211
X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/d8983d99631ddba747941cadb391ce80243a5529

Remove the unmodified NAS examples as they are really useless nowadays

I'm still unsure of what to do with the modified ones. I vote for
removing them if we have enough examples already.
---

diff --git a/examples/smpi/NAS/BT/Makefile b/examples/smpi/NAS/BT/Makefile
deleted file mode 100644
index dd27503221..0000000000
--- a/examples/smpi/NAS/BT/Makefile
+++ /dev/null
@@ -1,106 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=bt
-BENCHMARKU=BT
-VEC=
-
-include ../config/make.def
-
-
-OBJS = bt.o make_set.o initialize.o exact_solution.o exact_rhs.o \
-       set_constants.o adi.o define.o copy_faces.o rhs.o solve_subs.o \
-       x_solve$(VEC).o y_solve$(VEC).o z_solve$(VEC).o add.o error.o \
-       verify.o setup_mpi.o \
-       ${COMMON}/print_results.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-# npbparams.h is included by header.h
-# The following rule should do the trick but many make programs (not gmake)
-# will do the wrong thing and rebuild the world every time (because the
-# mod time on header.h is not changed. One solution would be to 
-# touch header.h but this might cause confusion if someone has
-# accidentally deleted it. Instead, make the dependency on npbparams.h
-# explicit in all the lines below (even though dependence is indirect). 
-
-# header.h: npbparams.h
-
-${PROGRAM}: config
-	@if [ x$(VERSION) = xvec ] ; then	\
-		${MAKE} VEC=_vec exec;		\
-	elif [ x$(VERSION) = xVEC ] ; then	\
-		${MAKE} VEC=_vec exec;		\
-	else					\
-		${MAKE} exec;			\
-	fi
-
-exec: $(OBJS)
-	@if [ x$(SUBTYPE) = xfull ] ; then	\
-		${MAKE} bt-full;		\
-	elif [ x$(SUBTYPE) = xFULL ] ; then	\
-		${MAKE} bt-full;		\
-	elif [ x$(SUBTYPE) = xsimple ] ; then	\
-		${MAKE} bt-simple;		\
-	elif [ x$(SUBTYPE) = xSIMPLE ] ; then	\
-		${MAKE} bt-simple;		\
-	elif [ x$(SUBTYPE) = xfortran ] ; then	\
-		${MAKE} bt-fortran;		\
-	elif [ x$(SUBTYPE) = xFORTRAN ] ; then	\
-		${MAKE} bt-fortran;		\
-	elif [ x$(SUBTYPE) = xepio ] ; then	\
-		${MAKE} bt-epio;		\
-	elif [ x$(SUBTYPE) = xEPIO ] ; then	\
-		${MAKE} bt-epio;		\
-	else					\
-		${MAKE} bt-bt;			\
-	fi
-
-bt-bt: ${OBJS} btio.o
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} btio.o ${FMPI_LIB}
-
-bt-full: ${OBJS} full_mpiio.o btio_common.o
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_full ${OBJS} btio_common.o full_mpiio.o ${FMPI_LIB}
-
-bt-simple: ${OBJS} simple_mpiio.o btio_common.o
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_simple ${OBJS} btio_common.o simple_mpiio.o ${FMPI_LIB}
-
-bt-fortran: ${OBJS} fortran_io.o btio_common.o
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.fortran_io ${OBJS} btio_common.o fortran_io.o ${FMPI_LIB}
-
-bt-epio: ${OBJS} epio.o btio_common.o
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.ep_io ${OBJS} btio_common.o epio.o ${FMPI_LIB}
-
-.f.o:
-	${FCOMPILE} $<
-
-.c.o:
-	${CCOMPILE} $<
-
-
-bt.o:             bt.f  header.h npbparams.h  mpinpb.h
-make_set.o:       make_set.f  header.h npbparams.h  mpinpb.h
-initialize.o:     initialize.f  header.h npbparams.h
-exact_solution.o: exact_solution.f  header.h npbparams.h
-exact_rhs.o:      exact_rhs.f  header.h npbparams.h
-set_constants.o:  set_constants.f  header.h npbparams.h
-adi.o:            adi.f  header.h npbparams.h
-define.o:         define.f  header.h npbparams.h
-copy_faces.o:     copy_faces.f  header.h npbparams.h  mpinpb.h
-rhs.o:            rhs.f  header.h npbparams.h
-x_solve$(VEC).o:  x_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
-y_solve$(VEC).o:  y_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
-z_solve$(VEC).o:  z_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
-solve_subs.o:     solve_subs.f  npbparams.h
-add.o:            add.f  header.h npbparams.h
-error.o:          error.f  header.h npbparams.h  mpinpb.h
-verify.o:         verify.f  header.h npbparams.h  mpinpb.h
-setup_mpi.o:      setup_mpi.f mpinpb.h npbparams.h 
-btio.o:           btio.f  header.h npbparams.h
-btio_common.o:    btio_common.f mpinpb.h npbparams.h 
-fortran_io.o:     fortran_io.f mpinpb.h npbparams.h 
-simple_mpiio.o:   simple_mpiio.f mpinpb.h npbparams.h 
-full_mpiio.o:     full_mpiio.f mpinpb.h npbparams.h 
-epio.o:           epio.f mpinpb.h npbparams.h 
-
-clean:
-	- rm -f *.o *~ mputil*
-	- rm -f  npbparams.h core
diff --git a/examples/smpi/NAS/BT/add.f b/examples/smpi/NAS/BT/add.f
deleted file mode 100644
index e14cde46ef..0000000000
--- a/examples/smpi/NAS/BT/add.f
+++ /dev/null
@@ -1,30 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine  add
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     addition of update to the vector u
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer  c, i, j, k, m
-
-      do     c = 1, ncells
-         do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do    m = 1, 5
-                     u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      enddo
-
-      return
-      end
diff --git a/examples/smpi/NAS/BT/adi.f b/examples/smpi/NAS/BT/adi.f
deleted file mode 100644
index 58450c028e..0000000000
--- a/examples/smpi/NAS/BT/adi.f
+++ /dev/null
@@ -1,21 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine  adi
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      call copy_faces
-
-      call x_solve
-
-      call y_solve
-
-      call z_solve
-
-      call add
-
-      return
-      end
-
diff --git a/examples/smpi/NAS/BT/bt.f b/examples/smpi/NAS/BT/bt.f
deleted file mode 100644
index 36e50781b3..0000000000
--- a/examples/smpi/NAS/BT/bt.f
+++ /dev/null
@@ -1,275 +0,0 @@
-!-------------------------------------------------------------------------!
-!                                                                         !
-!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
-!                                                                         !
-!                                   B T                                   !
-!                                                                         !
-!-------------------------------------------------------------------------!
-!                                                                         !
-!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
-!    It is described in NAS Technical Reports 95-020 and 02-007.          !
-!                                                                         !
-!    Permission to use, copy, distribute and modify this software         !
-!    for any purpose with or without fee is hereby granted.  We           !
-!    request, however, that all derived work reference the NAS            !
-!    Parallel Benchmarks 3.3. This software is provided "as is"           !
-!    without express or implied warranty.                                 !
-!                                                                         !
-!    Information on NPB 3.3, including the technical report, the          !
-!    original specifications, source code, results and information        !
-!    on how to submit new results, is available at:                       !
-!                                                                         !
-!           http://www.nas.nasa.gov/Software/NPB/                         !
-!                                                                         !
-!    Send comments or suggestions to  npb@nas.nasa.gov                    !
-!                                                                         !
-!          NAS Parallel Benchmarks Group                                  !
-!          NASA Ames Research Center                                      !
-!          Mail Stop: T27A-1                                              !
-!          Moffett Field, CA   94035-1000                                 !
-!                                                                         !
-!          E-mail:  npb@nas.nasa.gov                                      !
-!          Fax:     (650) 604-3957                                        !
-!                                                                         !
-!-------------------------------------------------------------------------!
-
-c---------------------------------------------------------------------
-c
-c Authors: R. F. Van der Wijngaart
-c          T. Harris
-c          M. Yarrow
-c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-       program MPBT
-c---------------------------------------------------------------------
-
-       include  'header.h'
-       include  'mpinpb.h'
-      
-       integer i, niter, step, c, error, fstatus
-       double precision navg, mflops, mbytes, n3
-
-       external timer_read
-       double precision t, tmax, tiominv, tpc, timer_read
-       logical verified
-       character class, cbuff*40
-
-       integer wr_interval
-
-       call setup_mpi
-       if (.not. active) goto 999
-
-c---------------------------------------------------------------------
-c      Root node reads input file (if it exists) else takes
-c      defaults from parameters
-c---------------------------------------------------------------------
-       if (node .eq. root) then
-          
-          write(*, 1000)
-          open (unit=2,file='inputbt.data',status='old', iostat=fstatus)
-c
-          rd_interval = 0
-          if (fstatus .eq. 0) then
-            write(*,233) 
- 233        format(' Reading from input file inputbt.data')
-            read (2,*) niter
-            read (2,*) dt
-            read (2,*) grid_points(1), grid_points(2), grid_points(3)
-            if (iotype .ne. 0) then
-                read (2,'(A)') cbuff
-                read (cbuff,*,iostat=i) wr_interval, rd_interval
-                if (i .ne. 0) rd_interval = 0
-                if (wr_interval .le. 0) wr_interval = wr_default
-            endif
-            if (iotype .eq. 1) then
-                read (2,*) collbuf_nodes, collbuf_size
-                write(*,*) 'collbuf_nodes ', collbuf_nodes
-                write(*,*) 'collbuf_size  ', collbuf_size
-            endif
-            close(2)
-          else
-            write(*,234) 
-            niter = niter_default
-            dt    = dt_default
-            grid_points(1) = problem_size
-            grid_points(2) = problem_size
-            grid_points(3) = problem_size
-            wr_interval = wr_default
-            if (iotype .eq. 1) then
-c             set number of nodes involved in collective buffering to 4,
-c             unless total number of nodes is smaller than that.
-c             set buffer size for collective buffering to 1MB per node
-c             collbuf_nodes = min(4,no_nodes)
-c             set default to No-File-Hints with a value of 0
-              collbuf_nodes = 0
-              collbuf_size = 1000000
-            endif
-          endif
- 234      format(' No input file inputbt.data. Using compiled defaults')
-
-          write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
-          write(*, 1002) niter, dt
-          if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
-          if (no_nodes .ne. maxcells*maxcells) 
-     >        write(*, 1005) maxcells*maxcells
-          write(*, 1003) no_nodes
-
-          if (iotype .eq. 1) write(*, 1006) 'FULL MPI-IO', wr_interval
-          if (iotype .eq. 2) write(*, 1006) 'SIMPLE MPI-IO', wr_interval
-          if (iotype .eq. 3) write(*, 1006) 'EPIO', wr_interval
-          if (iotype .eq. 4) write(*, 1006) 'FORTRAN IO', wr_interval
-
- 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- BT Benchmark ',/)
- 1001     format(' Size: ', i4, 'x', i4, 'x', i4)
- 1002     format(' Iterations: ', i4, '    dt: ', F11.7)
- 1004     format(' Total number of processes: ', i5)
- 1005     format(' WARNING: compiled for ', i5, ' processes ')
- 1003     format(' Number of active processes: ', i5, /)
- 1006     format(' BTIO -- ', A, ' write interval: ', i3 /)
-
-       endif
-
-       call mpi_bcast(niter, 1, MPI_INTEGER,
-     >                root, comm_setup, error)
-
-       call mpi_bcast(dt, 1, dp_type, 
-     >                root, comm_setup, error)
-
-       call mpi_bcast(grid_points(1), 3, MPI_INTEGER, 
-     >                root, comm_setup, error)
-
-       call mpi_bcast(wr_interval, 1, MPI_INTEGER,
-     >                root, comm_setup, error)
-
-       call mpi_bcast(rd_interval, 1, MPI_INTEGER,
-     >                root, comm_setup, error)
-
-       call make_set
-
-       do  c = 1, maxcells
-          if ( (cell_size(1,c) .gt. IMAX) .or.
-     >         (cell_size(2,c) .gt. JMAX) .or.
-     >         (cell_size(3,c) .gt. KMAX) ) then
-             print *,node, c, (cell_size(i,c),i=1,3)
-             print *,' Problem size too big for compiled array sizes'
-             goto 999
-          endif
-       end do
-
-       call set_constants
-
-       call initialize
-
-       call setup_btio
-       idump = 0
-
-       call lhsinit
-
-       call exact_rhs
-
-       call compute_buffer_size(5)
-
-c---------------------------------------------------------------------
-c      do one time step to touch all code, and reinitialize
-c---------------------------------------------------------------------
-       call adi
-       call initialize
-
-       call timer_clear(2)
-
-c---------------------------------------------------------------------
-c      Synchronize before placing time stamp
-c---------------------------------------------------------------------
-       call mpi_barrier(comm_setup, error)
-
-       call timer_clear(1)
-       call timer_start(1)
-
-       do  step = 1, niter
-
-          if (node .eq. root) then
-             if (mod(step, 20) .eq. 0 .or. step .eq. niter .or.
-     >           step .eq. 1) then
-                write(*, 200) step
- 200            format(' Time step ', i4)
-             endif
-          endif
-
-          call adi
-
-          if (iotype .ne. 0) then
-              call timer_start(2)
-              if (mod(step, wr_interval).eq.0 .or. step .eq. niter) then
-                  if (node .eq. root) then
-                      print *, 'Writing data set, time step', step
-                  endif
-                  if (step .eq. niter .and. rd_interval .gt. 1) then
-                      rd_interval = 1
-                  endif
-                  call output_timestep
-                  idump = idump + 1
-              endif
-              call timer_stop(2)
-          endif
-       end do
-
-       call btio_cleanup
-
-       call timer_stop(1)
-       t = timer_read(1)
-       
-       call verify(niter, class, verified)
-
-       call mpi_reduce(t, tmax, 1, 
-     >                 dp_type, MPI_MAX, 
-     >                 root, comm_setup, error)
-
-       if (iotype .ne. 0) then
-          t = timer_read(2)
-          if (t .ne. 0.d0) t = 1.0d0 / t
-          call mpi_reduce(t, tiominv, 1, 
-     >                    dp_type, MPI_SUM, 
-     >                    root, comm_setup, error)
-       endif
-
-       if( node .eq. root ) then
-          n3 = 1.0d0*grid_points(1)*grid_points(2)*grid_points(3)
-          navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0
-          if( tmax .ne. 0. ) then
-             mflops = 1.0e-6*float(niter)*
-     >     (3478.8*n3-17655.7*navg**2+28023.7*navg)
-     >     / tmax
-          else
-             mflops = 0.0
-          endif
-
-          if (iotype .ne. 0) then
-             mbytes = n3 * 40.0 * idump * 1.0d-6
-             tiominv = tiominv / no_nodes
-             t = 0.0
-             if (tiominv .ne. 0.) t = 1.d0 / tiominv
-             tpc = 0.0
-             if (tmax .ne. 0.) tpc = t * 100.0 / tmax
-             write(*,1100) t, tpc, mbytes, mbytes*tiominv
- 1100        format(/' BTIO -- statistics:'/
-     >               '   I/O timing in seconds   : ', f14.2/
-     >               '   I/O timing percentage   : ', f14.2/
-     >               '   Total data written (MB) : ', f14.2/
-     >               '   I/O data rate  (MB/sec) : ', f14.2)
-          endif
-
-         call print_results('BT', class, grid_points(1), 
-     >     grid_points(2), grid_points(3), niter, maxcells*maxcells, 
-     >     total_nodes, tmax, mflops, '          floating point', 
-     >     verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, 
-     >     cs6, '(none)')
-       endif
-
- 999   continue
-       call mpi_barrier(MPI_COMM_WORLD, error)
-       call mpi_finalize(error)
-
-       end
-
diff --git a/examples/smpi/NAS/BT/btio.f b/examples/smpi/NAS/BT/btio.f
deleted file mode 100644
index 1fb730b1f6..0000000000
--- a/examples/smpi/NAS/BT/btio.f
+++ /dev/null
@@ -1,72 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine btio_verify(verified)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      logical verified
-
-      verified = .true.
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      double precision xce_acc(5)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine checksum_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      return
-      end
diff --git a/examples/smpi/NAS/BT/btio_common.f b/examples/smpi/NAS/BT/btio_common.f
deleted file mode 100644
index 9227a12b70..0000000000
--- a/examples/smpi/NAS/BT/btio_common.f
+++ /dev/null
@@ -1,30 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine clear_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer cio, kio, jio, ix
-
-      do cio=1,ncells
-          do kio=0, cell_size(3,cio)-1
-              do jio=0, cell_size(2,cio)-1
-                  do ix=0,cell_size(1,cio)-1
-                            u(1,ix, jio,kio,cio) = 0
-                            u(2,ix, jio,kio,cio) = 0
-                            u(3,ix, jio,kio,cio) = 0
-                            u(4,ix, jio,kio,cio) = 0
-                            u(5,ix, jio,kio,cio) = 0
-                  enddo
-              enddo
-          enddo
-      enddo
-
-      return
-      end
-
diff --git a/examples/smpi/NAS/BT/copy_faces.f b/examples/smpi/NAS/BT/copy_faces.f
deleted file mode 100644
index 14b82caf83..0000000000
--- a/examples/smpi/NAS/BT/copy_faces.f
+++ /dev/null
@@ -1,316 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine copy_faces
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     
-c This function copies the face values of a variable defined on a set 
-c of cells to the overlap locations of the adjacent sets of cells. 
-c Because a set of cells interfaces in each direction with exactly one 
-c other set, we only need to fill six different buffers. We could try to 
-c overlap communication with computation, by computing
-c some internal values while communicating boundary values, but this
-c adds so much overhead that it's not clearly useful. 
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i, j, k, c, m, requests(0:11), p0, p1, 
-     >     p2, p3, p4, p5, b_size(0:5), ss(0:5), 
-     >     sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
-
-c---------------------------------------------------------------------
-c     exit immediately if there are no faces to be copied           
-c---------------------------------------------------------------------
-      if (no_nodes .eq. 1) then
-         call compute_rhs
-         return
-      endif
-
-      ss(0) = start_send_east
-      ss(1) = start_send_west
-      ss(2) = start_send_north
-      ss(3) = start_send_south
-      ss(4) = start_send_top
-      ss(5) = start_send_bottom
-
-      sr(0) = start_recv_east
-      sr(1) = start_recv_west
-      sr(2) = start_recv_north
-      sr(3) = start_recv_south
-      sr(4) = start_recv_top
-      sr(5) = start_recv_bottom
-
-      b_size(0) = east_size   
-      b_size(1) = west_size   
-      b_size(2) = north_size  
-      b_size(3) = south_size  
-      b_size(4) = top_size    
-      b_size(5) = bottom_size 
-
-c---------------------------------------------------------------------
-c     because the difference stencil for the diagonalized scheme is 
-c     orthogonal, we do not have to perform the staged copying of faces, 
-c     but can send all face information simultaneously to the neighboring 
-c     cells in all directions          
-c---------------------------------------------------------------------
-      p0 = 0
-      p1 = 0
-      p2 = 0
-      p3 = 0
-      p4 = 0
-      p5 = 0
-
-      do  c = 1, ncells
-
-c---------------------------------------------------------------------
-c     fill the buffer to be sent to eastern neighbors (i-dir)
-c---------------------------------------------------------------------
-         if (cell_coord(1,c) .ne. ncells) then
-            do   k = 0, cell_size(3,c)-1
-               do   j = 0, cell_size(2,c)-1
-                  do   i = cell_size(1,c)-2, cell_size(1,c)-1
-                     do   m = 1, 5
-                        out_buffer(ss(0)+p0) = u(m,i,j,k,c)
-                        p0 = p0 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-c---------------------------------------------------------------------
-c     fill the buffer to be sent to western neighbors 
-c---------------------------------------------------------------------
-         if (cell_coord(1,c) .ne. 1) then
-            do   k = 0, cell_size(3,c)-1
-               do   j = 0, cell_size(2,c)-1
-                  do   i = 0, 1
-                     do   m = 1, 5
-                        out_buffer(ss(1)+p1) = u(m,i,j,k,c)
-                        p1 = p1 + 1
-                     end do
-                  end do
-               end do
-            end do
-
-         endif
-
-c---------------------------------------------------------------------
-c     fill the buffer to be sent to northern neighbors (j_dir)
-c---------------------------------------------------------------------
-         if (cell_coord(2,c) .ne. ncells) then
-            do   k = 0, cell_size(3,c)-1
-               do   j = cell_size(2,c)-2, cell_size(2,c)-1
-                  do   i = 0, cell_size(1,c)-1
-                     do   m = 1, 5
-                        out_buffer(ss(2)+p2) = u(m,i,j,k,c)
-                        p2 = p2 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-c---------------------------------------------------------------------
-c     fill the buffer to be sent to southern neighbors 
-c---------------------------------------------------------------------
-         if (cell_coord(2,c).ne. 1) then
-            do   k = 0, cell_size(3,c)-1
-               do   j = 0, 1
-                  do   i = 0, cell_size(1,c)-1   
-                     do   m = 1, 5
-                        out_buffer(ss(3)+p3) = u(m,i,j,k,c)
-                        p3 = p3 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-c---------------------------------------------------------------------
-c     fill the buffer to be sent to top neighbors (k-dir)
-c---------------------------------------------------------------------
-         if (cell_coord(3,c) .ne. ncells) then
-            do   k = cell_size(3,c)-2, cell_size(3,c)-1
-               do   j = 0, cell_size(2,c)-1
-                  do   i = 0, cell_size(1,c)-1
-                     do   m = 1, 5
-                        out_buffer(ss(4)+p4) = u(m,i,j,k,c)
-                        p4 = p4 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-c---------------------------------------------------------------------
-c     fill the buffer to be sent to bottom neighbors
-c---------------------------------------------------------------------
-         if (cell_coord(3,c).ne. 1) then
-            do    k=0, 1
-               do   j = 0, cell_size(2,c)-1
-                  do   i = 0, cell_size(1,c)-1
-                     do   m = 1, 5
-                        out_buffer(ss(5)+p5) = u(m,i,j,k,c)
-                        p5 = p5 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-c---------------------------------------------------------------------
-c     cell loop
-c---------------------------------------------------------------------
-      end do
-
-      call mpi_irecv(in_buffer(sr(0)), b_size(0), 
-     >     dp_type, successor(1), WEST,  
-     >     comm_rhs, requests(0), error)
-      call mpi_irecv(in_buffer(sr(1)), b_size(1), 
-     >     dp_type, predecessor(1), EAST,  
-     >     comm_rhs, requests(1), error)
-      call mpi_irecv(in_buffer(sr(2)), b_size(2), 
-     >     dp_type, successor(2), SOUTH, 
-     >     comm_rhs, requests(2), error)
-      call mpi_irecv(in_buffer(sr(3)), b_size(3), 
-     >     dp_type, predecessor(2), NORTH, 
-     >     comm_rhs, requests(3), error)
-      call mpi_irecv(in_buffer(sr(4)), b_size(4), 
-     >     dp_type, successor(3), BOTTOM,
-     >     comm_rhs, requests(4), error)
-      call mpi_irecv(in_buffer(sr(5)), b_size(5), 
-     >     dp_type, predecessor(3), TOP,   
-     >     comm_rhs, requests(5), error)
-
-      call mpi_isend(out_buffer(ss(0)), b_size(0), 
-     >     dp_type, successor(1),   EAST, 
-     >     comm_rhs, requests(6), error)
-      call mpi_isend(out_buffer(ss(1)), b_size(1), 
-     >     dp_type, predecessor(1), WEST, 
-     >     comm_rhs, requests(7), error)
-      call mpi_isend(out_buffer(ss(2)), b_size(2), 
-     >     dp_type,successor(2),   NORTH, 
-     >     comm_rhs, requests(8), error)
-      call mpi_isend(out_buffer(ss(3)), b_size(3), 
-     >     dp_type,predecessor(2), SOUTH, 
-     >     comm_rhs, requests(9), error)
-      call mpi_isend(out_buffer(ss(4)), b_size(4), 
-     >     dp_type,successor(3),   TOP, 
-     >     comm_rhs,   requests(10), error)
-      call mpi_isend(out_buffer(ss(5)), b_size(5), 
-     >     dp_type,predecessor(3), BOTTOM, 
-     >     comm_rhs,requests(11), error)
-
-
-      call mpi_waitall(12, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c     unpack the data that has just been received;             
-c---------------------------------------------------------------------
-      p0 = 0
-      p1 = 0
-      p2 = 0
-      p3 = 0
-      p4 = 0
-      p5 = 0
-
-      do   c = 1, ncells
-
-         if (cell_coord(1,c) .ne. 1) then
-            do   k = 0, cell_size(3,c)-1
-               do   j = 0, cell_size(2,c)-1
-                  do   i = -2, -1
-                     do   m = 1, 5
-                        u(m,i,j,k,c) = in_buffer(sr(1)+p0)
-                        p0 = p0 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-         if (cell_coord(1,c) .ne. ncells) then
-            do  k = 0, cell_size(3,c)-1
-               do  j = 0, cell_size(2,c)-1
-                  do  i = cell_size(1,c), cell_size(1,c)+1
-                     do   m = 1, 5
-                        u(m,i,j,k,c) = in_buffer(sr(0)+p1)
-                        p1 = p1 + 1
-                     end do
-                  end do
-               end do
-            end do
-         end if
-            
-         if (cell_coord(2,c) .ne. 1) then
-            do  k = 0, cell_size(3,c)-1
-               do   j = -2, -1
-                  do  i = 0, cell_size(1,c)-1
-                     do   m = 1, 5
-                        u(m,i,j,k,c) = in_buffer(sr(3)+p2)
-                        p2 = p2 + 1
-                     end do
-                  end do
-               end do
-            end do
-
-         endif
-            
-         if (cell_coord(2,c) .ne. ncells) then
-            do  k = 0, cell_size(3,c)-1
-               do   j = cell_size(2,c), cell_size(2,c)+1
-                  do  i = 0, cell_size(1,c)-1
-                     do   m = 1, 5
-                        u(m,i,j,k,c) = in_buffer(sr(2)+p3)
-                        p3 = p3 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-         if (cell_coord(3,c) .ne. 1) then
-            do  k = -2, -1
-               do  j = 0, cell_size(2,c)-1
-                  do  i = 0, cell_size(1,c)-1
-                     do   m = 1, 5
-                        u(m,i,j,k,c) = in_buffer(sr(5)+p4)
-                        p4 = p4 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-         if (cell_coord(3,c) .ne. ncells) then
-            do  k = cell_size(3,c), cell_size(3,c)+1
-               do  j = 0, cell_size(2,c)-1
-                  do  i = 0, cell_size(1,c)-1
-                     do   m = 1, 5
-                        u(m,i,j,k,c) = in_buffer(sr(4)+p5)
-                        p5 = p5 + 1
-                     end do
-                  end do
-               end do
-            end do
-         endif
-
-c---------------------------------------------------------------------
-c     cells loop
-c---------------------------------------------------------------------
-      end do
-
-c---------------------------------------------------------------------
-c     do the rest of the rhs that uses the copied face values          
-c---------------------------------------------------------------------
-      call compute_rhs
-
-      return
-      end
diff --git a/examples/smpi/NAS/BT/define.f b/examples/smpi/NAS/BT/define.f
deleted file mode 100644
index 03c4c6edd7..0000000000
--- a/examples/smpi/NAS/BT/define.f
+++ /dev/null
@@ -1,64 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine compute_buffer_size(dim)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer  c, dim, face_size
-
-      if (ncells .eq. 1) return
-
-c---------------------------------------------------------------------
-c     compute the actual sizes of the buffers; note that there is 
-c     always one cell face that doesn't need buffer space, because it 
-c     is at the boundary of the grid
-c---------------------------------------------------------------------
-      west_size = 0
-      east_size = 0
-
-      do   c = 1, ncells
-         face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
-         if (cell_coord(1,c).ne.1) west_size = west_size + face_size
-         if (cell_coord(1,c).ne.ncells) east_size = east_size + 
-     >        face_size 
-      end do
-
-      north_size = 0
-      south_size = 0
-      do   c = 1, ncells
-         face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
-         if (cell_coord(2,c).ne.1) south_size = south_size + face_size
-         if (cell_coord(2,c).ne.ncells) north_size = north_size + 
-     >        face_size 
-      end do
-
-      top_size = 0
-      bottom_size = 0
-      do   c = 1, ncells
-         face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
-         if (cell_coord(3,c).ne.1) bottom_size = bottom_size + 
-     >        face_size
-         if (cell_coord(3,c).ne.ncells) top_size = top_size +
-     >        face_size     
-      end do
-
-      start_send_west   = 1
-      start_send_east   = start_send_west   + west_size
-      start_send_south  = start_send_east   + east_size
-      start_send_north  = start_send_south  + south_size
-      start_send_bottom = start_send_north  + north_size
-      start_send_top    = start_send_bottom + bottom_size
-      start_recv_west   = 1
-      start_recv_east   = start_recv_west   + west_size
-      start_recv_south  = start_recv_east   + east_size
-      start_recv_north  = start_recv_south  + south_size
-      start_recv_bottom = start_recv_north  + north_size
-      start_recv_top    = start_recv_bottom + bottom_size
-
-      return
-      end
-
diff --git a/examples/smpi/NAS/BT/epio.f b/examples/smpi/NAS/BT/epio.f
deleted file mode 100644
index 52b630999d..0000000000
--- a/examples/smpi/NAS/BT/epio.f
+++ /dev/null
@@ -1,165 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      character*(128) newfilenm
-      integer m
-
-      if (node .lt. 10000) then
-          write (newfilenm, 996) filenm,node
-      else
-          print *, 'error generating file names (> 10000 nodes)'
-          stop
-      endif
-
-996   format (a,'.',i4.4)
-
-      open (unit=99, file=newfilenm, form='unformatted',
-     $       status='unknown')
-
-      do m = 1, 5
-         xce_sub(m) = 0.d0
-      end do
-
-      idump_sub = 0
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ix, iio, jio, kio, cio, aio
-
-      do cio=1,ncells
-          write(99)
-     $         ((((u(aio,ix, jio,kio,cio),aio=1,5),
-     $             ix=0, cell_size(1,cio)-1),
-     $             jio=0, cell_size(2,cio)-1),
-     $             kio=0, cell_size(3,cio)-1)
-      enddo
-
-      idump_sub = idump_sub + 1
-      if (rd_interval .gt. 0) then
-         if (idump_sub .ge. rd_interval) then
-
-            rewind(99)
-            call acc_sub_norms(idump+1)
-
-            rewind(99)
-            idump_sub = 0
-         endif
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine acc_sub_norms(idump_cur)
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer idump_cur
-
-      integer ix, jio, kio, cio, ii, m, ichunk
-      double precision xce_single(5)
-
-      ichunk = idump_cur - idump_sub + 1
-      do ii=0, idump_sub-1
-        do cio=1,ncells
-          read(99)
-     $         ((((u(m,ix, jio,kio,cio),m=1,5),
-     $             ix=0, cell_size(1,cio)-1),
-     $             jio=0, cell_size(2,cio)-1),
-     $             kio=0, cell_size(3,cio)-1)
-        enddo
-
-        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
-
-        call error_norm(xce_single)
-        do m = 1, 5
-           xce_sub(m) = xce_sub(m) + xce_single(m)
-        end do
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      close(unit=99)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      double precision xce_acc(5)
-
-      character*(128) newfilenm
-      integer m
-
-      if (rd_interval .gt. 0) goto 20
-
-      if (node .lt. 10000) then
-          write (newfilenm, 996) filenm,node
-      else
-          print *, 'error generating file names (> 10000 nodes)'
-          stop
-      endif
-
-996   format (a,'.',i4.4)
-
-      open (unit=99, file=newfilenm,
-     $      form='unformatted')
-
-c     clear the last time step
-
-      call clear_timestep
-
-c     read back the time steps and accumulate norms
-
-      call acc_sub_norms(idump)
-
-      close(unit=99)
-
- 20   continue
-      do m = 1, 5
-         xce_acc(m) = xce_sub(m) / dble(idump)
-      end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/BT/error.f b/examples/smpi/NAS/BT/error.f
deleted file mode 100644
index 147a582b58..0000000000
--- a/examples/smpi/NAS/BT/error.f
+++ /dev/null
@@ -1,106 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine error_norm(rms)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     this function computes the norm of the difference between the
-c     computed solution and the exact solution
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer c, i, j, k, m, ii, jj, kk, d, error
-      double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
-     >     add
-
-      do m = 1, 5 
-         rms_work(m) = 0.0d0
-      enddo
-
-      do c = 1, ncells
-         kk = 0
-         do k = cell_low(3,c), cell_high(3,c)
-            zeta = dble(k) * dnzm1
-            jj = 0
-            do j = cell_low(2,c), cell_high(2,c)
-               eta = dble(j) * dnym1
-               ii = 0
-               do i = cell_low(1,c), cell_high(1,c)
-                  xi = dble(i) * dnxm1
-                  call exact_solution(xi, eta, zeta, u_exact)
-
-                  do m = 1, 5
-                     add = u(m,ii,jj,kk,c)-u_exact(m)
-                     rms_work(m) = rms_work(m) + add*add
-                  enddo
-                  ii = ii + 1
-               enddo
-               jj = jj + 1
-            enddo
-            kk = kk + 1
-         enddo
-      enddo
-
-      call mpi_allreduce(rms_work, rms, 5, dp_type, 
-     >     MPI_SUM, comm_setup, error)
-
-      do m = 1, 5
-         do d = 1, 3
-            rms(m) = rms(m) / dble(grid_points(d)-2)
-         enddo
-         rms(m) = dsqrt(rms(m))
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine rhs_norm(rms)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer c, i, j, k, d, m, error
-      double precision rms(5), rms_work(5), add
-
-      do m = 1, 5
-         rms_work(m) = 0.0d0
-      enddo 
-
-      do c = 1, ncells
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     add = rhs(m,i,j,k,c)
-                     rms_work(m) = rms_work(m) + add*add
-                  enddo 
-               enddo 
-            enddo 
-         enddo 
-      enddo 
-
-      call mpi_allreduce(rms_work, rms, 5, dp_type, 
-     >     MPI_SUM, comm_setup, error)
-
-      do m = 1, 5
-         do d = 1, 3
-            rms(m) = rms(m) / dble(grid_points(d)-2)
-         enddo 
-         rms(m) = dsqrt(rms(m))
-      enddo 
-
-      return
-      end
-
diff --git a/examples/smpi/NAS/BT/exact_rhs.f b/examples/smpi/NAS/BT/exact_rhs.f
deleted file mode 100644
index 26a2871d20..0000000000
--- a/examples/smpi/NAS/BT/exact_rhs.f
+++ /dev/null
@@ -1,360 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine exact_rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      double precision dtemp(5), xi, eta, zeta, dtpp
-      integer          c, m, i, j, k, ip1, im1, jp1, 
-     >     jm1, km1, kp1
-
-
-c---------------------------------------------------------------------
-c     loop over all cells owned by this node                   
-c---------------------------------------------------------------------
-      do c = 1, ncells
-
-c---------------------------------------------------------------------
-c     initialize                                  
-c---------------------------------------------------------------------
-         do k= 0, cell_size(3,c)-1
-            do j = 0, cell_size(2,c)-1
-               do i = 0, cell_size(1,c)-1
-                  do m = 1, 5
-                     forcing(m,i,j,k,c) = 0.0d0
-                  enddo
-               enddo
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     xi-direction flux differences                      
-c---------------------------------------------------------------------
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            zeta = dble(k+cell_low(3,c)) * dnzm1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               eta = dble(j+cell_low(2,c)) * dnym1
-
-               do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
-                  xi = dble(i+cell_low(1,c)) * dnxm1
-
-                  call exact_solution(xi, eta, zeta, dtemp)
-                  do m = 1, 5
-                     ue(i,m) = dtemp(m)
-                  enddo
-
-                  dtpp = 1.0d0 / dtemp(1)
-
-                  do m = 2, 5
-                     buf(i,m) = dtpp * dtemp(m)
-                  enddo
-
-                  cuf(i)   = buf(i,2) * buf(i,2)
-                  buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + 
-     >                 buf(i,4) * buf(i,4) 
-                  q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
-     >                 buf(i,4)*ue(i,4))
-
-               enddo
-               
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  im1 = i-1
-                  ip1 = i+1
-
-                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
-     >                 tx2*( ue(ip1,2)-ue(im1,2) )+
-     >                 dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
-
-                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * (
-     >                 (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
-     >                 (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
-     >                 xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
-     >                 dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
-
-                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * (
-     >                 ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
-     >                 xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
-     >                 dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
-                  
-                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*(
-     >                 ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
-     >                 xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
-     >                 dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
-
-                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*(
-     >                 buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
-     >                 buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
-     >                 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
-     >                 buf(im1,1))+
-     >                 xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
-     >                 xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
-     >                 dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
-               enddo
-
-c---------------------------------------------------------------------
-c     Fourth-order dissipation                         
-c---------------------------------------------------------------------
-               if (start(1,c) .gt. 0) then
-                  do m = 1, 5
-                     i = 1
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
-                     i = 2
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
-     >                    4.0d0*ue(i+1,m) +       ue(i+2,m))
-                  enddo
-               endif
-
-               do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
-                  do m = 1, 5
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
-     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) +
-     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
-                  enddo
-               enddo
-
-               if (end(1,c) .gt. 0) then
-                  do m = 1, 5
-                     i = cell_size(1,c)-3
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) +
-     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
-                     i = cell_size(1,c)-2
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
-                  enddo
-               endif
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     eta-direction flux differences             
-c---------------------------------------------------------------------
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1          
-            zeta = dble(k+cell_low(3,c)) * dnzm1
-            do i=start(1,c), cell_size(1,c)-end(1,c)-1
-               xi = dble(i+cell_low(1,c)) * dnxm1
-
-               do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
-                  eta = dble(j+cell_low(2,c)) * dnym1
-
-                  call exact_solution(xi, eta, zeta, dtemp)
-                  do m = 1, 5 
-                     ue(j,m) = dtemp(m)
-                  enddo
-                  
-                  dtpp = 1.0d0/dtemp(1)
-
-                  do m = 2, 5
-                     buf(j,m) = dtpp * dtemp(m)
-                  enddo
-
-                  cuf(j)   = buf(j,3) * buf(j,3)
-                  buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + 
-     >                 buf(j,4) * buf(j,4)
-                  q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
-     >                 buf(j,4)*ue(j,4))
-               enddo
-
-               do j = start(2,c), cell_size(2,c)-end(2,c)-1
-                  jm1 = j-1
-                  jp1 = j+1
-                  
-                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
-     >                 ty2*( ue(jp1,3)-ue(jm1,3) )+
-     >                 dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
-
-                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*(
-     >                 ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
-     >                 yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
-     >                 dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
-
-                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*(
-     >                 (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
-     >                 (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
-     >                 yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
-     >                 dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
-
-                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*(
-     >                 ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
-     >                 yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
-     >                 dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
-
-                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*(
-     >                 buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
-     >                 buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
-     >                 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
-     >                 buf(jm1,1))+
-     >                 yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
-     >                 yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
-     >                 dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
-               enddo
-
-c---------------------------------------------------------------------
-c     Fourth-order dissipation                      
-c---------------------------------------------------------------------
-               if (start(2,c) .gt. 0) then
-                  do m = 1, 5
-                     j = 1
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
-                     j = 2
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
-     >                    4.0d0*ue(j+1,m) +       ue(j+2,m))
-                  enddo
-               endif
-
-               do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
-                  do m = 1, 5
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
-     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) +
-     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
-                  enddo
-               enddo
-
-               if (end(2,c) .gt. 0) then
-                  do m = 1, 5
-                     j = cell_size(2,c)-3
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) +
-     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
-                     j = cell_size(2,c)-2
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
-
-                  enddo
-               endif
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     zeta-direction flux differences                      
-c---------------------------------------------------------------------
-         do j=start(2,c), cell_size(2,c)-end(2,c)-1
-            eta = dble(j+cell_low(2,c)) * dnym1
-            do i = start(1,c), cell_size(1,c)-end(1,c)-1
-               xi = dble(i+cell_low(1,c)) * dnxm1
-
-               do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
-                  zeta = dble(k+cell_low(3,c)) * dnzm1
-
-                  call exact_solution(xi, eta, zeta, dtemp)
-                  do m = 1, 5
-                     ue(k,m) = dtemp(m)
-                  enddo
-
-                  dtpp = 1.0d0/dtemp(1)
-
-                  do m = 2, 5
-                     buf(k,m) = dtpp * dtemp(m)
-                  enddo
-
-                  cuf(k)   = buf(k,4) * buf(k,4)
-                  buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + 
-     >                 buf(k,3) * buf(k,3)
-                  q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
-     >                 buf(k,4)*ue(k,4))
-               enddo
-
-               do k=start(3,c), cell_size(3,c)-end(3,c)-1
-                  km1 = k-1
-                  kp1 = k+1
-                  
-                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
-     >                 tz2*( ue(kp1,4)-ue(km1,4) )+
-     >                 dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
-
-                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * (
-     >                 ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
-     >                 zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
-     >                 dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
-
-                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * (
-     >                 ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
-     >                 zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
-     >                 dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
-
-                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * (
-     >                 (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
-     >                 (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
-     >                 zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
-     >                 dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
-
-                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * (
-     >                 buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
-     >                 buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
-     >                 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
-     >                 +buf(km1,1))+
-     >                 zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
-     >                 zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
-     >                 dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
-               enddo
-
-c---------------------------------------------------------------------
-c     Fourth-order dissipation                        
-c---------------------------------------------------------------------
-               if (start(3,c) .gt. 0) then
-                  do m = 1, 5
-                     k = 1
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
-                     k = 2
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
-     >                    4.0d0*ue(k+1,m) +       ue(k+2,m))
-                  enddo
-               endif
-
-               do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
-                  do m = 1, 5
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
-     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) +
-     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
-                  enddo
-               enddo
-
-               if (end(3,c) .gt. 0) then
-                  do m = 1, 5
-                     k = cell_size(3,c)-3
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) +
-     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
-                     k = cell_size(3,c)-2
-                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
-     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
-                  enddo
-               endif
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     now change the sign of the forcing function, 
-c---------------------------------------------------------------------
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     forcing(m,i,j,k,c) = -1.d0 * forcing(m,i,j,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-
-      enddo
-
-      return
-      end
diff --git a/examples/smpi/NAS/BT/exact_solution.f b/examples/smpi/NAS/BT/exact_solution.f
deleted file mode 100644
index b093b46d16..0000000000
--- a/examples/smpi/NAS/BT/exact_solution.f
+++ /dev/null
@@ -1,29 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine exact_solution(xi,eta,zeta,dtemp)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     this function returns the exact solution at point xi, eta, zeta  
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      double precision  xi, eta, zeta, dtemp(5)
-      integer m
-
-      do m = 1, 5
-         dtemp(m) =  ce(m,1) +
-     >     xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
-     >     eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
-     >     zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + 
-     >     zeta*ce(m,13))))
-      enddo
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/BT/fortran_io.f b/examples/smpi/NAS/BT/fortran_io.f
deleted file mode 100644
index d3085a030a..0000000000
--- a/examples/smpi/NAS/BT/fortran_io.f
+++ /dev/null
@@ -1,174 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      character*(128) newfilenm
-      integer m, ierr
-
-      if (node.eq.root) record_length = 40/fortran_rec_sz
-      call mpi_bcast(record_length, 1, MPI_INTEGER,
-     >                root, comm_setup, ierr)
-
-      open (unit=99, file=filenm,
-     $      form='unformatted', access='direct',
-     $      recl=record_length)
-
-      do m = 1, 5
-         xce_sub(m) = 0.d0
-      end do
-
-      idump_sub = 0
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ix, jio, kio, cio
-
-      do cio=1,ncells
-          do kio=0, cell_size(3,cio)-1
-              do jio=0, cell_size(2,cio)-1
-                  iseek=(cell_low(1,cio) +
-     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
-     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
-     $                   PROBLEM_SIZE*idump_sub)))
-
-                  do ix=0,cell_size(1,cio)-1
-                      write(99, rec=iseek+ix+1)
-     $                      u(1,ix, jio,kio,cio),
-     $                      u(2,ix, jio,kio,cio),
-     $                      u(3,ix, jio,kio,cio),
-     $                      u(4,ix, jio,kio,cio),
-     $                      u(5,ix, jio,kio,cio)
-                  enddo
-              enddo
-          enddo
-      enddo
-
-      idump_sub = idump_sub + 1
-      if (rd_interval .gt. 0) then
-         if (idump_sub .ge. rd_interval) then
-
-            call acc_sub_norms(idump+1)
-
-            idump_sub = 0
-         endif
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine acc_sub_norms(idump_cur)
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer idump_cur
-
-      integer ix, jio, kio, cio, ii, m, ichunk
-      double precision xce_single(5)
-
-      ichunk = idump_cur - idump_sub + 1
-      do ii=0, idump_sub-1
-        do cio=1,ncells
-          do kio=0, cell_size(3,cio)-1
-              do jio=0, cell_size(2,cio)-1
-                  iseek=(cell_low(1,cio) +
-     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
-     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
-     $                   PROBLEM_SIZE*ii)))
-
-
-                  do ix=0,cell_size(1,cio)-1
-                      read(99, rec=iseek+ix+1)
-     $                      u(1,ix, jio,kio,cio),
-     $                      u(2,ix, jio,kio,cio),
-     $                      u(3,ix, jio,kio,cio),
-     $                      u(4,ix, jio,kio,cio),
-     $                      u(5,ix, jio,kio,cio)
-                  enddo
-              enddo
-          enddo
-        enddo
-
-        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
-
-        call error_norm(xce_single)
-        do m = 1, 5
-           xce_sub(m) = xce_sub(m) + xce_single(m)
-        end do
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      close(unit=99)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      include 'header.h'
-      include 'mpinpb.h'
-
-      double precision xce_acc(5)
-      integer m
-
-      if (rd_interval .gt. 0) goto 20
-
-      open (unit=99, file=filenm,
-     $      form='unformatted', access='direct',
-     $      recl=record_length)
-
-c     clear the last time step
-
-      call clear_timestep
-
-c     read back the time steps and accumulate norms
-
-      call acc_sub_norms(idump)
-
-      close(unit=99)
-
- 20   continue
-      do m = 1, 5
-         xce_acc(m) = xce_sub(m) / dble(idump)
-      end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/BT/full_mpiio.f b/examples/smpi/NAS/BT/full_mpiio.f
deleted file mode 100644
index ecfd41ca73..0000000000
--- a/examples/smpi/NAS/BT/full_mpiio.f
+++ /dev/null
@@ -1,307 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ierr
-      integer mstatus(MPI_STATUS_SIZE)
-      integer sizes(4), starts(4), subsizes(4)
-      integer cell_btype(maxcells), cell_ftype(maxcells)
-      integer cell_blength(maxcells)
-      integer info
-      character*20 cb_nodes, cb_size
-      integer c, m
-      integer cell_disp(maxcells)
-
-       call mpi_bcast(collbuf_nodes, 1, MPI_INTEGER,
-     >                root, comm_setup, ierr)
-
-       call mpi_bcast(collbuf_size, 1, MPI_INTEGER,
-     >                root, comm_setup, ierr)
-
-       if (collbuf_nodes .eq. 0) then
-          info = MPI_INFO_NULL
-       else
-          write (cb_nodes,*) collbuf_nodes
-          write (cb_size,*) collbuf_size
-          call MPI_Info_create(info, ierr)
-          call MPI_Info_set(info, 'cb_nodes', cb_nodes, ierr)
-          call MPI_Info_set(info, 'cb_buffer_size', cb_size, ierr)
-          call MPI_Info_set(info, 'collective_buffering', 'true', ierr)
-       endif
-
-       call MPI_Type_contiguous(5, MPI_DOUBLE_PRECISION,
-     $                          element, ierr)
-       call MPI_Type_commit(element, ierr)
-       call MPI_Type_extent(element, eltext, ierr)
-
-       do  c = 1, ncells
-c
-c Outer array dimensions ar same for every cell
-c
-           sizes(1) = IMAX+4
-           sizes(2) = JMAX+4
-           sizes(3) = KMAX+4
-c
-c 4th dimension is cell number, total of maxcells cells
-c
-           sizes(4) = maxcells
-c
-c Internal dimensions of cells can differ slightly between cells
-c
-           subsizes(1) = cell_size(1, c)
-           subsizes(2) = cell_size(2, c)
-           subsizes(3) = cell_size(3, c)
-c
-c Cell is 4th dimension, 1 cell per cell type to handle varying 
-c cell sub-array sizes
-c
-           subsizes(4) = 1
-
-c
-c type constructors use 0-based start addresses
-c
-           starts(1) = 2 
-           starts(2) = 2
-           starts(3) = 2
-           starts(4) = c-1
-
-c 
-c Create buftype for a cell
-c
-           call MPI_Type_create_subarray(4, sizes, subsizes, 
-     $          starts, MPI_ORDER_FORTRAN, element, 
-     $          cell_btype(c), ierr)
-c
-c block length and displacement for joining cells - 
-c 1 cell buftype per block, cell buftypes have own displacment
-c generated from cell number (4th array dimension)
-c
-           cell_blength(c) = 1
-           cell_disp(c) = 0
-
-       enddo
-c
-c Create combined buftype for all cells
-c
-       call MPI_Type_struct(ncells, cell_blength, cell_disp,
-     $            cell_btype, combined_btype, ierr)
-       call MPI_Type_commit(combined_btype, ierr)
-
-       do  c = 1, ncells
-c
-c Entire array size
-c
-           sizes(1) = PROBLEM_SIZE
-           sizes(2) = PROBLEM_SIZE
-           sizes(3) = PROBLEM_SIZE
-
-c
-c Size of c'th cell
-c
-           subsizes(1) = cell_size(1, c)
-           subsizes(2) = cell_size(2, c)
-           subsizes(3) = cell_size(3, c)
-
-c
-c Starting point in full array of c'th cell
-c
-           starts(1) = cell_low(1,c)
-           starts(2) = cell_low(2,c)
-           starts(3) = cell_low(3,c)
-
-           call MPI_Type_create_subarray(3, sizes, subsizes,
-     $          starts, MPI_ORDER_FORTRAN,
-     $          element, cell_ftype(c), ierr)
-           cell_blength(c) = 1
-           cell_disp(c) = 0
-       enddo
-
-       call MPI_Type_struct(ncells, cell_blength, cell_disp,
-     $            cell_ftype, combined_ftype, ierr)
-       call MPI_Type_commit(combined_ftype, ierr)
-
-       iseek=0
-       if (node .eq. root) then
-          call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
-       endif
-
-
-      call MPI_Barrier(comm_solve, ierr)
-
-       call MPI_File_open(comm_solve,
-     $          filenm,
-     $          MPI_MODE_RDWR+MPI_MODE_CREATE,
-     $          MPI_INFO_NULL, fp, ierr)
-
-       if (ierr .ne. MPI_SUCCESS) then
-                print *, 'Error opening file'
-                stop
-       endif
-
-        call MPI_File_set_view(fp, iseek, element, 
-     $          combined_ftype, 'native', info, ierr)
-
-       if (ierr .ne. MPI_SUCCESS) then
-                print *, 'Error setting file view'
-                stop
-       endif
-
-      do m = 1, 5
-         xce_sub(m) = 0.d0
-      end do
-
-      idump_sub = 0
-
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer mstatus(MPI_STATUS_SIZE)
-      integer ierr
-
-      call MPI_File_write_at_all(fp, iseek, u,
-     $                           1, combined_btype, mstatus, ierr)
-      if (ierr .ne. MPI_SUCCESS) then
-          print *, 'Error writing to file'
-          stop
-      endif
-
-      call MPI_Type_size(combined_btype, iosize, ierr)
-      iseek = iseek + iosize/eltext
-
-      idump_sub = idump_sub + 1
-      if (rd_interval .gt. 0) then
-         if (idump_sub .ge. rd_interval) then
-
-            iseek = 0
-            call acc_sub_norms(idump+1)
-
-            iseek = 0
-            idump_sub = 0
-         endif
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine acc_sub_norms(idump_cur)
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer idump_cur
-
-      integer ii, m, ichunk
-      integer ierr
-      integer mstatus(MPI_STATUS_SIZE)
-      double precision xce_single(5)
-
-      ichunk = idump_cur - idump_sub + 1
-      do ii=0, idump_sub-1
-
-        call MPI_File_read_at_all(fp, iseek, u,
-     $                           1, combined_btype, mstatus, ierr)
-        if (ierr .ne. MPI_SUCCESS) then
-           print *, 'Error reading back file'
-           call MPI_File_close(fp, ierr)
-           stop
-        endif
-
-        call MPI_Type_size(combined_btype, iosize, ierr)
-        iseek = iseek + iosize/eltext
-
-        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
-
-        call error_norm(xce_single)
-        do m = 1, 5
-           xce_sub(m) = xce_sub(m) + xce_single(m)
-        end do
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ierr
-
-      call MPI_File_close(fp, ierr)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
-      subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      double precision xce_acc(5)
-      integer m, ierr
-
-      if (rd_interval .gt. 0) goto 20
-
-      call MPI_File_open(comm_solve,
-     $          filenm,
-     $          MPI_MODE_RDONLY,
-     $          MPI_INFO_NULL,
-     $          fp,
-     $          ierr)
-
-      iseek = 0
-      call MPI_File_set_view(fp, iseek, element, combined_ftype,
-     $          'native', MPI_INFO_NULL, ierr)
-
-c     clear the last time step
-
-      call clear_timestep
-
-c     read back the time steps and accumulate norms
-
-      call acc_sub_norms(idump)
-
-      call MPI_File_close(fp, ierr)
-
- 20   continue
-      do m = 1, 5
-         xce_acc(m) = xce_sub(m) / dble(idump)
-      end do
-
-      return
-      end
-
diff --git a/examples/smpi/NAS/BT/header.h b/examples/smpi/NAS/BT/header.h
deleted file mode 100644
index 47719da674..0000000000
--- a/examples/smpi/NAS/BT/header.h
+++ /dev/null
@@ -1,137 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-c
-c  header.h
-c
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- 
-      implicit none
-
-c---------------------------------------------------------------------
-c The following include file is generated automatically by the
-c "setparams" utility. It defines 
-c      maxcells:      the square root of the maximum number of processors
-c      problem_size:  12, 64, 102, 162 (for class T, A, B, C)
-c      dt_default:    default time step for this problem size if no
-c                     config file
-c      niter_default: default number of iterations for this problem size
-c---------------------------------------------------------------------
-
-      include 'npbparams.h'
-
-      integer           aa, bb, cc, BLOCK_SIZE
-      parameter (aa=1, bb=2, cc=3, BLOCK_SIZE=5)
-
-      integer           ncells, grid_points(3)
-      double precision  elapsed_time
-      common /global/   elapsed_time, ncells, grid_points
-
-      double precision  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
-     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
-     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
-     >                  ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, 
-     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
-     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
-     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
-     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
-     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
-     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
-     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
-     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
-     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
-     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
-
-      common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
-     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
-     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
-     >                  ce, dxmax, dymax, dzmax, xxcon1, xxcon2, 
-     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
-     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
-     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
-     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
-     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
-     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
-     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
-     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
-     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
-     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
-
-      integer           EAST, WEST, NORTH, SOUTH, 
-     >                  BOTTOM, TOP
-
-      parameter (EAST=2000, WEST=3000,      NORTH=4000, SOUTH=5000,
-     >           BOTTOM=6000, TOP=7000)
-
-      integer cell_coord (3,maxcells), cell_low (3,maxcells), 
-     >        cell_high  (3,maxcells), cell_size(3,maxcells),
-     >        predecessor(3),          slice    (3,maxcells),
-     >        grid_size  (3),          successor(3)         ,
-     >        start      (3,maxcells), end      (3,maxcells)
-      common /partition/ cell_coord, cell_low, cell_high, cell_size,
-     >                   grid_size, successor, predecessor, slice,
-     >                   start, end
-
-      integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE
-
-      parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
-
-      parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
-
-      parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60+1)
-
-      double precision 
-     >   us      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
-     >   vs      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
-     >   ws      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
-     >   qs      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
-     >   rho_i   (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
-     >   square  (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
-     >   forcing (5,   0:IMAX-1, 0:JMAX-1, 0:KMAX-1, maxcells),
-     >   u       (5,  -2:IMAX+1,-2:JMAX+1,-2:KMAX+1, maxcells),
-     >   rhs     (5,  -1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
-     >   lhsc    (5,5,-1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
-     >   backsub_info (5, 0:MAX_CELL_DIM, 0:MAX_CELL_DIM, maxcells),
-     >   in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
-      common /fields/  u, us, vs, ws, qs, rho_i, square, 
-     >                 rhs, forcing, lhsc, in_buffer, out_buffer,
-     >                 backsub_info
-
-      double precision cv(-2:MAX_CELL_DIM+1),   rhon(-2:MAX_CELL_DIM+1),
-     >                 rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
-     >                 cuf(-2:MAX_CELL_DIM+1),  q(-2:MAX_CELL_DIM+1),
-     >                 ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
-      common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
-
-      integer  west_size, east_size, bottom_size, top_size,
-     >         north_size, south_size, start_send_west, 
-     >         start_send_east, start_send_south, start_send_north,
-     >         start_send_bottom, start_send_top, start_recv_west,
-     >         start_recv_east, start_recv_south, start_recv_north,
-     >         start_recv_bottom, start_recv_top
-      common /box/ west_size, east_size, bottom_size,
-     >             top_size, north_size, south_size, 
-     >             start_send_west, start_send_east, start_send_south,
-     >             start_send_north, start_send_bottom, start_send_top,
-     >             start_recv_west, start_recv_east, start_recv_south,
-     >             start_recv_north, start_recv_bottom, start_recv_top
-
-      double precision  tmp_block(5,5), b_inverse(5,5), tmp_vec(5)
-      common /work_solve/ tmp_block, b_inverse, tmp_vec
-
-c
-c     These are used by btio
-c
-      integer collbuf_nodes, collbuf_size, iosize, eltext,
-     $        combined_btype, fp, idump, record_length, element,
-     $        combined_ftype, idump_sub, rd_interval
-      common /btio/ collbuf_nodes, collbuf_size, iosize, eltext,
-     $              combined_btype, fp, idump, record_length,
-     $              idump_sub, rd_interval
-      double precision sum(niter_default), xce_sub(5)
-      common /btio/ sum, xce_sub
-      integer*8 iseek
-      common /btio/ iseek, element, combined_ftype
-
-
-
diff --git a/examples/smpi/NAS/BT/initialize.f b/examples/smpi/NAS/BT/initialize.f
deleted file mode 100644
index 274cdb1899..0000000000
--- a/examples/smpi/NAS/BT/initialize.f
+++ /dev/null
@@ -1,308 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine  initialize
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     This subroutine initializes the field variable u using 
-c     tri-linear transfinite interpolation of the boundary values     
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      
-      integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
-      double precision  xi, eta, zeta, Pface(5,3,2), Pxi, Peta, 
-     >     Pzeta, temp(5)
-
-c---------------------------------------------------------------------
-c  Later (in compute_rhs) we compute 1/u for every element. A few of 
-c  the corner elements are not used, but it convenient (and faster) 
-c  to compute the whole thing with a simple loop. Make sure those 
-c  values are nonzero by initializing the whole thing here. 
-c---------------------------------------------------------------------
-      do c = 1, ncells
-         do kk = -1, KMAX
-            do jj = -1, JMAX
-               do ii = -1, IMAX
-                  do m = 1, 5
-                     u(m, ii, jj, kk, c) = 1.0
-                  end do
-               end do
-            end do
-         end do
-      end do
-c---------------------------------------------------------------------
-
-
-
-c---------------------------------------------------------------------
-c     first store the "interpolated" values everywhere on the grid    
-c---------------------------------------------------------------------
-      do c=1, ncells
-         kk = 0
-         do k = cell_low(3,c), cell_high(3,c)
-            zeta = dble(k) * dnzm1
-            jj = 0
-            do j = cell_low(2,c), cell_high(2,c)
-               eta = dble(j) * dnym1
-               ii = 0
-               do i = cell_low(1,c), cell_high(1,c)
-                  xi = dble(i) * dnxm1
-                  
-                  do ix = 1, 2
-                     call exact_solution(dble(ix-1), eta, zeta, 
-     >                    Pface(1,1,ix))
-                  enddo
-
-                  do iy = 1, 2
-                     call exact_solution(xi, dble(iy-1) , zeta, 
-     >                    Pface(1,2,iy))
-                  enddo
-
-                  do iz = 1, 2
-                     call exact_solution(xi, eta, dble(iz-1),   
-     >                    Pface(1,3,iz))
-                  enddo
-
-                  do m = 1, 5
-                     Pxi   = xi   * Pface(m,1,2) + 
-     >                    (1.0d0-xi)   * Pface(m,1,1)
-                     Peta  = eta  * Pface(m,2,2) + 
-     >                    (1.0d0-eta)  * Pface(m,2,1)
-                     Pzeta = zeta * Pface(m,3,2) + 
-     >                    (1.0d0-zeta) * Pface(m,3,1)
-                     
-                     u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta - 
-     >                    Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + 
-     >                    Pxi*Peta*Pzeta
-
-                  enddo
-                  ii = ii + 1
-               enddo
-               jj = jj + 1
-            enddo
-            kk = kk+1
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     now store the exact values on the boundaries        
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     west face                                                  
-c---------------------------------------------------------------------
-      c = slice(1,1)
-      ii = 0
-      xi = 0.0d0
-      kk = 0
-      do k = cell_low(3,c), cell_high(3,c)
-         zeta = dble(k) * dnzm1
-         jj = 0
-         do j = cell_low(2,c), cell_high(2,c)
-            eta = dble(j) * dnym1
-            call exact_solution(xi, eta, zeta, temp)
-            do m = 1, 5
-               u(m,ii,jj,kk,c) = temp(m)
-            enddo
-            jj = jj + 1
-         enddo
-         kk = kk + 1
-      enddo
-
-c---------------------------------------------------------------------
-c     east face                                                      
-c---------------------------------------------------------------------
-      c  = slice(1,ncells)
-      ii = cell_size(1,c)-1
-      xi = 1.0d0
-      kk = 0
-      do k = cell_low(3,c), cell_high(3,c)
-         zeta = dble(k) * dnzm1
-         jj = 0
-         do j = cell_low(2,c), cell_high(2,c)
-            eta = dble(j) * dnym1
-            call exact_solution(xi, eta, zeta, temp)
-            do m = 1, 5
-               u(m,ii,jj,kk,c) = temp(m)
-            enddo
-            jj = jj + 1
-         enddo
-         kk = kk + 1
-      enddo
-
-c---------------------------------------------------------------------
-c     south face                                                 
-c---------------------------------------------------------------------
-      c = slice(2,1)
-      jj = 0
-      eta = 0.0d0
-      kk = 0
-      do k = cell_low(3,c), cell_high(3,c)
-         zeta = dble(k) * dnzm1
-         ii = 0
-         do i = cell_low(1,c), cell_high(1,c)
-            xi = dble(i) * dnxm1
-            call exact_solution(xi, eta, zeta, temp)
-            do m = 1, 5
-               u(m,ii,jj,kk,c) = temp(m)
-            enddo
-            ii = ii + 1
-         enddo
-         kk = kk + 1
-      enddo
-
-
-c---------------------------------------------------------------------
-c     north face                                    
-c---------------------------------------------------------------------
-      c = slice(2,ncells)
-      jj = cell_size(2,c)-1
-      eta = 1.0d0
-      kk = 0
-      do k = cell_low(3,c), cell_high(3,c)
-         zeta = dble(k) * dnzm1
-         ii = 0
-         do i = cell_low(1,c), cell_high(1,c)
-            xi = dble(i) * dnxm1
-            call exact_solution(xi, eta, zeta, temp)
-            do m = 1, 5
-               u(m,ii,jj,kk,c) = temp(m)
-            enddo
-            ii = ii + 1
-         enddo
-         kk = kk + 1
-      enddo
-
-c---------------------------------------------------------------------
-c     bottom face                                       
-c---------------------------------------------------------------------
-      c = slice(3,1)
-      kk = 0
-      zeta = 0.0d0
-      jj = 0
-      do j = cell_low(2,c), cell_high(2,c)
-         eta = dble(j) * dnym1
-         ii = 0
-         do i =cell_low(1,c), cell_high(1,c)
-            xi = dble(i) *dnxm1
-            call exact_solution(xi, eta, zeta, temp)
-            do m = 1, 5
-               u(m,ii,jj,kk,c) = temp(m)
-            enddo
-            ii = ii + 1
-         enddo
-         jj = jj + 1
-      enddo
-
-c---------------------------------------------------------------------
-c     top face     
-c---------------------------------------------------------------------
-      c = slice(3,ncells)
-      kk = cell_size(3,c)-1
-      zeta = 1.0d0
-      jj = 0
-      do j = cell_low(2,c), cell_high(2,c)
-         eta = dble(j) * dnym1
-         ii = 0
-         do i =cell_low(1,c), cell_high(1,c)
-            xi = dble(i) * dnxm1
-            call exact_solution(xi, eta, zeta, temp)
-            do m = 1, 5
-               u(m,ii,jj,kk,c) = temp(m)
-            enddo
-            ii = ii + 1
-         enddo
-         jj = jj + 1
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine lhsinit
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      
-      integer i, j, k, d, c, m, n
-
-c---------------------------------------------------------------------
-c     loop over all cells                                       
-c---------------------------------------------------------------------
-      do c = 1, ncells
-
-c---------------------------------------------------------------------
-c     first, initialize the start and end arrays
-c---------------------------------------------------------------------
-         do d = 1, 3
-            if (cell_coord(d,c) .eq. 1) then
-               start(d,c) = 1
-            else 
-               start(d,c) = 0
-            endif
-            if (cell_coord(d,c) .eq. ncells) then
-               end(d,c) = 1
-            else
-               end(d,c) = 0
-            endif
-         enddo
-
-c---------------------------------------------------------------------
-c     zero the whole left hand side for starters
-c---------------------------------------------------------------------
-         do k = 0, cell_size(3,c)-1
-            do j = 0, cell_size(2,c)-1
-               do i = 0, cell_size(1,c)-1
-                  do m = 1,5
-                     do n = 1, 5
-                        lhsc(m,n,i,j,k,c) = 0.0d0
-                     enddo
-                  enddo
-               enddo
-            enddo
-         enddo
-
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine lhsabinit(lhsa, lhsb, size)
-      implicit none
-
-      integer size
-      double precision lhsa(5, 5, -1:size), lhsb(5, 5, -1:size)
-
-      integer i, m, n
-
-c---------------------------------------------------------------------
-c     next, set all diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
-      do i = 0, size
-         do m = 1, 5
-            do n = 1, 5
-               lhsa(m,n,i) = 0.0d0
-               lhsb(m,n,i) = 0.0d0
-            enddo
-            lhsb(m,m,i) = 1.0d0
-         enddo
-      enddo
-
-      return
-      end
-
-
-
diff --git a/examples/smpi/NAS/BT/inputbt.data.sample b/examples/smpi/NAS/BT/inputbt.data.sample
deleted file mode 100644
index 776654e8d0..0000000000
--- a/examples/smpi/NAS/BT/inputbt.data.sample
+++ /dev/null
@@ -1,5 +0,0 @@
-200       number of time steps
-0.0008d0  dt for class A = 0.0008d0. class B = 0.0003d0  class C = 0.0001d0
-64 64 64
-5 0        write interval (optional read interval) for BTIO
-0 1000000  number of nodes in collective buffering and buffer size for BTIO
diff --git a/examples/smpi/NAS/BT/make_set.f b/examples/smpi/NAS/BT/make_set.f
deleted file mode 100644
index b8d90c65a4..0000000000
--- a/examples/smpi/NAS/BT/make_set.f
+++ /dev/null
@@ -1,124 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine make_set
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     This function allocates space for a set of cells and fills the set     
-c     such that communication between cells on different nodes is only
-c     nearest neighbor                                                   
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-
-      integer p, i, j, c, dir, size, excess, ierr,ierrcode
-
-c---------------------------------------------------------------------
-c     compute square root; add small number to allow for roundoff
-c     (note: this is computed in setup_mpi.f also, but prefer to do
-c     it twice because of some include file problems).
-c---------------------------------------------------------------------
-      ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
-
-c---------------------------------------------------------------------
-c     this makes coding easier
-c---------------------------------------------------------------------
-      p = ncells
-      
-c---------------------------------------------------------------------
-c     determine the location of the cell at the bottom of the 3D 
-c     array of cells
-c---------------------------------------------------------------------
-      cell_coord(1,1) = mod(node,p) 
-      cell_coord(2,1) = node/p 
-      cell_coord(3,1) = 0
-
-c---------------------------------------------------------------------
-c     set the cell_coords for cells in the rest of the z-layers; 
-c     this comes down to a simple linear numbering in the z-direct-
-c     ion, and to the doubly-cyclic numbering in the other dirs     
-c---------------------------------------------------------------------
-      do c=2, p
-         cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) 
-         cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) 
-         cell_coord(3,c) = c-1
-      end do
-
-c---------------------------------------------------------------------
-c     offset all the coordinates by 1 to adjust for Fortran arrays
-c---------------------------------------------------------------------
-      do dir = 1, 3
-         do c = 1, p
-            cell_coord(dir,c) = cell_coord(dir,c) + 1
-         end do
-      end do
-      
-c---------------------------------------------------------------------
-c     slice(dir,n) contains the sequence number of the cell that is in
-c     coordinate plane n in the dir direction
-c---------------------------------------------------------------------
-      do dir = 1, 3
-         do c = 1, p
-            slice(dir,cell_coord(dir,c)) = c
-         end do
-      end do
-
-
-c---------------------------------------------------------------------
-c     fill the predecessor and successor entries, using the indices 
-c     of the bottom cells (they are the same at each level of k 
-c     anyway) acting as if full periodicity pertains; note that p is
-c     added to those arguments to the mod functions that might
-c     otherwise return wrong values when using the modulo function
-c---------------------------------------------------------------------
-      i = cell_coord(1,1)-1
-      j = cell_coord(2,1)-1
-
-      predecessor(1) = mod(i-1+p,p) + p*j
-      predecessor(2) = i + p*mod(j-1+p,p)
-      predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
-      successor(1)   = mod(i+1,p) + p*j
-      successor(2)   = i + p*mod(j+1,p)
-      successor(3)   = mod(i-1+p,p) + p*mod(j+1,p)
-
-c---------------------------------------------------------------------
-c     now compute the sizes of the cells                                    
-c---------------------------------------------------------------------
-      do dir= 1, 3
-c---------------------------------------------------------------------
-c     set cell_coord range for each direction                            
-c---------------------------------------------------------------------
-         size   = grid_points(dir)/p
-         excess = mod(grid_points(dir),p)
-         do c=1, ncells
-            if (cell_coord(dir,c) .le. excess) then
-               cell_size(dir,c) = size+1
-               cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
-               cell_high(dir,c) = cell_low(dir,c)+size
-            else 
-               cell_size(dir,c) = size
-               cell_low(dir,c)  = excess*(size+1)+
-     >              (cell_coord(dir,c)-excess-1)*size
-               cell_high(dir,c) = cell_low(dir,c)+size-1
-            endif
-            if (cell_size(dir, c) .le. 2) then
-               write(*,50)
- 50            format(' Error: Cell size too small. Min size is 3')
-               call MPI_Abort(mpi_comm_world,ierrcode,ierr)
-               stop
-            endif
-         end do
-      end do
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
diff --git a/examples/smpi/NAS/BT/mpinpb.h b/examples/smpi/NAS/BT/mpinpb.h
deleted file mode 100644
index f621f08b64..0000000000
--- a/examples/smpi/NAS/BT/mpinpb.h
+++ /dev/null
@@ -1,12 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'mpif.h'
-
-      integer           node, no_nodes, total_nodes, root, comm_setup, 
-     >                  comm_solve, comm_rhs, dp_type
-      logical           active
-      common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, 
-     >                  comm_solve, comm_rhs, dp_type, active
-
diff --git a/examples/smpi/NAS/BT/rhs.f b/examples/smpi/NAS/BT/rhs.f
deleted file mode 100644
index 89171a6741..0000000000
--- a/examples/smpi/NAS/BT/rhs.f
+++ /dev/null
@@ -1,425 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine compute_rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer c, i, j, k, m
-      double precision rho_inv, uijk, up1, um1, vijk, vp1, vm1,
-     >     wijk, wp1, wm1
-
-
-c---------------------------------------------------------------------
-c     loop over all cells owned by this node                           
-c---------------------------------------------------------------------
-      do c = 1, ncells
-
-c---------------------------------------------------------------------
-c     compute the reciprocal of density, and the kinetic energy, 
-c     and the speed of sound.
-c---------------------------------------------------------------------
-         do k = -1, cell_size(3,c)
-            do j = -1, cell_size(2,c)
-               do i = -1, cell_size(1,c)
-                  rho_inv = 1.0d0/u(1,i,j,k,c)
-                  rho_i(i,j,k,c) = rho_inv
-                  us(i,j,k,c) = u(2,i,j,k,c) * rho_inv
-                  vs(i,j,k,c) = u(3,i,j,k,c) * rho_inv
-                  ws(i,j,k,c) = u(4,i,j,k,c) * rho_inv
-                  square(i,j,k,c)     = 0.5d0* (
-     >                 u(2,i,j,k,c)*u(2,i,j,k,c) + 
-     >                 u(3,i,j,k,c)*u(3,i,j,k,c) +
-     >                 u(4,i,j,k,c)*u(4,i,j,k,c) ) * rho_inv
-                  qs(i,j,k,c) = square(i,j,k,c) * rho_inv
-               enddo
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c copy the exact forcing term to the right hand side;  because 
-c this forcing term is known, we can store it on the whole of every 
-c cell,  including the boundary                   
-c---------------------------------------------------------------------
-
-         do k = 0, cell_size(3,c)-1
-            do j = 0, cell_size(2,c)-1
-               do i = 0, cell_size(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = forcing(m,i,j,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-
-
-c---------------------------------------------------------------------
-c     compute xi-direction fluxes 
-c---------------------------------------------------------------------
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  uijk = us(i,j,k,c)
-                  up1  = us(i+1,j,k,c)
-                  um1  = us(i-1,j,k,c)
-
-                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dx1tx1 * 
-     >                 (u(1,i+1,j,k,c) - 2.0d0*u(1,i,j,k,c) + 
-     >                 u(1,i-1,j,k,c)) -
-     >                 tx2 * (u(2,i+1,j,k,c) - u(2,i-1,j,k,c))
-
-                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dx2tx1 * 
-     >                 (u(2,i+1,j,k,c) - 2.0d0*u(2,i,j,k,c) + 
-     >                 u(2,i-1,j,k,c)) +
-     >                 xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
-     >                 tx2 * (u(2,i+1,j,k,c)*up1 - 
-     >                 u(2,i-1,j,k,c)*um1 +
-     >                 (u(5,i+1,j,k,c)- square(i+1,j,k,c)-
-     >                 u(5,i-1,j,k,c)+ square(i-1,j,k,c))*
-     >                 c2)
-
-                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dx3tx1 * 
-     >                 (u(3,i+1,j,k,c) - 2.0d0*u(3,i,j,k,c) +
-     >                 u(3,i-1,j,k,c)) +
-     >                 xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
-     >                 vs(i-1,j,k,c)) -
-     >                 tx2 * (u(3,i+1,j,k,c)*up1 - 
-     >                 u(3,i-1,j,k,c)*um1)
-
-                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dx4tx1 * 
-     >                 (u(4,i+1,j,k,c) - 2.0d0*u(4,i,j,k,c) +
-     >                 u(4,i-1,j,k,c)) +
-     >                 xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
-     >                 ws(i-1,j,k,c)) -
-     >                 tx2 * (u(4,i+1,j,k,c)*up1 - 
-     >                 u(4,i-1,j,k,c)*um1)
-
-                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dx5tx1 * 
-     >                 (u(5,i+1,j,k,c) - 2.0d0*u(5,i,j,k,c) +
-     >                 u(5,i-1,j,k,c)) +
-     >                 xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
-     >                 qs(i-1,j,k,c)) +
-     >                 xxcon4 * (up1*up1 -       2.0d0*uijk*uijk + 
-     >                 um1*um1) +
-     >                 xxcon5 * (u(5,i+1,j,k,c)*rho_i(i+1,j,k,c) - 
-     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
-     >                 u(5,i-1,j,k,c)*rho_i(i-1,j,k,c)) -
-     >                 tx2 * ( (c1*u(5,i+1,j,k,c) - 
-     >                 c2*square(i+1,j,k,c))*up1 -
-     >                 (c1*u(5,i-1,j,k,c) - 
-     >                 c2*square(i-1,j,k,c))*um1 )
-               enddo
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     add fourth order xi-direction dissipation               
-c---------------------------------------------------------------------
-         if (start(1,c) .gt. 0) then
-            do k = start(3,c), cell_size(3,c)-end(3,c)-1
-               do j = start(2,c), cell_size(2,c)-end(2,c)-1
-                  i = 1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
-     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) +
-     >                    u(m,i+2,j,k,c))
-                  enddo
-
-                  i = 2
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
-     >                    (-4.0d0*u(m,i-1,j,k,c) + 6.0d0*u(m,i,j,k,c) -
-     >                    4.0d0*u(m,i+1,j,k,c) + u(m,i+2,j,k,c))
-                  enddo
-               enddo
-            enddo
-         endif
-
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
-     >                    (  u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + 
-     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) + 
-     >                    u(m,i+2,j,k,c) )
-                  enddo
-               enddo
-            enddo
-         enddo
-         
-
-         if (end(1,c) .gt. 0) then
-            do k = start(3,c), cell_size(3,c)-end(3,c)-1
-               do j = start(2,c), cell_size(2,c)-end(2,c)-1
-                  i = cell_size(1,c)-3
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
-     >                    ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + 
-     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) )
-                  enddo
-
-                  i = cell_size(1,c)-2
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
-     >                    ( u(m,i-2,j,k,c) - 4.d0*u(m,i-1,j,k,c) +
-     >                    5.d0*u(m,i,j,k,c) )
-                  enddo
-               enddo
-            enddo
-         endif
-
-c---------------------------------------------------------------------
-c     compute eta-direction fluxes 
-c---------------------------------------------------------------------
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  vijk = vs(i,j,k,c)
-                  vp1  = vs(i,j+1,k,c)
-                  vm1  = vs(i,j-1,k,c)
-                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dy1ty1 * 
-     >                 (u(1,i,j+1,k,c) - 2.0d0*u(1,i,j,k,c) + 
-     >                 u(1,i,j-1,k,c)) -
-     >                 ty2 * (u(3,i,j+1,k,c) - u(3,i,j-1,k,c))
-                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dy2ty1 * 
-     >                 (u(2,i,j+1,k,c) - 2.0d0*u(2,i,j,k,c) + 
-     >                 u(2,i,j-1,k,c)) +
-     >                 yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + 
-     >                 us(i,j-1,k,c)) -
-     >                 ty2 * (u(2,i,j+1,k,c)*vp1 - 
-     >                 u(2,i,j-1,k,c)*vm1)
-                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dy3ty1 * 
-     >                 (u(3,i,j+1,k,c) - 2.0d0*u(3,i,j,k,c) + 
-     >                 u(3,i,j-1,k,c)) +
-     >                 yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
-     >                 ty2 * (u(3,i,j+1,k,c)*vp1 - 
-     >                 u(3,i,j-1,k,c)*vm1 +
-     >                 (u(5,i,j+1,k,c) - square(i,j+1,k,c) - 
-     >                 u(5,i,j-1,k,c) + square(i,j-1,k,c))
-     >                 *c2)
-                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dy4ty1 * 
-     >                 (u(4,i,j+1,k,c) - 2.0d0*u(4,i,j,k,c) + 
-     >                 u(4,i,j-1,k,c)) +
-     >                 yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + 
-     >                 ws(i,j-1,k,c)) -
-     >                 ty2 * (u(4,i,j+1,k,c)*vp1 - 
-     >                 u(4,i,j-1,k,c)*vm1)
-                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dy5ty1 * 
-     >                 (u(5,i,j+1,k,c) - 2.0d0*u(5,i,j,k,c) + 
-     >                 u(5,i,j-1,k,c)) +
-     >                 yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + 
-     >                 qs(i,j-1,k,c)) +
-     >                 yycon4 * (vp1*vp1       - 2.0d0*vijk*vijk + 
-     >                 vm1*vm1) +
-     >                 yycon5 * (u(5,i,j+1,k,c)*rho_i(i,j+1,k,c) - 
-     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
-     >                 u(5,i,j-1,k,c)*rho_i(i,j-1,k,c)) -
-     >                 ty2 * ((c1*u(5,i,j+1,k,c) - 
-     >                 c2*square(i,j+1,k,c)) * vp1 -
-     >                 (c1*u(5,i,j-1,k,c) - 
-     >                 c2*square(i,j-1,k,c)) * vm1)
-               enddo
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     add fourth order eta-direction dissipation         
-c---------------------------------------------------------------------
-         if (start(2,c) .gt. 0) then
-            do k = start(3,c), cell_size(3,c)-end(3,c)-1
-               j = 1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
-     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) +
-     >                    u(m,i,j+2,k,c))
-                  enddo
-               enddo
-
-               j = 2
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
-     >                    (-4.0d0*u(m,i,j-1,k,c) + 6.0d0*u(m,i,j,k,c) -
-     >                    4.0d0*u(m,i,j+1,k,c) + u(m,i,j+2,k,c))
-                  enddo
-               enddo
-            enddo
-         endif
-
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
-               do i = start(1,c),cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
-     >                    (  u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + 
-     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) + 
-     >                    u(m,i,j+2,k,c) )
-                  enddo
-               enddo
-            enddo
-         enddo
-         
-         if (end(2,c) .gt. 0) then
-            do k = start(3,c), cell_size(3,c)-end(3,c)-1
-               j = cell_size(2,c)-3
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
-     >                    ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + 
-     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) )
-                  enddo
-               enddo
-
-               j = cell_size(2,c)-2
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
-     >                    ( u(m,i,j-2,k,c) - 4.d0*u(m,i,j-1,k,c) +
-     >                    5.d0*u(m,i,j,k,c) )
-                  enddo
-               enddo
-            enddo
-         endif
-
-c---------------------------------------------------------------------
-c     compute zeta-direction fluxes 
-c---------------------------------------------------------------------
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  wijk = ws(i,j,k,c)
-                  wp1  = ws(i,j,k+1,c)
-                  wm1  = ws(i,j,k-1,c)
-
-                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dz1tz1 * 
-     >                 (u(1,i,j,k+1,c) - 2.0d0*u(1,i,j,k,c) + 
-     >                 u(1,i,j,k-1,c)) -
-     >                 tz2 * (u(4,i,j,k+1,c) - u(4,i,j,k-1,c))
-                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dz2tz1 * 
-     >                 (u(2,i,j,k+1,c) - 2.0d0*u(2,i,j,k,c) + 
-     >                 u(2,i,j,k-1,c)) +
-     >                 zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + 
-     >                 us(i,j,k-1,c)) -
-     >                 tz2 * (u(2,i,j,k+1,c)*wp1 - 
-     >                 u(2,i,j,k-1,c)*wm1)
-                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dz3tz1 * 
-     >                 (u(3,i,j,k+1,c) - 2.0d0*u(3,i,j,k,c) + 
-     >                 u(3,i,j,k-1,c)) +
-     >                 zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + 
-     >                 vs(i,j,k-1,c)) -
-     >                 tz2 * (u(3,i,j,k+1,c)*wp1 - 
-     >                 u(3,i,j,k-1,c)*wm1)
-                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dz4tz1 * 
-     >                 (u(4,i,j,k+1,c) - 2.0d0*u(4,i,j,k,c) + 
-     >                 u(4,i,j,k-1,c)) +
-     >                 zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
-     >                 tz2 * (u(4,i,j,k+1,c)*wp1 - 
-     >                 u(4,i,j,k-1,c)*wm1 +
-     >                 (u(5,i,j,k+1,c) - square(i,j,k+1,c) - 
-     >                 u(5,i,j,k-1,c) + square(i,j,k-1,c))
-     >                 *c2)
-                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dz5tz1 * 
-     >                 (u(5,i,j,k+1,c) - 2.0d0*u(5,i,j,k,c) + 
-     >                 u(5,i,j,k-1,c)) +
-     >                 zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + 
-     >                 qs(i,j,k-1,c)) +
-     >                 zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + 
-     >                 wm1*wm1) +
-     >                 zzcon5 * (u(5,i,j,k+1,c)*rho_i(i,j,k+1,c) - 
-     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
-     >                 u(5,i,j,k-1,c)*rho_i(i,j,k-1,c)) -
-     >                 tz2 * ( (c1*u(5,i,j,k+1,c) - 
-     >                 c2*square(i,j,k+1,c))*wp1 -
-     >                 (c1*u(5,i,j,k-1,c) - 
-     >                 c2*square(i,j,k-1,c))*wm1)
-               enddo
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     add fourth order zeta-direction dissipation                
-c---------------------------------------------------------------------
-         if (start(3,c) .gt. 0) then
-            k = 1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
-     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) +
-     >                    u(m,i,j,k+2,c))
-                  enddo
-               enddo
-            enddo
-
-            k = 2
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
-     >                    (-4.0d0*u(m,i,j,k-1,c) + 6.0d0*u(m,i,j,k,c) -
-     >                    4.0d0*u(m,i,j,k+1,c) + u(m,i,j,k+2,c))
-                  enddo
-               enddo
-            enddo
-         endif
-
-         do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c),cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
-     >                    (  u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + 
-     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) + 
-     >                    u(m,i,j,k+2,c) )
-                  enddo
-               enddo
-            enddo
-         enddo
-         
-         if (end(3,c) .gt. 0) then
-            k = cell_size(3,c)-3
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
-     >                    ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + 
-     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) )
-                  enddo
-               enddo
-            enddo
-
-            k = cell_size(3,c)-2
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
-     >                    ( u(m,i,j,k-2,c) - 4.d0*u(m,i,j,k-1,c) +
-     >                    5.d0*u(m,i,j,k,c) )
-                  enddo
-               enddo
-            enddo
-         endif
-
-         do k = start(3,c), cell_size(3,c)-end(3,c)-1
-            do j = start(2,c), cell_size(2,c)-end(2,c)-1
-               do i = start(1,c), cell_size(1,c)-end(1,c)-1
-                  do m = 1, 5
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) * dt
-                  enddo
-               enddo
-            enddo
-         enddo
-
-      enddo
-      
-      return
-      end
-
-
-
-
diff --git a/examples/smpi/NAS/BT/set_constants.f b/examples/smpi/NAS/BT/set_constants.f
deleted file mode 100644
index 81397d4bcf..0000000000
--- a/examples/smpi/NAS/BT/set_constants.f
+++ /dev/null
@@ -1,202 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine  set_constants
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      
-      ce(1,1)  = 2.0d0
-      ce(1,2)  = 0.0d0
-      ce(1,3)  = 0.0d0
-      ce(1,4)  = 4.0d0
-      ce(1,5)  = 5.0d0
-      ce(1,6)  = 3.0d0
-      ce(1,7)  = 0.5d0
-      ce(1,8)  = 0.02d0
-      ce(1,9)  = 0.01d0
-      ce(1,10) = 0.03d0
-      ce(1,11) = 0.5d0
-      ce(1,12) = 0.4d0
-      ce(1,13) = 0.3d0
-      
-      ce(2,1)  = 1.0d0
-      ce(2,2)  = 0.0d0
-      ce(2,3)  = 0.0d0
-      ce(2,4)  = 0.0d0
-      ce(2,5)  = 1.0d0
-      ce(2,6)  = 2.0d0
-      ce(2,7)  = 3.0d0
-      ce(2,8)  = 0.01d0
-      ce(2,9)  = 0.03d0
-      ce(2,10) = 0.02d0
-      ce(2,11) = 0.4d0
-      ce(2,12) = 0.3d0
-      ce(2,13) = 0.5d0
-
-      ce(3,1)  = 2.0d0
-      ce(3,2)  = 2.0d0
-      ce(3,3)  = 0.0d0
-      ce(3,4)  = 0.0d0
-      ce(3,5)  = 0.0d0
-      ce(3,6)  = 2.0d0
-      ce(3,7)  = 3.0d0
-      ce(3,8)  = 0.04d0
-      ce(3,9)  = 0.03d0
-      ce(3,10) = 0.05d0
-      ce(3,11) = 0.3d0
-      ce(3,12) = 0.5d0
-      ce(3,13) = 0.4d0
-
-      ce(4,1)  = 2.0d0
-      ce(4,2)  = 2.0d0
-      ce(4,3)  = 0.0d0
-      ce(4,4)  = 0.0d0
-      ce(4,5)  = 0.0d0
-      ce(4,6)  = 2.0d0
-      ce(4,7)  = 3.0d0
-      ce(4,8)  = 0.03d0
-      ce(4,9)  = 0.05d0
-      ce(4,10) = 0.04d0
-      ce(4,11) = 0.2d0
-      ce(4,12) = 0.1d0
-      ce(4,13) = 0.3d0
-
-      ce(5,1)  = 5.0d0
-      ce(5,2)  = 4.0d0
-      ce(5,3)  = 3.0d0
-      ce(5,4)  = 2.0d0
-      ce(5,5)  = 0.1d0
-      ce(5,6)  = 0.4d0
-      ce(5,7)  = 0.3d0
-      ce(5,8)  = 0.05d0
-      ce(5,9)  = 0.04d0
-      ce(5,10) = 0.03d0
-      ce(5,11) = 0.1d0
-      ce(5,12) = 0.3d0
-      ce(5,13) = 0.2d0
-
-      c1 = 1.4d0
-      c2 = 0.4d0
-      c3 = 0.1d0
-      c4 = 1.0d0
-      c5 = 1.4d0
-
-      bt = dsqrt(0.5d0)
-
-      dnxm1 = 1.0d0 / dble(grid_points(1)-1)
-      dnym1 = 1.0d0 / dble(grid_points(2)-1)
-      dnzm1 = 1.0d0 / dble(grid_points(3)-1)
-
-      c1c2 = c1 * c2
-      c1c5 = c1 * c5
-      c3c4 = c3 * c4
-      c1345 = c1c5 * c3c4
-
-      conz1 = (1.0d0-c1c5)
-
-      tx1 = 1.0d0 / (dnxm1 * dnxm1)
-      tx2 = 1.0d0 / (2.0d0 * dnxm1)
-      tx3 = 1.0d0 / dnxm1
-
-      ty1 = 1.0d0 / (dnym1 * dnym1)
-      ty2 = 1.0d0 / (2.0d0 * dnym1)
-      ty3 = 1.0d0 / dnym1
-      
-      tz1 = 1.0d0 / (dnzm1 * dnzm1)
-      tz2 = 1.0d0 / (2.0d0 * dnzm1)
-      tz3 = 1.0d0 / dnzm1
-
-      dx1 = 0.75d0
-      dx2 = 0.75d0
-      dx3 = 0.75d0
-      dx4 = 0.75d0
-      dx5 = 0.75d0
-
-      dy1 = 0.75d0
-      dy2 = 0.75d0
-      dy3 = 0.75d0
-      dy4 = 0.75d0
-      dy5 = 0.75d0
-
-      dz1 = 1.0d0
-      dz2 = 1.0d0
-      dz3 = 1.0d0
-      dz4 = 1.0d0
-      dz5 = 1.0d0
-
-      dxmax = dmax1(dx3, dx4)
-      dymax = dmax1(dy2, dy4)
-      dzmax = dmax1(dz2, dz3)
-
-      dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
-
-      c4dssp = 4.0d0 * dssp
-      c5dssp = 5.0d0 * dssp
-
-      dttx1 = dt*tx1
-      dttx2 = dt*tx2
-      dtty1 = dt*ty1
-      dtty2 = dt*ty2
-      dttz1 = dt*tz1
-      dttz2 = dt*tz2
-
-      c2dttx1 = 2.0d0*dttx1
-      c2dtty1 = 2.0d0*dtty1
-      c2dttz1 = 2.0d0*dttz1
-
-      dtdssp = dt*dssp
-
-      comz1  = dtdssp
-      comz4  = 4.0d0*dtdssp
-      comz5  = 5.0d0*dtdssp
-      comz6  = 6.0d0*dtdssp
-
-      c3c4tx3 = c3c4*tx3
-      c3c4ty3 = c3c4*ty3
-      c3c4tz3 = c3c4*tz3
-
-      dx1tx1 = dx1*tx1
-      dx2tx1 = dx2*tx1
-      dx3tx1 = dx3*tx1
-      dx4tx1 = dx4*tx1
-      dx5tx1 = dx5*tx1
-      
-      dy1ty1 = dy1*ty1
-      dy2ty1 = dy2*ty1
-      dy3ty1 = dy3*ty1
-      dy4ty1 = dy4*ty1
-      dy5ty1 = dy5*ty1
-      
-      dz1tz1 = dz1*tz1
-      dz2tz1 = dz2*tz1
-      dz3tz1 = dz3*tz1
-      dz4tz1 = dz4*tz1
-      dz5tz1 = dz5*tz1
-
-      c2iv  = 2.5d0
-      con43 = 4.0d0/3.0d0
-      con16 = 1.0d0/6.0d0
-      
-      xxcon1 = c3c4tx3*con43*tx3
-      xxcon2 = c3c4tx3*tx3
-      xxcon3 = c3c4tx3*conz1*tx3
-      xxcon4 = c3c4tx3*con16*tx3
-      xxcon5 = c3c4tx3*c1c5*tx3
-
-      yycon1 = c3c4ty3*con43*ty3
-      yycon2 = c3c4ty3*ty3
-      yycon3 = c3c4ty3*conz1*ty3
-      yycon4 = c3c4ty3*con16*ty3
-      yycon5 = c3c4ty3*c1c5*ty3
-
-      zzcon1 = c3c4tz3*con43*tz3
-      zzcon2 = c3c4tz3*tz3
-      zzcon3 = c3c4tz3*conz1*tz3
-      zzcon4 = c3c4tz3*con16*tz3
-      zzcon5 = c3c4tz3*c1c5*tz3
-
-      return
-      end
diff --git a/examples/smpi/NAS/BT/setup_mpi.f b/examples/smpi/NAS/BT/setup_mpi.f
deleted file mode 100644
index 987c6bfba4..0000000000
--- a/examples/smpi/NAS/BT/setup_mpi.f
+++ /dev/null
@@ -1,64 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup_mpi
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c set up MPI stuff
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'mpinpb.h'
-      include 'npbparams.h'
-      integer error, color, nc
-
-      call mpi_init(error)
-      
-      call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
-      call mpi_comm_rank(MPI_COMM_WORLD, node, error)
-
-      if (.not. convertdouble) then
-         dp_type = MPI_DOUBLE_PRECISION
-      else
-         dp_type = MPI_REAL
-      endif
-
-c---------------------------------------------------------------------
-c     compute square root; add small number to allow for roundoff
-c---------------------------------------------------------------------
-      nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
-
-c---------------------------------------------------------------------
-c We handle a non-square number of nodes by making the excess nodes
-c inactive. However, we can never handle more cells than were compiled
-c in. 
-c---------------------------------------------------------------------
-
-      if (nc .gt. maxcells) nc = maxcells
-      if (node .ge. nc*nc) then
-         active = .false.
-         color = 1
-      else
-         active = .true.
-         color = 0
-      end if
-      
-      call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
-      if (.not. active) return
-
-      call mpi_comm_size(comm_setup, no_nodes, error)
-      call mpi_comm_dup(comm_setup, comm_solve, error)
-      call mpi_comm_dup(comm_setup, comm_rhs, error)
-      
-c---------------------------------------------------------------------
-c     let node 0 be the root for the group (there is only one)
-c---------------------------------------------------------------------
-      root = 0
-
-      return
-      end
-
diff --git a/examples/smpi/NAS/BT/simple_mpiio.f b/examples/smpi/NAS/BT/simple_mpiio.f
deleted file mode 100644
index 02e2700177..0000000000
--- a/examples/smpi/NAS/BT/simple_mpiio.f
+++ /dev/null
@@ -1,213 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer m, ierr
-
-      iseek=0
-
-      if (node .eq. root) then
-          call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
-      endif
-
-      call MPI_Barrier(comm_solve, ierr)
-
-      call MPI_File_open(comm_solve,
-     $          filenm,
-     $          MPI_MODE_RDWR + MPI_MODE_CREATE,
-     $          MPI_INFO_NULL,
-     $          fp,
-     $          ierr)
-
-      call MPI_File_set_view(fp,
-     $          iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
-     $          'native', MPI_INFO_NULL, ierr)
-
-      if (ierr .ne. MPI_SUCCESS) then
-          print *, 'Error opening file'
-          stop
-      endif
-
-      do m = 1, 5
-         xce_sub(m) = 0.d0
-      end do
-
-      idump_sub = 0
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer count, jio, kio, cio, aio
-      integer ierr
-      integer mstatus(MPI_STATUS_SIZE)
-
-      do cio=1,ncells
-          do kio=0, cell_size(3,cio)-1
-              do jio=0, cell_size(2,cio)-1
-                  iseek=5*(cell_low(1,cio) +
-     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
-     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
-     $                   PROBLEM_SIZE*idump_sub)))
-
-                  count=5*cell_size(1,cio)
-
-                  call MPI_File_write_at(fp, iseek,
-     $                  u(1,0,jio,kio,cio),
-     $                  count, MPI_DOUBLE_PRECISION,
-     $                  mstatus, ierr)
-
-                  if (ierr .ne. MPI_SUCCESS) then
-                      print *, 'Error writing to file'
-                      stop
-                  endif
-              enddo
-          enddo
-      enddo
-
-      idump_sub = idump_sub + 1
-      if (rd_interval .gt. 0) then
-         if (idump_sub .ge. rd_interval) then
-
-            call acc_sub_norms(idump+1)
-
-            idump_sub = 0
-         endif
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine acc_sub_norms(idump_cur)
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer idump_cur
-
-      integer count, jio, kio, cio, ii, m, ichunk
-      integer ierr
-      integer mstatus(MPI_STATUS_SIZE)
-      double precision xce_single(5)
-
-      ichunk = idump_cur - idump_sub + 1
-      do ii=0, idump_sub-1
-        do cio=1,ncells
-          do kio=0, cell_size(3,cio)-1
-              do jio=0, cell_size(2,cio)-1
-                  iseek=5*(cell_low(1,cio) +
-     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
-     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
-     $                   PROBLEM_SIZE*ii)))
-
-                  count=5*cell_size(1,cio)
-
-                  call MPI_File_read_at(fp, iseek,
-     $                  u(1,0,jio,kio,cio),
-     $                  count, MPI_DOUBLE_PRECISION,
-     $                  mstatus, ierr)
-
-                  if (ierr .ne. MPI_SUCCESS) then
-                      print *, 'Error reading back file'
-                      call MPI_File_close(fp, ierr)
-                      stop
-                  endif
-              enddo
-          enddo
-        enddo
-
-        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
-
-        call error_norm(xce_single)
-        do m = 1, 5
-           xce_sub(m) = xce_sub(m) + xce_single(m)
-        end do
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ierr
-
-      call MPI_File_close(fp, ierr)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      double precision xce_acc(5)
-      integer m, ierr
-
-      if (rd_interval .gt. 0) goto 20
-
-      call MPI_File_open(comm_solve,
-     $          filenm,
-     $          MPI_MODE_RDONLY,
-     $          MPI_INFO_NULL,
-     $          fp,
-     $          ierr)
-
-      iseek = 0
-      call MPI_File_set_view(fp,
-     $          iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
-     $          'native', MPI_INFO_NULL, ierr)
-
-c     clear the last time step
-
-      call clear_timestep
-
-c     read back the time steps and accumulate norms
-
-      call acc_sub_norms(idump)
-
-      call MPI_File_close(fp, ierr)
-
- 20   continue
-      do m = 1, 5
-         xce_acc(m) = xce_sub(m) / dble(idump)
-      end do
-
-      return
-      end
-
diff --git a/examples/smpi/NAS/BT/solve_subs.f b/examples/smpi/NAS/BT/solve_subs.f
deleted file mode 100644
index 351489a313..0000000000
--- a/examples/smpi/NAS/BT/solve_subs.f
+++ /dev/null
@@ -1,642 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine matvec_sub(ablock,avec,bvec)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     subtracts bvec=bvec - ablock*avec
-c---------------------------------------------------------------------
-
-      implicit none
-
-      double precision ablock,avec,bvec
-      dimension ablock(5,5),avec(5),bvec(5)
-
-c---------------------------------------------------------------------
-c            rhs(i,ic,jc,kc,ccell) = rhs(i,ic,jc,kc,ccell) 
-c     $           - lhs(i,1,ablock,ia,ja,ka,acell)*
-c---------------------------------------------------------------------
-         bvec(1) = bvec(1) - ablock(1,1)*avec(1)
-     >                     - ablock(1,2)*avec(2)
-     >                     - ablock(1,3)*avec(3)
-     >                     - ablock(1,4)*avec(4)
-     >                     - ablock(1,5)*avec(5)
-         bvec(2) = bvec(2) - ablock(2,1)*avec(1)
-     >                     - ablock(2,2)*avec(2)
-     >                     - ablock(2,3)*avec(3)
-     >                     - ablock(2,4)*avec(4)
-     >                     - ablock(2,5)*avec(5)
-         bvec(3) = bvec(3) - ablock(3,1)*avec(1)
-     >                     - ablock(3,2)*avec(2)
-     >                     - ablock(3,3)*avec(3)
-     >                     - ablock(3,4)*avec(4)
-     >                     - ablock(3,5)*avec(5)
-         bvec(4) = bvec(4) - ablock(4,1)*avec(1)
-     >                     - ablock(4,2)*avec(2)
-     >                     - ablock(4,3)*avec(3)
-     >                     - ablock(4,4)*avec(4)
-     >                     - ablock(4,5)*avec(5)
-         bvec(5) = bvec(5) - ablock(5,1)*avec(1)
-     >                     - ablock(5,2)*avec(2)
-     >                     - ablock(5,3)*avec(3)
-     >                     - ablock(5,4)*avec(4)
-     >                     - ablock(5,5)*avec(5)
-
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine matmul_sub(ablock, bblock, cblock)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     subtracts a(i,j,k) X b(i,j,k) from c(i,j,k)
-c---------------------------------------------------------------------
-
-      implicit none
-
-      double precision ablock, bblock, cblock
-      dimension ablock(5,5), bblock(5,5), cblock(5,5)
-
-
-         cblock(1,1) = cblock(1,1) - ablock(1,1)*bblock(1,1)
-     >                             - ablock(1,2)*bblock(2,1)
-     >                             - ablock(1,3)*bblock(3,1)
-     >                             - ablock(1,4)*bblock(4,1)
-     >                             - ablock(1,5)*bblock(5,1)
-         cblock(2,1) = cblock(2,1) - ablock(2,1)*bblock(1,1)
-     >                             - ablock(2,2)*bblock(2,1)
-     >                             - ablock(2,3)*bblock(3,1)
-     >                             - ablock(2,4)*bblock(4,1)
-     >                             - ablock(2,5)*bblock(5,1)
-         cblock(3,1) = cblock(3,1) - ablock(3,1)*bblock(1,1)
-     >                             - ablock(3,2)*bblock(2,1)
-     >                             - ablock(3,3)*bblock(3,1)
-     >                             - ablock(3,4)*bblock(4,1)
-     >                             - ablock(3,5)*bblock(5,1)
-         cblock(4,1) = cblock(4,1) - ablock(4,1)*bblock(1,1)
-     >                             - ablock(4,2)*bblock(2,1)
-     >                             - ablock(4,3)*bblock(3,1)
-     >                             - ablock(4,4)*bblock(4,1)
-     >                             - ablock(4,5)*bblock(5,1)
-         cblock(5,1) = cblock(5,1) - ablock(5,1)*bblock(1,1)
-     >                             - ablock(5,2)*bblock(2,1)
-     >                             - ablock(5,3)*bblock(3,1)
-     >                             - ablock(5,4)*bblock(4,1)
-     >                             - ablock(5,5)*bblock(5,1)
-         cblock(1,2) = cblock(1,2) - ablock(1,1)*bblock(1,2)
-     >                             - ablock(1,2)*bblock(2,2)
-     >                             - ablock(1,3)*bblock(3,2)
-     >                             - ablock(1,4)*bblock(4,2)
-     >                             - ablock(1,5)*bblock(5,2)
-         cblock(2,2) = cblock(2,2) - ablock(2,1)*bblock(1,2)
-     >                             - ablock(2,2)*bblock(2,2)
-     >                             - ablock(2,3)*bblock(3,2)
-     >                             - ablock(2,4)*bblock(4,2)
-     >                             - ablock(2,5)*bblock(5,2)
-         cblock(3,2) = cblock(3,2) - ablock(3,1)*bblock(1,2)
-     >                             - ablock(3,2)*bblock(2,2)
-     >                             - ablock(3,3)*bblock(3,2)
-     >                             - ablock(3,4)*bblock(4,2)
-     >                             - ablock(3,5)*bblock(5,2)
-         cblock(4,2) = cblock(4,2) - ablock(4,1)*bblock(1,2)
-     >                             - ablock(4,2)*bblock(2,2)
-     >                             - ablock(4,3)*bblock(3,2)
-     >                             - ablock(4,4)*bblock(4,2)
-     >                             - ablock(4,5)*bblock(5,2)
-         cblock(5,2) = cblock(5,2) - ablock(5,1)*bblock(1,2)
-     >                             - ablock(5,2)*bblock(2,2)
-     >                             - ablock(5,3)*bblock(3,2)
-     >                             - ablock(5,4)*bblock(4,2)
-     >                             - ablock(5,5)*bblock(5,2)
-         cblock(1,3) = cblock(1,3) - ablock(1,1)*bblock(1,3)
-     >                             - ablock(1,2)*bblock(2,3)
-     >                             - ablock(1,3)*bblock(3,3)
-     >                             - ablock(1,4)*bblock(4,3)
-     >                             - ablock(1,5)*bblock(5,3)
-         cblock(2,3) = cblock(2,3) - ablock(2,1)*bblock(1,3)
-     >                             - ablock(2,2)*bblock(2,3)
-     >                             - ablock(2,3)*bblock(3,3)
-     >                             - ablock(2,4)*bblock(4,3)
-     >                             - ablock(2,5)*bblock(5,3)
-         cblock(3,3) = cblock(3,3) - ablock(3,1)*bblock(1,3)
-     >                             - ablock(3,2)*bblock(2,3)
-     >                             - ablock(3,3)*bblock(3,3)
-     >                             - ablock(3,4)*bblock(4,3)
-     >                             - ablock(3,5)*bblock(5,3)
-         cblock(4,3) = cblock(4,3) - ablock(4,1)*bblock(1,3)
-     >                             - ablock(4,2)*bblock(2,3)
-     >                             - ablock(4,3)*bblock(3,3)
-     >                             - ablock(4,4)*bblock(4,3)
-     >                             - ablock(4,5)*bblock(5,3)
-         cblock(5,3) = cblock(5,3) - ablock(5,1)*bblock(1,3)
-     >                             - ablock(5,2)*bblock(2,3)
-     >                             - ablock(5,3)*bblock(3,3)
-     >                             - ablock(5,4)*bblock(4,3)
-     >                             - ablock(5,5)*bblock(5,3)
-         cblock(1,4) = cblock(1,4) - ablock(1,1)*bblock(1,4)
-     >                             - ablock(1,2)*bblock(2,4)
-     >                             - ablock(1,3)*bblock(3,4)
-     >                             - ablock(1,4)*bblock(4,4)
-     >                             - ablock(1,5)*bblock(5,4)
-         cblock(2,4) = cblock(2,4) - ablock(2,1)*bblock(1,4)
-     >                             - ablock(2,2)*bblock(2,4)
-     >                             - ablock(2,3)*bblock(3,4)
-     >                             - ablock(2,4)*bblock(4,4)
-     >                             - ablock(2,5)*bblock(5,4)
-         cblock(3,4) = cblock(3,4) - ablock(3,1)*bblock(1,4)
-     >                             - ablock(3,2)*bblock(2,4)
-     >                             - ablock(3,3)*bblock(3,4)
-     >                             - ablock(3,4)*bblock(4,4)
-     >                             - ablock(3,5)*bblock(5,4)
-         cblock(4,4) = cblock(4,4) - ablock(4,1)*bblock(1,4)
-     >                             - ablock(4,2)*bblock(2,4)
-     >                             - ablock(4,3)*bblock(3,4)
-     >                             - ablock(4,4)*bblock(4,4)
-     >                             - ablock(4,5)*bblock(5,4)
-         cblock(5,4) = cblock(5,4) - ablock(5,1)*bblock(1,4)
-     >                             - ablock(5,2)*bblock(2,4)
-     >                             - ablock(5,3)*bblock(3,4)
-     >                             - ablock(5,4)*bblock(4,4)
-     >                             - ablock(5,5)*bblock(5,4)
-         cblock(1,5) = cblock(1,5) - ablock(1,1)*bblock(1,5)
-     >                             - ablock(1,2)*bblock(2,5)
-     >                             - ablock(1,3)*bblock(3,5)
-     >                             - ablock(1,4)*bblock(4,5)
-     >                             - ablock(1,5)*bblock(5,5)
-         cblock(2,5) = cblock(2,5) - ablock(2,1)*bblock(1,5)
-     >                             - ablock(2,2)*bblock(2,5)
-     >                             - ablock(2,3)*bblock(3,5)
-     >                             - ablock(2,4)*bblock(4,5)
-     >                             - ablock(2,5)*bblock(5,5)
-         cblock(3,5) = cblock(3,5) - ablock(3,1)*bblock(1,5)
-     >                             - ablock(3,2)*bblock(2,5)
-     >                             - ablock(3,3)*bblock(3,5)
-     >                             - ablock(3,4)*bblock(4,5)
-     >                             - ablock(3,5)*bblock(5,5)
-         cblock(4,5) = cblock(4,5) - ablock(4,1)*bblock(1,5)
-     >                             - ablock(4,2)*bblock(2,5)
-     >                             - ablock(4,3)*bblock(3,5)
-     >                             - ablock(4,4)*bblock(4,5)
-     >                             - ablock(4,5)*bblock(5,5)
-         cblock(5,5) = cblock(5,5) - ablock(5,1)*bblock(1,5)
-     >                             - ablock(5,2)*bblock(2,5)
-     >                             - ablock(5,3)*bblock(3,5)
-     >                             - ablock(5,4)*bblock(4,5)
-     >                             - ablock(5,5)*bblock(5,5)
-
-              
-      return
-      end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine binvcrhs( lhs,c,r )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     
-c---------------------------------------------------------------------
-
-      implicit none
-
-      double precision pivot, coeff, lhs
-      dimension lhs(5,5)
-      double precision c(5,5), r(5)
-
-c---------------------------------------------------------------------
-c     
-c---------------------------------------------------------------------
-
-      pivot = 1.00d0/lhs(1,1)
-      lhs(1,2) = lhs(1,2)*pivot
-      lhs(1,3) = lhs(1,3)*pivot
-      lhs(1,4) = lhs(1,4)*pivot
-      lhs(1,5) = lhs(1,5)*pivot
-      c(1,1) = c(1,1)*pivot
-      c(1,2) = c(1,2)*pivot
-      c(1,3) = c(1,3)*pivot
-      c(1,4) = c(1,4)*pivot
-      c(1,5) = c(1,5)*pivot
-      r(1)   = r(1)  *pivot
-
-      coeff = lhs(2,1)
-      lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
-      lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
-      lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
-      lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
-      c(2,1) = c(2,1) - coeff*c(1,1)
-      c(2,2) = c(2,2) - coeff*c(1,2)
-      c(2,3) = c(2,3) - coeff*c(1,3)
-      c(2,4) = c(2,4) - coeff*c(1,4)
-      c(2,5) = c(2,5) - coeff*c(1,5)
-      r(2)   = r(2)   - coeff*r(1)
-
-      coeff = lhs(3,1)
-      lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
-      lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
-      lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
-      lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
-      c(3,1) = c(3,1) - coeff*c(1,1)
-      c(3,2) = c(3,2) - coeff*c(1,2)
-      c(3,3) = c(3,3) - coeff*c(1,3)
-      c(3,4) = c(3,4) - coeff*c(1,4)
-      c(3,5) = c(3,5) - coeff*c(1,5)
-      r(3)   = r(3)   - coeff*r(1)
-
-      coeff = lhs(4,1)
-      lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
-      lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
-      lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
-      lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
-      c(4,1) = c(4,1) - coeff*c(1,1)
-      c(4,2) = c(4,2) - coeff*c(1,2)
-      c(4,3) = c(4,3) - coeff*c(1,3)
-      c(4,4) = c(4,4) - coeff*c(1,4)
-      c(4,5) = c(4,5) - coeff*c(1,5)
-      r(4)   = r(4)   - coeff*r(1)
-
-      coeff = lhs(5,1)
-      lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
-      lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
-      lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
-      lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
-      c(5,1) = c(5,1) - coeff*c(1,1)
-      c(5,2) = c(5,2) - coeff*c(1,2)
-      c(5,3) = c(5,3) - coeff*c(1,3)
-      c(5,4) = c(5,4) - coeff*c(1,4)
-      c(5,5) = c(5,5) - coeff*c(1,5)
-      r(5)   = r(5)   - coeff*r(1)
-
-
-      pivot = 1.00d0/lhs(2,2)
-      lhs(2,3) = lhs(2,3)*pivot
-      lhs(2,4) = lhs(2,4)*pivot
-      lhs(2,5) = lhs(2,5)*pivot
-      c(2,1) = c(2,1)*pivot
-      c(2,2) = c(2,2)*pivot
-      c(2,3) = c(2,3)*pivot
-      c(2,4) = c(2,4)*pivot
-      c(2,5) = c(2,5)*pivot
-      r(2)   = r(2)  *pivot
-
-      coeff = lhs(1,2)
-      lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
-      lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
-      lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
-      c(1,1) = c(1,1) - coeff*c(2,1)
-      c(1,2) = c(1,2) - coeff*c(2,2)
-      c(1,3) = c(1,3) - coeff*c(2,3)
-      c(1,4) = c(1,4) - coeff*c(2,4)
-      c(1,5) = c(1,5) - coeff*c(2,5)
-      r(1)   = r(1)   - coeff*r(2)
-
-      coeff = lhs(3,2)
-      lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
-      lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
-      lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
-      c(3,1) = c(3,1) - coeff*c(2,1)
-      c(3,2) = c(3,2) - coeff*c(2,2)
-      c(3,3) = c(3,3) - coeff*c(2,3)
-      c(3,4) = c(3,4) - coeff*c(2,4)
-      c(3,5) = c(3,5) - coeff*c(2,5)
-      r(3)   = r(3)   - coeff*r(2)
-
-      coeff = lhs(4,2)
-      lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
-      lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
-      lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
-      c(4,1) = c(4,1) - coeff*c(2,1)
-      c(4,2) = c(4,2) - coeff*c(2,2)
-      c(4,3) = c(4,3) - coeff*c(2,3)
-      c(4,4) = c(4,4) - coeff*c(2,4)
-      c(4,5) = c(4,5) - coeff*c(2,5)
-      r(4)   = r(4)   - coeff*r(2)
-
-      coeff = lhs(5,2)
-      lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
-      lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
-      lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
-      c(5,1) = c(5,1) - coeff*c(2,1)
-      c(5,2) = c(5,2) - coeff*c(2,2)
-      c(5,3) = c(5,3) - coeff*c(2,3)
-      c(5,4) = c(5,4) - coeff*c(2,4)
-      c(5,5) = c(5,5) - coeff*c(2,5)
-      r(5)   = r(5)   - coeff*r(2)
-
-
-      pivot = 1.00d0/lhs(3,3)
-      lhs(3,4) = lhs(3,4)*pivot
-      lhs(3,5) = lhs(3,5)*pivot
-      c(3,1) = c(3,1)*pivot
-      c(3,2) = c(3,2)*pivot
-      c(3,3) = c(3,3)*pivot
-      c(3,4) = c(3,4)*pivot
-      c(3,5) = c(3,5)*pivot
-      r(3)   = r(3)  *pivot
-
-      coeff = lhs(1,3)
-      lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
-      lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
-      c(1,1) = c(1,1) - coeff*c(3,1)
-      c(1,2) = c(1,2) - coeff*c(3,2)
-      c(1,3) = c(1,3) - coeff*c(3,3)
-      c(1,4) = c(1,4) - coeff*c(3,4)
-      c(1,5) = c(1,5) - coeff*c(3,5)
-      r(1)   = r(1)   - coeff*r(3)
-
-      coeff = lhs(2,3)
-      lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
-      lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
-      c(2,1) = c(2,1) - coeff*c(3,1)
-      c(2,2) = c(2,2) - coeff*c(3,2)
-      c(2,3) = c(2,3) - coeff*c(3,3)
-      c(2,4) = c(2,4) - coeff*c(3,4)
-      c(2,5) = c(2,5) - coeff*c(3,5)
-      r(2)   = r(2)   - coeff*r(3)
-
-      coeff = lhs(4,3)
-      lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
-      lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
-      c(4,1) = c(4,1) - coeff*c(3,1)
-      c(4,2) = c(4,2) - coeff*c(3,2)
-      c(4,3) = c(4,3) - coeff*c(3,3)
-      c(4,4) = c(4,4) - coeff*c(3,4)
-      c(4,5) = c(4,5) - coeff*c(3,5)
-      r(4)   = r(4)   - coeff*r(3)
-
-      coeff = lhs(5,3)
-      lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
-      lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
-      c(5,1) = c(5,1) - coeff*c(3,1)
-      c(5,2) = c(5,2) - coeff*c(3,2)
-      c(5,3) = c(5,3) - coeff*c(3,3)
-      c(5,4) = c(5,4) - coeff*c(3,4)
-      c(5,5) = c(5,5) - coeff*c(3,5)
-      r(5)   = r(5)   - coeff*r(3)
-
-
-      pivot = 1.00d0/lhs(4,4)
-      lhs(4,5) = lhs(4,5)*pivot
-      c(4,1) = c(4,1)*pivot
-      c(4,2) = c(4,2)*pivot
-      c(4,3) = c(4,3)*pivot
-      c(4,4) = c(4,4)*pivot
-      c(4,5) = c(4,5)*pivot
-      r(4)   = r(4)  *pivot
-
-      coeff = lhs(1,4)
-      lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
-      c(1,1) = c(1,1) - coeff*c(4,1)
-      c(1,2) = c(1,2) - coeff*c(4,2)
-      c(1,3) = c(1,3) - coeff*c(4,3)
-      c(1,4) = c(1,4) - coeff*c(4,4)
-      c(1,5) = c(1,5) - coeff*c(4,5)
-      r(1)   = r(1)   - coeff*r(4)
-
-      coeff = lhs(2,4)
-      lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
-      c(2,1) = c(2,1) - coeff*c(4,1)
-      c(2,2) = c(2,2) - coeff*c(4,2)
-      c(2,3) = c(2,3) - coeff*c(4,3)
-      c(2,4) = c(2,4) - coeff*c(4,4)
-      c(2,5) = c(2,5) - coeff*c(4,5)
-      r(2)   = r(2)   - coeff*r(4)
-
-      coeff = lhs(3,4)
-      lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
-      c(3,1) = c(3,1) - coeff*c(4,1)
-      c(3,2) = c(3,2) - coeff*c(4,2)
-      c(3,3) = c(3,3) - coeff*c(4,3)
-      c(3,4) = c(3,4) - coeff*c(4,4)
-      c(3,5) = c(3,5) - coeff*c(4,5)
-      r(3)   = r(3)   - coeff*r(4)
-
-      coeff = lhs(5,4)
-      lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
-      c(5,1) = c(5,1) - coeff*c(4,1)
-      c(5,2) = c(5,2) - coeff*c(4,2)
-      c(5,3) = c(5,3) - coeff*c(4,3)
-      c(5,4) = c(5,4) - coeff*c(4,4)
-      c(5,5) = c(5,5) - coeff*c(4,5)
-      r(5)   = r(5)   - coeff*r(4)
-
-
-      pivot = 1.00d0/lhs(5,5)
-      c(5,1) = c(5,1)*pivot
-      c(5,2) = c(5,2)*pivot
-      c(5,3) = c(5,3)*pivot
-      c(5,4) = c(5,4)*pivot
-      c(5,5) = c(5,5)*pivot
-      r(5)   = r(5)  *pivot
-
-      coeff = lhs(1,5)
-      c(1,1) = c(1,1) - coeff*c(5,1)
-      c(1,2) = c(1,2) - coeff*c(5,2)
-      c(1,3) = c(1,3) - coeff*c(5,3)
-      c(1,4) = c(1,4) - coeff*c(5,4)
-      c(1,5) = c(1,5) - coeff*c(5,5)
-      r(1)   = r(1)   - coeff*r(5)
-
-      coeff = lhs(2,5)
-      c(2,1) = c(2,1) - coeff*c(5,1)
-      c(2,2) = c(2,2) - coeff*c(5,2)
-      c(2,3) = c(2,3) - coeff*c(5,3)
-      c(2,4) = c(2,4) - coeff*c(5,4)
-      c(2,5) = c(2,5) - coeff*c(5,5)
-      r(2)   = r(2)   - coeff*r(5)
-
-      coeff = lhs(3,5)
-      c(3,1) = c(3,1) - coeff*c(5,1)
-      c(3,2) = c(3,2) - coeff*c(5,2)
-      c(3,3) = c(3,3) - coeff*c(5,3)
-      c(3,4) = c(3,4) - coeff*c(5,4)
-      c(3,5) = c(3,5) - coeff*c(5,5)
-      r(3)   = r(3)   - coeff*r(5)
-
-      coeff = lhs(4,5)
-      c(4,1) = c(4,1) - coeff*c(5,1)
-      c(4,2) = c(4,2) - coeff*c(5,2)
-      c(4,3) = c(4,3) - coeff*c(5,3)
-      c(4,4) = c(4,4) - coeff*c(5,4)
-      c(4,5) = c(4,5) - coeff*c(5,5)
-      r(4)   = r(4)   - coeff*r(5)
-
-
-      return
-      end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine binvrhs( lhs,r )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     
-c---------------------------------------------------------------------
-
-      implicit none
-
-      double precision pivot, coeff, lhs
-      dimension lhs(5,5)
-      double precision r(5)
-
-c---------------------------------------------------------------------
-c     
-c---------------------------------------------------------------------
-
-
-      pivot = 1.00d0/lhs(1,1)
-      lhs(1,2) = lhs(1,2)*pivot
-      lhs(1,3) = lhs(1,3)*pivot
-      lhs(1,4) = lhs(1,4)*pivot
-      lhs(1,5) = lhs(1,5)*pivot
-      r(1)   = r(1)  *pivot
-
-      coeff = lhs(2,1)
-      lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
-      lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
-      lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
-      lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
-      r(2)   = r(2)   - coeff*r(1)
-
-      coeff = lhs(3,1)
-      lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
-      lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
-      lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
-      lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
-      r(3)   = r(3)   - coeff*r(1)
-
-      coeff = lhs(4,1)
-      lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
-      lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
-      lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
-      lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
-      r(4)   = r(4)   - coeff*r(1)
-
-      coeff = lhs(5,1)
-      lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
-      lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
-      lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
-      lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
-      r(5)   = r(5)   - coeff*r(1)
-
-
-      pivot = 1.00d0/lhs(2,2)
-      lhs(2,3) = lhs(2,3)*pivot
-      lhs(2,4) = lhs(2,4)*pivot
-      lhs(2,5) = lhs(2,5)*pivot
-      r(2)   = r(2)  *pivot
-
-      coeff = lhs(1,2)
-      lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
-      lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
-      lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
-      r(1)   = r(1)   - coeff*r(2)
-
-      coeff = lhs(3,2)
-      lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
-      lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
-      lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
-      r(3)   = r(3)   - coeff*r(2)
-
-      coeff = lhs(4,2)
-      lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
-      lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
-      lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
-      r(4)   = r(4)   - coeff*r(2)
-
-      coeff = lhs(5,2)
-      lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
-      lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
-      lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
-      r(5)   = r(5)   - coeff*r(2)
-
-
-      pivot = 1.00d0/lhs(3,3)
-      lhs(3,4) = lhs(3,4)*pivot
-      lhs(3,5) = lhs(3,5)*pivot
-      r(3)   = r(3)  *pivot
-
-      coeff = lhs(1,3)
-      lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
-      lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
-      r(1)   = r(1)   - coeff*r(3)
-
-      coeff = lhs(2,3)
-      lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
-      lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
-      r(2)   = r(2)   - coeff*r(3)
-
-      coeff = lhs(4,3)
-      lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
-      lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
-      r(4)   = r(4)   - coeff*r(3)
-
-      coeff = lhs(5,3)
-      lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
-      lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
-      r(5)   = r(5)   - coeff*r(3)
-
-
-      pivot = 1.00d0/lhs(4,4)
-      lhs(4,5) = lhs(4,5)*pivot
-      r(4)   = r(4)  *pivot
-
-      coeff = lhs(1,4)
-      lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
-      r(1)   = r(1)   - coeff*r(4)
-
-      coeff = lhs(2,4)
-      lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
-      r(2)   = r(2)   - coeff*r(4)
-
-      coeff = lhs(3,4)
-      lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
-      r(3)   = r(3)   - coeff*r(4)
-
-      coeff = lhs(5,4)
-      lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
-      r(5)   = r(5)   - coeff*r(4)
-
-
-      pivot = 1.00d0/lhs(5,5)
-      r(5)   = r(5)  *pivot
-
-      coeff = lhs(1,5)
-      r(1)   = r(1)   - coeff*r(5)
-
-      coeff = lhs(2,5)
-      r(2)   = r(2)   - coeff*r(5)
-
-      coeff = lhs(3,5)
-      r(3)   = r(3)   - coeff*r(5)
-
-      coeff = lhs(4,5)
-      r(4)   = r(4)   - coeff*r(5)
-
-
-      return
-      end
-
-
-
diff --git a/examples/smpi/NAS/BT/verify.f b/examples/smpi/NAS/BT/verify.f
deleted file mode 100644
index 7dbc8a96a2..0000000000
--- a/examples/smpi/NAS/BT/verify.f
+++ /dev/null
@@ -1,435 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-        subroutine verify(no_time_steps, class, verified)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c  verification routine                         
-c---------------------------------------------------------------------
-
-        include 'header.h'
-        include 'mpinpb.h'
-
-        double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), 
-     >                   epsilon, xce(5), xcr(5), dtref
-        integer m, no_time_steps
-        character class
-        logical verified
-
-c---------------------------------------------------------------------
-c   tolerance level
-c---------------------------------------------------------------------
-        epsilon = 1.0d-08
-        verified = .true.
-
-c---------------------------------------------------------------------
-c   compute the error norm and the residual norm, and exit if not printing
-c---------------------------------------------------------------------
-
-        if (iotype .ne. 0) then
-           call accumulate_norms(xce)
-        else
-           call error_norm(xce)
-        endif
-
-        call copy_faces
-
-        call rhs_norm(xcr)
-
-        do m = 1, 5
-           xcr(m) = xcr(m) / dt
-        enddo
-
-        if (node .ne. 0) return
-
-        class = 'U'
-
-        do m = 1,5
-           xcrref(m) = 1.0
-           xceref(m) = 1.0
-        end do
-
-c---------------------------------------------------------------------
-c    reference data for 12X12X12 grids after 60 time steps, with DT = 1.0d-02
-c---------------------------------------------------------------------
-        if ( (grid_points(1)  .eq. 12     ) .and. 
-     >       (grid_points(2)  .eq. 12     ) .and.
-     >       (grid_points(3)  .eq. 12     ) .and.
-     >       (no_time_steps   .eq. 60    ))  then
-
-           class = 'S'
-           dtref = 1.0d-2
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-         xcrref(1) = 1.7034283709541311d-01
-         xcrref(2) = 1.2975252070034097d-02
-         xcrref(3) = 3.2527926989486055d-02
-         xcrref(4) = 2.6436421275166801d-02
-         xcrref(5) = 1.9211784131744430d-01
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
-         if (iotype .eq. 0) then
-           xceref(1) = 4.9976913345811579d-04
-           xceref(2) = 4.5195666782961927d-05
-           xceref(3) = 7.3973765172921357d-05
-           xceref(4) = 7.3821238632439731d-05
-           xceref(5) = 8.9269630987491446d-04
-         else
-           xceref(1) = 0.1149036328945d+02
-           xceref(2) = 0.9156788904727d+00
-           xceref(3) = 0.2857899428614d+01
-           xceref(4) = 0.2598273346734d+01
-           xceref(5) = 0.2652795397547d+02
-         endif
-
-c---------------------------------------------------------------------
-c    reference data for 24X24X24 grids after 200 time steps, with DT = 0.8d-3
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 24) .and. 
-     >           (grid_points(2) .eq. 24) .and.
-     >           (grid_points(3) .eq. 24) .and.
-     >           (no_time_steps . eq. 200) ) then
-
-           class = 'W'
-           dtref = 0.8d-3
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-           xcrref(1) = 0.1125590409344d+03
-           xcrref(2) = 0.1180007595731d+02
-           xcrref(3) = 0.2710329767846d+02
-           xcrref(4) = 0.2469174937669d+02
-           xcrref(5) = 0.2638427874317d+03
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
-         if (iotype .eq. 0) then
-           xceref(1) = 0.4419655736008d+01
-           xceref(2) = 0.4638531260002d+00
-           xceref(3) = 0.1011551749967d+01
-           xceref(4) = 0.9235878729944d+00
-           xceref(5) = 0.1018045837718d+02
-         else
-           xceref(1) = 0.6729594398612d+02
-           xceref(2) = 0.5264523081690d+01
-           xceref(3) = 0.1677107142637d+02
-           xceref(4) = 0.1508721463436d+02
-           xceref(5) = 0.1477018363393d+03
-         endif
-
-
-c---------------------------------------------------------------------
-c    reference data for 64X64X64 grids after 200 time steps, with DT = 0.8d-3
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 64) .and. 
-     >           (grid_points(2) .eq. 64) .and.
-     >           (grid_points(3) .eq. 64) .and.
-     >           (no_time_steps . eq. 200) ) then
-
-           class = 'A'
-           dtref = 0.8d-3
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-         xcrref(1) = 1.0806346714637264d+02
-         xcrref(2) = 1.1319730901220813d+01
-         xcrref(3) = 2.5974354511582465d+01
-         xcrref(4) = 2.3665622544678910d+01
-         xcrref(5) = 2.5278963211748344d+02
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
-         if (iotype .eq. 0) then
-           xceref(1) = 4.2348416040525025d+00
-           xceref(2) = 4.4390282496995698d-01
-           xceref(3) = 9.6692480136345650d-01
-           xceref(4) = 8.8302063039765474d-01
-           xceref(5) = 9.7379901770829278d+00
-         else
-           xceref(1) = 0.6482218724961d+02
-           xceref(2) = 0.5066461714527d+01
-           xceref(3) = 0.1613931961359d+02
-           xceref(4) = 0.1452010201481d+02
-           xceref(5) = 0.1420099377681d+03
-         endif
-
-c---------------------------------------------------------------------
-c    reference data for 102X102X102 grids after 200 time steps,
-c    with DT = 3.0d-04
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 102) .and. 
-     >           (grid_points(2) .eq. 102) .and.
-     >           (grid_points(3) .eq. 102) .and.
-     >           (no_time_steps . eq. 200) ) then
-
-           class = 'B'
-           dtref = 3.0d-4
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-         xcrref(1) = 1.4233597229287254d+03
-         xcrref(2) = 9.9330522590150238d+01
-         xcrref(3) = 3.5646025644535285d+02
-         xcrref(4) = 3.2485447959084092d+02
-         xcrref(5) = 3.2707541254659363d+03
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
-         if (iotype .eq. 0) then
-           xceref(1) = 5.2969847140936856d+01
-           xceref(2) = 4.4632896115670668d+00
-           xceref(3) = 1.3122573342210174d+01
-           xceref(4) = 1.2006925323559144d+01
-           xceref(5) = 1.2459576151035986d+02
-         else
-           xceref(1) = 0.1477545106464d+03
-           xceref(2) = 0.1108895555053d+02
-           xceref(3) = 0.3698065590331d+02
-           xceref(4) = 0.3310505581440d+02
-           xceref(5) = 0.3157928282563d+03
-         endif
-
-c---------------------------------------------------------------------
-c    reference data for 162X162X162 grids after 200 time steps,
-c    with DT = 1.0d-04
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 162) .and. 
-     >           (grid_points(2) .eq. 162) .and.
-     >           (grid_points(3) .eq. 162) .and.
-     >           (no_time_steps . eq. 200) ) then
-
-           class = 'C'
-           dtref = 1.0d-4
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-         xcrref(1) = 0.62398116551764615d+04
-         xcrref(2) = 0.50793239190423964d+03
-         xcrref(3) = 0.15423530093013596d+04
-         xcrref(4) = 0.13302387929291190d+04
-         xcrref(5) = 0.11604087428436455d+05
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
-         if (iotype .eq. 0) then
-           xceref(1) = 0.16462008369091265d+03
-           xceref(2) = 0.11497107903824313d+02
-           xceref(3) = 0.41207446207461508d+02
-           xceref(4) = 0.37087651059694167d+02
-           xceref(5) = 0.36211053051841265d+03
-         else
-           xceref(1) = 0.2597156483475d+03
-           xceref(2) = 0.1985384289495d+02
-           xceref(3) = 0.6517950485788d+02
-           xceref(4) = 0.5757235541520d+02
-           xceref(5) = 0.5215668188726d+03
-         endif 
-
-
-c---------------------------------------------------------------------
-c    reference data for 408x408x408 grids after 250 time steps,
-c    with DT = 0.2d-04
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 408) .and. 
-     >           (grid_points(2) .eq. 408) .and.
-     >           (grid_points(3) .eq. 408) .and.
-     >           (no_time_steps . eq. 250) ) then
-
-           class = 'D'
-           dtref = 0.2d-4
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-         xcrref(1) = 0.2533188551738d+05
-         xcrref(2) = 0.2346393716980d+04
-         xcrref(3) = 0.6294554366904d+04
-         xcrref(4) = 0.5352565376030d+04
-         xcrref(5) = 0.3905864038618d+05
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
-         if (iotype .eq. 0) then
-           xceref(1) = 0.3100009377557d+03
-           xceref(2) = 0.2424086324913d+02
-           xceref(3) = 0.7782212022645d+02
-           xceref(4) = 0.6835623860116d+02
-           xceref(5) = 0.6065737200368d+03
-         else
-           xceref(1) = 0.3813781566713d+03
-           xceref(2) = 0.3160872966198d+02
-           xceref(3) = 0.9593576357290d+02
-           xceref(4) = 0.8363391989815d+02
-           xceref(5) = 0.7063466087423d+03
-         endif
-
-
-c---------------------------------------------------------------------
-c    reference data for 1020x1020x1020 grids after 250 time steps,
-c    with DT = 0.4d-05
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 1020) .and. 
-     >           (grid_points(2) .eq. 1020) .and.
-     >           (grid_points(3) .eq. 1020) .and.
-     >           (no_time_steps . eq. 250) ) then
-
-           class = 'E'
-           dtref = 0.4d-5
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-         xcrref(1) = 0.9795372484517d+05
-         xcrref(2) = 0.9739814511521d+04
-         xcrref(3) = 0.2467606342965d+05
-         xcrref(4) = 0.2092419572860d+05
-         xcrref(5) = 0.1392138856939d+06
-
-c---------------------------------------------------------------------
-c  Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
-         if (iotype .eq. 0) then
-           xceref(1) = 0.4327562208414d+03
-           xceref(2) = 0.3699051964887d+02
-           xceref(3) = 0.1089845040954d+03
-           xceref(4) = 0.9462517622043d+02
-           xceref(5) = 0.7765512765309d+03
-         else
-c  wr_interval = 5
-           xceref(1) = 0.4729898413058d+03
-           xceref(2) = 0.4145899331704d+02
-           xceref(3) = 0.1192850917138d+03
-           xceref(4) = 0.1032746026932d+03
-           xceref(5) = 0.8270322177634d+03
-c  wr_interval = 10
-c          xceref(1) = 0.4718135916251d+03
-c          xceref(2) = 0.4132620259096d+02
-c          xceref(3) = 0.1189831133503d+03
-c          xceref(4) = 0.1030212798803d+03
-c          xceref(5) = 0.8255924078458d+03
-        endif
-
-        else
-           verified = .false.
-        endif
-
-c---------------------------------------------------------------------
-c    verification test for residuals if gridsize is one of 
-c    the defined grid sizes above (class .ne. 'U')
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c    Compute the difference of solution values and the known reference 
-c    values.
-c---------------------------------------------------------------------
-        do m = 1, 5
-           
-           xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) 
-           xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
-           
-        enddo
-
-c---------------------------------------------------------------------
-c    Output the comparison of computed results to known cases.
-c---------------------------------------------------------------------
-
-        if (class .ne. 'U') then
-           write(*, 1990) class
- 1990      format(' Verification being performed for class ', a)
-           write (*,2000) epsilon
- 2000      format(' accuracy setting for epsilon = ', E20.13)
-           verified = (dabs(dt-dtref) .le. epsilon)
-           if (.not.verified) then  
-              verified = .false.
-              class = 'U'
-              write (*,1000) dtref
- 1000         format(' DT does not match the reference value of ', 
-     >                 E15.8)
-           endif
-        else 
-           write(*, 1995)
- 1995      format(' Unknown class')
-        endif
-
-
-        if (class .ne. 'U') then
-           write (*,2001) 
-        else
-           write (*, 2005)
-        endif
-
- 2001   format(' Comparison of RMS-norms of residual')
- 2005   format(' RMS-norms of residual')
-        do m = 1, 5
-           if (class .eq. 'U') then
-              write(*, 2015) m, xcr(m)
-           else if (xcrdif(m) .le. epsilon) then
-              write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
-           else 
-              verified = .false.
-              write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
-           endif
-        enddo
-
-        if (class .ne. 'U') then
-           write (*,2002)
-        else
-           write (*,2006)
-        endif
- 2002   format(' Comparison of RMS-norms of solution error')
- 2006   format(' RMS-norms of solution error')
-        
-        do m = 1, 5
-           if (class .eq. 'U') then
-              write(*, 2015) m, xce(m)
-           else if (xcedif(m) .le. epsilon) then
-              write (*,2011) m,xce(m),xceref(m),xcedif(m)
-           else
-              verified = .false.
-              write (*,2010) m,xce(m),xceref(m),xcedif(m)
-           endif
-        enddo
-        
- 2010   format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
- 2011   format('          ', i2, E20.13, E20.13, E20.13)
- 2015   format('          ', i2, E20.13)
-        
-        if (class .eq. 'U') then
-           write(*, 2022)
-           write(*, 2023)
- 2022      format(' No reference values provided')
- 2023      format(' No verification performed')
-        else if (verified) then
-           write(*, 2020)
- 2020      format(' Verification Successful')
-        else
-           write(*, 2021)
- 2021      format(' Verification failed')
-        endif
-
-        return
-
-
-        end
diff --git a/examples/smpi/NAS/BT/work_lhs.h b/examples/smpi/NAS/BT/work_lhs.h
deleted file mode 100644
index d9bc9e4d66..0000000000
--- a/examples/smpi/NAS/BT/work_lhs.h
+++ /dev/null
@@ -1,14 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-c
-c  work_lhs.h
-c
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      double precision fjac(5, 5, -2:MAX_CELL_DIM+1),
-     >                 njac(5, 5, -2:MAX_CELL_DIM+1),
-     >                 lhsa(5, 5, -1:MAX_CELL_DIM),
-     >                 lhsb(5, 5, -1:MAX_CELL_DIM),
-     >                 tmp1, tmp2, tmp3
-      common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
diff --git a/examples/smpi/NAS/BT/work_lhs_vec.h b/examples/smpi/NAS/BT/work_lhs_vec.h
deleted file mode 100644
index a97054f419..0000000000
--- a/examples/smpi/NAS/BT/work_lhs_vec.h
+++ /dev/null
@@ -1,14 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-c
-c  work_lhs_vec.h
-c
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      double precision fjac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
-     >                 njac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
-     >                 lhsa(5, 5, -1:MAX_CELL_DIM,   -1:MAX_CELL_DIM),
-     >                 lhsb(5, 5, -1:MAX_CELL_DIM,   -1:MAX_CELL_DIM),
-     >                 tmp1, tmp2, tmp3
-      common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
diff --git a/examples/smpi/NAS/BT/x_solve.f b/examples/smpi/NAS/BT/x_solve.f
deleted file mode 100644
index 5386732616..0000000000
--- a/examples/smpi/NAS/BT/x_solve.f
+++ /dev/null
@@ -1,761 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     
-c     Performs line solves in X direction by first factoring
-c     the block-tridiagonal matrix into an upper triangular matrix, 
-c     and then performing back substitution to solve for the unknow
-c     vectors of each line.  
-c     
-c     Make sure we treat elements zero to cell_size in the direction
-c     of the sweep.
-c     
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-      integer  c, istart, stage,
-     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
-     >     isize,jsize,ksize,send_id
-
-      istart = 0
-
-c---------------------------------------------------------------------
-c     in our terminology stage is the number of the cell in the x-direction
-c     i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
-      do stage = 1,ncells
-         c = slice(1,stage)
-         isize = cell_size(1,c) - 1
-         jsize = cell_size(2,c) - 1
-         ksize = cell_size(3,c) - 1
-         
-c---------------------------------------------------------------------
-c     set last-cell flag
-c---------------------------------------------------------------------
-         if (stage .eq. ncells) then
-            last = 1
-         else
-            last = 0
-         endif
-
-         if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c     This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
-            first = 1
-c            call lhsx(c)
-            call x_solve_cell(first,last,c)
-         else
-c---------------------------------------------------------------------
-c     Not the first cell of this line, so receive info from
-c     processor working on preceeding cell
-c---------------------------------------------------------------------
-            first = 0
-            call x_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c     overlap computations and communications
-c---------------------------------------------------------------------
-c            call lhsx(c)
-c---------------------------------------------------------------------
-c     wait for completion
-c---------------------------------------------------------------------
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c     install C'(istart) and rhs'(istart) to be used in this cell
-c---------------------------------------------------------------------
-            call x_unpack_solve_info(c)
-            call x_solve_cell(first,last,c)
-         endif
-
-         if (last .eq. 0) call x_send_solve_info(send_id,c)
-      enddo
-
-c---------------------------------------------------------------------
-c     now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
-      do stage = ncells, 1, -1
-         c = slice(1,stage)
-         first = 0
-         last = 0
-         if (stage .eq. 1) first = 1
-         if (stage .eq. ncells) then
-            last = 1
-c---------------------------------------------------------------------
-c     last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
-            call x_backsubstitute(first, last,c)
-         else
-            call x_receive_backsub_info(recv_id,c)
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-            call x_unpack_backsub_info(c)
-            call x_backsubstitute(first,last,c)
-         endif
-         if (first .eq. 0) call x_send_backsub_info(send_id,c)
-      enddo
-
-
-      return
-      end
-      
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_unpack_solve_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack C'(-1) and rhs'(-1) for
-c     all j and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      integer j,k,m,n,ptr,c,istart 
-
-      istart = 0
-      ptr = 0
-      do k=0,KMAX-1
-         do j=0,JMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine x_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send C'(iend) and rhs'(iend) for
-c     all j and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer j,k,m,n,isize,ptr,c,jp,kp
-      integer error,send_id,buffer_size 
-
-      isize = cell_size(1,c)-1
-      jp = cell_coord(2,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c     pack up buffer
-c---------------------------------------------------------------------
-      ptr = 0
-      do k=0,KMAX-1
-         do j=0,JMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,isize,j,k,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     send buffer 
-c---------------------------------------------------------------------
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, successor(1),
-     >     WEST+jp+kp*NCELLS, comm_solve,
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send U(istart) for all j and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer j,k,n,ptr,c,istart,jp,kp
-      integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c     Send element 0 to previous processor
-c---------------------------------------------------------------------
-      istart = 0
-      jp = cell_coord(2,c)-1
-      kp = cell_coord(3,c)-1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      ptr = 0
-      do k=0,KMAX-1
-         do j=0,JMAX-1
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,istart,j,k,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, predecessor(1), 
-     >     EAST+jp+kp*NCELLS, comm_solve, 
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack U(isize) for all j and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      integer j,k,n,ptr,c
-
-      ptr = 0
-      do k=0,KMAX-1
-         do j=0,JMAX-1
-            do n=1,BLOCK_SIZE
-               backsub_info(n,j,k,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer error,recv_id,jp,kp,c,buffer_size
-      jp = cell_coord(2,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, successor(1), 
-     >     EAST+jp+kp*NCELLS, comm_solve, 
-     >     recv_id, error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives 
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer jp,kp,recv_id,error,c,buffer_size
-      jp = cell_coord(2,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, predecessor(1), 
-     >     WEST+jp+kp*NCELLS,  comm_solve, 
-     >     recv_id, error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine x_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     back solve: if last cell, then generate U(isize)=rhs(isize)
-c     else assume U(isize) is loaded in un pack backsub_info
-c     so just use it
-c     after call u(istart) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer first, last, c, i, j, k
-      integer m,n,isize,jsize,ksize,istart
-      
-      istart = 0
-      isize = cell_size(1,c)-1
-      jsize = cell_size(2,c)-end(2,c)-1      
-      ksize = cell_size(3,c)-end(3,c)-1
-      if (last .eq. 0) then
-         do k=start(3,c),ksize
-            do j=start(2,c),jsize
-c---------------------------------------------------------------------
-c     U(isize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
-     >                    - lhsc(m,n,isize,j,k,c)*
-     >                    backsub_info(n,j,k,c)
-c---------------------------------------------------------------------
-c     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
-c     $                    - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
-c---------------------------------------------------------------------
-                  enddo
-               enddo
-            enddo
-         enddo
-      endif
-      do k=start(3,c),ksize
-         do j=start(2,c),jsize
-            do i=isize-1,istart,-1
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
-     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     performs guaussian elimination on this cell.
-c     
-c     assumes that unpacking routines for non-first cells 
-c     preload C' and rhs' from previous cell.
-c     
-c     assumed send happens outside this routine, but that
-c     c'(IMAX) and rhs'(IMAX) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'work_lhs.h'
-
-      integer first,last,c
-      integer i,j,k,isize,ksize,jsize,istart
-
-      istart = 0
-      isize = cell_size(1,c)-1
-      jsize = cell_size(2,c)-end(2,c)-1
-      ksize = cell_size(3,c)-end(3,c)-1
-
-      call lhsabinit(lhsa, lhsb, isize)
-
-      do k=start(3,c),ksize 
-         do j=start(2,c),jsize
-
-c---------------------------------------------------------------------
-c     This function computes the left hand side in the xi-direction
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     determine a (labeled f) and n jacobians for cell c
-c---------------------------------------------------------------------
-            do i = start(1,c)-1, cell_size(1,c) - end(1,c)
-
-               tmp1 = rho_i(i,j,k,c)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-c---------------------------------------------------------------------
-c     
-c---------------------------------------------------------------------
-               fjac(1,1,i) = 0.0d+00
-               fjac(1,2,i) = 1.0d+00
-               fjac(1,3,i) = 0.0d+00
-               fjac(1,4,i) = 0.0d+00
-               fjac(1,5,i) = 0.0d+00
-
-               fjac(2,1,i) = -(u(2,i,j,k,c) * tmp2 * 
-     >              u(2,i,j,k,c))
-     >              + c2 * qs(i,j,k,c)
-               fjac(2,2,i) = ( 2.0d+00 - c2 )
-     >              * ( u(2,i,j,k,c) * tmp1 )
-               fjac(2,3,i) = - c2 * ( u(3,i,j,k,c) * tmp1 )
-               fjac(2,4,i) = - c2 * ( u(4,i,j,k,c) * tmp1 )
-               fjac(2,5,i) = c2
-
-               fjac(3,1,i) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
-               fjac(3,2,i) = u(3,i,j,k,c) * tmp1
-               fjac(3,3,i) = u(2,i,j,k,c) * tmp1
-               fjac(3,4,i) = 0.0d+00
-               fjac(3,5,i) = 0.0d+00
-
-               fjac(4,1,i) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
-               fjac(4,2,i) = u(4,i,j,k,c) * tmp1
-               fjac(4,3,i) = 0.0d+00
-               fjac(4,4,i) = u(2,i,j,k,c) * tmp1
-               fjac(4,5,i) = 0.0d+00
-
-               fjac(5,1,i) = ( c2 * 2.0d0 * qs(i,j,k,c)
-     >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
-     >              * ( u(2,i,j,k,c) * tmp1 )
-               fjac(5,2,i) = c1 *  u(5,i,j,k,c) * tmp1 
-     >              - c2
-     >              * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
-     >              + qs(i,j,k,c) )
-               fjac(5,3,i) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
-     >              * tmp2
-               fjac(5,4,i) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
-     >              * tmp2
-               fjac(5,5,i) = c1 * ( u(2,i,j,k,c) * tmp1 )
-
-               njac(1,1,i) = 0.0d+00
-               njac(1,2,i) = 0.0d+00
-               njac(1,3,i) = 0.0d+00
-               njac(1,4,i) = 0.0d+00
-               njac(1,5,i) = 0.0d+00
-
-               njac(2,1,i) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
-               njac(2,2,i) =   con43 * c3c4 * tmp1
-               njac(2,3,i) =   0.0d+00
-               njac(2,4,i) =   0.0d+00
-               njac(2,5,i) =   0.0d+00
-
-               njac(3,1,i) = - c3c4 * tmp2 * u(3,i,j,k,c)
-               njac(3,2,i) =   0.0d+00
-               njac(3,3,i) =   c3c4 * tmp1
-               njac(3,4,i) =   0.0d+00
-               njac(3,5,i) =   0.0d+00
-
-               njac(4,1,i) = - c3c4 * tmp2 * u(4,i,j,k,c)
-               njac(4,2,i) =   0.0d+00 
-               njac(4,3,i) =   0.0d+00
-               njac(4,4,i) =   c3c4 * tmp1
-               njac(4,5,i) =   0.0d+00
-
-               njac(5,1,i) = - ( con43 * c3c4
-     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
-     >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
-     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
-     >              - c1345 * tmp2 * u(5,i,j,k,c)
-
-               njac(5,2,i) = ( con43 * c3c4
-     >              - c1345 ) * tmp2 * u(2,i,j,k,c)
-               njac(5,3,i) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
-               njac(5,4,i) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
-               njac(5,5,i) = ( c1345 ) * tmp1
-
-            enddo
-c---------------------------------------------------------------------
-c     now jacobians set, so form left hand side in x direction
-c---------------------------------------------------------------------
-            do i = start(1,c), isize - end(1,c)
-
-               tmp1 = dt * tx1
-               tmp2 = dt * tx2
-
-               lhsa(1,1,i) = - tmp2 * fjac(1,1,i-1)
-     >              - tmp1 * njac(1,1,i-1)
-     >              - tmp1 * dx1 
-               lhsa(1,2,i) = - tmp2 * fjac(1,2,i-1)
-     >              - tmp1 * njac(1,2,i-1)
-               lhsa(1,3,i) = - tmp2 * fjac(1,3,i-1)
-     >              - tmp1 * njac(1,3,i-1)
-               lhsa(1,4,i) = - tmp2 * fjac(1,4,i-1)
-     >              - tmp1 * njac(1,4,i-1)
-               lhsa(1,5,i) = - tmp2 * fjac(1,5,i-1)
-     >              - tmp1 * njac(1,5,i-1)
-
-               lhsa(2,1,i) = - tmp2 * fjac(2,1,i-1)
-     >              - tmp1 * njac(2,1,i-1)
-               lhsa(2,2,i) = - tmp2 * fjac(2,2,i-1)
-     >              - tmp1 * njac(2,2,i-1)
-     >              - tmp1 * dx2
-               lhsa(2,3,i) = - tmp2 * fjac(2,3,i-1)
-     >              - tmp1 * njac(2,3,i-1)
-               lhsa(2,4,i) = - tmp2 * fjac(2,4,i-1)
-     >              - tmp1 * njac(2,4,i-1)
-               lhsa(2,5,i) = - tmp2 * fjac(2,5,i-1)
-     >              - tmp1 * njac(2,5,i-1)
-
-               lhsa(3,1,i) = - tmp2 * fjac(3,1,i-1)
-     >              - tmp1 * njac(3,1,i-1)
-               lhsa(3,2,i) = - tmp2 * fjac(3,2,i-1)
-     >              - tmp1 * njac(3,2,i-1)
-               lhsa(3,3,i) = - tmp2 * fjac(3,3,i-1)
-     >              - tmp1 * njac(3,3,i-1)
-     >              - tmp1 * dx3 
-               lhsa(3,4,i) = - tmp2 * fjac(3,4,i-1)
-     >              - tmp1 * njac(3,4,i-1)
-               lhsa(3,5,i) = - tmp2 * fjac(3,5,i-1)
-     >              - tmp1 * njac(3,5,i-1)
-
-               lhsa(4,1,i) = - tmp2 * fjac(4,1,i-1)
-     >              - tmp1 * njac(4,1,i-1)
-               lhsa(4,2,i) = - tmp2 * fjac(4,2,i-1)
-     >              - tmp1 * njac(4,2,i-1)
-               lhsa(4,3,i) = - tmp2 * fjac(4,3,i-1)
-     >              - tmp1 * njac(4,3,i-1)
-               lhsa(4,4,i) = - tmp2 * fjac(4,4,i-1)
-     >              - tmp1 * njac(4,4,i-1)
-     >              - tmp1 * dx4
-               lhsa(4,5,i) = - tmp2 * fjac(4,5,i-1)
-     >              - tmp1 * njac(4,5,i-1)
-
-               lhsa(5,1,i) = - tmp2 * fjac(5,1,i-1)
-     >              - tmp1 * njac(5,1,i-1)
-               lhsa(5,2,i) = - tmp2 * fjac(5,2,i-1)
-     >              - tmp1 * njac(5,2,i-1)
-               lhsa(5,3,i) = - tmp2 * fjac(5,3,i-1)
-     >              - tmp1 * njac(5,3,i-1)
-               lhsa(5,4,i) = - tmp2 * fjac(5,4,i-1)
-     >              - tmp1 * njac(5,4,i-1)
-               lhsa(5,5,i) = - tmp2 * fjac(5,5,i-1)
-     >              - tmp1 * njac(5,5,i-1)
-     >              - tmp1 * dx5
-
-               lhsb(1,1,i) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(1,1,i)
-     >              + tmp1 * 2.0d+00 * dx1
-               lhsb(1,2,i) = tmp1 * 2.0d+00 * njac(1,2,i)
-               lhsb(1,3,i) = tmp1 * 2.0d+00 * njac(1,3,i)
-               lhsb(1,4,i) = tmp1 * 2.0d+00 * njac(1,4,i)
-               lhsb(1,5,i) = tmp1 * 2.0d+00 * njac(1,5,i)
-
-               lhsb(2,1,i) = tmp1 * 2.0d+00 * njac(2,1,i)
-               lhsb(2,2,i) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(2,2,i)
-     >              + tmp1 * 2.0d+00 * dx2
-               lhsb(2,3,i) = tmp1 * 2.0d+00 * njac(2,3,i)
-               lhsb(2,4,i) = tmp1 * 2.0d+00 * njac(2,4,i)
-               lhsb(2,5,i) = tmp1 * 2.0d+00 * njac(2,5,i)
-
-               lhsb(3,1,i) = tmp1 * 2.0d+00 * njac(3,1,i)
-               lhsb(3,2,i) = tmp1 * 2.0d+00 * njac(3,2,i)
-               lhsb(3,3,i) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(3,3,i)
-     >              + tmp1 * 2.0d+00 * dx3
-               lhsb(3,4,i) = tmp1 * 2.0d+00 * njac(3,4,i)
-               lhsb(3,5,i) = tmp1 * 2.0d+00 * njac(3,5,i)
-
-               lhsb(4,1,i) = tmp1 * 2.0d+00 * njac(4,1,i)
-               lhsb(4,2,i) = tmp1 * 2.0d+00 * njac(4,2,i)
-               lhsb(4,3,i) = tmp1 * 2.0d+00 * njac(4,3,i)
-               lhsb(4,4,i) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(4,4,i)
-     >              + tmp1 * 2.0d+00 * dx4
-               lhsb(4,5,i) = tmp1 * 2.0d+00 * njac(4,5,i)
-
-               lhsb(5,1,i) = tmp1 * 2.0d+00 * njac(5,1,i)
-               lhsb(5,2,i) = tmp1 * 2.0d+00 * njac(5,2,i)
-               lhsb(5,3,i) = tmp1 * 2.0d+00 * njac(5,3,i)
-               lhsb(5,4,i) = tmp1 * 2.0d+00 * njac(5,4,i)
-               lhsb(5,5,i) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(5,5,i)
-     >              + tmp1 * 2.0d+00 * dx5
-
-               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i+1)
-     >              - tmp1 * njac(1,1,i+1)
-     >              - tmp1 * dx1
-               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i+1)
-     >              - tmp1 * njac(1,2,i+1)
-               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i+1)
-     >              - tmp1 * njac(1,3,i+1)
-               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i+1)
-     >              - tmp1 * njac(1,4,i+1)
-               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i+1)
-     >              - tmp1 * njac(1,5,i+1)
-
-               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i+1)
-     >              - tmp1 * njac(2,1,i+1)
-               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i+1)
-     >              - tmp1 * njac(2,2,i+1)
-     >              - tmp1 * dx2
-               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i+1)
-     >              - tmp1 * njac(2,3,i+1)
-               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i+1)
-     >              - tmp1 * njac(2,4,i+1)
-               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i+1)
-     >              - tmp1 * njac(2,5,i+1)
-
-               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i+1)
-     >              - tmp1 * njac(3,1,i+1)
-               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i+1)
-     >              - tmp1 * njac(3,2,i+1)
-               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i+1)
-     >              - tmp1 * njac(3,3,i+1)
-     >              - tmp1 * dx3
-               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i+1)
-     >              - tmp1 * njac(3,4,i+1)
-               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i+1)
-     >              - tmp1 * njac(3,5,i+1)
-
-               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i+1)
-     >              - tmp1 * njac(4,1,i+1)
-               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i+1)
-     >              - tmp1 * njac(4,2,i+1)
-               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i+1)
-     >              - tmp1 * njac(4,3,i+1)
-               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i+1)
-     >              - tmp1 * njac(4,4,i+1)
-     >              - tmp1 * dx4
-               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i+1)
-     >              - tmp1 * njac(4,5,i+1)
-
-               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i+1)
-     >              - tmp1 * njac(5,1,i+1)
-               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i+1)
-     >              - tmp1 * njac(5,2,i+1)
-               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i+1)
-     >              - tmp1 * njac(5,3,i+1)
-               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i+1)
-     >              - tmp1 * njac(5,4,i+1)
-               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i+1)
-     >              - tmp1 * njac(5,5,i+1)
-     >              - tmp1 * dx5
-
-            enddo
-
-
-c---------------------------------------------------------------------
-c     outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
-            if (first .eq. 1) then 
-
-c---------------------------------------------------------------------
-c     multiply c(istart,j,k) by b_inverse and copy back to c
-c     multiply rhs(istart) by b_inverse(istart) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,istart),
-     >                        lhsc(1,1,istart,j,k,c),
-     >                        rhs(1,istart,j,k,c) )
-
-            endif
-
-c---------------------------------------------------------------------
-c     begin inner most do loop
-c     do all the elements of the cell unless last 
-c---------------------------------------------------------------------
-            do i=istart+first,isize-last
-
-c---------------------------------------------------------------------
-c     rhs(i) = rhs(i) - A*rhs(i-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,i),
-     >                         rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c     B(i) = B(i) - C(i-1)*A(i)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,i),
-     >                         lhsc(1,1,i-1,j,k,c),
-     >                         lhsb(1,1,i))
-
-
-c---------------------------------------------------------------------
-c     multiply c(i,j,k) by b_inverse and copy back to c
-c     multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,i),
-     >                        lhsc(1,1,i,j,k,c),
-     >                        rhs(1,i,j,k,c) )
-
-            enddo
-
-c---------------------------------------------------------------------
-c     Now finish up special cases for last cell
-c---------------------------------------------------------------------
-            if (last .eq. 1) then
-
-c---------------------------------------------------------------------
-c     rhs(isize) = rhs(isize) - A*rhs(isize-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,isize),
-     >                         rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
-
-c---------------------------------------------------------------------
-c     B(isize) = B(isize) - C(isize-1)*A(isize)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,isize),
-     >                         lhsc(1,1,isize-1,j,k,c),
-     >                         lhsb(1,1,isize))
-
-c---------------------------------------------------------------------
-c     multiply rhs() by b_inverse() and copy to rhs
-c---------------------------------------------------------------------
-               call binvrhs( lhsb(1,1,isize),
-     >                       rhs(1,isize,j,k,c) )
-
-            endif
-         enddo
-      enddo
-
-
-      return
-      end
-      
diff --git a/examples/smpi/NAS/BT/x_solve_vec.f b/examples/smpi/NAS/BT/x_solve_vec.f
deleted file mode 100644
index 8f1c1371db..0000000000
--- a/examples/smpi/NAS/BT/x_solve_vec.f
+++ /dev/null
@@ -1,789 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     
-c     Performs line solves in X direction by first factoring
-c     the block-tridiagonal matrix into an upper triangular matrix, 
-c     and then performing back substitution to solve for the unknow
-c     vectors of each line.  
-c     
-c     Make sure we treat elements zero to cell_size in the direction
-c     of the sweep.
-c     
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-      integer  c, istart, stage,
-     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
-     >     isize,jsize,ksize,send_id
-
-      istart = 0
-
-c---------------------------------------------------------------------
-c     in our terminology stage is the number of the cell in the x-direct
-c     i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
-      do stage = 1,ncells
-         c = slice(1,stage)
-         isize = cell_size(1,c) - 1
-         jsize = cell_size(2,c) - 1
-         ksize = cell_size(3,c) - 1
-         
-c---------------------------------------------------------------------
-c     set last-cell flag
-c---------------------------------------------------------------------
-         if (stage .eq. ncells) then
-            last = 1
-         else
-            last = 0
-         endif
-
-         if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c     This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
-            first = 1
-c            call lhsx(c)
-            call x_solve_cell(first,last,c)
-         else
-c---------------------------------------------------------------------
-c     Not the first cell of this line, so receive info from
-c     processor working on preceeding cell
-c---------------------------------------------------------------------
-            first = 0
-            call x_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c     overlap computations and communications
-c---------------------------------------------------------------------
-c            call lhsx(c)
-c---------------------------------------------------------------------
-c     wait for completion
-c---------------------------------------------------------------------
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c     install C'(istart) and rhs'(istart) to be used in this cell
-c---------------------------------------------------------------------
-            call x_unpack_solve_info(c)
-            call x_solve_cell(first,last,c)
-         endif
-
-         if (last .eq. 0) call x_send_solve_info(send_id,c)
-      enddo
-
-c---------------------------------------------------------------------
-c     now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
-      do stage = ncells, 1, -1
-         c = slice(1,stage)
-         first = 0
-         last = 0
-         if (stage .eq. 1) first = 1
-         if (stage .eq. ncells) then
-            last = 1
-c---------------------------------------------------------------------
-c     last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
-            call x_backsubstitute(first, last,c)
-         else
-            call x_receive_backsub_info(recv_id,c)
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-            call x_unpack_backsub_info(c)
-            call x_backsubstitute(first,last,c)
-         endif
-         if (first .eq. 0) call x_send_backsub_info(send_id,c)
-      enddo
-
-
-      return
-      end
-      
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_unpack_solve_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack C'(-1) and rhs'(-1) for
-c     all j and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      integer j,k,m,n,ptr,c,istart 
-
-      istart = 0
-      ptr = 0
-      do k=0,KMAX-1
-         do j=0,JMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine x_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send C'(iend) and rhs'(iend) for
-c     all j and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer j,k,m,n,isize,ptr,c,jp,kp
-      integer error,send_id,buffer_size 
-
-      isize = cell_size(1,c)-1
-      jp = cell_coord(2,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c     pack up buffer
-c---------------------------------------------------------------------
-      ptr = 0
-      do k=0,KMAX-1
-         do j=0,JMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,isize,j,k,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     send buffer 
-c---------------------------------------------------------------------
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, successor(1),
-     >     WEST+jp+kp*NCELLS, comm_solve,
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send U(istart) for all j and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer j,k,n,ptr,c,istart,jp,kp
-      integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c     Send element 0 to previous processor
-c---------------------------------------------------------------------
-      istart = 0
-      jp = cell_coord(2,c)-1
-      kp = cell_coord(3,c)-1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      ptr = 0
-      do k=0,KMAX-1
-         do j=0,JMAX-1
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,istart,j,k,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, predecessor(1), 
-     >     EAST+jp+kp*NCELLS, comm_solve, 
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack U(isize) for all j and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      integer j,k,n,ptr,c
-
-      ptr = 0
-      do k=0,KMAX-1
-         do j=0,JMAX-1
-            do n=1,BLOCK_SIZE
-               backsub_info(n,j,k,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer error,recv_id,jp,kp,c,buffer_size
-      jp = cell_coord(2,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, successor(1), 
-     >     EAST+jp+kp*NCELLS, comm_solve, 
-     >     recv_id, error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives 
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer jp,kp,recv_id,error,c,buffer_size
-      jp = cell_coord(2,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, predecessor(1), 
-     >     WEST+jp+kp*NCELLS,  comm_solve, 
-     >     recv_id, error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine x_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     back solve: if last cell, then generate U(isize)=rhs(isize)
-c     else assume U(isize) is loaded in un pack backsub_info
-c     so just use it
-c     after call u(istart) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer first, last, c, i, j, k
-      integer m,n,isize,jsize,ksize,istart
-      
-      istart = 0
-      isize = cell_size(1,c)-1
-      jsize = cell_size(2,c)-end(2,c)-1      
-      ksize = cell_size(3,c)-end(3,c)-1
-      if (last .eq. 0) then
-         do k=start(3,c),ksize
-            do j=start(2,c),jsize
-c---------------------------------------------------------------------
-c     U(isize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
-     >                    - lhsc(m,n,isize,j,k,c)*
-     >                    backsub_info(n,j,k,c)
-c---------------------------------------------------------------------
-c     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
-c     $                    - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
-c---------------------------------------------------------------------
-                  enddo
-               enddo
-            enddo
-         enddo
-      endif
-      do k=start(3,c),ksize
-         do j=start(2,c),jsize
-            do i=isize-1,istart,-1
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
-     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine x_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     performs guaussian elimination on this cell.
-c     
-c     assumes that unpacking routines for non-first cells 
-c     preload C' and rhs' from previous cell.
-c     
-c     assumed send happens outside this routine, but that
-c     c'(IMAX) and rhs'(IMAX) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'work_lhs_vec.h'
-
-      integer first,last,c
-      integer i,j,k,m,n,isize,ksize,jsize,istart
-
-      istart = 0
-      isize = cell_size(1,c)-1
-      jsize = cell_size(2,c)-end(2,c)-1
-      ksize = cell_size(3,c)-end(3,c)-1
-
-c---------------------------------------------------------------------
-c     zero the left hand side for starters
-c     set diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
-      do j = 0, jsize
-         do m = 1, 5
-            do n = 1, 5
-               lhsa(m,n,0,j) = 0.0d0
-               lhsb(m,n,0,j) = 0.0d0
-               lhsa(m,n,isize,j) = 0.0d0
-               lhsb(m,n,isize,j) = 0.0d0
-            enddo
-            lhsb(m,m,0,j) = 1.0d0
-            lhsb(m,m,isize,j) = 1.0d0
-         enddo
-      enddo
-
-      do k=start(3,c),ksize 
-
-c---------------------------------------------------------------------
-c     This function computes the left hand side in the xi-direction
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     determine a (labeled f) and n jacobians for cell c
-c---------------------------------------------------------------------
-         do j=start(2,c),jsize
-            do i = start(1,c)-1, cell_size(1,c) - end(1,c)
-
-               tmp1 = rho_i(i,j,k,c)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-c---------------------------------------------------------------------
-c     
-c---------------------------------------------------------------------
-               fjac(1,1,i,j) = 0.0d+00
-               fjac(1,2,i,j) = 1.0d+00
-               fjac(1,3,i,j) = 0.0d+00
-               fjac(1,4,i,j) = 0.0d+00
-               fjac(1,5,i,j) = 0.0d+00
-
-               fjac(2,1,i,j) = -(u(2,i,j,k,c) * tmp2 * 
-     >              u(2,i,j,k,c))
-     >              + c2 * qs(i,j,k,c)
-               fjac(2,2,i,j) = ( 2.0d+00 - c2 )
-     >              * ( u(2,i,j,k,c) * tmp1 )
-               fjac(2,3,i,j) = - c2 * ( u(3,i,j,k,c) * tmp1 )
-               fjac(2,4,i,j) = - c2 * ( u(4,i,j,k,c) * tmp1 )
-               fjac(2,5,i,j) = c2
-
-               fjac(3,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
-               fjac(3,2,i,j) = u(3,i,j,k,c) * tmp1
-               fjac(3,3,i,j) = u(2,i,j,k,c) * tmp1
-               fjac(3,4,i,j) = 0.0d+00
-               fjac(3,5,i,j) = 0.0d+00
-
-               fjac(4,1,i,j) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
-               fjac(4,2,i,j) = u(4,i,j,k,c) * tmp1
-               fjac(4,3,i,j) = 0.0d+00
-               fjac(4,4,i,j) = u(2,i,j,k,c) * tmp1
-               fjac(4,5,i,j) = 0.0d+00
-
-               fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
-     >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
-     >              * ( u(2,i,j,k,c) * tmp1 )
-               fjac(5,2,i,j) = c1 *  u(5,i,j,k,c) * tmp1 
-     >              - c2
-     >              * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
-     >              + qs(i,j,k,c) )
-               fjac(5,3,i,j) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
-     >              * tmp2
-               fjac(5,4,i,j) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
-     >              * tmp2
-               fjac(5,5,i,j) = c1 * ( u(2,i,j,k,c) * tmp1 )
-
-               njac(1,1,i,j) = 0.0d+00
-               njac(1,2,i,j) = 0.0d+00
-               njac(1,3,i,j) = 0.0d+00
-               njac(1,4,i,j) = 0.0d+00
-               njac(1,5,i,j) = 0.0d+00
-
-               njac(2,1,i,j) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
-               njac(2,2,i,j) =   con43 * c3c4 * tmp1
-               njac(2,3,i,j) =   0.0d+00
-               njac(2,4,i,j) =   0.0d+00
-               njac(2,5,i,j) =   0.0d+00
-
-               njac(3,1,i,j) = - c3c4 * tmp2 * u(3,i,j,k,c)
-               njac(3,2,i,j) =   0.0d+00
-               njac(3,3,i,j) =   c3c4 * tmp1
-               njac(3,4,i,j) =   0.0d+00
-               njac(3,5,i,j) =   0.0d+00
-
-               njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
-               njac(4,2,i,j) =   0.0d+00 
-               njac(4,3,i,j) =   0.0d+00
-               njac(4,4,i,j) =   c3c4 * tmp1
-               njac(4,5,i,j) =   0.0d+00
-
-               njac(5,1,i,j) = - ( con43 * c3c4
-     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
-     >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
-     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
-     >              - c1345 * tmp2 * u(5,i,j,k,c)
-
-               njac(5,2,i,j) = ( con43 * c3c4
-     >              - c1345 ) * tmp2 * u(2,i,j,k,c)
-               njac(5,3,i,j) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
-               njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
-               njac(5,5,i,j) = ( c1345 ) * tmp1
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     now jacobians set, so form left hand side in x direction
-c---------------------------------------------------------------------
-         do j=start(2,c),jsize
-            do i = start(1,c), isize - end(1,c)
-
-               tmp1 = dt * tx1
-               tmp2 = dt * tx2
-
-               lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i-1,j)
-     >              - tmp1 * njac(1,1,i-1,j)
-     >              - tmp1 * dx1 
-               lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i-1,j)
-     >              - tmp1 * njac(1,2,i-1,j)
-               lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i-1,j)
-     >              - tmp1 * njac(1,3,i-1,j)
-               lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i-1,j)
-     >              - tmp1 * njac(1,4,i-1,j)
-               lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i-1,j)
-     >              - tmp1 * njac(1,5,i-1,j)
-
-               lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i-1,j)
-     >              - tmp1 * njac(2,1,i-1,j)
-               lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i-1,j)
-     >              - tmp1 * njac(2,2,i-1,j)
-     >              - tmp1 * dx2
-               lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i-1,j)
-     >              - tmp1 * njac(2,3,i-1,j)
-               lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i-1,j)
-     >              - tmp1 * njac(2,4,i-1,j)
-               lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i-1,j)
-     >              - tmp1 * njac(2,5,i-1,j)
-
-               lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i-1,j)
-     >              - tmp1 * njac(3,1,i-1,j)
-               lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i-1,j)
-     >              - tmp1 * njac(3,2,i-1,j)
-               lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i-1,j)
-     >              - tmp1 * njac(3,3,i-1,j)
-     >              - tmp1 * dx3 
-               lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i-1,j)
-     >              - tmp1 * njac(3,4,i-1,j)
-               lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i-1,j)
-     >              - tmp1 * njac(3,5,i-1,j)
-
-               lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i-1,j)
-     >              - tmp1 * njac(4,1,i-1,j)
-               lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i-1,j)
-     >              - tmp1 * njac(4,2,i-1,j)
-               lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i-1,j)
-     >              - tmp1 * njac(4,3,i-1,j)
-               lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i-1,j)
-     >              - tmp1 * njac(4,4,i-1,j)
-     >              - tmp1 * dx4
-               lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i-1,j)
-     >              - tmp1 * njac(4,5,i-1,j)
-
-               lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i-1,j)
-     >              - tmp1 * njac(5,1,i-1,j)
-               lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i-1,j)
-     >              - tmp1 * njac(5,2,i-1,j)
-               lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i-1,j)
-     >              - tmp1 * njac(5,3,i-1,j)
-               lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i-1,j)
-     >              - tmp1 * njac(5,4,i-1,j)
-               lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i-1,j)
-     >              - tmp1 * njac(5,5,i-1,j)
-     >              - tmp1 * dx5
-
-               lhsb(1,1,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(1,1,i,j)
-     >              + tmp1 * 2.0d+00 * dx1
-               lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
-               lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
-               lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
-               lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
-
-               lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
-               lhsb(2,2,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(2,2,i,j)
-     >              + tmp1 * 2.0d+00 * dx2
-               lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
-               lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
-               lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
-
-               lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
-               lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
-               lhsb(3,3,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(3,3,i,j)
-     >              + tmp1 * 2.0d+00 * dx3
-               lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
-               lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
-
-               lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
-               lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
-               lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
-               lhsb(4,4,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(4,4,i,j)
-     >              + tmp1 * 2.0d+00 * dx4
-               lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
-
-               lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
-               lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
-               lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
-               lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
-               lhsb(5,5,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(5,5,i,j)
-     >              + tmp1 * 2.0d+00 * dx5
-
-               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i+1,j)
-     >              - tmp1 * njac(1,1,i+1,j)
-     >              - tmp1 * dx1
-               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i+1,j)
-     >              - tmp1 * njac(1,2,i+1,j)
-               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i+1,j)
-     >              - tmp1 * njac(1,3,i+1,j)
-               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i+1,j)
-     >              - tmp1 * njac(1,4,i+1,j)
-               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i+1,j)
-     >              - tmp1 * njac(1,5,i+1,j)
-
-               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i+1,j)
-     >              - tmp1 * njac(2,1,i+1,j)
-               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i+1,j)
-     >              - tmp1 * njac(2,2,i+1,j)
-     >              - tmp1 * dx2
-               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i+1,j)
-     >              - tmp1 * njac(2,3,i+1,j)
-               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i+1,j)
-     >              - tmp1 * njac(2,4,i+1,j)
-               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i+1,j)
-     >              - tmp1 * njac(2,5,i+1,j)
-
-               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i+1,j)
-     >              - tmp1 * njac(3,1,i+1,j)
-               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i+1,j)
-     >              - tmp1 * njac(3,2,i+1,j)
-               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i+1,j)
-     >              - tmp1 * njac(3,3,i+1,j)
-     >              - tmp1 * dx3
-               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i+1,j)
-     >              - tmp1 * njac(3,4,i+1,j)
-               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i+1,j)
-     >              - tmp1 * njac(3,5,i+1,j)
-
-               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i+1,j)
-     >              - tmp1 * njac(4,1,i+1,j)
-               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i+1,j)
-     >              - tmp1 * njac(4,2,i+1,j)
-               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i+1,j)
-     >              - tmp1 * njac(4,3,i+1,j)
-               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i+1,j)
-     >              - tmp1 * njac(4,4,i+1,j)
-     >              - tmp1 * dx4
-               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i+1,j)
-     >              - tmp1 * njac(4,5,i+1,j)
-
-               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i+1,j)
-     >              - tmp1 * njac(5,1,i+1,j)
-               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i+1,j)
-     >              - tmp1 * njac(5,2,i+1,j)
-               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i+1,j)
-     >              - tmp1 * njac(5,3,i+1,j)
-               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i+1,j)
-     >              - tmp1 * njac(5,4,i+1,j)
-               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i+1,j)
-     >              - tmp1 * njac(5,5,i+1,j)
-     >              - tmp1 * dx5
-
-            enddo
-         enddo
-
-
-c---------------------------------------------------------------------
-c     outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
-         if (first .eq. 1) then 
-
-c---------------------------------------------------------------------
-c     multiply c(istart,j,k) by b_inverse and copy back to c
-c     multiply rhs(istart) by b_inverse(istart) and copy to rhs
-c---------------------------------------------------------------------
-!dir$ ivdep
-            do j=start(2,c),jsize
-               call binvcrhs( lhsb(1,1,istart,j),
-     >                        lhsc(1,1,istart,j,k,c),
-     >                        rhs(1,istart,j,k,c) )
-            enddo
-
-         endif
-
-c---------------------------------------------------------------------
-c     begin inner most do loop
-c     do all the elements of the cell unless last 
-c---------------------------------------------------------------------
-!dir$ ivdep
-!dir$ interchange(i,j)
-         do j=start(2,c),jsize
-            do i=istart+first,isize-last
-
-c---------------------------------------------------------------------
-c     rhs(i) = rhs(i) - A*rhs(i-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,i,j),
-     >                         rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c     B(i) = B(i) - C(i-1)*A(i)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,i,j),
-     >                         lhsc(1,1,i-1,j,k,c),
-     >                         lhsb(1,1,i,j))
-
-
-c---------------------------------------------------------------------
-c     multiply c(i,j,k) by b_inverse and copy back to c
-c     multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,i,j),
-     >                        lhsc(1,1,i,j,k,c),
-     >                        rhs(1,i,j,k,c) )
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     Now finish up special cases for last cell
-c---------------------------------------------------------------------
-         if (last .eq. 1) then
-
-!dir$ ivdep
-            do j=start(2,c),jsize
-c---------------------------------------------------------------------
-c     rhs(isize) = rhs(isize) - A*rhs(isize-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,isize,j),
-     >                         rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
-
-c---------------------------------------------------------------------
-c     B(isize) = B(isize) - C(isize-1)*A(isize)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,isize,j),
-     >                         lhsc(1,1,isize-1,j,k,c),
-     >                         lhsb(1,1,isize,j))
-
-c---------------------------------------------------------------------
-c     multiply rhs() by b_inverse() and copy to rhs
-c---------------------------------------------------------------------
-               call binvrhs( lhsb(1,1,isize,j),
-     >                       rhs(1,isize,j,k,c) )
-            enddo
-
-         endif
-      enddo
-
-
-      return
-      end
-      
diff --git a/examples/smpi/NAS/BT/y_solve.f b/examples/smpi/NAS/BT/y_solve.f
deleted file mode 100644
index 33e2ebc018..0000000000
--- a/examples/smpi/NAS/BT/y_solve.f
+++ /dev/null
@@ -1,771 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     Performs line solves in Y direction by first factoring
-c     the block-tridiagonal matrix into an upper triangular matrix, 
-c     and then performing back substitution to solve for the unknow
-c     vectors of each line.  
-c     
-c     Make sure we treat elements zero to cell_size in the direction
-c     of the sweep.
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer 
-     >     c, jstart, stage,
-     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
-     >     isize,jsize,ksize,send_id
-
-      jstart = 0
-
-c---------------------------------------------------------------------
-c     in our terminology stage is the number of the cell in the y-direction
-c     i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
-      do stage = 1,ncells
-         c = slice(2,stage)
-         isize = cell_size(1,c) - 1
-         jsize = cell_size(2,c) - 1
-         ksize = cell_size(3,c) - 1
-
-c---------------------------------------------------------------------
-c     set last-cell flag
-c---------------------------------------------------------------------
-         if (stage .eq. ncells) then
-            last = 1
-         else
-            last = 0
-         endif
-
-         if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c     This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
-            first = 1
-c            call lhsy(c)
-            call y_solve_cell(first,last,c)
-         else
-c---------------------------------------------------------------------
-c     Not the first cell of this line, so receive info from
-c     processor working on preceeding cell
-c---------------------------------------------------------------------
-            first = 0
-            call y_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c     overlap computations and communications
-c---------------------------------------------------------------------
-c            call lhsy(c)
-c---------------------------------------------------------------------
-c     wait for completion
-c---------------------------------------------------------------------
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c     install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
-c---------------------------------------------------------------------
-            call y_unpack_solve_info(c)
-            call y_solve_cell(first,last,c)
-         endif
-
-         if (last .eq. 0) call y_send_solve_info(send_id,c)
-      enddo
-
-c---------------------------------------------------------------------
-c     now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
-      do stage = ncells, 1, -1
-         c = slice(2,stage)
-         first = 0
-         last = 0
-         if (stage .eq. 1) first = 1
-         if (stage .eq. ncells) then
-            last = 1
-c---------------------------------------------------------------------
-c     last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
-            call y_backsubstitute(first, last,c)
-         else
-            call y_receive_backsub_info(recv_id,c)
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-            call y_unpack_backsub_info(c)
-            call y_backsubstitute(first,last,c)
-         endif
-         if (first .eq. 0) call y_send_backsub_info(send_id,c)
-      enddo
-
-
-      return
-      end
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine y_unpack_solve_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack C'(-1) and rhs'(-1) for
-c     all i and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer i,k,m,n,ptr,c,jstart 
-
-      jstart = 0
-      ptr = 0
-      do k=0,KMAX-1
-         do i=0,IMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine y_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send C'(jend) and rhs'(jend) for
-c     all i and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i,k,m,n,jsize,ptr,c,ip,kp
-      integer error,send_id,buffer_size 
-
-      jsize = cell_size(2,c)-1
-      ip = cell_coord(1,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c     pack up buffer
-c---------------------------------------------------------------------
-      ptr = 0
-      do k=0,KMAX-1
-         do i=0,IMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     send buffer 
-c---------------------------------------------------------------------
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, successor(2),
-     >     SOUTH+ip+kp*NCELLS, comm_solve,
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send U(jstart) for all i and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i,k,n,ptr,c,jstart,ip,kp
-      integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c     Send element 0 to previous processor
-c---------------------------------------------------------------------
-      jstart = 0
-      ip = cell_coord(1,c)-1
-      kp = cell_coord(3,c)-1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      ptr = 0
-      do k=0,KMAX-1
-         do i=0,IMAX-1
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, predecessor(2), 
-     >     NORTH+ip+kp*NCELLS, comm_solve, 
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack U(jsize) for all i and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer i,k,n,ptr,c 
-
-      ptr = 0
-      do k=0,KMAX-1
-         do i=0,IMAX-1
-            do n=1,BLOCK_SIZE
-               backsub_info(n,i,k,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer error,recv_id,ip,kp,c,buffer_size
-      ip = cell_coord(1,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, successor(2), 
-     >     NORTH+ip+kp*NCELLS, comm_solve, 
-     >     recv_id, error)
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives 
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ip,kp,recv_id,error,c,buffer_size
-      ip = cell_coord(1,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-      call mpi_irecv(out_buffer, buffer_size, 
-     >     dp_type, predecessor(2), 
-     >     SOUTH+ip+kp*NCELLS,  comm_solve, 
-     >     recv_id, error)
-
-      return
-      end
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     back solve: if last cell, then generate U(jsize)=rhs(jsize)
-c     else assume U(jsize) is loaded in un pack backsub_info
-c     so just use it
-c     after call u(jstart) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer first, last, c, i, k
-      integer m,n,j,jsize,isize,ksize,jstart
-      
-      jstart = 0
-      isize = cell_size(1,c)-end(1,c)-1      
-      jsize = cell_size(2,c)-1
-      ksize = cell_size(3,c)-end(3,c)-1
-      if (last .eq. 0) then
-         do k=start(3,c),ksize
-            do i=start(1,c),isize
-c---------------------------------------------------------------------
-c     U(jsize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) 
-     >                    - lhsc(m,n,i,jsize,k,c)*
-     >                    backsub_info(n,i,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      endif
-      do k=start(3,c),ksize
-         do j=jsize-1,jstart,-1
-            do i=start(1,c),isize
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
-     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     performs guaussian elimination on this cell.
-c     
-c     assumes that unpacking routines for non-first cells 
-c     preload C' and rhs' from previous cell.
-c     
-c     assumed send happens outside this routine, but that
-c     c'(JMAX) and rhs'(JMAX) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'work_lhs.h'
-
-      integer first,last,c
-      integer i,j,k,isize,ksize,jsize,jstart
-      double precision utmp(6,-2:JMAX+1)
-
-      jstart = 0
-      isize = cell_size(1,c)-end(1,c)-1
-      jsize = cell_size(2,c)-1
-      ksize = cell_size(3,c)-end(3,c)-1
-
-      call lhsabinit(lhsa, lhsb, jsize)
-
-      do k=start(3,c),ksize 
-         do i=start(1,c),isize
-
-c---------------------------------------------------------------------
-c     This function computes the left hand side for the three y-factors   
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     Compute the indices for storing the tri-diagonal matrix;
-c     determine a (labeled f) and n jacobians for cell c
-c---------------------------------------------------------------------
-            do j = start(2,c)-1, cell_size(2,c)-end(2,c)
-               utmp(1,j) = 1.0d0 / u(1,i,j,k,c)
-               utmp(2,j) = u(2,i,j,k,c)
-               utmp(3,j) = u(3,i,j,k,c)
-               utmp(4,j) = u(4,i,j,k,c)
-               utmp(5,j) = u(5,i,j,k,c)
-               utmp(6,j) = qs(i,j,k,c)
-            end do
-
-            do j = start(2,c)-1, cell_size(2,c)-end(2,c)
-
-               tmp1 = utmp(1,j)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               fjac(1,1,j) = 0.0d+00
-               fjac(1,2,j) = 0.0d+00
-               fjac(1,3,j) = 1.0d+00
-               fjac(1,4,j) = 0.0d+00
-               fjac(1,5,j) = 0.0d+00
-
-               fjac(2,1,j) = - ( utmp(2,j)*utmp(3,j) )
-     >              * tmp2
-               fjac(2,2,j) = utmp(3,j) * tmp1
-               fjac(2,3,j) = utmp(2,j) * tmp1
-               fjac(2,4,j) = 0.0d+00
-               fjac(2,5,j) = 0.0d+00
-
-               fjac(3,1,j) = - ( utmp(3,j)*utmp(3,j)*tmp2)
-     >              + c2 * utmp(6,j)
-               fjac(3,2,j) = - c2 *  utmp(2,j) * tmp1
-               fjac(3,3,j) = ( 2.0d+00 - c2 )
-     >              *  utmp(3,j) * tmp1 
-               fjac(3,4,j) = - c2 * utmp(4,j) * tmp1 
-               fjac(3,5,j) = c2
-
-               fjac(4,1,j) = - ( utmp(3,j)*utmp(4,j) )
-     >              * tmp2
-               fjac(4,2,j) = 0.0d+00
-               fjac(4,3,j) = utmp(4,j) * tmp1
-               fjac(4,4,j) = utmp(3,j) * tmp1
-               fjac(4,5,j) = 0.0d+00
-
-               fjac(5,1,j) = ( c2 * 2.0d0 * utmp(6,j)
-     >              - c1 * utmp(5,j) * tmp1 ) 
-     >              * utmp(3,j) * tmp1 
-               fjac(5,2,j) = - c2 * utmp(2,j)*utmp(3,j) 
-     >              * tmp2
-               fjac(5,3,j) = c1 * utmp(5,j) * tmp1 
-     >              - c2 * ( utmp(6,j)
-     >              + utmp(3,j)*utmp(3,j) * tmp2 )
-               fjac(5,4,j) = - c2 * ( utmp(3,j)*utmp(4,j) )
-     >              * tmp2
-               fjac(5,5,j) = c1 * utmp(3,j) * tmp1 
-
-               njac(1,1,j) = 0.0d+00
-               njac(1,2,j) = 0.0d+00
-               njac(1,3,j) = 0.0d+00
-               njac(1,4,j) = 0.0d+00
-               njac(1,5,j) = 0.0d+00
-
-               njac(2,1,j) = - c3c4 * tmp2 * utmp(2,j)
-               njac(2,2,j) =   c3c4 * tmp1
-               njac(2,3,j) =   0.0d+00
-               njac(2,4,j) =   0.0d+00
-               njac(2,5,j) =   0.0d+00
-
-               njac(3,1,j) = - con43 * c3c4 * tmp2 * utmp(3,j)
-               njac(3,2,j) =   0.0d+00
-               njac(3,3,j) =   con43 * c3c4 * tmp1
-               njac(3,4,j) =   0.0d+00
-               njac(3,5,j) =   0.0d+00
-
-               njac(4,1,j) = - c3c4 * tmp2 * utmp(4,j)
-               njac(4,2,j) =   0.0d+00
-               njac(4,3,j) =   0.0d+00
-               njac(4,4,j) =   c3c4 * tmp1
-               njac(4,5,j) =   0.0d+00
-
-               njac(5,1,j) = - (  c3c4
-     >              - c1345 ) * tmp3 * (utmp(2,j)**2)
-     >              - ( con43 * c3c4
-     >              - c1345 ) * tmp3 * (utmp(3,j)**2)
-     >              - ( c3c4 - c1345 ) * tmp3 * (utmp(4,j)**2)
-     >              - c1345 * tmp2 * utmp(5,j)
-
-               njac(5,2,j) = (  c3c4 - c1345 ) * tmp2 * utmp(2,j)
-               njac(5,3,j) = ( con43 * c3c4
-     >              - c1345 ) * tmp2 * utmp(3,j)
-               njac(5,4,j) = ( c3c4 - c1345 ) * tmp2 * utmp(4,j)
-               njac(5,5,j) = ( c1345 ) * tmp1
-
-            enddo
-
-c---------------------------------------------------------------------
-c     now joacobians set, so form left hand side in y direction
-c---------------------------------------------------------------------
-            do j = start(2,c), jsize-end(2,c)
-
-               tmp1 = dt * ty1
-               tmp2 = dt * ty2
-
-               lhsa(1,1,j) = - tmp2 * fjac(1,1,j-1)
-     >              - tmp1 * njac(1,1,j-1)
-     >              - tmp1 * dy1 
-               lhsa(1,2,j) = - tmp2 * fjac(1,2,j-1)
-     >              - tmp1 * njac(1,2,j-1)
-               lhsa(1,3,j) = - tmp2 * fjac(1,3,j-1)
-     >              - tmp1 * njac(1,3,j-1)
-               lhsa(1,4,j) = - tmp2 * fjac(1,4,j-1)
-     >              - tmp1 * njac(1,4,j-1)
-               lhsa(1,5,j) = - tmp2 * fjac(1,5,j-1)
-     >              - tmp1 * njac(1,5,j-1)
-
-               lhsa(2,1,j) = - tmp2 * fjac(2,1,j-1)
-     >              - tmp1 * njac(2,1,j-1)
-               lhsa(2,2,j) = - tmp2 * fjac(2,2,j-1)
-     >              - tmp1 * njac(2,2,j-1)
-     >              - tmp1 * dy2
-               lhsa(2,3,j) = - tmp2 * fjac(2,3,j-1)
-     >              - tmp1 * njac(2,3,j-1)
-               lhsa(2,4,j) = - tmp2 * fjac(2,4,j-1)
-     >              - tmp1 * njac(2,4,j-1)
-               lhsa(2,5,j) = - tmp2 * fjac(2,5,j-1)
-     >              - tmp1 * njac(2,5,j-1)
-
-               lhsa(3,1,j) = - tmp2 * fjac(3,1,j-1)
-     >              - tmp1 * njac(3,1,j-1)
-               lhsa(3,2,j) = - tmp2 * fjac(3,2,j-1)
-     >              - tmp1 * njac(3,2,j-1)
-               lhsa(3,3,j) = - tmp2 * fjac(3,3,j-1)
-     >              - tmp1 * njac(3,3,j-1)
-     >              - tmp1 * dy3 
-               lhsa(3,4,j) = - tmp2 * fjac(3,4,j-1)
-     >              - tmp1 * njac(3,4,j-1)
-               lhsa(3,5,j) = - tmp2 * fjac(3,5,j-1)
-     >              - tmp1 * njac(3,5,j-1)
-
-               lhsa(4,1,j) = - tmp2 * fjac(4,1,j-1)
-     >              - tmp1 * njac(4,1,j-1)
-               lhsa(4,2,j) = - tmp2 * fjac(4,2,j-1)
-     >              - tmp1 * njac(4,2,j-1)
-               lhsa(4,3,j) = - tmp2 * fjac(4,3,j-1)
-     >              - tmp1 * njac(4,3,j-1)
-               lhsa(4,4,j) = - tmp2 * fjac(4,4,j-1)
-     >              - tmp1 * njac(4,4,j-1)
-     >              - tmp1 * dy4
-               lhsa(4,5,j) = - tmp2 * fjac(4,5,j-1)
-     >              - tmp1 * njac(4,5,j-1)
-
-               lhsa(5,1,j) = - tmp2 * fjac(5,1,j-1)
-     >              - tmp1 * njac(5,1,j-1)
-               lhsa(5,2,j) = - tmp2 * fjac(5,2,j-1)
-     >              - tmp1 * njac(5,2,j-1)
-               lhsa(5,3,j) = - tmp2 * fjac(5,3,j-1)
-     >              - tmp1 * njac(5,3,j-1)
-               lhsa(5,4,j) = - tmp2 * fjac(5,4,j-1)
-     >              - tmp1 * njac(5,4,j-1)
-               lhsa(5,5,j) = - tmp2 * fjac(5,5,j-1)
-     >              - tmp1 * njac(5,5,j-1)
-     >              - tmp1 * dy5
-
-               lhsb(1,1,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(1,1,j)
-     >              + tmp1 * 2.0d+00 * dy1
-               lhsb(1,2,j) = tmp1 * 2.0d+00 * njac(1,2,j)
-               lhsb(1,3,j) = tmp1 * 2.0d+00 * njac(1,3,j)
-               lhsb(1,4,j) = tmp1 * 2.0d+00 * njac(1,4,j)
-               lhsb(1,5,j) = tmp1 * 2.0d+00 * njac(1,5,j)
-
-               lhsb(2,1,j) = tmp1 * 2.0d+00 * njac(2,1,j)
-               lhsb(2,2,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(2,2,j)
-     >              + tmp1 * 2.0d+00 * dy2
-               lhsb(2,3,j) = tmp1 * 2.0d+00 * njac(2,3,j)
-               lhsb(2,4,j) = tmp1 * 2.0d+00 * njac(2,4,j)
-               lhsb(2,5,j) = tmp1 * 2.0d+00 * njac(2,5,j)
-
-               lhsb(3,1,j) = tmp1 * 2.0d+00 * njac(3,1,j)
-               lhsb(3,2,j) = tmp1 * 2.0d+00 * njac(3,2,j)
-               lhsb(3,3,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(3,3,j)
-     >              + tmp1 * 2.0d+00 * dy3
-               lhsb(3,4,j) = tmp1 * 2.0d+00 * njac(3,4,j)
-               lhsb(3,5,j) = tmp1 * 2.0d+00 * njac(3,5,j)
-
-               lhsb(4,1,j) = tmp1 * 2.0d+00 * njac(4,1,j)
-               lhsb(4,2,j) = tmp1 * 2.0d+00 * njac(4,2,j)
-               lhsb(4,3,j) = tmp1 * 2.0d+00 * njac(4,3,j)
-               lhsb(4,4,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(4,4,j)
-     >              + tmp1 * 2.0d+00 * dy4
-               lhsb(4,5,j) = tmp1 * 2.0d+00 * njac(4,5,j)
-
-               lhsb(5,1,j) = tmp1 * 2.0d+00 * njac(5,1,j)
-               lhsb(5,2,j) = tmp1 * 2.0d+00 * njac(5,2,j)
-               lhsb(5,3,j) = tmp1 * 2.0d+00 * njac(5,3,j)
-               lhsb(5,4,j) = tmp1 * 2.0d+00 * njac(5,4,j)
-               lhsb(5,5,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(5,5,j) 
-     >              + tmp1 * 2.0d+00 * dy5
-
-               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,j+1)
-     >              - tmp1 * njac(1,1,j+1)
-     >              - tmp1 * dy1
-               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,j+1)
-     >              - tmp1 * njac(1,2,j+1)
-               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,j+1)
-     >              - tmp1 * njac(1,3,j+1)
-               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,j+1)
-     >              - tmp1 * njac(1,4,j+1)
-               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,j+1)
-     >              - tmp1 * njac(1,5,j+1)
-
-               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,j+1)
-     >              - tmp1 * njac(2,1,j+1)
-               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,j+1)
-     >              - tmp1 * njac(2,2,j+1)
-     >              - tmp1 * dy2
-               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,j+1)
-     >              - tmp1 * njac(2,3,j+1)
-               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,j+1)
-     >              - tmp1 * njac(2,4,j+1)
-               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,j+1)
-     >              - tmp1 * njac(2,5,j+1)
-
-               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,j+1)
-     >              - tmp1 * njac(3,1,j+1)
-               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,j+1)
-     >              - tmp1 * njac(3,2,j+1)
-               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,j+1)
-     >              - tmp1 * njac(3,3,j+1)
-     >              - tmp1 * dy3
-               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,j+1)
-     >              - tmp1 * njac(3,4,j+1)
-               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,j+1)
-     >              - tmp1 * njac(3,5,j+1)
-
-               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,j+1)
-     >              - tmp1 * njac(4,1,j+1)
-               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,j+1)
-     >              - tmp1 * njac(4,2,j+1)
-               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,j+1)
-     >              - tmp1 * njac(4,3,j+1)
-               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,j+1)
-     >              - tmp1 * njac(4,4,j+1)
-     >              - tmp1 * dy4
-               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,j+1)
-     >              - tmp1 * njac(4,5,j+1)
-
-               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,j+1)
-     >              - tmp1 * njac(5,1,j+1)
-               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,j+1)
-     >              - tmp1 * njac(5,2,j+1)
-               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,j+1)
-     >              - tmp1 * njac(5,3,j+1)
-               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,j+1)
-     >              - tmp1 * njac(5,4,j+1)
-               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,j+1)
-     >              - tmp1 * njac(5,5,j+1)
-     >              - tmp1 * dy5
-
-            enddo
-
-
-c---------------------------------------------------------------------
-c     outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
-            if (first .eq. 1) then 
-
-c---------------------------------------------------------------------
-c     multiply c(i,jstart,k) by b_inverse and copy back to c
-c     multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,jstart),
-     >                        lhsc(1,1,i,jstart,k,c),
-     >                        rhs(1,i,jstart,k,c) )
-
-            endif
-
-c---------------------------------------------------------------------
-c     begin inner most do loop
-c     do all the elements of the cell unless last 
-c---------------------------------------------------------------------
-            do j=jstart+first,jsize-last
-
-c---------------------------------------------------------------------
-c     subtract A*lhs_vector(j-1) from lhs_vector(j)
-c     
-c     rhs(j) = rhs(j) - A*rhs(j-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,j),
-     >                         rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c     B(j) = B(j) - C(j-1)*A(j)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,j),
-     >                         lhsc(1,1,i,j-1,k,c),
-     >                         lhsb(1,1,j))
-
-c---------------------------------------------------------------------
-c     multiply c(i,j,k) by b_inverse and copy back to c
-c     multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,j),
-     >                        lhsc(1,1,i,j,k,c),
-     >                        rhs(1,i,j,k,c) )
-
-            enddo
-
-c---------------------------------------------------------------------
-c     Now finish up special cases for last cell
-c---------------------------------------------------------------------
-            if (last .eq. 1) then
-
-c---------------------------------------------------------------------
-c     rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,jsize),
-     >                         rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
-
-c---------------------------------------------------------------------
-c     B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
-c     call matmul_sub(aa,i,jsize,k,c,
-c     $              cc,i,jsize-1,k,c,bb,i,jsize,k,c)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,jsize),
-     >                         lhsc(1,1,i,jsize-1,k,c),
-     >                         lhsb(1,1,jsize))
-
-c---------------------------------------------------------------------
-c     multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
-c---------------------------------------------------------------------
-               call binvrhs( lhsb(1,1,jsize),
-     >                       rhs(1,i,jsize,k,c) )
-
-            endif
-         enddo
-      enddo
-
-
-      return
-      end
-      
-
-
diff --git a/examples/smpi/NAS/BT/y_solve_vec.f b/examples/smpi/NAS/BT/y_solve_vec.f
deleted file mode 100644
index e21cfa36ce..0000000000
--- a/examples/smpi/NAS/BT/y_solve_vec.f
+++ /dev/null
@@ -1,788 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     Performs line solves in Y direction by first factoring
-c     the block-tridiagonal matrix into an upper triangular matrix, 
-c     and then performing back substitution to solve for the unknow
-c     vectors of each line.  
-c     
-c     Make sure we treat elements zero to cell_size in the direction
-c     of the sweep.
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer 
-     >     c, jstart, stage,
-     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
-     >     isize,jsize,ksize,send_id
-
-      jstart = 0
-
-c---------------------------------------------------------------------
-c     in our terminology stage is the number of the cell in the y-direct
-c     i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
-      do stage = 1,ncells
-         c = slice(2,stage)
-         isize = cell_size(1,c) - 1
-         jsize = cell_size(2,c) - 1
-         ksize = cell_size(3,c) - 1
-
-c---------------------------------------------------------------------
-c     set last-cell flag
-c---------------------------------------------------------------------
-         if (stage .eq. ncells) then
-            last = 1
-         else
-            last = 0
-         endif
-
-         if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c     This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
-            first = 1
-c            call lhsy(c)
-            call y_solve_cell(first,last,c)
-         else
-c---------------------------------------------------------------------
-c     Not the first cell of this line, so receive info from
-c     processor working on preceeding cell
-c---------------------------------------------------------------------
-            first = 0
-            call y_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c     overlap computations and communications
-c---------------------------------------------------------------------
-c            call lhsy(c)
-c---------------------------------------------------------------------
-c     wait for completion
-c---------------------------------------------------------------------
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c     install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
-c---------------------------------------------------------------------
-            call y_unpack_solve_info(c)
-            call y_solve_cell(first,last,c)
-         endif
-
-         if (last .eq. 0) call y_send_solve_info(send_id,c)
-      enddo
-
-c---------------------------------------------------------------------
-c     now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
-      do stage = ncells, 1, -1
-         c = slice(2,stage)
-         first = 0
-         last = 0
-         if (stage .eq. 1) first = 1
-         if (stage .eq. ncells) then
-            last = 1
-c---------------------------------------------------------------------
-c     last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
-            call y_backsubstitute(first, last,c)
-         else
-            call y_receive_backsub_info(recv_id,c)
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-            call y_unpack_backsub_info(c)
-            call y_backsubstitute(first,last,c)
-         endif
-         if (first .eq. 0) call y_send_backsub_info(send_id,c)
-      enddo
-
-
-      return
-      end
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine y_unpack_solve_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack C'(-1) and rhs'(-1) for
-c     all i and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer i,k,m,n,ptr,c,jstart 
-
-      jstart = 0
-      ptr = 0
-      do k=0,KMAX-1
-         do i=0,IMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine y_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send C'(jend) and rhs'(jend) for
-c     all i and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i,k,m,n,jsize,ptr,c,ip,kp
-      integer error,send_id,buffer_size 
-
-      jsize = cell_size(2,c)-1
-      ip = cell_coord(1,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c     pack up buffer
-c---------------------------------------------------------------------
-      ptr = 0
-      do k=0,KMAX-1
-         do i=0,IMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     send buffer 
-c---------------------------------------------------------------------
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, successor(2),
-     >     SOUTH+ip+kp*NCELLS, comm_solve,
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send U(jstart) for all i and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i,k,n,ptr,c,jstart,ip,kp
-      integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c     Send element 0 to previous processor
-c---------------------------------------------------------------------
-      jstart = 0
-      ip = cell_coord(1,c)-1
-      kp = cell_coord(3,c)-1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      ptr = 0
-      do k=0,KMAX-1
-         do i=0,IMAX-1
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, predecessor(2), 
-     >     NORTH+ip+kp*NCELLS, comm_solve, 
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack U(jsize) for all i and k
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer i,k,n,ptr,c 
-
-      ptr = 0
-      do k=0,KMAX-1
-         do i=0,IMAX-1
-            do n=1,BLOCK_SIZE
-               backsub_info(n,i,k,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer error,recv_id,ip,kp,c,buffer_size
-      ip = cell_coord(1,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, successor(2), 
-     >     NORTH+ip+kp*NCELLS, comm_solve, 
-     >     recv_id, error)
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives 
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ip,kp,recv_id,error,c,buffer_size
-      ip = cell_coord(1,c) - 1
-      kp = cell_coord(3,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-      call mpi_irecv(out_buffer, buffer_size, 
-     >     dp_type, predecessor(2), 
-     >     SOUTH+ip+kp*NCELLS,  comm_solve, 
-     >     recv_id, error)
-
-      return
-      end
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     back solve: if last cell, then generate U(jsize)=rhs(jsize)
-c     else assume U(jsize) is loaded in un pack backsub_info
-c     so just use it
-c     after call u(jstart) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer first, last, c, i, k
-      integer m,n,j,jsize,isize,ksize,jstart
-      
-      jstart = 0
-      isize = cell_size(1,c)-end(1,c)-1      
-      jsize = cell_size(2,c)-1
-      ksize = cell_size(3,c)-end(3,c)-1
-      if (last .eq. 0) then
-         do k=start(3,c),ksize
-            do i=start(1,c),isize
-c---------------------------------------------------------------------
-c     U(jsize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) 
-     >                    - lhsc(m,n,i,jsize,k,c)*
-     >                    backsub_info(n,i,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      endif
-      do k=start(3,c),ksize
-         do j=jsize-1,jstart,-1
-            do i=start(1,c),isize
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
-     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine y_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     performs guaussian elimination on this cell.
-c     
-c     assumes that unpacking routines for non-first cells 
-c     preload C' and rhs' from previous cell.
-c     
-c     assumed send happens outside this routine, but that
-c     c'(JMAX) and rhs'(JMAX) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'work_lhs_vec.h'
-
-      integer first,last,c
-      integer i,j,k,m,n,isize,ksize,jsize,jstart
-
-      jstart = 0
-      isize = cell_size(1,c)-end(1,c)-1
-      jsize = cell_size(2,c)-1
-      ksize = cell_size(3,c)-end(3,c)-1
-
-c---------------------------------------------------------------------
-c     zero the left hand side for starters
-c     set diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
-      do i = 0, isize
-         do m = 1, 5
-            do n = 1, 5
-               lhsa(m,n,i,0) = 0.0d0
-               lhsb(m,n,i,0) = 0.0d0
-               lhsa(m,n,i,jsize) = 0.0d0
-               lhsb(m,n,i,jsize) = 0.0d0
-            enddo
-            lhsb(m,m,i,0) = 1.0d0
-            lhsb(m,m,i,jsize) = 1.0d0
-         enddo
-      enddo
-
-      do k=start(3,c),ksize 
-
-c---------------------------------------------------------------------
-c     This function computes the left hand side for the three y-factors 
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     Compute the indices for storing the tri-diagonal matrix;
-c     determine a (labeled f) and n jacobians for cell c
-c---------------------------------------------------------------------
-
-         do j = start(2,c)-1, cell_size(2,c)-end(2,c)
-            do i=start(1,c),isize
-
-               tmp1 = 1.0d0 / u(1,i,j,k,c)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               fjac(1,1,i,j) = 0.0d+00
-               fjac(1,2,i,j) = 0.0d+00
-               fjac(1,3,i,j) = 1.0d+00
-               fjac(1,4,i,j) = 0.0d+00
-               fjac(1,5,i,j) = 0.0d+00
-
-               fjac(2,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) )
-     >              * tmp2
-               fjac(2,2,i,j) = u(3,i,j,k,c) * tmp1
-               fjac(2,3,i,j) = u(2,i,j,k,c) * tmp1
-               fjac(2,4,i,j) = 0.0d+00
-               fjac(2,5,i,j) = 0.0d+00
-
-               fjac(3,1,i,j) = - ( u(3,i,j,k,c)*u(3,i,j,k,c)*tmp2)
-     >              + c2 * qs(i,j,k,c)
-               fjac(3,2,i,j) = - c2 *  u(2,i,j,k,c) * tmp1
-               fjac(3,3,i,j) = ( 2.0d+00 - c2 )
-     >              *  u(3,i,j,k,c) * tmp1 
-               fjac(3,4,i,j) = - c2 * u(4,i,j,k,c) * tmp1 
-               fjac(3,5,i,j) = c2
-
-               fjac(4,1,i,j) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
-     >              * tmp2
-               fjac(4,2,i,j) = 0.0d+00
-               fjac(4,3,i,j) = u(4,i,j,k,c) * tmp1
-               fjac(4,4,i,j) = u(3,i,j,k,c) * tmp1
-               fjac(4,5,i,j) = 0.0d+00
-
-               fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
-     >              - c1 * u(5,i,j,k,c) * tmp1 ) 
-     >              * u(3,i,j,k,c) * tmp1 
-               fjac(5,2,i,j) = - c2 * u(2,i,j,k,c)*u(3,i,j,k,c) 
-     >              * tmp2
-               fjac(5,3,i,j) = c1 * u(5,i,j,k,c) * tmp1 
-     >              - c2 * ( qs(i,j,k,c)
-     >              + u(3,i,j,k,c)*u(3,i,j,k,c) * tmp2 )
-               fjac(5,4,i,j) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
-     >              * tmp2
-               fjac(5,5,i,j) = c1 * u(3,i,j,k,c) * tmp1 
-
-               njac(1,1,i,j) = 0.0d+00
-               njac(1,2,i,j) = 0.0d+00
-               njac(1,3,i,j) = 0.0d+00
-               njac(1,4,i,j) = 0.0d+00
-               njac(1,5,i,j) = 0.0d+00
-
-               njac(2,1,i,j) = - c3c4 * tmp2 * u(2,i,j,k,c)
-               njac(2,2,i,j) =   c3c4 * tmp1
-               njac(2,3,i,j) =   0.0d+00
-               njac(2,4,i,j) =   0.0d+00
-               njac(2,5,i,j) =   0.0d+00
-
-               njac(3,1,i,j) = - con43 * c3c4 * tmp2 * u(3,i,j,k,c)
-               njac(3,2,i,j) =   0.0d+00
-               njac(3,3,i,j) =   con43 * c3c4 * tmp1
-               njac(3,4,i,j) =   0.0d+00
-               njac(3,5,i,j) =   0.0d+00
-
-               njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
-               njac(4,2,i,j) =   0.0d+00
-               njac(4,3,i,j) =   0.0d+00
-               njac(4,4,i,j) =   c3c4 * tmp1
-               njac(4,5,i,j) =   0.0d+00
-
-               njac(5,1,i,j) = - (  c3c4
-     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
-     >              - ( con43 * c3c4
-     >              - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
-     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
-     >              - c1345 * tmp2 * u(5,i,j,k,c)
-
-               njac(5,2,i,j) = (  c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
-               njac(5,3,i,j) = ( con43 * c3c4
-     >              - c1345 ) * tmp2 * u(3,i,j,k,c)
-               njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
-               njac(5,5,i,j) = ( c1345 ) * tmp1
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     now joacobians set, so form left hand side in y direction
-c---------------------------------------------------------------------
-         do j = start(2,c), jsize-end(2,c)
-            do i=start(1,c),isize
-
-               tmp1 = dt * ty1
-               tmp2 = dt * ty2
-
-               lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i,j-1)
-     >              - tmp1 * njac(1,1,i,j-1)
-     >              - tmp1 * dy1 
-               lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i,j-1)
-     >              - tmp1 * njac(1,2,i,j-1)
-               lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i,j-1)
-     >              - tmp1 * njac(1,3,i,j-1)
-               lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i,j-1)
-     >              - tmp1 * njac(1,4,i,j-1)
-               lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i,j-1)
-     >              - tmp1 * njac(1,5,i,j-1)
-
-               lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i,j-1)
-     >              - tmp1 * njac(2,1,i,j-1)
-               lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i,j-1)
-     >              - tmp1 * njac(2,2,i,j-1)
-     >              - tmp1 * dy2
-               lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i,j-1)
-     >              - tmp1 * njac(2,3,i,j-1)
-               lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i,j-1)
-     >              - tmp1 * njac(2,4,i,j-1)
-               lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i,j-1)
-     >              - tmp1 * njac(2,5,i,j-1)
-
-               lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i,j-1)
-     >              - tmp1 * njac(3,1,i,j-1)
-               lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i,j-1)
-     >              - tmp1 * njac(3,2,i,j-1)
-               lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i,j-1)
-     >              - tmp1 * njac(3,3,i,j-1)
-     >              - tmp1 * dy3 
-               lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i,j-1)
-     >              - tmp1 * njac(3,4,i,j-1)
-               lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i,j-1)
-     >              - tmp1 * njac(3,5,i,j-1)
-
-               lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i,j-1)
-     >              - tmp1 * njac(4,1,i,j-1)
-               lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i,j-1)
-     >              - tmp1 * njac(4,2,i,j-1)
-               lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i,j-1)
-     >              - tmp1 * njac(4,3,i,j-1)
-               lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i,j-1)
-     >              - tmp1 * njac(4,4,i,j-1)
-     >              - tmp1 * dy4
-               lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i,j-1)
-     >              - tmp1 * njac(4,5,i,j-1)
-
-               lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i,j-1)
-     >              - tmp1 * njac(5,1,i,j-1)
-               lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i,j-1)
-     >              - tmp1 * njac(5,2,i,j-1)
-               lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i,j-1)
-     >              - tmp1 * njac(5,3,i,j-1)
-               lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i,j-1)
-     >              - tmp1 * njac(5,4,i,j-1)
-               lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i,j-1)
-     >              - tmp1 * njac(5,5,i,j-1)
-     >              - tmp1 * dy5
-
-               lhsb(1,1,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(1,1,i,j)
-     >              + tmp1 * 2.0d+00 * dy1
-               lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
-               lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
-               lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
-               lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
-
-               lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
-               lhsb(2,2,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(2,2,i,j)
-     >              + tmp1 * 2.0d+00 * dy2
-               lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
-               lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
-               lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
-
-               lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
-               lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
-               lhsb(3,3,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(3,3,i,j)
-     >              + tmp1 * 2.0d+00 * dy3
-               lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
-               lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
-
-               lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
-               lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
-               lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
-               lhsb(4,4,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(4,4,i,j)
-     >              + tmp1 * 2.0d+00 * dy4
-               lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
-
-               lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
-               lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
-               lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
-               lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
-               lhsb(5,5,i,j) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(5,5,i,j) 
-     >              + tmp1 * 2.0d+00 * dy5
-
-               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i,j+1)
-     >              - tmp1 * njac(1,1,i,j+1)
-     >              - tmp1 * dy1
-               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i,j+1)
-     >              - tmp1 * njac(1,2,i,j+1)
-               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i,j+1)
-     >              - tmp1 * njac(1,3,i,j+1)
-               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i,j+1)
-     >              - tmp1 * njac(1,4,i,j+1)
-               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i,j+1)
-     >              - tmp1 * njac(1,5,i,j+1)
-
-               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i,j+1)
-     >              - tmp1 * njac(2,1,i,j+1)
-               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i,j+1)
-     >              - tmp1 * njac(2,2,i,j+1)
-     >              - tmp1 * dy2
-               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i,j+1)
-     >              - tmp1 * njac(2,3,i,j+1)
-               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i,j+1)
-     >              - tmp1 * njac(2,4,i,j+1)
-               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i,j+1)
-     >              - tmp1 * njac(2,5,i,j+1)
-
-               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i,j+1)
-     >              - tmp1 * njac(3,1,i,j+1)
-               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i,j+1)
-     >              - tmp1 * njac(3,2,i,j+1)
-               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i,j+1)
-     >              - tmp1 * njac(3,3,i,j+1)
-     >              - tmp1 * dy3
-               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i,j+1)
-     >              - tmp1 * njac(3,4,i,j+1)
-               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i,j+1)
-     >              - tmp1 * njac(3,5,i,j+1)
-
-               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i,j+1)
-     >              - tmp1 * njac(4,1,i,j+1)
-               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i,j+1)
-     >              - tmp1 * njac(4,2,i,j+1)
-               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i,j+1)
-     >              - tmp1 * njac(4,3,i,j+1)
-               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i,j+1)
-     >              - tmp1 * njac(4,4,i,j+1)
-     >              - tmp1 * dy4
-               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i,j+1)
-     >              - tmp1 * njac(4,5,i,j+1)
-
-               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i,j+1)
-     >              - tmp1 * njac(5,1,i,j+1)
-               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i,j+1)
-     >              - tmp1 * njac(5,2,i,j+1)
-               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i,j+1)
-     >              - tmp1 * njac(5,3,i,j+1)
-               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i,j+1)
-     >              - tmp1 * njac(5,4,i,j+1)
-               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i,j+1)
-     >              - tmp1 * njac(5,5,i,j+1)
-     >              - tmp1 * dy5
-
-            enddo
-         enddo
-
-
-c---------------------------------------------------------------------
-c     outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
-         if (first .eq. 1) then 
-
-c---------------------------------------------------------------------
-c     multiply c(i,jstart,k) by b_inverse and copy back to c
-c     multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
-c---------------------------------------------------------------------
-!dir$ ivdep
-            do i=start(1,c),isize
-               call binvcrhs( lhsb(1,1,i,jstart),
-     >                        lhsc(1,1,i,jstart,k,c),
-     >                        rhs(1,i,jstart,k,c) )
-            enddo
-
-         endif
-
-c---------------------------------------------------------------------
-c     begin inner most do loop
-c     do all the elements of the cell unless last 
-c---------------------------------------------------------------------
-         do j=jstart+first,jsize-last
-!dir$ ivdep
-            do i=start(1,c),isize
-
-c---------------------------------------------------------------------
-c     subtract A*lhs_vector(j-1) from lhs_vector(j)
-c     
-c     rhs(j) = rhs(j) - A*rhs(j-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,i,j),
-     >                         rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c     B(j) = B(j) - C(j-1)*A(j)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,i,j),
-     >                         lhsc(1,1,i,j-1,k,c),
-     >                         lhsb(1,1,i,j))
-
-c---------------------------------------------------------------------
-c     multiply c(i,j,k) by b_inverse and copy back to c
-c     multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,i,j),
-     >                        lhsc(1,1,i,j,k,c),
-     >                        rhs(1,i,j,k,c) )
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     Now finish up special cases for last cell
-c---------------------------------------------------------------------
-         if (last .eq. 1) then
-
-!dir$ ivdep
-            do i=start(1,c),isize
-c---------------------------------------------------------------------
-c     rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,i,jsize),
-     >                         rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
-
-c---------------------------------------------------------------------
-c     B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
-c     call matmul_sub(aa,i,jsize,k,c,
-c     $              cc,i,jsize-1,k,c,bb,i,jsize,k,c)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,i,jsize),
-     >                         lhsc(1,1,i,jsize-1,k,c),
-     >                         lhsb(1,1,i,jsize))
-
-c---------------------------------------------------------------------
-c     multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
-c---------------------------------------------------------------------
-               call binvrhs( lhsb(1,1,i,jsize),
-     >                       rhs(1,i,jsize,k,c) )
-            enddo
-
-         endif
-      enddo
-
-
-      return
-      end
-      
-
-
diff --git a/examples/smpi/NAS/BT/z_solve.f b/examples/smpi/NAS/BT/z_solve.f
deleted file mode 100644
index d7a5a2f1ec..0000000000
--- a/examples/smpi/NAS/BT/z_solve.f
+++ /dev/null
@@ -1,776 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     Performs line solves in Z direction by first factoring
-c     the block-tridiagonal matrix into an upper triangular matrix, 
-c     and then performing back substitution to solve for the unknow
-c     vectors of each line.  
-c     
-c     Make sure we treat elements zero to cell_size in the direction
-c     of the sweep.
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer c, kstart, stage,
-     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
-     >     isize,jsize,ksize,send_id
-
-      kstart = 0
-
-c---------------------------------------------------------------------
-c     in our terminology stage is the number of the cell in the y-direction
-c     i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
-      do stage = 1,ncells
-         c = slice(3,stage)
-         isize = cell_size(1,c) - 1
-         jsize = cell_size(2,c) - 1
-         ksize = cell_size(3,c) - 1
-c---------------------------------------------------------------------
-c     set last-cell flag
-c---------------------------------------------------------------------
-         if (stage .eq. ncells) then
-            last = 1
-         else
-            last = 0
-         endif
-
-         if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c     This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
-            first = 1
-c            call lhsz(c)
-            call z_solve_cell(first,last,c)
-         else
-c---------------------------------------------------------------------
-c     Not the first cell of this line, so receive info from
-c     processor working on preceeding cell
-c---------------------------------------------------------------------
-            first = 0
-            call z_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c     overlap computations and communications
-c---------------------------------------------------------------------
-c            call lhsz(c)
-c---------------------------------------------------------------------
-c     wait for completion
-c---------------------------------------------------------------------
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c     install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
-c---------------------------------------------------------------------
-            call z_unpack_solve_info(c)
-            call z_solve_cell(first,last,c)
-         endif
-
-         if (last .eq. 0) call z_send_solve_info(send_id,c)
-      enddo
-
-c---------------------------------------------------------------------
-c     now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
-      do stage = ncells, 1, -1
-         c = slice(3,stage)
-         first = 0
-         last = 0
-         if (stage .eq. 1) first = 1
-         if (stage .eq. ncells) then
-            last = 1
-c---------------------------------------------------------------------
-c     last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
-            call z_backsubstitute(first, last,c)
-         else
-            call z_receive_backsub_info(recv_id,c)
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-            call z_unpack_backsub_info(c)
-            call z_backsubstitute(first,last,c)
-         endif
-         if (first .eq. 0) call z_send_backsub_info(send_id,c)
-      enddo
-
-
-      return
-      end
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine z_unpack_solve_info(c)
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack C'(-1) and rhs'(-1) for
-c     all i and j
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer i,j,m,n,ptr,c,kstart 
-
-      kstart = 0
-      ptr = 0
-      do j=0,JMAX-1
-         do i=0,IMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine z_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send C'(kend) and rhs'(kend) for
-c     all i and j
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i,j,m,n,ksize,ptr,c,ip,jp
-      integer error,send_id,buffer_size
-
-      ksize = cell_size(3,c)-1
-      ip = cell_coord(1,c) - 1
-      jp = cell_coord(2,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c     pack up buffer
-c---------------------------------------------------------------------
-      ptr = 0
-      do j=0,JMAX-1
-         do i=0,IMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     send buffer 
-c---------------------------------------------------------------------
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, successor(3),
-     >     BOTTOM+ip+jp*NCELLS, comm_solve,
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send U(jstart) for all i and j
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i,j,n,ptr,c,kstart,ip,jp
-      integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c     Send element 0 to previous processor
-c---------------------------------------------------------------------
-      kstart = 0
-      ip = cell_coord(1,c)-1
-      jp = cell_coord(2,c)-1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      ptr = 0
-      do j=0,JMAX-1
-         do i=0,IMAX-1
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, predecessor(3), 
-     >     TOP+ip+jp*NCELLS, comm_solve, 
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack U(ksize) for all i and j
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer i,j,n,ptr,c
-
-      ptr = 0
-      do j=0,JMAX-1
-         do i=0,IMAX-1
-            do n=1,BLOCK_SIZE
-               backsub_info(n,i,j,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer error,recv_id,ip,jp,c,buffer_size
-      ip = cell_coord(1,c) - 1
-      jp = cell_coord(2,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, successor(3), 
-     >     TOP+ip+jp*NCELLS, comm_solve, 
-     >     recv_id, error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives 
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ip,jp,recv_id,error,c,buffer_size
-      ip = cell_coord(1,c) - 1
-      jp = cell_coord(2,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, predecessor(3), 
-     >     BOTTOM+ip+jp*NCELLS, comm_solve,
-     >     recv_id, error)
-
-      return
-      end
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     back solve: if last cell, then generate U(ksize)=rhs(ksize)
-c     else assume U(ksize) is loaded in un pack backsub_info
-c     so just use it
-c     after call u(kstart) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer first, last, c, i, k
-      integer m,n,j,jsize,isize,ksize,kstart
-      
-      kstart = 0
-      isize = cell_size(1,c)-end(1,c)-1      
-      jsize = cell_size(2,c)-end(2,c)-1
-      ksize = cell_size(3,c)-1
-      if (last .eq. 0) then
-         do j=start(2,c),jsize
-            do i=start(1,c),isize
-c---------------------------------------------------------------------
-c     U(jsize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) 
-     >                    - lhsc(m,n,i,j,ksize,c)*
-     >                    backsub_info(n,i,j,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      endif
-      do k=ksize-1,kstart,-1
-         do j=start(2,c),jsize
-            do i=start(1,c),isize
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
-     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     performs guaussian elimination on this cell.
-c     
-c     assumes that unpacking routines for non-first cells 
-c     preload C' and rhs' from previous cell.
-c     
-c     assumed send happens outside this routine, but that
-c     c'(KMAX) and rhs'(KMAX) will be sent to next cell.
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'work_lhs.h'
-
-      integer first,last,c
-      integer i,j,k,isize,ksize,jsize,kstart
-      double precision utmp(6,-2:KMAX+1)
-
-      kstart = 0
-      isize = cell_size(1,c)-end(1,c)-1
-      jsize = cell_size(2,c)-end(2,c)-1
-      ksize = cell_size(3,c)-1
-
-      call lhsabinit(lhsa, lhsb, ksize)
-
-      do j=start(2,c),jsize 
-         do i=start(1,c),isize
-
-c---------------------------------------------------------------------
-c     This function computes the left hand side for the three z-factors   
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     Compute the indices for storing the block-diagonal matrix;
-c     determine c (labeled f) and s jacobians for cell c
-c---------------------------------------------------------------------
-            do k = start(3,c)-1, cell_size(3,c)-end(3,c)
-               utmp(1,k) = 1.0d0 / u(1,i,j,k,c)
-               utmp(2,k) = u(2,i,j,k,c)
-               utmp(3,k) = u(3,i,j,k,c)
-               utmp(4,k) = u(4,i,j,k,c)
-               utmp(5,k) = u(5,i,j,k,c)
-               utmp(6,k) = qs(i,j,k,c)
-            end do
-
-            do k = start(3,c)-1, cell_size(3,c)-end(3,c)
-
-               tmp1 = utmp(1,k)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               fjac(1,1,k) = 0.0d+00
-               fjac(1,2,k) = 0.0d+00
-               fjac(1,3,k) = 0.0d+00
-               fjac(1,4,k) = 1.0d+00
-               fjac(1,5,k) = 0.0d+00
-
-               fjac(2,1,k) = - ( utmp(2,k)*utmp(4,k) ) 
-     >              * tmp2 
-               fjac(2,2,k) = utmp(4,k) * tmp1
-               fjac(2,3,k) = 0.0d+00
-               fjac(2,4,k) = utmp(2,k) * tmp1
-               fjac(2,5,k) = 0.0d+00
-
-               fjac(3,1,k) = - ( utmp(3,k)*utmp(4,k) )
-     >              * tmp2 
-               fjac(3,2,k) = 0.0d+00
-               fjac(3,3,k) = utmp(4,k) * tmp1
-               fjac(3,4,k) = utmp(3,k) * tmp1
-               fjac(3,5,k) = 0.0d+00
-
-               fjac(4,1,k) = - (utmp(4,k)*utmp(4,k) * tmp2 ) 
-     >              + c2 * utmp(6,k)
-               fjac(4,2,k) = - c2 *  utmp(2,k) * tmp1 
-               fjac(4,3,k) = - c2 *  utmp(3,k) * tmp1
-               fjac(4,4,k) = ( 2.0d+00 - c2 )
-     >              *  utmp(4,k) * tmp1 
-               fjac(4,5,k) = c2
-
-               fjac(5,1,k) = ( c2 * 2.0d0 * utmp(6,k)
-     >              - c1 * ( utmp(5,k) * tmp1 ) )
-     >              * ( utmp(4,k) * tmp1 )
-               fjac(5,2,k) = - c2 * ( utmp(2,k)*utmp(4,k) )
-     >              * tmp2 
-               fjac(5,3,k) = - c2 * ( utmp(3,k)*utmp(4,k) )
-     >              * tmp2
-               fjac(5,4,k) = c1 * ( utmp(5,k) * tmp1 )
-     >              - c2 * ( utmp(6,k)
-     >              + utmp(4,k)*utmp(4,k) * tmp2 )
-               fjac(5,5,k) = c1 * utmp(4,k) * tmp1
-
-               njac(1,1,k) = 0.0d+00
-               njac(1,2,k) = 0.0d+00
-               njac(1,3,k) = 0.0d+00
-               njac(1,4,k) = 0.0d+00
-               njac(1,5,k) = 0.0d+00
-
-               njac(2,1,k) = - c3c4 * tmp2 * utmp(2,k)
-               njac(2,2,k) =   c3c4 * tmp1
-               njac(2,3,k) =   0.0d+00
-               njac(2,4,k) =   0.0d+00
-               njac(2,5,k) =   0.0d+00
-
-               njac(3,1,k) = - c3c4 * tmp2 * utmp(3,k)
-               njac(3,2,k) =   0.0d+00
-               njac(3,3,k) =   c3c4 * tmp1
-               njac(3,4,k) =   0.0d+00
-               njac(3,5,k) =   0.0d+00
-
-               njac(4,1,k) = - con43 * c3c4 * tmp2 * utmp(4,k)
-               njac(4,2,k) =   0.0d+00
-               njac(4,3,k) =   0.0d+00
-               njac(4,4,k) =   con43 * c3 * c4 * tmp1
-               njac(4,5,k) =   0.0d+00
-
-               njac(5,1,k) = - (  c3c4
-     >              - c1345 ) * tmp3 * (utmp(2,k)**2)
-     >              - ( c3c4 - c1345 ) * tmp3 * (utmp(3,k)**2)
-     >              - ( con43 * c3c4
-     >              - c1345 ) * tmp3 * (utmp(4,k)**2)
-     >              - c1345 * tmp2 * utmp(5,k)
-
-               njac(5,2,k) = (  c3c4 - c1345 ) * tmp2 * utmp(2,k)
-               njac(5,3,k) = (  c3c4 - c1345 ) * tmp2 * utmp(3,k)
-               njac(5,4,k) = ( con43 * c3c4
-     >              - c1345 ) * tmp2 * utmp(4,k)
-               njac(5,5,k) = ( c1345 )* tmp1
-
-
-            enddo
-
-c---------------------------------------------------------------------
-c     now joacobians set, so form left hand side in z direction
-c---------------------------------------------------------------------
-            do k = start(3,c), ksize-end(3,c)
-
-               tmp1 = dt * tz1
-               tmp2 = dt * tz2
-
-               lhsa(1,1,k) = - tmp2 * fjac(1,1,k-1)
-     >              - tmp1 * njac(1,1,k-1)
-     >              - tmp1 * dz1 
-               lhsa(1,2,k) = - tmp2 * fjac(1,2,k-1)
-     >              - tmp1 * njac(1,2,k-1)
-               lhsa(1,3,k) = - tmp2 * fjac(1,3,k-1)
-     >              - tmp1 * njac(1,3,k-1)
-               lhsa(1,4,k) = - tmp2 * fjac(1,4,k-1)
-     >              - tmp1 * njac(1,4,k-1)
-               lhsa(1,5,k) = - tmp2 * fjac(1,5,k-1)
-     >              - tmp1 * njac(1,5,k-1)
-
-               lhsa(2,1,k) = - tmp2 * fjac(2,1,k-1)
-     >              - tmp1 * njac(2,1,k-1)
-               lhsa(2,2,k) = - tmp2 * fjac(2,2,k-1)
-     >              - tmp1 * njac(2,2,k-1)
-     >              - tmp1 * dz2
-               lhsa(2,3,k) = - tmp2 * fjac(2,3,k-1)
-     >              - tmp1 * njac(2,3,k-1)
-               lhsa(2,4,k) = - tmp2 * fjac(2,4,k-1)
-     >              - tmp1 * njac(2,4,k-1)
-               lhsa(2,5,k) = - tmp2 * fjac(2,5,k-1)
-     >              - tmp1 * njac(2,5,k-1)
-
-               lhsa(3,1,k) = - tmp2 * fjac(3,1,k-1)
-     >              - tmp1 * njac(3,1,k-1)
-               lhsa(3,2,k) = - tmp2 * fjac(3,2,k-1)
-     >              - tmp1 * njac(3,2,k-1)
-               lhsa(3,3,k) = - tmp2 * fjac(3,3,k-1)
-     >              - tmp1 * njac(3,3,k-1)
-     >              - tmp1 * dz3 
-               lhsa(3,4,k) = - tmp2 * fjac(3,4,k-1)
-     >              - tmp1 * njac(3,4,k-1)
-               lhsa(3,5,k) = - tmp2 * fjac(3,5,k-1)
-     >              - tmp1 * njac(3,5,k-1)
-
-               lhsa(4,1,k) = - tmp2 * fjac(4,1,k-1)
-     >              - tmp1 * njac(4,1,k-1)
-               lhsa(4,2,k) = - tmp2 * fjac(4,2,k-1)
-     >              - tmp1 * njac(4,2,k-1)
-               lhsa(4,3,k) = - tmp2 * fjac(4,3,k-1)
-     >              - tmp1 * njac(4,3,k-1)
-               lhsa(4,4,k) = - tmp2 * fjac(4,4,k-1)
-     >              - tmp1 * njac(4,4,k-1)
-     >              - tmp1 * dz4
-               lhsa(4,5,k) = - tmp2 * fjac(4,5,k-1)
-     >              - tmp1 * njac(4,5,k-1)
-
-               lhsa(5,1,k) = - tmp2 * fjac(5,1,k-1)
-     >              - tmp1 * njac(5,1,k-1)
-               lhsa(5,2,k) = - tmp2 * fjac(5,2,k-1)
-     >              - tmp1 * njac(5,2,k-1)
-               lhsa(5,3,k) = - tmp2 * fjac(5,3,k-1)
-     >              - tmp1 * njac(5,3,k-1)
-               lhsa(5,4,k) = - tmp2 * fjac(5,4,k-1)
-     >              - tmp1 * njac(5,4,k-1)
-               lhsa(5,5,k) = - tmp2 * fjac(5,5,k-1)
-     >              - tmp1 * njac(5,5,k-1)
-     >              - tmp1 * dz5
-
-               lhsb(1,1,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(1,1,k)
-     >              + tmp1 * 2.0d+00 * dz1
-               lhsb(1,2,k) = tmp1 * 2.0d+00 * njac(1,2,k)
-               lhsb(1,3,k) = tmp1 * 2.0d+00 * njac(1,3,k)
-               lhsb(1,4,k) = tmp1 * 2.0d+00 * njac(1,4,k)
-               lhsb(1,5,k) = tmp1 * 2.0d+00 * njac(1,5,k)
-
-               lhsb(2,1,k) = tmp1 * 2.0d+00 * njac(2,1,k)
-               lhsb(2,2,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(2,2,k)
-     >              + tmp1 * 2.0d+00 * dz2
-               lhsb(2,3,k) = tmp1 * 2.0d+00 * njac(2,3,k)
-               lhsb(2,4,k) = tmp1 * 2.0d+00 * njac(2,4,k)
-               lhsb(2,5,k) = tmp1 * 2.0d+00 * njac(2,5,k)
-
-               lhsb(3,1,k) = tmp1 * 2.0d+00 * njac(3,1,k)
-               lhsb(3,2,k) = tmp1 * 2.0d+00 * njac(3,2,k)
-               lhsb(3,3,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(3,3,k)
-     >              + tmp1 * 2.0d+00 * dz3
-               lhsb(3,4,k) = tmp1 * 2.0d+00 * njac(3,4,k)
-               lhsb(3,5,k) = tmp1 * 2.0d+00 * njac(3,5,k)
-
-               lhsb(4,1,k) = tmp1 * 2.0d+00 * njac(4,1,k)
-               lhsb(4,2,k) = tmp1 * 2.0d+00 * njac(4,2,k)
-               lhsb(4,3,k) = tmp1 * 2.0d+00 * njac(4,3,k)
-               lhsb(4,4,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(4,4,k)
-     >              + tmp1 * 2.0d+00 * dz4
-               lhsb(4,5,k) = tmp1 * 2.0d+00 * njac(4,5,k)
-
-               lhsb(5,1,k) = tmp1 * 2.0d+00 * njac(5,1,k)
-               lhsb(5,2,k) = tmp1 * 2.0d+00 * njac(5,2,k)
-               lhsb(5,3,k) = tmp1 * 2.0d+00 * njac(5,3,k)
-               lhsb(5,4,k) = tmp1 * 2.0d+00 * njac(5,4,k)
-               lhsb(5,5,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(5,5,k) 
-     >              + tmp1 * 2.0d+00 * dz5
-
-               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,k+1)
-     >              - tmp1 * njac(1,1,k+1)
-     >              - tmp1 * dz1
-               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,k+1)
-     >              - tmp1 * njac(1,2,k+1)
-               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,k+1)
-     >              - tmp1 * njac(1,3,k+1)
-               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,k+1)
-     >              - tmp1 * njac(1,4,k+1)
-               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,k+1)
-     >              - tmp1 * njac(1,5,k+1)
-
-               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,k+1)
-     >              - tmp1 * njac(2,1,k+1)
-               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,k+1)
-     >              - tmp1 * njac(2,2,k+1)
-     >              - tmp1 * dz2
-               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,k+1)
-     >              - tmp1 * njac(2,3,k+1)
-               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,k+1)
-     >              - tmp1 * njac(2,4,k+1)
-               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,k+1)
-     >              - tmp1 * njac(2,5,k+1)
-
-               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,k+1)
-     >              - tmp1 * njac(3,1,k+1)
-               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,k+1)
-     >              - tmp1 * njac(3,2,k+1)
-               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,k+1)
-     >              - tmp1 * njac(3,3,k+1)
-     >              - tmp1 * dz3
-               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,k+1)
-     >              - tmp1 * njac(3,4,k+1)
-               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,k+1)
-     >              - tmp1 * njac(3,5,k+1)
-
-               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,k+1)
-     >              - tmp1 * njac(4,1,k+1)
-               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,k+1)
-     >              - tmp1 * njac(4,2,k+1)
-               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,k+1)
-     >              - tmp1 * njac(4,3,k+1)
-               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,k+1)
-     >              - tmp1 * njac(4,4,k+1)
-     >              - tmp1 * dz4
-               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,k+1)
-     >              - tmp1 * njac(4,5,k+1)
-
-               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,k+1)
-     >              - tmp1 * njac(5,1,k+1)
-               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,k+1)
-     >              - tmp1 * njac(5,2,k+1)
-               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,k+1)
-     >              - tmp1 * njac(5,3,k+1)
-               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,k+1)
-     >              - tmp1 * njac(5,4,k+1)
-               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,k+1)
-     >              - tmp1 * njac(5,5,k+1)
-     >              - tmp1 * dz5
-
-            enddo
-
-
-c---------------------------------------------------------------------
-c     outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
-            if (first .eq. 1) then 
-
-c---------------------------------------------------------------------
-c     multiply c(i,j,kstart) by b_inverse and copy back to c
-c     multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,kstart),
-     >                        lhsc(1,1,i,j,kstart,c),
-     >                        rhs(1,i,j,kstart,c) )
-
-            endif
-
-c---------------------------------------------------------------------
-c     begin inner most do loop
-c     do all the elements of the cell unless last 
-c---------------------------------------------------------------------
-            do k=kstart+first,ksize-last
-
-c---------------------------------------------------------------------
-c     subtract A*lhs_vector(k-1) from lhs_vector(k)
-c     
-c     rhs(k) = rhs(k) - A*rhs(k-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,k),
-     >                         rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c     B(k) = B(k) - C(k-1)*A(k)
-c     call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,k),
-     >                         lhsc(1,1,i,j,k-1,c),
-     >                         lhsb(1,1,k))
-
-c---------------------------------------------------------------------
-c     multiply c(i,j,k) by b_inverse and copy back to c
-c     multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,k),
-     >                        lhsc(1,1,i,j,k,c),
-     >                        rhs(1,i,j,k,c) )
-
-            enddo
-
-c---------------------------------------------------------------------
-c     Now finish up special cases for last cell
-c---------------------------------------------------------------------
-            if (last .eq. 1) then
-
-c---------------------------------------------------------------------
-c     rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,ksize),
-     >                         rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
-
-c---------------------------------------------------------------------
-c     B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
-c     call matmul_sub(aa,i,j,ksize,c,
-c     $              cc,i,j,ksize-1,c,bb,i,j,ksize,c)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,ksize),
-     >                         lhsc(1,1,i,j,ksize-1,c),
-     >                         lhsb(1,1,ksize))
-
-c---------------------------------------------------------------------
-c     multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
-c---------------------------------------------------------------------
-               call binvrhs( lhsb(1,1,ksize),
-     >                       rhs(1,i,j,ksize,c) )
-
-            endif
-         enddo
-      enddo
-
-
-      return
-      end
-      
-
-
-
-
-
diff --git a/examples/smpi/NAS/BT/z_solve_vec.f b/examples/smpi/NAS/BT/z_solve_vec.f
deleted file mode 100644
index 2c27fb00d1..0000000000
--- a/examples/smpi/NAS/BT/z_solve_vec.f
+++ /dev/null
@@ -1,793 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     Performs line solves in Z direction by first factoring
-c     the block-tridiagonal matrix into an upper triangular matrix, 
-c     and then performing back substitution to solve for the unknow
-c     vectors of each line.  
-c     
-c     Make sure we treat elements zero to cell_size in the direction
-c     of the sweep.
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer c, kstart, stage,
-     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
-     >     isize,jsize,ksize,send_id
-
-      kstart = 0
-
-c---------------------------------------------------------------------
-c     in our terminology stage is the number of the cell in the y-direct
-c     i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
-      do stage = 1,ncells
-         c = slice(3,stage)
-         isize = cell_size(1,c) - 1
-         jsize = cell_size(2,c) - 1
-         ksize = cell_size(3,c) - 1
-c---------------------------------------------------------------------
-c     set last-cell flag
-c---------------------------------------------------------------------
-         if (stage .eq. ncells) then
-            last = 1
-         else
-            last = 0
-         endif
-
-         if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c     This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
-            first = 1
-c            call lhsz(c)
-            call z_solve_cell(first,last,c)
-         else
-c---------------------------------------------------------------------
-c     Not the first cell of this line, so receive info from
-c     processor working on preceeding cell
-c---------------------------------------------------------------------
-            first = 0
-            call z_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c     overlap computations and communications
-c---------------------------------------------------------------------
-c            call lhsz(c)
-c---------------------------------------------------------------------
-c     wait for completion
-c---------------------------------------------------------------------
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c     install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
-c---------------------------------------------------------------------
-            call z_unpack_solve_info(c)
-            call z_solve_cell(first,last,c)
-         endif
-
-         if (last .eq. 0) call z_send_solve_info(send_id,c)
-      enddo
-
-c---------------------------------------------------------------------
-c     now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
-      do stage = ncells, 1, -1
-         c = slice(3,stage)
-         first = 0
-         last = 0
-         if (stage .eq. 1) first = 1
-         if (stage .eq. ncells) then
-            last = 1
-c---------------------------------------------------------------------
-c     last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
-            call z_backsubstitute(first, last,c)
-         else
-            call z_receive_backsub_info(recv_id,c)
-            call mpi_wait(send_id,r_status,error)
-            call mpi_wait(recv_id,r_status,error)
-            call z_unpack_backsub_info(c)
-            call z_backsubstitute(first,last,c)
-         endif
-         if (first .eq. 0) call z_send_backsub_info(send_id,c)
-      enddo
-
-
-      return
-      end
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine z_unpack_solve_info(c)
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack C'(-1) and rhs'(-1) for
-c     all i and j
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer i,j,m,n,ptr,c,kstart 
-
-      kstart = 0
-      ptr = 0
-      do j=0,JMAX-1
-         do i=0,IMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      
-      subroutine z_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send C'(kend) and rhs'(kend) for
-c     all i and j
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i,j,m,n,ksize,ptr,c,ip,jp
-      integer error,send_id,buffer_size
-
-      ksize = cell_size(3,c)-1
-      ip = cell_coord(1,c) - 1
-      jp = cell_coord(2,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c     pack up buffer
-c---------------------------------------------------------------------
-      ptr = 0
-      do j=0,JMAX-1
-         do i=0,IMAX-1
-            do m=1,BLOCK_SIZE
-               do n=1,BLOCK_SIZE
-                  in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
-               enddo
-               ptr = ptr+BLOCK_SIZE
-            enddo
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     send buffer 
-c---------------------------------------------------------------------
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, successor(3),
-     >     BOTTOM+ip+jp*NCELLS, comm_solve,
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     pack up and send U(jstart) for all i and j
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer i,j,n,ptr,c,kstart,ip,jp
-      integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c     Send element 0 to previous processor
-c---------------------------------------------------------------------
-      kstart = 0
-      ip = cell_coord(1,c)-1
-      jp = cell_coord(2,c)-1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      ptr = 0
-      do j=0,JMAX-1
-         do i=0,IMAX-1
-            do n=1,BLOCK_SIZE
-               in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      call mpi_isend(in_buffer, buffer_size,
-     >     dp_type, predecessor(3), 
-     >     TOP+ip+jp*NCELLS, comm_solve, 
-     >     send_id,error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     unpack U(ksize) for all i and j
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer i,j,n,ptr,c
-
-      ptr = 0
-      do j=0,JMAX-1
-         do i=0,IMAX-1
-            do n=1,BLOCK_SIZE
-               backsub_info(n,i,j,c) = out_buffer(ptr+n)
-            enddo
-            ptr = ptr+BLOCK_SIZE
-         enddo
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer error,recv_id,ip,jp,c,buffer_size
-      ip = cell_coord(1,c) - 1
-      jp = cell_coord(2,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, successor(3), 
-     >     TOP+ip+jp*NCELLS, comm_solve, 
-     >     recv_id, error)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     post mpi receives 
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'mpinpb.h'
-
-      integer ip,jp,recv_id,error,c,buffer_size
-      ip = cell_coord(1,c) - 1
-      jp = cell_coord(2,c) - 1
-      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
-     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-      call mpi_irecv(out_buffer, buffer_size,
-     >     dp_type, predecessor(3), 
-     >     BOTTOM+ip+jp*NCELLS, comm_solve,
-     >     recv_id, error)
-
-      return
-      end
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     back solve: if last cell, then generate U(ksize)=rhs(ksize)
-c     else assume U(ksize) is loaded in un pack backsub_info
-c     so just use it
-c     after call u(kstart) will be sent to next cell
-c---------------------------------------------------------------------
-
-      include 'header.h'
-
-      integer first, last, c, i, k
-      integer m,n,j,jsize,isize,ksize,kstart
-      
-      kstart = 0
-      isize = cell_size(1,c)-end(1,c)-1      
-      jsize = cell_size(2,c)-end(2,c)-1
-      ksize = cell_size(3,c)-1
-      if (last .eq. 0) then
-         do j=start(2,c),jsize
-            do i=start(1,c),isize
-c---------------------------------------------------------------------
-c     U(jsize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) 
-     >                    - lhsc(m,n,i,j,ksize,c)*
-     >                    backsub_info(n,i,j,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      endif
-      do k=ksize-1,kstart,-1
-         do j=start(2,c),jsize
-            do i=start(1,c),isize
-               do m=1,BLOCK_SIZE
-                  do n=1,BLOCK_SIZE
-                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
-     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
-                  enddo
-               enddo
-            enddo
-         enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine z_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     performs guaussian elimination on this cell.
-c     
-c     assumes that unpacking routines for non-first cells 
-c     preload C' and rhs' from previous cell.
-c     
-c     assumed send happens outside this routine, but that
-c     c'(KMAX) and rhs'(KMAX) will be sent to next cell.
-c---------------------------------------------------------------------
-
-      include 'header.h'
-      include 'work_lhs_vec.h'
-
-      integer first,last,c
-      integer i,j,k,m,n,isize,ksize,jsize,kstart
-
-      kstart = 0
-      isize = cell_size(1,c)-end(1,c)-1
-      jsize = cell_size(2,c)-end(2,c)-1
-      ksize = cell_size(3,c)-1
-
-c---------------------------------------------------------------------
-c     zero the left hand side for starters
-c     set diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
-      do i = 0, isize
-         do m = 1, 5
-            do n = 1, 5
-               lhsa(m,n,i,0) = 0.0d0
-               lhsb(m,n,i,0) = 0.0d0
-               lhsa(m,n,i,ksize) = 0.0d0
-               lhsb(m,n,i,ksize) = 0.0d0
-            enddo
-            lhsb(m,m,i,0) = 1.0d0
-            lhsb(m,m,i,ksize) = 1.0d0
-         enddo
-      enddo
-
-      do j=start(2,c),jsize 
-
-c---------------------------------------------------------------------
-c     This function computes the left hand side for the three z-factors 
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     Compute the indices for storing the block-diagonal matrix;
-c     determine c (labeled f) and s jacobians for cell c
-c---------------------------------------------------------------------
-
-         do k = start(3,c)-1, cell_size(3,c)-end(3,c)
-            do i=start(1,c),isize
-
-               tmp1 = 1.0d0 / u(1,i,j,k,c)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               fjac(1,1,i,k) = 0.0d+00
-               fjac(1,2,i,k) = 0.0d+00
-               fjac(1,3,i,k) = 0.0d+00
-               fjac(1,4,i,k) = 1.0d+00
-               fjac(1,5,i,k) = 0.0d+00
-
-               fjac(2,1,i,k) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) 
-     >              * tmp2 
-               fjac(2,2,i,k) = u(4,i,j,k,c) * tmp1
-               fjac(2,3,i,k) = 0.0d+00
-               fjac(2,4,i,k) = u(2,i,j,k,c) * tmp1
-               fjac(2,5,i,k) = 0.0d+00
-
-               fjac(3,1,i,k) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
-     >              * tmp2 
-               fjac(3,2,i,k) = 0.0d+00
-               fjac(3,3,i,k) = u(4,i,j,k,c) * tmp1
-               fjac(3,4,i,k) = u(3,i,j,k,c) * tmp1
-               fjac(3,5,i,k) = 0.0d+00
-
-               fjac(4,1,i,k) = - (u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 ) 
-     >              + c2 * qs(i,j,k,c)
-               fjac(4,2,i,k) = - c2 *  u(2,i,j,k,c) * tmp1 
-               fjac(4,3,i,k) = - c2 *  u(3,i,j,k,c) * tmp1
-               fjac(4,4,i,k) = ( 2.0d+00 - c2 )
-     >              *  u(4,i,j,k,c) * tmp1 
-               fjac(4,5,i,k) = c2
-
-               fjac(5,1,i,k) = ( c2 * 2.0d0 * qs(i,j,k,c)
-     >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
-     >              * ( u(4,i,j,k,c) * tmp1 )
-               fjac(5,2,i,k) = - c2 * ( u(2,i,j,k,c)*u(4,i,j,k,c) )
-     >              * tmp2 
-               fjac(5,3,i,k) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
-     >              * tmp2
-               fjac(5,4,i,k) = c1 * ( u(5,i,j,k,c) * tmp1 )
-     >              - c2 * ( qs(i,j,k,c)
-     >              + u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 )
-               fjac(5,5,i,k) = c1 * u(4,i,j,k,c) * tmp1
-
-               njac(1,1,i,k) = 0.0d+00
-               njac(1,2,i,k) = 0.0d+00
-               njac(1,3,i,k) = 0.0d+00
-               njac(1,4,i,k) = 0.0d+00
-               njac(1,5,i,k) = 0.0d+00
-
-               njac(2,1,i,k) = - c3c4 * tmp2 * u(2,i,j,k,c)
-               njac(2,2,i,k) =   c3c4 * tmp1
-               njac(2,3,i,k) =   0.0d+00
-               njac(2,4,i,k) =   0.0d+00
-               njac(2,5,i,k) =   0.0d+00
-
-               njac(3,1,i,k) = - c3c4 * tmp2 * u(3,i,j,k,c)
-               njac(3,2,i,k) =   0.0d+00
-               njac(3,3,i,k) =   c3c4 * tmp1
-               njac(3,4,i,k) =   0.0d+00
-               njac(3,5,i,k) =   0.0d+00
-
-               njac(4,1,i,k) = - con43 * c3c4 * tmp2 * u(4,i,j,k,c)
-               njac(4,2,i,k) =   0.0d+00
-               njac(4,3,i,k) =   0.0d+00
-               njac(4,4,i,k) =   con43 * c3 * c4 * tmp1
-               njac(4,5,i,k) =   0.0d+00
-
-               njac(5,1,i,k) = - (  c3c4
-     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
-     >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
-     >              - ( con43 * c3c4
-     >              - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
-     >              - c1345 * tmp2 * u(5,i,j,k,c)
-
-               njac(5,2,i,k) = (  c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
-               njac(5,3,i,k) = (  c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
-               njac(5,4,i,k) = ( con43 * c3c4
-     >              - c1345 ) * tmp2 * u(4,i,j,k,c)
-               njac(5,5,i,k) = ( c1345 )* tmp1
-
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     now joacobians set, so form left hand side in z direction
-c---------------------------------------------------------------------
-         do k = start(3,c), ksize-end(3,c)
-            do i=start(1,c),isize
-
-               tmp1 = dt * tz1
-               tmp2 = dt * tz2
-
-               lhsa(1,1,i,k) = - tmp2 * fjac(1,1,i,k-1)
-     >              - tmp1 * njac(1,1,i,k-1)
-     >              - tmp1 * dz1 
-               lhsa(1,2,i,k) = - tmp2 * fjac(1,2,i,k-1)
-     >              - tmp1 * njac(1,2,i,k-1)
-               lhsa(1,3,i,k) = - tmp2 * fjac(1,3,i,k-1)
-     >              - tmp1 * njac(1,3,i,k-1)
-               lhsa(1,4,i,k) = - tmp2 * fjac(1,4,i,k-1)
-     >              - tmp1 * njac(1,4,i,k-1)
-               lhsa(1,5,i,k) = - tmp2 * fjac(1,5,i,k-1)
-     >              - tmp1 * njac(1,5,i,k-1)
-
-               lhsa(2,1,i,k) = - tmp2 * fjac(2,1,i,k-1)
-     >              - tmp1 * njac(2,1,i,k-1)
-               lhsa(2,2,i,k) = - tmp2 * fjac(2,2,i,k-1)
-     >              - tmp1 * njac(2,2,i,k-1)
-     >              - tmp1 * dz2
-               lhsa(2,3,i,k) = - tmp2 * fjac(2,3,i,k-1)
-     >              - tmp1 * njac(2,3,i,k-1)
-               lhsa(2,4,i,k) = - tmp2 * fjac(2,4,i,k-1)
-     >              - tmp1 * njac(2,4,i,k-1)
-               lhsa(2,5,i,k) = - tmp2 * fjac(2,5,i,k-1)
-     >              - tmp1 * njac(2,5,i,k-1)
-
-               lhsa(3,1,i,k) = - tmp2 * fjac(3,1,i,k-1)
-     >              - tmp1 * njac(3,1,i,k-1)
-               lhsa(3,2,i,k) = - tmp2 * fjac(3,2,i,k-1)
-     >              - tmp1 * njac(3,2,i,k-1)
-               lhsa(3,3,i,k) = - tmp2 * fjac(3,3,i,k-1)
-     >              - tmp1 * njac(3,3,i,k-1)
-     >              - tmp1 * dz3 
-               lhsa(3,4,i,k) = - tmp2 * fjac(3,4,i,k-1)
-     >              - tmp1 * njac(3,4,i,k-1)
-               lhsa(3,5,i,k) = - tmp2 * fjac(3,5,i,k-1)
-     >              - tmp1 * njac(3,5,i,k-1)
-
-               lhsa(4,1,i,k) = - tmp2 * fjac(4,1,i,k-1)
-     >              - tmp1 * njac(4,1,i,k-1)
-               lhsa(4,2,i,k) = - tmp2 * fjac(4,2,i,k-1)
-     >              - tmp1 * njac(4,2,i,k-1)
-               lhsa(4,3,i,k) = - tmp2 * fjac(4,3,i,k-1)
-     >              - tmp1 * njac(4,3,i,k-1)
-               lhsa(4,4,i,k) = - tmp2 * fjac(4,4,i,k-1)
-     >              - tmp1 * njac(4,4,i,k-1)
-     >              - tmp1 * dz4
-               lhsa(4,5,i,k) = - tmp2 * fjac(4,5,i,k-1)
-     >              - tmp1 * njac(4,5,i,k-1)
-
-               lhsa(5,1,i,k) = - tmp2 * fjac(5,1,i,k-1)
-     >              - tmp1 * njac(5,1,i,k-1)
-               lhsa(5,2,i,k) = - tmp2 * fjac(5,2,i,k-1)
-     >              - tmp1 * njac(5,2,i,k-1)
-               lhsa(5,3,i,k) = - tmp2 * fjac(5,3,i,k-1)
-     >              - tmp1 * njac(5,3,i,k-1)
-               lhsa(5,4,i,k) = - tmp2 * fjac(5,4,i,k-1)
-     >              - tmp1 * njac(5,4,i,k-1)
-               lhsa(5,5,i,k) = - tmp2 * fjac(5,5,i,k-1)
-     >              - tmp1 * njac(5,5,i,k-1)
-     >              - tmp1 * dz5
-
-               lhsb(1,1,i,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(1,1,i,k)
-     >              + tmp1 * 2.0d+00 * dz1
-               lhsb(1,2,i,k) = tmp1 * 2.0d+00 * njac(1,2,i,k)
-               lhsb(1,3,i,k) = tmp1 * 2.0d+00 * njac(1,3,i,k)
-               lhsb(1,4,i,k) = tmp1 * 2.0d+00 * njac(1,4,i,k)
-               lhsb(1,5,i,k) = tmp1 * 2.0d+00 * njac(1,5,i,k)
-
-               lhsb(2,1,i,k) = tmp1 * 2.0d+00 * njac(2,1,i,k)
-               lhsb(2,2,i,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(2,2,i,k)
-     >              + tmp1 * 2.0d+00 * dz2
-               lhsb(2,3,i,k) = tmp1 * 2.0d+00 * njac(2,3,i,k)
-               lhsb(2,4,i,k) = tmp1 * 2.0d+00 * njac(2,4,i,k)
-               lhsb(2,5,i,k) = tmp1 * 2.0d+00 * njac(2,5,i,k)
-
-               lhsb(3,1,i,k) = tmp1 * 2.0d+00 * njac(3,1,i,k)
-               lhsb(3,2,i,k) = tmp1 * 2.0d+00 * njac(3,2,i,k)
-               lhsb(3,3,i,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(3,3,i,k)
-     >              + tmp1 * 2.0d+00 * dz3
-               lhsb(3,4,i,k) = tmp1 * 2.0d+00 * njac(3,4,i,k)
-               lhsb(3,5,i,k) = tmp1 * 2.0d+00 * njac(3,5,i,k)
-
-               lhsb(4,1,i,k) = tmp1 * 2.0d+00 * njac(4,1,i,k)
-               lhsb(4,2,i,k) = tmp1 * 2.0d+00 * njac(4,2,i,k)
-               lhsb(4,3,i,k) = tmp1 * 2.0d+00 * njac(4,3,i,k)
-               lhsb(4,4,i,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(4,4,i,k)
-     >              + tmp1 * 2.0d+00 * dz4
-               lhsb(4,5,i,k) = tmp1 * 2.0d+00 * njac(4,5,i,k)
-
-               lhsb(5,1,i,k) = tmp1 * 2.0d+00 * njac(5,1,i,k)
-               lhsb(5,2,i,k) = tmp1 * 2.0d+00 * njac(5,2,i,k)
-               lhsb(5,3,i,k) = tmp1 * 2.0d+00 * njac(5,3,i,k)
-               lhsb(5,4,i,k) = tmp1 * 2.0d+00 * njac(5,4,i,k)
-               lhsb(5,5,i,k) = 1.0d+00
-     >              + tmp1 * 2.0d+00 * njac(5,5,i,k) 
-     >              + tmp1 * 2.0d+00 * dz5
-
-               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i,k+1)
-     >              - tmp1 * njac(1,1,i,k+1)
-     >              - tmp1 * dz1
-               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i,k+1)
-     >              - tmp1 * njac(1,2,i,k+1)
-               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i,k+1)
-     >              - tmp1 * njac(1,3,i,k+1)
-               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i,k+1)
-     >              - tmp1 * njac(1,4,i,k+1)
-               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i,k+1)
-     >              - tmp1 * njac(1,5,i,k+1)
-
-               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i,k+1)
-     >              - tmp1 * njac(2,1,i,k+1)
-               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i,k+1)
-     >              - tmp1 * njac(2,2,i,k+1)
-     >              - tmp1 * dz2
-               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i,k+1)
-     >              - tmp1 * njac(2,3,i,k+1)
-               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i,k+1)
-     >              - tmp1 * njac(2,4,i,k+1)
-               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i,k+1)
-     >              - tmp1 * njac(2,5,i,k+1)
-
-               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i,k+1)
-     >              - tmp1 * njac(3,1,i,k+1)
-               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i,k+1)
-     >              - tmp1 * njac(3,2,i,k+1)
-               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i,k+1)
-     >              - tmp1 * njac(3,3,i,k+1)
-     >              - tmp1 * dz3
-               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i,k+1)
-     >              - tmp1 * njac(3,4,i,k+1)
-               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i,k+1)
-     >              - tmp1 * njac(3,5,i,k+1)
-
-               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i,k+1)
-     >              - tmp1 * njac(4,1,i,k+1)
-               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i,k+1)
-     >              - tmp1 * njac(4,2,i,k+1)
-               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i,k+1)
-     >              - tmp1 * njac(4,3,i,k+1)
-               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i,k+1)
-     >              - tmp1 * njac(4,4,i,k+1)
-     >              - tmp1 * dz4
-               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i,k+1)
-     >              - tmp1 * njac(4,5,i,k+1)
-
-               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i,k+1)
-     >              - tmp1 * njac(5,1,i,k+1)
-               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i,k+1)
-     >              - tmp1 * njac(5,2,i,k+1)
-               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i,k+1)
-     >              - tmp1 * njac(5,3,i,k+1)
-               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i,k+1)
-     >              - tmp1 * njac(5,4,i,k+1)
-               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i,k+1)
-     >              - tmp1 * njac(5,5,i,k+1)
-     >              - tmp1 * dz5
-
-            enddo
-         enddo
-
-
-c---------------------------------------------------------------------
-c     outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
-         if (first .eq. 1) then 
-
-c---------------------------------------------------------------------
-c     multiply c(i,j,kstart) by b_inverse and copy back to c
-c     multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
-c---------------------------------------------------------------------
-!dir$ ivdep
-            do i=start(1,c),isize
-               call binvcrhs( lhsb(1,1,i,kstart),
-     >                        lhsc(1,1,i,j,kstart,c),
-     >                        rhs(1,i,j,kstart,c) )
-            enddo
-
-         endif
-
-c---------------------------------------------------------------------
-c     begin inner most do loop
-c     do all the elements of the cell unless last 
-c---------------------------------------------------------------------
-         do k=kstart+first,ksize-last
-!dir$ ivdep
-            do i=start(1,c),isize
-
-c---------------------------------------------------------------------
-c     subtract A*lhs_vector(k-1) from lhs_vector(k)
-c     
-c     rhs(k) = rhs(k) - A*rhs(k-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,i,k),
-     >                         rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c     B(k) = B(k) - C(k-1)*A(k)
-c     call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,i,k),
-     >                         lhsc(1,1,i,j,k-1,c),
-     >                         lhsb(1,1,i,k))
-
-c---------------------------------------------------------------------
-c     multiply c(i,j,k) by b_inverse and copy back to c
-c     multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
-c---------------------------------------------------------------------
-               call binvcrhs( lhsb(1,1,i,k),
-     >                        lhsc(1,1,i,j,k,c),
-     >                        rhs(1,i,j,k,c) )
-
-            enddo
-         enddo
-
-c---------------------------------------------------------------------
-c     Now finish up special cases for last cell
-c---------------------------------------------------------------------
-         if (last .eq. 1) then
-
-!dir$ ivdep
-            do i=start(1,c),isize
-c---------------------------------------------------------------------
-c     rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
-c---------------------------------------------------------------------
-               call matvec_sub(lhsa(1,1,i,ksize),
-     >                         rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
-
-c---------------------------------------------------------------------
-c     B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
-c     call matmul_sub(aa,i,j,ksize,c,
-c     $              cc,i,j,ksize-1,c,bb,i,j,ksize,c)
-c---------------------------------------------------------------------
-               call matmul_sub(lhsa(1,1,i,ksize),
-     >                         lhsc(1,1,i,j,ksize-1,c),
-     >                         lhsb(1,1,i,ksize))
-
-c---------------------------------------------------------------------
-c     multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
-c---------------------------------------------------------------------
-               call binvrhs( lhsb(1,1,i,ksize),
-     >                       rhs(1,i,j,ksize,c) )
-            enddo
-
-         endif
-      enddo
-
-
-      return
-      end
-      
-
-
-
-
-
diff --git a/examples/smpi/NAS/CG/Makefile b/examples/smpi/NAS/CG/Makefile
deleted file mode 100644
index 33e52c697b..0000000000
--- a/examples/smpi/NAS/CG/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=cg
-BENCHMARKU=CG
-
-include ../config/make.def
-
-OBJS = cg.o ${COMMON}/print_results.o  \
-       ${COMMON}/${RAND}.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-${PROGRAM}: config ${OBJS}
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-cg.o:		cg.f  mpinpb.h npbparams.h
-	${FCOMPILE} cg.f
-
-clean:
-	- rm -f *.o *~ 
-	- rm -f npbparams.h core
-
-
-
diff --git a/examples/smpi/NAS/CG/cg.f b/examples/smpi/NAS/CG/cg.f
deleted file mode 100644
index 0d425d78a4..0000000000
--- a/examples/smpi/NAS/CG/cg.f
+++ /dev/null
@@ -1,1787 +0,0 @@
-!-------------------------------------------------------------------------!
-!                                                                         !
-!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
-!                                                                         !
-!                                   C G                                   !
-!                                                                         !
-!-------------------------------------------------------------------------!
-!                                                                         !
-!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
-!    It is described in NAS Technical Reports 95-020 and 02-007           !
-!                                                                         !
-!    Permission to use, copy, distribute and modify this software         !
-!    for any purpose with or without fee is hereby granted.  We           !
-!    request, however, that all derived work reference the NAS            !
-!    Parallel Benchmarks 3.3. This software is provided "as is"           !
-!    without express or implied warranty.                                 !
-!                                                                         !
-!    Information on NPB 3.3, including the technical report, the          !
-!    original specifications, source code, results and information        !
-!    on how to submit new results, is available at:                       !
-!                                                                         !
-!           http://www.nas.nasa.gov/Software/NPB/                         !
-!                                                                         !
-!    Send comments or suggestions to  npb@nas.nasa.gov                    !
-!                                                                         !
-!          NAS Parallel Benchmarks Group                                  !
-!          NASA Ames Research Center                                      !
-!          Mail Stop: T27A-1                                              !
-!          Moffett Field, CA   94035-1000                                 !
-!                                                                         !
-!          E-mail:  npb@nas.nasa.gov                                      !
-!          Fax:     (650) 604-3957                                        !
-!                                                                         !
-!-------------------------------------------------------------------------!
-
-
-c---------------------------------------------------------------------
-c
-c Authors: M. Yarrow
-c          C. Kuszmaul
-c          R. F. Van der Wijngaart
-c          H. Jin
-c
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      program cg
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
-      implicit none
-
-      include 'mpinpb.h'
-      integer status(MPI_STATUS_SIZE), request, ierr
-
-      include 'npbparams.h'
-
-c---------------------------------------------------------------------
-c  num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows.
-c  num_proc_cols and num_proc_cols are to be found in npbparams.h.
-c  When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows.
-c---------------------------------------------------------------------
-      integer    num_procs 
-      parameter( num_procs = num_proc_cols * num_proc_rows )
-
-
-
-c---------------------------------------------------------------------
-c  Class specific parameters: 
-c  It appears here for reference only.
-c  These are their values, however, this info is imported in the npbparams.h
-c  include file, which is written by the sys/setparams.c program.
-c---------------------------------------------------------------------
-
-C----------
-C  Class S:
-C----------
-CC       parameter( na=1400, 
-CC      >           nonzer=7, 
-CC      >           shift=10., 
-CC      >           niter=15,
-CC      >           rcond=1.0d-1 )
-C----------
-C  Class W:
-C----------
-CC       parameter( na=7000,
-CC      >           nonzer=8, 
-CC      >           shift=12., 
-CC      >           niter=15,
-CC      >           rcond=1.0d-1 )
-C----------
-C  Class A:
-C----------
-CC       parameter( na=14000,
-CC      >           nonzer=11, 
-CC      >           shift=20., 
-CC      >           niter=15,
-CC      >           rcond=1.0d-1 )
-C----------
-C  Class B:
-C----------
-CC       parameter( na=75000, 
-CC      >           nonzer=13, 
-CC      >           shift=60., 
-CC      >           niter=75,
-CC      >           rcond=1.0d-1 )
-C----------
-C  Class C:
-C----------
-CC       parameter( na=150000, 
-CC      >           nonzer=15, 
-CC      >           shift=110., 
-CC      >           niter=75,
-CC      >           rcond=1.0d-1 )
-C----------
-C  Class D:
-C----------
-CC       parameter( na=1500000, 
-CC      >           nonzer=21, 
-CC      >           shift=500., 
-CC      >           niter=100,
-CC      >           rcond=1.0d-1 )
-C----------
-C  Class E:
-C----------
-CC       parameter( na=9000000, 
-CC      >           nonzer=26, 
-CC      >           shift=1500., 
-CC      >           niter=100,
-CC      >           rcond=1.0d-1 )
-
-
-
-      integer    nz
-      parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer
-     >              + na*(nonzer+2+num_procs/256)/num_proc_cols )
-
-
-
-      common / partit_size  /  naa, nzz, 
-     >                         npcols, nprows,
-     >                         proc_col, proc_row,
-     >                         firstrow, 
-     >                         lastrow, 
-     >                         firstcol, 
-     >                         lastcol,
-     >                         exch_proc,
-     >                         exch_recv_length,
-     >                         send_start,
-     >                         send_len
-      integer                  naa, nzz, 
-     >                         npcols, nprows,
-     >                         proc_col, proc_row,
-     >                         firstrow, 
-     >                         lastrow, 
-     >                         firstcol, 
-     >                         lastcol,
-     >                         exch_proc,
-     >                         exch_recv_length,
-     >                         send_start,
-     >                         send_len
-
-
-      common / main_int_mem /  colidx,     rowstr,
-     >                         iv,         arow,     acol
-      integer                  colidx(nz), rowstr(na+1),
-     >                         iv(2*na+1), arow(nz), acol(nz)
-
-
-      common / main_flt_mem /  v,       aelt,     a,
-     >                         x,
-     >                         z,
-     >                         p,
-     >                         q,
-     >                         r,
-     >                         w
-      double precision         v(na+1), aelt(nz), a(nz),
-     >                         x(na/num_proc_rows+2),
-     >                         z(na/num_proc_rows+2),
-     >                         p(na/num_proc_rows+2),
-     >                         q(na/num_proc_rows+2),
-     >                         r(na/num_proc_rows+2),
-     >                         w(na/num_proc_rows+2)
-
-
-      common /urando/          amult, tran
-      double precision         amult, tran
-
-
-
-      integer            l2npcols
-      integer            reduce_exch_proc(num_proc_cols)
-      integer            reduce_send_starts(num_proc_cols)
-      integer            reduce_send_lengths(num_proc_cols)
-      integer            reduce_recv_starts(num_proc_cols)
-      integer            reduce_recv_lengths(num_proc_cols)
-
-      integer            i, j, k, it
-
-      double precision   zeta, randlc
-      external           randlc
-      double precision   rnorm
-      double precision   norm_temp1(2), norm_temp2(2)
-
-      double precision   t, tmax, mflops
-      external           timer_read
-      double precision   timer_read
-      character          class
-      logical            verified
-      double precision   zeta_verify_value, epsilon, err
-
-
-c---------------------------------------------------------------------
-c  Set up mpi initialization and number of proc testing
-c---------------------------------------------------------------------
-      call initialize_mpi
-
-
-      if( na .eq. 1400 .and. 
-     &    nonzer .eq. 7 .and. 
-     &    niter .eq. 15 .and.
-     &    shift .eq. 10.d0 ) then
-         class = 'S'
-         zeta_verify_value = 8.5971775078648d0
-      else if( na .eq. 7000 .and. 
-     &         nonzer .eq. 8 .and. 
-     &         niter .eq. 15 .and.
-     &         shift .eq. 12.d0 ) then
-         class = 'W'
-         zeta_verify_value = 10.362595087124d0
-      else if( na .eq. 14000 .and. 
-     &         nonzer .eq. 11 .and. 
-     &         niter .eq. 15 .and.
-     &         shift .eq. 20.d0 ) then
-         class = 'A'
-         zeta_verify_value = 17.130235054029d0
-      else if( na .eq. 75000 .and. 
-     &         nonzer .eq. 13 .and. 
-     &         niter .eq. 75 .and.
-     &         shift .eq. 60.d0 ) then
-         class = 'B'
-         zeta_verify_value = 22.712745482631d0
-      else if( na .eq. 150000 .and. 
-     &         nonzer .eq. 15 .and. 
-     &         niter .eq. 75 .and.
-     &         shift .eq. 110.d0 ) then
-         class = 'C'
-         zeta_verify_value = 28.973605592845d0
-      else if( na .eq. 1500000 .and. 
-     &         nonzer .eq. 21 .and. 
-     &         niter .eq. 100 .and.
-     &         shift .eq. 500.d0 ) then
-         class = 'D'
-         zeta_verify_value = 52.514532105794d0
-      else if( na .eq. 9000000 .and. 
-     &         nonzer .eq. 26 .and. 
-     &         niter .eq. 100 .and.
-     &         shift .eq. 1.5d3 ) then
-         class = 'E'
-         zeta_verify_value = 77.522164599383d0
-      else
-         class = 'U'
-      endif
-
-      if( me .eq. root )then
-         write( *,1000 ) 
-         write( *,1001 ) na
-         write( *,1002 ) niter
-         write( *,1003 ) nprocs
-         write( *,1004 ) nonzer
-         write( *,1005 ) shift
- 1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /)
- 1001 format(' Size: ', i10 )
- 1002 format(' Iterations: ', i5 )
- 1003 format(' Number of active processes: ', i5 )
- 1004 format(' Number of nonzeroes per row: ', i8)
- 1005 format(' Eigenvalue shift: ', e8.3)
-      endif
-
-      if (.not. convertdouble) then
-         dp_type = MPI_DOUBLE_PRECISION
-      else
-         dp_type = MPI_REAL
-      endif
-
-
-      naa = na
-      nzz = nz
-
-
-c---------------------------------------------------------------------
-c  Set up processor info, such as whether sq num of procs, etc
-c---------------------------------------------------------------------
-      call setup_proc_info( num_procs, 
-     >                      num_proc_rows, 
-     >                      num_proc_cols )
-
-
-c---------------------------------------------------------------------
-c  Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow
-c---------------------------------------------------------------------
-      call setup_submatrix_info( l2npcols,
-     >                           reduce_exch_proc,
-     >                           reduce_send_starts,
-     >                           reduce_send_lengths,
-     >                           reduce_recv_starts,
-     >                           reduce_recv_lengths )
-
-
-
-c---------------------------------------------------------------------
-c  Inialize random number generator
-c---------------------------------------------------------------------
-      tran    = 314159265.0D0
-      amult   = 1220703125.0D0
-      zeta    = randlc( tran, amult )
-
-c---------------------------------------------------------------------
-c  Set up partition's sparse random matrix for given class size
-c---------------------------------------------------------------------
-      call makea(naa, nzz, a, colidx, rowstr, nonzer,
-     >           firstrow, lastrow, firstcol, lastcol, 
-     >           rcond, arow, acol, aelt, v, iv, shift)
-
-
-
-c---------------------------------------------------------------------
-c  Note: as a result of the above call to makea:
-c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
-c        values of colidx which are col indexes go from firstcol --> lastcol
-c        So:
-c        Shift the col index vals from actual (firstcol --> lastcol ) 
-c        to local, i.e., (1 --> lastcol-firstcol+1)
-c---------------------------------------------------------------------
-      do j=1,lastrow-firstrow+1
-         do k=rowstr(j),rowstr(j+1)-1
-            colidx(k) = colidx(k) - firstcol + 1
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c  set starting vector to (1, 1, .... 1)
-c---------------------------------------------------------------------
-      do i = 1, na/num_proc_rows+1
-         x(i) = 1.0D0
-      enddo
-
-      zeta  = 0.0d0
-
-c---------------------------------------------------------------------
-c---->
-c  Do one iteration untimed to init all code and data page tables
-c---->                    (then reinit, start timing, to niter its)
-c---------------------------------------------------------------------
-      do it = 1, 1
-
-c---------------------------------------------------------------------
-c  The call to the conjugate gradient routine:
-c---------------------------------------------------------------------
-         call conj_grad ( colidx,
-     >                    rowstr,
-     >                    x,
-     >                    z,
-     >                    a,
-     >                    p,
-     >                    q,
-     >                    r,
-     >                    w,
-     >                    rnorm, 
-     >                    l2npcols,
-     >                    reduce_exch_proc,
-     >                    reduce_send_starts,
-     >                    reduce_send_lengths,
-     >                    reduce_recv_starts,
-     >                    reduce_recv_lengths )
-
-c---------------------------------------------------------------------
-c  zeta = shift + 1/(x.z)
-c  So, first: (x.z)
-c  Also, find norm of z
-c  So, first: (z.z)
-c---------------------------------------------------------------------
-         norm_temp1(1) = 0.0d0
-         norm_temp1(2) = 0.0d0
-         do j=1, lastcol-firstcol+1
-            norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
-            norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
-         enddo
-
-         do i = 1, l2npcols
-            call mpi_irecv( norm_temp2,
-     >                      2, 
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      request,
-     >                      ierr )
-            call mpi_send(  norm_temp1,
-     >                      2, 
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      ierr )
-            call mpi_wait( request, status, ierr )
-
-            norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
-            norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
-         enddo
-
-         norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
-
-
-c---------------------------------------------------------------------
-c  Normalize z to obtain x
-c---------------------------------------------------------------------
-         do j=1, lastcol-firstcol+1      
-            x(j) = norm_temp1(2)*z(j)    
-         enddo                           
-
-
-      enddo                              ! end of do one iteration untimed
-
-
-c---------------------------------------------------------------------
-c  set starting vector to (1, 1, .... 1)
-c---------------------------------------------------------------------
-c
-c  NOTE: a questionable limit on size:  should this be na/num_proc_cols+1 ?
-c
-      do i = 1, na/num_proc_rows+1
-         x(i) = 1.0D0
-      enddo
-
-      zeta  = 0.0d0
-
-c---------------------------------------------------------------------
-c  Synchronize and start timing
-c---------------------------------------------------------------------
-      call mpi_barrier( mpi_comm_world,
-     >                  ierr )
-
-      call timer_clear( 1 )
-      call timer_start( 1 )
-
-c---------------------------------------------------------------------
-c---->
-c  Main Iteration for inverse power method
-c---->
-c---------------------------------------------------------------------
-      do it = 1, niter
-
-c---------------------------------------------------------------------
-c  The call to the conjugate gradient routine:
-c---------------------------------------------------------------------
-         call conj_grad ( colidx,
-     >                    rowstr,
-     >                    x,
-     >                    z,
-     >                    a,
-     >                    p,
-     >                    q,
-     >                    r,
-     >                    w,
-     >                    rnorm, 
-     >                    l2npcols,
-     >                    reduce_exch_proc,
-     >                    reduce_send_starts,
-     >                    reduce_send_lengths,
-     >                    reduce_recv_starts,
-     >                    reduce_recv_lengths )
-
-
-c---------------------------------------------------------------------
-c  zeta = shift + 1/(x.z)
-c  So, first: (x.z)
-c  Also, find norm of z
-c  So, first: (z.z)
-c---------------------------------------------------------------------
-         norm_temp1(1) = 0.0d0
-         norm_temp1(2) = 0.0d0
-         do j=1, lastcol-firstcol+1
-            norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
-            norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
-         enddo
-
-         do i = 1, l2npcols
-            call mpi_irecv( norm_temp2,
-     >                      2, 
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      request,
-     >                      ierr )
-            call mpi_send(  norm_temp1,
-     >                      2, 
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      ierr )
-            call mpi_wait( request, status, ierr )
-
-            norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
-            norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
-         enddo
-
-         norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
-
-
-         if( me .eq. root )then
-            zeta = shift + 1.0d0 / norm_temp1(1)
-            if( it .eq. 1 ) write( *,9000 )
-            write( *,9001 ) it, rnorm, zeta
-         endif
- 9000 format( /,'   iteration           ||r||                 zeta' )
- 9001 format( 4x, i5, 7x, e20.14, f20.13 )
-
-c---------------------------------------------------------------------
-c  Normalize z to obtain x
-c---------------------------------------------------------------------
-         do j=1, lastcol-firstcol+1      
-            x(j) = norm_temp1(2)*z(j)    
-         enddo                           
-
-
-      enddo                              ! end of main iter inv pow meth
-
-      call timer_stop( 1 )
-
-c---------------------------------------------------------------------
-c  End of timed section
-c---------------------------------------------------------------------
-
-      t = timer_read( 1 )
-
-      call mpi_reduce( t,
-     >                 tmax,
-     >                 1, 
-     >                 dp_type,
-     >                 MPI_MAX,
-     >                 root,
-     >                 mpi_comm_world,
-     >                 ierr )
-
-      if( me .eq. root )then
-         write(*,100)
- 100     format(' Benchmark completed ')
-
-         epsilon = 1.d-10
-         if (class .ne. 'U') then
-
-            err = abs( zeta - zeta_verify_value )/zeta_verify_value
-            if( err .le. epsilon ) then
-               verified = .TRUE.
-               write(*, 200)
-               write(*, 201) zeta
-               write(*, 202) err
- 200           format(' VERIFICATION SUCCESSFUL ')
- 201           format(' Zeta is    ', E20.13)
- 202           format(' Error is   ', E20.13)
-            else
-               verified = .FALSE.
-               write(*, 300) 
-               write(*, 301) zeta
-               write(*, 302) zeta_verify_value
- 300           format(' VERIFICATION FAILED')
- 301           format(' Zeta                ', E20.13)
- 302           format(' The correct zeta is ', E20.13)
-            endif
-         else
-            verified = .FALSE.
-            write (*, 400)
-            write (*, 401)
-            write (*, 201) zeta
- 400        format(' Problem size unknown')
- 401        format(' NO VERIFICATION PERFORMED')
-         endif
-
-
-         if( tmax .ne. 0. ) then
-            mflops = float( 2*niter*na )
-     &                  * ( 3.+float( nonzer*(nonzer+1) )
-     &                    + 25.*(5.+float( nonzer*(nonzer+1) ))
-     &                    + 3. ) / tmax / 1000000.0
-         else
-            mflops = 0.0
-         endif
-
-         call print_results('CG', class, na, 0, 0,
-     >                      niter, nnodes_compiled, nprocs, tmax,
-     >                      mflops, '          floating point', 
-     >                      verified, npbversion, compiletime,
-     >                      cs1, cs2, cs3, cs4, cs5, cs6, cs7)
-
-
-      endif
-
-
-      call mpi_finalize(ierr)
-
-
-
-      end                              ! end main
-
-
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine initialize_mpi
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer   ierr
-
-
-      call mpi_init( ierr )
-      call mpi_comm_rank( mpi_comm_world, me, ierr )
-      call mpi_comm_size( mpi_comm_world, nprocs, ierr )
-      root = 0
-
-
-      return
-      end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine setup_proc_info( num_procs, 
-     >                            num_proc_rows, 
-     >                            num_proc_cols )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-
-      common / partit_size  /  naa, nzz, 
-     >                         npcols, nprows,
-     >                         proc_col, proc_row,
-     >                         firstrow, 
-     >                         lastrow, 
-     >                         firstcol, 
-     >                         lastcol,
-     >                         exch_proc,
-     >                         exch_recv_length,
-     >                         send_start,
-     >                         send_len
-      integer                  naa, nzz, 
-     >                         npcols, nprows,
-     >                         proc_col, proc_row,
-     >                         firstrow, 
-     >                         lastrow, 
-     >                         firstcol, 
-     >                         lastcol,
-     >                         exch_proc,
-     >                         exch_recv_length,
-     >                         send_start,
-     >                         send_len
-
-      integer   num_procs, num_proc_cols, num_proc_rows
-      integer   i, ierr
-      integer   log2nprocs
-
-c---------------------------------------------------------------------
-c  num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows
-c  When num_procs is not square, then num_proc_cols = 2*num_proc_rows
-c---------------------------------------------------------------------
-c  First, number of procs must be power of two. 
-c---------------------------------------------------------------------
-      if( nprocs .ne. num_procs )then
-          if( me .eq. root ) write( *,9000 ) nprocs, num_procs
- 9000     format(      /,'Error: ',/,'num of procs allocated   (', 
-     >                 i4, ' )',
-     >                 /,'is not equal to',/,
-     >                 'compiled number of procs (',
-     >                 i4, ' )',/   )
-          call mpi_finalize(ierr)
-          stop
-      endif
-
-
-      i = num_proc_cols
- 100  continue
-          if( i .ne. 1 .and. i/2*2 .ne. i )then
-              if ( me .eq. root ) then  
-                 write( *,* ) 'Error: num_proc_cols is ',
-     >                         num_proc_cols,
-     >                        ' which is not a power of two'
-              endif
-              call mpi_finalize(ierr)
-              stop
-          endif
-          i = i / 2
-          if( i .ne. 0 )then
-              goto 100
-          endif
-      
-      i = num_proc_rows
- 200  continue
-          if( i .ne. 1 .and. i/2*2 .ne. i )then
-              if ( me .eq. root ) then 
-                 write( *,* ) 'Error: num_proc_rows is ',
-     >                         num_proc_rows,
-     >                        ' which is not a power of two'
-              endif
-              call mpi_finalize(ierr)
-              stop
-          endif
-          i = i / 2
-          if( i .ne. 0 )then
-              goto 200
-          endif
-      
-      log2nprocs = 0
-      i = nprocs
- 300  continue
-          if( i .ne. 1 .and. i/2*2 .ne. i )then
-              write( *,* ) 'Error: nprocs is ',
-     >                      nprocs,
-     >                      ' which is not a power of two'
-              call mpi_finalize(ierr)
-              stop
-          endif
-          i = i / 2
-          if( i .ne. 0 )then
-              log2nprocs = log2nprocs + 1
-              goto 300
-          endif
-
-CC       write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs
-
-      
-      npcols = num_proc_cols
-      nprows = num_proc_rows
-
-
-      return
-      end
-
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine setup_submatrix_info( l2npcols,
-     >                                 reduce_exch_proc,
-     >                                 reduce_send_starts,
-     >                                 reduce_send_lengths,
-     >                                 reduce_recv_starts,
-     >                                 reduce_recv_lengths )
-     >                                 
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer      col_size, row_size
-
-      common / partit_size  /  naa, nzz, 
-     >                         npcols, nprows,
-     >                         proc_col, proc_row,
-     >                         firstrow, 
-     >                         lastrow, 
-     >                         firstcol, 
-     >                         lastcol,
-     >                         exch_proc,
-     >                         exch_recv_length,
-     >                         send_start,
-     >                         send_len
-      integer                  naa, nzz, 
-     >                         npcols, nprows,
-     >                         proc_col, proc_row,
-     >                         firstrow, 
-     >                         lastrow, 
-     >                         firstcol, 
-     >                         lastcol,
-     >                         exch_proc,
-     >                         exch_recv_length,
-     >                         send_start,
-     >                         send_len
-
-      integer   reduce_exch_proc(*)
-      integer   reduce_send_starts(*)
-      integer   reduce_send_lengths(*)
-      integer   reduce_recv_starts(*)
-      integer   reduce_recv_lengths(*)
-
-      integer   i, j
-      integer   div_factor
-      integer   l2npcols
-
-
-      proc_row = me / npcols
-      proc_col = me - proc_row*npcols
-
-
-
-c---------------------------------------------------------------------
-c  If naa evenly divisible by npcols, then it is evenly divisible 
-c  by nprows 
-c---------------------------------------------------------------------
-
-      if( naa/npcols*npcols .eq. naa )then
-          col_size = naa/npcols
-          firstcol = proc_col*col_size + 1
-          lastcol  = firstcol - 1 + col_size
-          row_size = naa/nprows
-          firstrow = proc_row*row_size + 1
-          lastrow  = firstrow - 1 + row_size
-c---------------------------------------------------------------------
-c  If naa not evenly divisible by npcols, then first subdivide for nprows
-c  and then, if npcols not equal to nprows (i.e., not a sq number of procs), 
-c  get col subdivisions by dividing by 2 each row subdivision.
-c---------------------------------------------------------------------
-      else
-          if( proc_row .lt. naa - naa/nprows*nprows)then
-              row_size = naa/nprows+ 1
-              firstrow = proc_row*row_size + 1
-              lastrow  = firstrow - 1 + row_size
-          else
-              row_size = naa/nprows
-              firstrow = (naa - naa/nprows*nprows)*(row_size+1)
-     >                 + (proc_row-(naa-naa/nprows*nprows))
-     >                     *row_size + 1
-              lastrow  = firstrow - 1 + row_size
-          endif
-          if( npcols .eq. nprows )then
-              if( proc_col .lt. naa - naa/npcols*npcols )then
-                  col_size = naa/npcols+ 1
-                  firstcol = proc_col*col_size + 1
-                  lastcol  = firstcol - 1 + col_size
-              else
-                  col_size = naa/npcols
-                  firstcol = (naa - naa/npcols*npcols)*(col_size+1)
-     >                     + (proc_col-(naa-naa/npcols*npcols))
-     >                         *col_size + 1
-                  lastcol  = firstcol - 1 + col_size
-              endif
-          else
-              if( (proc_col/2) .lt. 
-     >                           naa - naa/(npcols/2)*(npcols/2) )then
-                  col_size = naa/(npcols/2) + 1
-                  firstcol = (proc_col/2)*col_size + 1
-                  lastcol  = firstcol - 1 + col_size
-              else
-                  col_size = naa/(npcols/2)
-                  firstcol = (naa - naa/(npcols/2)*(npcols/2))
-     >                                                 *(col_size+1)
-     >               + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2)))
-     >                         *col_size + 1
-                  lastcol  = firstcol - 1 + col_size
-              endif
-CC               write( *,* ) col_size,firstcol,lastcol
-              if( mod( me,2 ) .eq. 0 )then
-                  lastcol  = firstcol - 1 + (col_size-1)/2 + 1
-              else
-                  firstcol = firstcol + (col_size-1)/2 + 1
-                  lastcol  = firstcol - 1 + col_size/2
-CC                   write( *,* ) firstcol,lastcol
-              endif
-          endif
-      endif
-
-
-
-      if( npcols .eq. nprows )then
-          send_start = 1
-          send_len   = lastrow - firstrow + 1
-      else
-          if( mod( me,2 ) .eq. 0 )then
-              send_start = 1
-              send_len   = (1 + lastrow-firstrow+1)/2
-          else
-              send_start = (1 + lastrow-firstrow+1)/2 + 1
-              send_len   = (lastrow-firstrow+1)/2
-          endif
-      endif
-          
-
-
-
-c---------------------------------------------------------------------
-c  Transpose exchange processor
-c---------------------------------------------------------------------
-
-      if( npcols .eq. nprows )then
-          exch_proc = mod( me,nprows )*nprows + me/nprows
-      else
-          exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows)
-     >                 + mod( me,2 )
-      endif
-
-
-
-      i = npcols / 2
-      l2npcols = 0
-      do while( i .gt. 0 )
-         l2npcols = l2npcols + 1
-         i = i / 2
-      enddo
-
-
-c---------------------------------------------------------------------
-c  Set up the reduce phase schedules...
-c---------------------------------------------------------------------
-
-      div_factor = npcols
-      do i = 1, l2npcols
-
-         j = mod( proc_col+div_factor/2, div_factor )
-     >     + proc_col / div_factor * div_factor
-         reduce_exch_proc(i) = proc_row*npcols + j
-
-         div_factor = div_factor / 2
-
-      enddo
-
-
-      do i = l2npcols, 1, -1
-
-            if( nprows .eq. npcols )then
-               reduce_send_starts(i)  = send_start
-               reduce_send_lengths(i) = send_len
-               reduce_recv_lengths(i) = lastrow - firstrow + 1
-            else
-               reduce_recv_lengths(i) = send_len
-               if( i .eq. l2npcols )then
-                  reduce_send_lengths(i) = lastrow-firstrow+1 - send_len
-                  if( me/2*2 .eq. me )then
-                     reduce_send_starts(i) = send_start + send_len
-                  else
-                     reduce_send_starts(i) = 1
-                  endif
-               else
-                  reduce_send_lengths(i) = send_len
-                  reduce_send_starts(i)  = send_start
-               endif
-            endif
-            reduce_recv_starts(i) = send_start
-
-      enddo
-
-
-      exch_recv_length = lastcol - firstcol + 1
-
-
-      return
-      end
-
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine conj_grad ( colidx,
-     >                       rowstr,
-     >                       x,
-     >                       z,
-     >                       a,
-     >                       p,
-     >                       q,
-     >                       r,
-     >                       w,
-     >                       rnorm, 
-     >                       l2npcols,
-     >                       reduce_exch_proc,
-     >                       reduce_send_starts,
-     >                       reduce_send_lengths,
-     >                       reduce_recv_starts,
-     >                       reduce_recv_lengths )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c  Floaging point arrays here are named as in NPB1 spec discussion of 
-c  CG algorithm
-c---------------------------------------------------------------------
- 
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer status(MPI_STATUS_SIZE ), request
-
-
-      common / partit_size  /  naa, nzz, 
-     >                         npcols, nprows,
-     >                         proc_col, proc_row,
-     >                         firstrow, 
-     >                         lastrow, 
-     >                         firstcol, 
-     >                         lastcol,
-     >                         exch_proc,
-     >                         exch_recv_length,
-     >                         send_start,
-     >                         send_len
-      integer                  naa, nzz, 
-     >                         npcols, nprows,
-     >                         proc_col, proc_row,
-     >                         firstrow, 
-     >                         lastrow, 
-     >                         firstcol, 
-     >                         lastcol,
-     >                         exch_proc,
-     >                         exch_recv_length,
-     >                         send_start,
-     >                         send_len
-
-
-
-      double precision   x(*),
-     >                   z(*),
-     >                   a(nzz)
-      integer            colidx(nzz), rowstr(naa+1)
-
-      double precision   p(*),
-     >                   q(*),
-     >                   r(*),               
-     >                   w(*)                ! used as work temporary
-
-      integer   l2npcols
-      integer   reduce_exch_proc(l2npcols)
-      integer   reduce_send_starts(l2npcols)
-      integer   reduce_send_lengths(l2npcols)
-      integer   reduce_recv_starts(l2npcols)
-      integer   reduce_recv_lengths(l2npcols)
-
-      integer   i, j, k, ierr
-      integer   cgit, cgitmax
-
-      double precision   d, sum, rho, rho0, alpha, beta, rnorm
-
-      external         timer_read
-      double precision timer_read
-
-      data      cgitmax / 25 /
-
-
-c---------------------------------------------------------------------
-c  Initialize the CG algorithm:
-c---------------------------------------------------------------------
-      do j=1,naa/nprows+1
-         q(j) = 0.0d0
-         z(j) = 0.0d0
-         r(j) = x(j)
-         p(j) = r(j)
-         w(j) = 0.0d0                 
-      enddo
-
-
-c---------------------------------------------------------------------
-c  rho = r.r
-c  Now, obtain the norm of r: First, sum squares of r elements locally...
-c---------------------------------------------------------------------
-      sum = 0.0d0
-      do j=1, lastcol-firstcol+1
-         sum = sum + r(j)*r(j)
-      enddo
-
-c---------------------------------------------------------------------
-c  Exchange and sum with procs identified in reduce_exch_proc
-c  (This is equivalent to mpi_allreduce.)
-c  Sum the partial sums of rho, leaving rho on all processors
-c---------------------------------------------------------------------
-      do i = 1, l2npcols
-         call mpi_irecv( rho,
-     >                   1,
-     >                   dp_type,
-     >                   reduce_exch_proc(i),
-     >                   i,
-     >                   mpi_comm_world,
-     >                   request,
-     >                   ierr )
-         call mpi_send(  sum,
-     >                   1,
-     >                   dp_type,
-     >                   reduce_exch_proc(i),
-     >                   i,
-     >                   mpi_comm_world,
-     >                   ierr )
-         call mpi_wait( request, status, ierr )
-
-         sum = sum + rho
-      enddo
-      rho = sum
-
-
-
-c---------------------------------------------------------------------
-c---->
-c  The conj grad iteration loop
-c---->
-c---------------------------------------------------------------------
-      do cgit = 1, cgitmax
-
-
-c---------------------------------------------------------------------
-c  q = A.p
-c  The partition submatrix-vector multiply: use workspace w
-c---------------------------------------------------------------------
-         do j=1,lastrow-firstrow+1
-            sum = 0.d0
-            do k=rowstr(j),rowstr(j+1)-1
-               sum = sum + a(k)*p(colidx(k))
-            enddo
-            w(j) = sum
-         enddo
-
-c---------------------------------------------------------------------
-c  Sum the partition submatrix-vec A.p's across rows
-c  Exchange and sum piece of w with procs identified in reduce_exch_proc
-c---------------------------------------------------------------------
-         do i = l2npcols, 1, -1
-            call mpi_irecv( q(reduce_recv_starts(i)),
-     >                      reduce_recv_lengths(i),
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      request,
-     >                      ierr )
-            call mpi_send(  w(reduce_send_starts(i)),
-     >                      reduce_send_lengths(i),
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      ierr )
-            call mpi_wait( request, status, ierr )
-            do j=send_start,send_start + reduce_recv_lengths(i) - 1
-               w(j) = w(j) + q(j)
-            enddo
-         enddo
-      
-
-c---------------------------------------------------------------------
-c  Exchange piece of q with transpose processor:
-c---------------------------------------------------------------------
-         if( l2npcols .ne. 0 )then
-            call mpi_irecv( q,               
-     >                      exch_recv_length,
-     >                      dp_type,
-     >                      exch_proc,
-     >                      1,
-     >                      mpi_comm_world,
-     >                      request,
-     >                      ierr )
-
-            call mpi_send(  w(send_start),   
-     >                      send_len,
-     >                      dp_type,
-     >                      exch_proc,
-     >                      1,
-     >                      mpi_comm_world,
-     >                      ierr )
-            call mpi_wait( request, status, ierr )
-         else
-            do j=1,exch_recv_length
-               q(j) = w(j)
-            enddo
-         endif
-
-
-c---------------------------------------------------------------------
-c  Clear w for reuse...
-c---------------------------------------------------------------------
-         do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 )
-            w(j) = 0.0d0
-         enddo
-         
-
-c---------------------------------------------------------------------
-c  Obtain p.q
-c---------------------------------------------------------------------
-         sum = 0.0d0
-         do j=1, lastcol-firstcol+1
-            sum = sum + p(j)*q(j)
-         enddo
-
-c---------------------------------------------------------------------
-c  Obtain d with a sum-reduce
-c---------------------------------------------------------------------
-         do i = 1, l2npcols
-            call mpi_irecv( d,
-     >                      1,
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      request,
-     >                      ierr )
-            call mpi_send(  sum,
-     >                      1,
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      ierr )
-
-            call mpi_wait( request, status, ierr )
-
-            sum = sum + d
-         enddo
-         d = sum
-
-
-c---------------------------------------------------------------------
-c  Obtain alpha = rho / (p.q)
-c---------------------------------------------------------------------
-         alpha = rho / d
-
-c---------------------------------------------------------------------
-c  Save a temporary of rho
-c---------------------------------------------------------------------
-         rho0 = rho
-
-c---------------------------------------------------------------------
-c  Obtain z = z + alpha*p
-c  and    r = r - alpha*q
-c---------------------------------------------------------------------
-         do j=1, lastcol-firstcol+1
-            z(j) = z(j) + alpha*p(j)
-            r(j) = r(j) - alpha*q(j)
-         enddo
-            
-c---------------------------------------------------------------------
-c  rho = r.r
-c  Now, obtain the norm of r: First, sum squares of r elements locally...
-c---------------------------------------------------------------------
-         sum = 0.0d0
-         do j=1, lastcol-firstcol+1
-            sum = sum + r(j)*r(j)
-         enddo
-
-c---------------------------------------------------------------------
-c  Obtain rho with a sum-reduce
-c---------------------------------------------------------------------
-         do i = 1, l2npcols
-            call mpi_irecv( rho,
-     >                      1,
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      request,
-     >                      ierr )
-            call mpi_send(  sum,
-     >                      1,
-     >                      dp_type,
-     >                      reduce_exch_proc(i),
-     >                      i,
-     >                      mpi_comm_world,
-     >                      ierr )
-            call mpi_wait( request, status, ierr )
-
-            sum = sum + rho
-         enddo
-         rho = sum
-
-c---------------------------------------------------------------------
-c  Obtain beta:
-c---------------------------------------------------------------------
-         beta = rho / rho0
-
-c---------------------------------------------------------------------
-c  p = r + beta*p
-c---------------------------------------------------------------------
-         do j=1, lastcol-firstcol+1
-            p(j) = r(j) + beta*p(j)
-         enddo
-
-
-
-      enddo                             ! end of do cgit=1,cgitmax
-
-
-
-c---------------------------------------------------------------------
-c  Compute residual norm explicitly:  ||r|| = ||x - A.z||
-c  First, form A.z
-c  The partition submatrix-vector multiply
-c---------------------------------------------------------------------
-      do j=1,lastrow-firstrow+1
-         sum = 0.d0
-         do k=rowstr(j),rowstr(j+1)-1
-            sum = sum + a(k)*z(colidx(k))
-         enddo
-         w(j) = sum
-      enddo
-
-
-
-c---------------------------------------------------------------------
-c  Sum the partition submatrix-vec A.z's across rows
-c---------------------------------------------------------------------
-      do i = l2npcols, 1, -1
-         call mpi_irecv( r(reduce_recv_starts(i)),
-     >                   reduce_recv_lengths(i),
-     >                   dp_type,
-     >                   reduce_exch_proc(i),
-     >                   i,
-     >                   mpi_comm_world,
-     >                   request,
-     >                   ierr )
-         call mpi_send(  w(reduce_send_starts(i)),
-     >                   reduce_send_lengths(i),
-     >                   dp_type,
-     >                   reduce_exch_proc(i),
-     >                   i,
-     >                   mpi_comm_world,
-     >                   ierr )
-         call mpi_wait( request, status, ierr )
-
-         do j=send_start,send_start + reduce_recv_lengths(i) - 1
-            w(j) = w(j) + r(j)
-         enddo
-      enddo
-      
-
-c---------------------------------------------------------------------
-c  Exchange piece of q with transpose processor:
-c---------------------------------------------------------------------
-      if( l2npcols .ne. 0 )then
-         call mpi_irecv( r,               
-     >                   exch_recv_length,
-     >                   dp_type,
-     >                   exch_proc,
-     >                   1,
-     >                   mpi_comm_world,
-     >                   request,
-     >                   ierr )
-   
-         call mpi_send(  w(send_start),   
-     >                   send_len,
-     >                   dp_type,
-     >                   exch_proc,
-     >                   1,
-     >                   mpi_comm_world,
-     >                   ierr )
-         call mpi_wait( request, status, ierr )
-      else
-         do j=1,exch_recv_length
-            r(j) = w(j)
-         enddo
-      endif
-
-
-c---------------------------------------------------------------------
-c  At this point, r contains A.z
-c---------------------------------------------------------------------
-         sum = 0.0d0
-         do j=1, lastcol-firstcol+1
-            d   = x(j) - r(j)         
-            sum = sum + d*d
-         enddo
-         
-c---------------------------------------------------------------------
-c  Obtain d with a sum-reduce
-c---------------------------------------------------------------------
-      do i = 1, l2npcols
-         call mpi_irecv( d,
-     >                   1,
-     >                   dp_type,
-     >                   reduce_exch_proc(i),
-     >                   i,
-     >                   mpi_comm_world,
-     >                   request,
-     >                   ierr )
-         call mpi_send(  sum,
-     >                   1,
-     >                   dp_type,
-     >                   reduce_exch_proc(i),
-     >                   i,
-     >                   mpi_comm_world,
-     >                   ierr )
-         call mpi_wait( request, status, ierr )
-
-         sum = sum + d
-      enddo
-      d = sum
-
-
-      if( me .eq. root ) rnorm = sqrt( d )
-
-
-
-      return
-      end                               ! end of routine conj_grad
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine makea( n, nz, a, colidx, rowstr, nonzer,
-     >                  firstrow, lastrow, firstcol, lastcol,
-     >                  rcond, arow, acol, aelt, v, iv, shift )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      integer             n, nz
-      integer             firstrow, lastrow, firstcol, lastcol
-      integer             colidx(nz), rowstr(n+1)
-      integer             iv(2*n+1), arow(nz), acol(nz)
-      double precision    v(n+1), aelt(nz)
-      double precision    rcond, a(nz), shift
-
-c---------------------------------------------------------------------
-c       generate the test problem for benchmark 6
-c       makea generates a sparse matrix with a
-c       prescribed sparsity distribution
-c
-c       parameter    type        usage
-c
-c       input
-c
-c       n            i           number of cols/rows of matrix
-c       nz           i           nonzeros as declared array size
-c       rcond        r*8         condition number
-c       shift        r*8         main diagonal shift
-c
-c       output
-c
-c       a            r*8         array for nonzeros
-c       colidx       i           col indices
-c       rowstr       i           row pointers
-c
-c       workspace
-c
-c       iv, arow, acol i
-c       v, aelt        r*8
-c---------------------------------------------------------------------
-
-      integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER
-
-c---------------------------------------------------------------------
-c      nonzer is approximately  (int(sqrt(nnza /n)));
-c---------------------------------------------------------------------
-
-      double precision  size, ratio, scale
-      external          sparse, sprnvc, vecset
-
-      size = 1.0D0
-      ratio = rcond ** (1.0D0 / dfloat(n))
-      nnza = 0
-
-c---------------------------------------------------------------------
-c  Initialize iv(n+1 .. 2n) to zero.
-c  Used by sprnvc to mark nonzero positions
-c---------------------------------------------------------------------
-
-      do i = 1, n
-           iv(n+i) = 0
-      enddo
-      do iouter = 1, n
-         nzv = nonzer
-         call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) )
-         call vecset( n, v, colidx, nzv, iouter, .5D0 )
-         do ivelt = 1, nzv
-              jcol = colidx(ivelt)
-              if (jcol.ge.firstcol .and. jcol.le.lastcol) then
-                 scale = size * v(ivelt)
-                 do ivelt1 = 1, nzv
-                    irow = colidx(ivelt1)
-                    if (irow.ge.firstrow .and. irow.le.lastrow) then
-                       nnza = nnza + 1
-                       if (nnza .gt. nz) goto 9999
-                       acol(nnza) = jcol
-                       arow(nnza) = irow
-                       aelt(nnza) = v(ivelt1) * scale
-                    endif
-                 enddo
-              endif
-         enddo
-         size = size * ratio
-      enddo
-
-
-c---------------------------------------------------------------------
-c       ... add the identity * rcond to the generated matrix to bound
-c           the smallest eigenvalue from below by rcond
-c---------------------------------------------------------------------
-        do i = firstrow, lastrow
-           if (i.ge.firstcol .and. i.le.lastcol) then
-              iouter = n + i
-              nnza = nnza + 1
-              if (nnza .gt. nz) goto 9999
-              acol(nnza) = i
-              arow(nnza) = i
-              aelt(nnza) = rcond - shift
-           endif
-        enddo
-
-
-c---------------------------------------------------------------------
-c       ... make the sparse matrix from list of elements with duplicates
-c           (v and iv are used as  workspace)
-c---------------------------------------------------------------------
-      call sparse( a, colidx, rowstr, n, arow, acol, aelt,
-     >             firstrow, lastrow,
-     >             v, iv(1), iv(n+1), nnza )
-      return
-
- 9999 continue
-      write(*,*) 'Space for matrix elements exceeded in makea'
-      write(*,*) 'nnza, nzmax = ',nnza, nz
-      write(*,*) ' iouter = ',iouter
-
-      stop
-      end
-c-------end   of makea------------------------------
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt,
-     >                   firstrow, lastrow,
-     >                   x, mark, nzloc, nnza )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit           logical (a-z)
-      integer            colidx(*), rowstr(*)
-      integer            firstrow, lastrow
-      integer            n, arow(*), acol(*), nnza
-      double precision   a(*), aelt(*)
-
-c---------------------------------------------------------------------
-c       rows range from firstrow to lastrow
-c       the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
-c---------------------------------------------------------------------
-      integer            nzloc(n), nrows
-      double precision   x(n)
-      logical            mark(n)
-
-c---------------------------------------------------
-c       generate a sparse matrix from a list of
-c       [col, row, element] tri
-c---------------------------------------------------
-
-      integer            i, j, jajp1, nza, k, nzrow
-      double precision   xi
-
-c---------------------------------------------------------------------
-c    how many rows of result
-c---------------------------------------------------------------------
-      nrows = lastrow - firstrow + 1
-
-c---------------------------------------------------------------------
-c     ...count the number of triples in each row
-c---------------------------------------------------------------------
-      do j = 1, n
-         rowstr(j) = 0
-         mark(j) = .false.
-      enddo
-      rowstr(n+1) = 0
-
-      do nza = 1, nnza
-         j = (arow(nza) - firstrow + 1) + 1
-         rowstr(j) = rowstr(j) + 1
-      enddo
-
-      rowstr(1) = 1
-      do j = 2, nrows+1
-         rowstr(j) = rowstr(j) + rowstr(j-1)
-      enddo
-
-
-c---------------------------------------------------------------------
-c     ... rowstr(j) now is the location of the first nonzero
-c           of row j of a
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c     ... do a bucket sort of the triples on the row index
-c---------------------------------------------------------------------
-      do nza = 1, nnza
-         j = arow(nza) - firstrow + 1
-         k = rowstr(j)
-         a(k) = aelt(nza)
-         colidx(k) = acol(nza)
-         rowstr(j) = rowstr(j) + 1
-      enddo
-
-
-c---------------------------------------------------------------------
-c       ... rowstr(j) now points to the first element of row j+1
-c---------------------------------------------------------------------
-      do j = nrows, 1, -1
-          rowstr(j+1) = rowstr(j)
-      enddo
-      rowstr(1) = 1
-
-
-c---------------------------------------------------------------------
-c       ... generate the actual output rows by adding elements
-c---------------------------------------------------------------------
-      nza = 0
-      do i = 1, n
-          x(i)    = 0.0
-          mark(i) = .false.
-      enddo
-
-      jajp1 = rowstr(1)
-      do j = 1, nrows
-         nzrow = 0
-
-c---------------------------------------------------------------------
-c          ...loop over the jth row of a
-c---------------------------------------------------------------------
-         do k = jajp1 , rowstr(j+1)-1
-            i = colidx(k)
-            x(i) = x(i) + a(k)
-            if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then
-             mark(i) = .true.
-             nzrow = nzrow + 1
-             nzloc(nzrow) = i
-            endif
-         enddo
-
-c---------------------------------------------------------------------
-c          ... extract the nonzeros of this row
-c---------------------------------------------------------------------
-         do k = 1, nzrow
-            i = nzloc(k)
-            mark(i) = .false.
-            xi = x(i)
-            x(i) = 0.D0
-            if (xi .ne. 0.D0) then
-             nza = nza + 1
-             a(nza) = xi
-             colidx(nza) = i
-            endif
-         enddo
-         jajp1 = rowstr(j+1)
-         rowstr(j+1) = nza + rowstr(1)
-      enddo
-CC       write (*, 11000) nza
-      return
-11000   format ( //,'final nonzero count in sparse ',
-     1            /,'number of nonzeros       = ', i16 )
-      end
-c-------end   of sparse-----------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine sprnvc( n, nz, v, iv, nzloc, mark )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit           logical (a-z)
-      double precision   v(*)
-      integer            n, nz, iv(*), nzloc(n), nn1
-      integer mark(n)
-      common /urando/    amult, tran
-      double precision   amult, tran
-
-
-c---------------------------------------------------------------------
-c       generate a sparse n-vector (v, iv)
-c       having nzv nonzeros
-c
-c       mark(i) is set to 1 if position i is nonzero.
-c       mark is all zero on entry and is reset to all zero before exit
-c       this corrects a performance bug found by John G. Lewis, caused by
-c       reinitialization of mark on every one of the n calls to sprnvc
-c---------------------------------------------------------------------
-
-        integer            nzrow, nzv, ii, i, icnvrt
-
-        external           randlc, icnvrt
-        double precision   randlc, vecelt, vecloc
-
-
-        nzv = 0
-        nzrow = 0
-        nn1 = 1
- 50     continue
-          nn1 = 2 * nn1
-          if (nn1 .lt. n) goto 50
-
-c---------------------------------------------------------------------
-c    nn1 is the smallest power of two not less than n
-c---------------------------------------------------------------------
-
-100     continue
-        if (nzv .ge. nz) goto 110
-         vecelt = randlc( tran, amult )
-
-c---------------------------------------------------------------------
-c   generate an integer between 1 and n in a portable manner
-c---------------------------------------------------------------------
-         vecloc = randlc(tran, amult)
-         i = icnvrt(vecloc, nn1) + 1
-         if (i .gt. n) goto 100
-
-c---------------------------------------------------------------------
-c  was this integer generated already?
-c---------------------------------------------------------------------
-         if (mark(i) .eq. 0) then
-            mark(i) = 1
-            nzrow = nzrow + 1
-            nzloc(nzrow) = i
-            nzv = nzv + 1
-            v(nzv) = vecelt
-            iv(nzv) = i
-         endif
-         goto 100
-110      continue
-      do ii = 1, nzrow
-         i = nzloc(ii)
-         mark(i) = 0
-      enddo
-      return
-      end
-c-------end   of sprnvc-----------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      function icnvrt(x, ipwr2)
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit           logical (a-z)
-      double precision   x
-      integer            ipwr2, icnvrt
-
-c---------------------------------------------------------------------
-c    scale a double precision number x in (0,1) by a power of 2 and chop it
-c---------------------------------------------------------------------
-      icnvrt = int(ipwr2 * x)
-
-      return
-      end
-c-------end   of icnvrt-----------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine vecset(n, v, iv, nzv, i, val)
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit           logical (a-z)
-      integer            n, iv(*), nzv, i, k
-      double precision   v(*), val
-
-c---------------------------------------------------------------------
-c       set ith element of sparse vector (v, iv) with
-c       nzv nonzeros to val
-c---------------------------------------------------------------------
-
-      logical set
-
-      set = .false.
-      do k = 1, nzv
-         if (iv(k) .eq. i) then
-            v(k) = val
-            set  = .true.
-         endif
-      enddo
-      if (.not. set) then
-         nzv     = nzv + 1
-         v(nzv)  = val
-         iv(nzv) = i
-      endif
-      return
-      end
-c-------end   of vecset-----------------------------
-
diff --git a/examples/smpi/NAS/CG/mpinpb.h b/examples/smpi/NAS/CG/mpinpb.h
deleted file mode 100644
index 1f0368c0b7..0000000000
--- a/examples/smpi/NAS/CG/mpinpb.h
+++ /dev/null
@@ -1,9 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include           'mpif.h'
-
-      integer           me, nprocs, root, dp_type
-      common /mpistuff/ me, nprocs, root, dp_type
-
diff --git a/examples/smpi/NAS/FT/Makefile b/examples/smpi/NAS/FT/Makefile
deleted file mode 100644
index 1cc6e1416b..0000000000
--- a/examples/smpi/NAS/FT/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=ft
-BENCHMARKU=FT
-
-include ../config/make.def
-
-include ../sys/make.common
-
-OBJS = ft.o ${COMMON}/${RAND}.o ${COMMON}/print_results.o ${COMMON}/timers.o
-
-${PROGRAM}: config ${OBJS}
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-
-
-.f.o:
-	${FCOMPILE} $<
-
-ft.o:             ft.f  global.h mpinpb.h npbparams.h
-
-clean:
-	- rm -f *.o *~ mputil*
-	- rm -f ft npbparams.h core
diff --git a/examples/smpi/NAS/FT/README b/examples/smpi/NAS/FT/README
deleted file mode 100644
index ab08b363b2..0000000000
--- a/examples/smpi/NAS/FT/README
+++ /dev/null
@@ -1,5 +0,0 @@
-This code implements the time integration of a three-dimensional
-partial differential equation using the Fast Fourier Transform.
-Some of the dimension statements are not F77 conforming and will
-not work using the g77 compiler. All dimension statements,
-however, are legal F90.
\ No newline at end of file
diff --git a/examples/smpi/NAS/FT/ft.f b/examples/smpi/NAS/FT/ft.f
deleted file mode 100644
index 5e3a3b0b40..0000000000
--- a/examples/smpi/NAS/FT/ft.f
+++ /dev/null
@@ -1,1998 +0,0 @@
-!-------------------------------------------------------------------------!
-!                                                                         !
-!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
-!                                                                         !
-!                                   F T                                   !
-!                                                                         !
-!-------------------------------------------------------------------------!
-!                                                                         !
-!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
-!    It is described in NAS Technical Reports 95-020 and 02-007           !
-!                                                                         !
-!    Permission to use, copy, distribute and modify this software         !
-!    for any purpose with or without fee is hereby granted.  We           !
-!    request, however, that all derived work reference the NAS            !
-!    Parallel Benchmarks 3.3. This software is provided "as is"           !
-!    without express or implied warranty.                                 !
-!                                                                         !
-!    Information on NPB 3.3, including the technical report, the          !
-!    original specifications, source code, results and information        !
-!    on how to submit new results, is available at:                       !
-!                                                                         !
-!           http://www.nas.nasa.gov/Software/NPB/                         !
-!                                                                         !
-!    Send comments or suggestions to  npb@nas.nasa.gov                    !
-!                                                                         !
-!          NAS Parallel Benchmarks Group                                  !
-!          NASA Ames Research Center                                      !
-!          Mail Stop: T27A-1                                              !
-!          Moffett Field, CA   94035-1000                                 !
-!                                                                         !
-!          E-mail:  npb@nas.nasa.gov                                      !
-!          Fax:     (650) 604-3957                                        !
-!                                                                         !
-!-------------------------------------------------------------------------!
-
-!TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
-!STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
-!JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
-!FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY 
-!USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
-!NPB3.0-SER)
-
-!TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
-!MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
-!NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
-!COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
-!((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
-!THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE 
-!PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A 
-!FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
-!THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.
-
-!UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
-!FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
-!"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM 
-!DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
-!OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
-!AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
-!OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
-!ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.
-
-c---------------------------------------------------------------------
-c
-c Authors: D. Bailey
-c          W. Saphir
-c          R. F. Van der Wijngaart
-c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c FT benchmark
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      program ft
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpif.h'
-      include 'global.h'
-      integer i, ierr
-      
-c---------------------------------------------------------------------
-c u0, u1, u2 are the main arrays in the problem. 
-c Depending on the decomposition, these arrays will have different 
-c dimensions. To accomodate all possibilities, we allocate them as 
-c one-dimensional arrays and pass them to subroutines for different 
-c views
-c  - u0 contains the initial (transformed) initial condition
-c  - u1 and u2 are working arrays
-c---------------------------------------------------------------------
-
-      double complex   u0(ntdivnp), 
-     >                 u1(ntdivnp), 
-     >                 u2(ntdivnp)
-      double precision twiddle(ntdivnp)
-c---------------------------------------------------------------------
-c Large arrays are in common so that they are allocated on the
-c heap rather than the stack. This common block is not
-c referenced directly anywhere else. Padding is to avoid accidental 
-c cache problems, since all array sizes are powers of two.
-c---------------------------------------------------------------------
-
-      double complex pad1(3), pad2(3), pad3(3)
-      common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
-
-      integer iter
-      double precision total_time, mflops
-      logical verified
-      character class
-
-      call MPI_Init(ierr)
-
-c---------------------------------------------------------------------
-c Run the entire problem once to make sure all data is touched. 
-c This reduces variable startup costs, which is important for such a 
-c short benchmark. The other NPB 2 implementations are similar. 
-c---------------------------------------------------------------------
-      do i = 1, t_max
-         call timer_clear(i)
-      end do
-
-      call setup()
-      call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
-      call compute_initial_conditions(u1, dims(1,1), dims(2,1), 
-     >                                dims(3,1))
-      call fft_init (dims(1,1))
-      call fft(1, u1, u0)
-
-c---------------------------------------------------------------------
-c Start over from the beginning. Note that all operations must
-c be timed, in contrast to other benchmarks. 
-c---------------------------------------------------------------------
-      do i = 1, t_max
-         call timer_clear(i)
-      end do
-      call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-      call timer_start(T_total)
-      if (timers_enabled) call timer_start(T_setup)
-
-      call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
-      call compute_initial_conditions(u1, dims(1,1), dims(2,1), 
-     >                                dims(3,1))
-      call fft_init (dims(1,1))
-
-      if (timers_enabled) call synchup()
-      if (timers_enabled) call timer_stop(T_setup)
-
-      if (timers_enabled) call timer_start(T_fft)
-      call fft(1, u1, u0)
-      if (timers_enabled) call timer_stop(T_fft)
-
-      do iter = 1, niter
-         if (timers_enabled) call timer_start(T_evolve)
-         call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
-         if (timers_enabled) call timer_stop(T_evolve)
-         if (timers_enabled) call timer_start(T_fft)
-         call fft(-1, u1, u2)
-         if (timers_enabled) call timer_stop(T_fft)
-         if (timers_enabled) call synchup()
-         if (timers_enabled) call timer_start(T_checksum)
-         call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
-         if (timers_enabled) call timer_stop(T_checksum)
-      end do
-
-      call verify(nx, ny, nz, niter, verified, class)
-      call timer_stop(t_total)
-      if (np .ne. np_min) verified = .false.
-      total_time = timer_read(t_total)
-
-      if( total_time .ne. 0. ) then
-         mflops = 1.0d-6*ntotal_f *
-     >             (14.8157+7.19641*log(ntotal_f)
-     >          +  (5.23518+7.21113*log(ntotal_f))*niter)
-     >                 /total_time
-      else
-         mflops = 0.0
-      endif
-      if (me .eq. 0) then
-         call print_results('FT', class, nx, ny, nz, niter, np_min, np,
-     >     total_time, mflops, '          floating point', verified, 
-     >     npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
-      endif
-      if (timers_enabled) call print_timers()
-      call MPI_Finalize(ierr)
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine evolve(u0, u1, twiddle, d1, d2, d3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c evolve u0 -> u1 (t time steps) in fourier space
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer d1, d2, d3
-      double precision exi
-      double complex u0(d1,d2,d3)
-      double complex u1(d1,d2,d3)
-      double precision twiddle(d1,d2,d3)
-      integer i, j, k
-
-      do k = 1, d3
-         do j = 1, d2
-            do i = 1, d1
-               u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
-               u1(i,j,k) = u0(i,j,k)
-            end do
-         end do
-      end do
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine compute_initial_conditions(u0, d1, d2, d3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Fill in array u0 with initial conditions from 
-c random number generator 
-c---------------------------------------------------------------------
-      implicit none
-      include 'global.h'
-      integer d1, d2, d3
-      double complex u0(d1, d2, d3)
-      integer k
-      double precision x0, start, an, dummy
-      
-c---------------------------------------------------------------------
-c 0-D and 1-D layouts are easy because each processor gets a contiguous
-c chunk of the array, in the Fortran ordering sense. 
-c For a 2-D layout, it's a bit more complicated. We always
-c have entire x-lines (contiguous) in processor. 
-c We can do ny/np1 of them at a time since we have
-c ny/np1 contiguous in y-direction. But then we jump
-c by z-planes (nz/np2 of them, total). 
-c For the 0-D and 1-D layouts we could do larger chunks, but
-c this turns out to have no measurable impact on performance. 
-c---------------------------------------------------------------------
-
-
-      start = seed                                    
-c---------------------------------------------------------------------
-c Jump to the starting element for our first plane.
-c---------------------------------------------------------------------
-      call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
-      dummy = randlc(start, an)
-      call ipow46(a, 2*nx, ny, an)
-      
-c---------------------------------------------------------------------
-c Go through by z planes filling in one square at a time.
-c---------------------------------------------------------------------
-      do k = 1, dims(3, 1) ! nz/np2
-         x0 = start
-         call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
-         if (k .ne. dims(3, 1)) dummy = randlc(start, an)
-      end do
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine ipow46(a, exp_1, exp_2, result)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute a^exponent mod 2^46
-c---------------------------------------------------------------------
-
-      implicit none
-      double precision a, result, dummy, q, r
-      integer exp_1, exp_2, n, n2, ierr
-      external randlc
-      double precision randlc
-      logical  two_pow
-c---------------------------------------------------------------------
-c Use
-c   a^n = a^(n/2)*a^(n/2) if n even else
-c   a^n = a*a^(n-1)       if n odd
-c---------------------------------------------------------------------
-      result = 1
-      if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
-      q = a
-      r = 1
-      n = exp_1
-      two_pow = .true.
-
-      do while (two_pow)
-         n2 = n/2
-         if (n2 * 2 .eq. n) then
-            dummy = randlc(q, q)
-            n = n2
-         else
-            n = n * exp_2
-            two_pow = .false.
-         endif
-      end do
-
-      do while (n .gt. 1)
-         n2 = n/2
-         if (n2 * 2 .eq. n) then
-            dummy = randlc(q, q) 
-            n = n2
-         else
-            dummy = randlc(r, q)
-            n = n-1
-         endif
-      end do
-      dummy = randlc(r, q)
-      result = r
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'mpinpb.h'
-      include 'global.h'
-
-      integer ierr, i, j, fstatus
-      debug = .FALSE.
-      
-      call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
-      call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)
-
-      if (.not. convertdouble) then
-         dc_type = MPI_DOUBLE_COMPLEX
-      else
-         dc_type = MPI_COMPLEX
-      endif
-
-
-      if (me .eq. 0) then
-         write(*, 1000)
-         open (unit=2,file='inputft.data',status='old', iostat=fstatus)
-
-         if (fstatus .eq. 0) then
-            write(*,233) 
- 233        format(' Reading from input file inputft.data')
-            read (2,*) niter
-            read (2,*) layout_type
-            read (2,*) np1, np2
-            close(2)
-
-c---------------------------------------------------------------------
-c check to make sure input data is consistent
-c---------------------------------------------------------------------
-
-    
-c---------------------------------------------------------------------
-c 1. product of processor grid dims must equal number of processors
-c---------------------------------------------------------------------
-
-            if (np1 * np2 .ne. np) then
-               write(*, 238)
- 238           format(' np1 and np2 given in input file are not valid.')
-               write(*, 239) np1*np2, np
- 239           format(' Product is ', i5, ' and should be ', i5)
-               call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
-               stop
-            endif
-
-c---------------------------------------------------------------------
-c 2. layout type must be valid
-c---------------------------------------------------------------------
-
-            if (layout_type .ne. layout_0D .and.
-     >          layout_type .ne. layout_1D .and.
-     >          layout_type .ne. layout_2D) then
-               write(*, 240)
- 240           format(' Layout type specified in inputft.data is 
-     >                  invalid ')
-               call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
-               stop
-            endif
-
-c---------------------------------------------------------------------
-c 3. 0D layout must be 1x1 grid
-c---------------------------------------------------------------------
-
-            if (layout_type .eq. layout_0D .and.
-     >            (np1 .ne.1 .or. np2 .ne. 1)) then
-               write(*, 241)
- 241           format(' For 0D layout, both np1 and np2 must be 1 ')
-               call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
-               stop
-            endif
-c---------------------------------------------------------------------
-c 4. 1D layout must be 1xN grid
-c---------------------------------------------------------------------
-
-            if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
-               write(*, 242)
- 242           format(' For 1D layout, np1 must be 1 ')
-               call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
-               stop
-            endif
-
-         else
-            write(*,234) 
-            niter = niter_default
-            if (np .eq. 1) then
-               np1 = 1
-               np2 = 1
-               layout_type = layout_0D
-            else if (np .le. nz) then
-               np1 = 1
-               np2 = np
-               layout_type = layout_1D
-            else
-               np1 = nz
-               np2 = np/nz
-               layout_type = layout_2D
-            endif
-         endif
-
-         if (np .lt. np_min) then
-            write(*, 10) np_min
- 10         format(' Error: Compiled for ', I5, ' processors. ')
-            write(*, 11) np
- 11         format(' Only ',  i5, ' processors found ')
-            call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
-            stop
-         endif
-
- 234     format(' No input file inputft.data. Using compiled defaults')
-         write(*, 1001) nx, ny, nz
-         write(*, 1002) niter
-         write(*, 1004) np
-         write(*, 1005) np1, np2
-         if (np .ne. np_min) write(*, 1006) np_min
-
-         if (layout_type .eq. layout_0D) then
-            write(*, 1010) '0D'
-         else if (layout_type .eq. layout_1D) then
-            write(*, 1010) '1D'
-         else
-            write(*, 1010) '2D'
-         endif
-
- 1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
- 1001    format(' Size                : ', i4, 'x', i4, 'x', i4)
- 1002    format(' Iterations          : ', 7x, i7)
- 1004    format(' Number of processes : ', 7x, i7)
- 1005    format(' Processor array     : ', 5x, i4, 'x', i4)
- 1006    format(' WARNING: compiled for ', i5, ' processes. ',
-     >          ' Will not verify. ')
- 1010    format(' Layout type         : ', 9x, A5)
-      endif
-
-
-c---------------------------------------------------------------------
-c Since np1, np2 and layout_type are in a common block, 
-c this sends all three. 
-c---------------------------------------------------------------------
-      call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
-
-      if (np1 .eq. 1 .and. np2 .eq. 1) then
-        layout_type = layout_0D
-      else if (np1 .eq. 1) then
-         layout_type = layout_1D
-      else
-         layout_type = layout_2D
-      endif
-
-      if (layout_type .eq. layout_0D) then
-         do i = 1, 3
-            dims(1, i) = nx
-            dims(2, i) = ny
-            dims(3, i) = nz
-         end do
-      else if (layout_type .eq. layout_1D) then
-         dims(1, 1) = nx
-         dims(2, 1) = ny
-         dims(3, 1) = nz
-
-         dims(1, 2) = nx
-         dims(2, 2) = ny
-         dims(3, 2) = nz
-
-         dims(1, 3) = nz
-         dims(2, 3) = nx
-         dims(3, 3) = ny
-      else if (layout_type .eq. layout_2D) then
-         dims(1, 1) = nx
-         dims(2, 1) = ny
-         dims(3, 1) = nz
-
-         dims(1, 2) = ny
-         dims(2, 2) = nx
-         dims(3, 2) = nz
-
-         dims(1, 3) = nz
-         dims(2, 3) = nx
-         dims(3, 3) = ny
-
-      endif
-      do i = 1, 3
-         dims(2, i) = dims(2, i) / np1
-         dims(3, i) = dims(3, i) / np2
-      end do
-
-
-c---------------------------------------------------------------------
-c Determine processor coordinates of this processor
-c Processor grid is np1xnp2. 
-c Arrays are always (n1, n2/np1, n3/np2)
-c Processor coords are zero-based. 
-c---------------------------------------------------------------------
-      me2 = mod(me, np2)  ! goes from 0...np2-1
-      me1 = me/np2        ! goes from 0...np1-1
-c---------------------------------------------------------------------
-c Communicators for rows/columns of processor grid. 
-c commslice1 is communicator of all procs with same me1, ranked as me2
-c commslice2 is communicator of all procs with same me2, ranked as me1
-c mpi_comm_split(comm, color, key, ...)
-c---------------------------------------------------------------------
-      call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
-      call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
-      if (timers_enabled) call synchup()
-
-      if (debug) print *, 'proc coords: ', me, me1, me2
-
-c---------------------------------------------------------------------
-c Determine which section of the grid is owned by this
-c processor. 
-c---------------------------------------------------------------------
-      if (layout_type .eq. layout_0d) then
-
-         do i = 1, 3
-            xstart(i) = 1
-            xend(i)   = nx
-            ystart(i) = 1
-            yend(i)   = ny
-            zstart(i) = 1
-            zend(i)   = nz
-         end do
-
-      else if (layout_type .eq. layout_1d) then
-
-         xstart(1) = 1
-         xend(1)   = nx
-         ystart(1) = 1
-         yend(1)   = ny
-         zstart(1) = 1 + me2 * nz/np2
-         zend(1)   = (me2+1) * nz/np2
-
-         xstart(2) = 1
-         xend(2)   = nx
-         ystart(2) = 1
-         yend(2)   = ny
-         zstart(2) = 1 + me2 * nz/np2
-         zend(2)   = (me2+1) * nz/np2
-
-         xstart(3) = 1
-         xend(3)   = nx
-         ystart(3) = 1 + me2 * ny/np2
-         yend(3)   = (me2+1) * ny/np2
-         zstart(3) = 1
-         zend(3)   = nz
-
-      else if (layout_type .eq. layout_2d) then
-
-         xstart(1) = 1
-         xend(1)   = nx
-         ystart(1) = 1 + me1 * ny/np1
-         yend(1)   = (me1+1) * ny/np1
-         zstart(1) = 1 + me2 * nz/np2
-         zend(1)   = (me2+1) * nz/np2
-
-         xstart(2) = 1 + me1 * nx/np1
-         xend(2)   = (me1+1)*nx/np1
-         ystart(2) = 1
-         yend(2)   = ny
-         zstart(2) = zstart(1)
-         zend(2)   = zend(1)
-
-         xstart(3) = xstart(2)
-         xend(3)   = xend(2)
-         ystart(3) = 1 + me2 *ny/np2
-         yend(3)   = (me2+1)*ny/np2
-         zstart(3) = 1
-         zend(3)   = nz
-      endif
-
-c---------------------------------------------------------------------
-c Set up info for blocking of ffts and transposes.  This improves
-c performance on cache-based systems. Blocking involves
-c working on a chunk of the problem at a time, taking chunks
-c along the first, second, or third dimension. 
-c
-c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
-c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
-
-c Since 1st dim is always in processor, we'll assume it's long enough 
-c (default blocking factor is 16 so min size for 1st dim is 16)
-c The only case we have to worry about is cffts1 in a 2d decomposition. 
-c so the blocking factor should not be larger than the 2nd dimension. 
-c---------------------------------------------------------------------
-
-      fftblock = fftblock_default
-      fftblockpad = fftblockpad_default
-
-      if (layout_type .eq. layout_2d) then
-         if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
-         if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
-         if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
-      endif
-      
-      if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3
-
-      return
-      end
-
-      
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine compute_indexmap(twiddle, d1, d2, d3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 
-c for time evolution exponent. 
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'mpinpb.h'
-      include 'global.h'
-      integer d1, d2, d3
-      integer i, j, k, ii, ii2, jj, ij2, kk
-      double precision ap, twiddle(d1, d2, d3)
-
-c---------------------------------------------------------------------
-c this function is very different depending on whether 
-c we are in the 0d, 1d or 2d layout. Compute separately. 
-c basically we want to convert the fortran indices 
-c   1 2 3 4 5 6 7 8 
-c to 
-c   0 1 2 3 -4 -3 -2 -1
-c The following magic formula does the trick:
-c mod(i-1+n/2, n) - n/2
-c---------------------------------------------------------------------
-
-      ap = - 4.d0 * alpha * pi *pi
-
-      if (layout_type .eq. layout_0d) then ! xyz layout
-         do i = 1, dims(1,3)
-            ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
-            ii2 = ii*ii
-            do j = 1, dims(2,3)
-               jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
-               ij2 = jj*jj+ii2
-               do k = 1, dims(3,3)
-                  kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
-                  twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
-               end do
-            end do
-         end do
-      else if (layout_type .eq. layout_1d) then ! zxy layout 
-         do i = 1,dims(2,3)
-            ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
-            ii2 = ii*ii
-            do j = 1,dims(3,3)
-               jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
-               ij2 = jj*jj+ii2
-               do k = 1,dims(1,3)
-                  kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
-                  twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
-               end do
-            end do
-         end do
-      else if (layout_type .eq. layout_2d) then ! zxy layout
-         do i = 1,dims(2,3)
-            ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
-            ii2 = ii*ii
-            do j = 1, dims(3,3)
-               jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
-               ij2 = jj*jj+ii2
-               do k =1,dims(1,3)
-                  kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
-                  twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
-               end do
-            end do
-         end do
-      else
-         print *, ' Unknown layout type ', layout_type
-         stop
-      endif
-
-      return
-      end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine print_timers()
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      integer i
-      include 'global.h'
-      character*25 tstrings(T_max)
-      data tstrings / '          total ', 
-     >                '          setup ', 
-     >                '            fft ', 
-     >                '         evolve ', 
-     >                '       checksum ', 
-     >                '         fftlow ', 
-     >                '        fftcopy ', 
-     >                '      transpose ', 
-     >                ' transpose1_loc ', 
-     >                ' transpose1_glo ', 
-     >                ' transpose1_fin ', 
-     >                ' transpose2_loc ', 
-     >                ' transpose2_glo ', 
-     >                ' transpose2_fin ', 
-     >                '           sync ' /
-
-      if (me .ne. 0) return
-      do i = 1, t_max
-         if (timer_read(i) .ne. 0.0d0) then
-            write(*, 100) i, tstrings(i), timer_read(i)
-         endif
-      end do
- 100  format(' timer ', i2, '(', A16,  ') :', F10.6)
-      return
-      end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine fft(dir, x1, x2)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer dir
-      double complex x1(ntdivnp), x2(ntdivnp)
-
-      double complex scratch(fftblockpad_default*maxdim*2)
-
-c---------------------------------------------------------------------
-c note: args x1, x2 must be different arrays
-c note: args for cfftsx are (direction, layout, xin, xout, scratch)
-c       xin/xout may be the same and it can be somewhat faster
-c       if they are
-c note: args for transpose are (layout1, layout2, xin, xout)
-c       xin/xout must be different
-c---------------------------------------------------------------------
-
-      if (dir .eq. 1) then
-         if (layout_type .eq. layout_0d) then
-            call cffts1(1, dims(1,1), dims(2,1), dims(3,1), 
-     >                  x1, x1, scratch)
-            call cffts2(1, dims(1,2), dims(2,2), dims(3,2), 
-     >                  x1, x1, scratch)
-            call cffts3(1, dims(1,3), dims(2,3), dims(3,3), 
-     >                  x1, x2, scratch)
-         else if (layout_type .eq. layout_1d) then
-            call cffts1(1, dims(1,1), dims(2,1), dims(3,1), 
-     >                  x1, x1, scratch)
-            call cffts2(1, dims(1,2), dims(2,2), dims(3,2), 
-     >                  x1, x1, scratch)
-            if (timers_enabled) call timer_start(T_transpose)
-            call transpose_xy_z(2, 3, x1, x2)
-            if (timers_enabled) call timer_stop(T_transpose)
-            call cffts1(1, dims(1,3), dims(2,3), dims(3,3), 
-     >                  x2, x2, scratch)
-         else if (layout_type .eq. layout_2d) then
-            call cffts1(1, dims(1,1), dims(2,1), dims(3,1), 
-     >                  x1, x1, scratch)
-            if (timers_enabled) call timer_start(T_transpose)
-            call transpose_x_y(1, 2, x1, x2)
-            if (timers_enabled) call timer_stop(T_transpose)
-            call cffts1(1, dims(1,2), dims(2,2), dims(3,2), 
-     >                  x2, x2, scratch)
-            if (timers_enabled) call timer_start(T_transpose)
-            call transpose_x_z(2, 3, x2, x1)
-            if (timers_enabled) call timer_stop(T_transpose)
-            call cffts1(1, dims(1,3), dims(2,3), dims(3,3), 
-     >                  x1, x2, scratch)
-         endif
-      else
-         if (layout_type .eq. layout_0d) then
-            call cffts3(-1, dims(1,3), dims(2,3), dims(3,3), 
-     >                  x1, x1, scratch)
-            call cffts2(-1, dims(1,2), dims(2,2), dims(3,2), 
-     >                  x1, x1, scratch)
-            call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), 
-     >                  x1, x2, scratch)
-         else if (layout_type .eq. layout_1d) then
-            call cffts1(-1, dims(1,3), dims(2,3), dims(3,3), 
-     >                  x1, x1, scratch)
-            if (timers_enabled) call timer_start(T_transpose)
-            call transpose_x_yz(3, 2, x1, x2)
-            if (timers_enabled) call timer_stop(T_transpose)
-            call cffts2(-1, dims(1,2), dims(2,2), dims(3,2), 
-     >                  x2, x2, scratch)
-            call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), 
-     >                  x2, x2, scratch)
-         else if (layout_type .eq. layout_2d) then
-            call cffts1(-1, dims(1,3), dims(2,3), dims(3,3), 
-     >                  x1, x1, scratch)
-            if (timers_enabled) call timer_start(T_transpose)
-            call transpose_x_z(3, 2, x1, x2)
-            if (timers_enabled) call timer_stop(T_transpose)
-            call cffts1(-1, dims(1,2), dims(2,2), dims(3,2), 
-     >                  x2, x2, scratch)
-            if (timers_enabled) call timer_start(T_transpose)
-            call transpose_x_y(2, 1, x2, x1)
-            if (timers_enabled) call timer_stop(T_transpose)
-            call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), 
-     >                  x1, x2, scratch)
-         endif
-      endif
-      return
-      end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine cffts1(is, d1, d2, d3, x, xout, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'global.h'
-      integer is, d1, d2, d3, logd1
-      double complex x(d1,d2,d3)
-      double complex xout(d1,d2,d3)
-      double complex y(fftblockpad, d1, 2) 
-      integer i, j, k, jj
-
-      logd1 = ilog2(d1)
-
-      do k = 1, d3
-         do jj = 0, d2 - fftblock, fftblock
-            if (timers_enabled) call timer_start(T_fftcopy)
-            do j = 1, fftblock
-               do i = 1, d1
-                  y(j,i,1) = x(i,j+jj,k)
-               enddo
-            enddo
-            if (timers_enabled) call timer_stop(T_fftcopy)
-            
-            if (timers_enabled) call timer_start(T_fftlow)
-            call cfftz (is, logd1, d1, y, y(1,1,2))
-            if (timers_enabled) call timer_stop(T_fftlow)
-
-            if (timers_enabled) call timer_start(T_fftcopy)
-            do j = 1, fftblock
-               do i = 1, d1
-                  xout(i,j+jj,k) = y(j,i,1)
-               enddo
-            enddo
-            if (timers_enabled) call timer_stop(T_fftcopy)
-         enddo
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine cffts2(is, d1, d2, d3, x, xout, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'global.h'
-      integer is, d1, d2, d3, logd2
-      double complex x(d1,d2,d3)
-      double complex xout(d1,d2,d3)
-      double complex y(fftblockpad, d2, 2) 
-      integer i, j, k, ii
-
-      logd2 = ilog2(d2)
-
-      do k = 1, d3
-        do ii = 0, d1 - fftblock, fftblock
-           if (timers_enabled) call timer_start(T_fftcopy)
-           do j = 1, d2
-              do i = 1, fftblock
-                 y(i,j,1) = x(i+ii,j,k)
-              enddo
-           enddo
-           if (timers_enabled) call timer_stop(T_fftcopy)
-
-           if (timers_enabled) call timer_start(T_fftlow)
-           call cfftz (is, logd2, d2, y, y(1, 1, 2))
-           if (timers_enabled) call timer_stop(T_fftlow)
-
-           if (timers_enabled) call timer_start(T_fftcopy)
-           do j = 1, d2
-              do i = 1, fftblock
-                 xout(i+ii,j,k) = y(i,j,1)
-              enddo
-           enddo
-           if (timers_enabled) call timer_stop(T_fftcopy)
-        enddo
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine cffts3(is, d1, d2, d3, x, xout, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'global.h'
-      integer is, d1, d2, d3, logd3
-      double complex x(d1,d2,d3)
-      double complex xout(d1,d2,d3)
-      double complex y(fftblockpad, d3, 2) 
-      integer i, j, k, ii
-
-      logd3 = ilog2(d3)
-
-      do j = 1, d2
-        do ii = 0, d1 - fftblock, fftblock
-           if (timers_enabled) call timer_start(T_fftcopy)
-           do k = 1, d3
-              do i = 1, fftblock
-                 y(i,k,1) = x(i+ii,j,k)
-              enddo
-           enddo
-           if (timers_enabled) call timer_stop(T_fftcopy)
-
-           if (timers_enabled) call timer_start(T_fftlow)
-           call cfftz (is, logd3, d3, y, y(1, 1, 2))
-           if (timers_enabled) call timer_stop(T_fftlow)
-
-           if (timers_enabled) call timer_start(T_fftcopy)
-           do k = 1, d3
-              do i = 1, fftblock
-                 xout(i+ii,j,k) = y(i,k,1)
-              enddo
-           enddo
-           if (timers_enabled) call timer_stop(T_fftcopy)
-        enddo
-      enddo
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine fft_init (n)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the roots-of-unity array that will be used for subsequent FFTs. 
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-
-      integer m,n,nu,ku,i,j,ln
-      double precision t, ti
-
-
-c---------------------------------------------------------------------
-c   Initialize the U array with sines and cosines in a manner that permits
-c   stride one access at each FFT iteration.
-c---------------------------------------------------------------------
-      nu = n
-      m = ilog2(n)
-      u(1) = m
-      ku = 2
-      ln = 1
-
-      do j = 1, m
-         t = pi / ln
-         
-         do i = 0, ln - 1
-            ti = i * t
-            u(i+ku) = dcmplx (cos (ti), sin(ti))
-         enddo
-         
-         ku = ku + ln
-         ln = 2 * ln
-      enddo
-      
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine cfftz (is, m, n, x, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
-c   to Swarztrauber.  X is both the input and the output array, while Y is a 
-c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to 
-c   perform FFTs, the array U must be initialized by calling CFFTZ with IS 
-c   set to 0 and M set to MX, where MX is the maximum value of M for any 
-c   subsequent call.
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-
-      integer is,m,n,i,j,l,mx
-      double complex x, y
-
-      dimension x(fftblockpad,n), y(fftblockpad,n)
-
-c---------------------------------------------------------------------
-c   Check if input parameters are invalid.
-c---------------------------------------------------------------------
-      mx = u(1)
-      if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)    
-     >  then
-        write (*, 1)  is, m, mx
- 1      format ('CFFTZ: Either U has not been initialized, or else'/    
-     >    'one of the input parameters is invalid', 3I5)
-        stop
-      endif
-
-c---------------------------------------------------------------------
-c   Perform one variant of the Stockham FFT.
-c---------------------------------------------------------------------
-      do l = 1, m, 2
-        call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
-        if (l .eq. m) goto 160
-        call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
-      enddo
-
-      goto 180
-
-c---------------------------------------------------------------------
-c   Copy Y to X.
-c---------------------------------------------------------------------
- 160  do j = 1, n
-        do i = 1, fftblock
-          x(i,j) = y(i,j)
-        enddo
-      enddo
-
- 180  continue
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   Performs the L-th iteration of the second variant of the Stockham FFT.
-c---------------------------------------------------------------------
-
-      implicit none
-
-      integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
-      double complex u,x,y,u1,x11,x21
-      dimension u(n), x(ny1,n), y(ny1,n)
-
-
-c---------------------------------------------------------------------
-c   Set initial parameters.
-c---------------------------------------------------------------------
-
-      n1 = n / 2
-      lk = 2 ** (l - 1)
-      li = 2 ** (m - l)
-      lj = 2 * lk
-      ku = li + 1
-
-      do i = 0, li - 1
-        i11 = i * lk + 1
-        i12 = i11 + n1
-        i21 = i * lj + 1
-        i22 = i21 + lk
-        if (is .ge. 1) then
-          u1 = u(ku+i)
-        else
-          u1 = dconjg (u(ku+i))
-        endif
-
-c---------------------------------------------------------------------
-c   This loop is vectorizable.
-c---------------------------------------------------------------------
-        do k = 0, lk - 1
-          do j = 1, ny
-            x11 = x(j,i11+k)
-            x21 = x(j,i12+k)
-            y(j,i21+k) = x11 + x21
-            y(j,i22+k) = u1 * (x11 - x21)
-          enddo
-        enddo
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      integer function ilog2(n)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      integer n, nn, lg
-      if (n .eq. 1) then
-         ilog2=0
-         return
-      endif
-      lg = 1
-      nn = 2
-      do while (nn .lt. n)
-         nn = nn*2
-         lg = lg+1
-      end do
-      ilog2 = lg
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_yz(l1, l2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer l1, l2
-      double complex xin(ntdivnp), xout(ntdivnp)
-
-      call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
-     >                          xin, xout)
-
-      call transpose2_global(xout, xin)
-
-      call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
-     >                          xin, xout)
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_xy_z(l1, l2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer l1, l2
-      double complex xin(ntdivnp), xout(ntdivnp)
-
-      call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
-     >                          xin, xout)
-      call transpose2_global(xout, xin)
-      call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
-     >                          xin, xout)
-
-      return
-      end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose2_local(n1, n2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'mpinpb.h'
-      include 'global.h'
-      integer n1, n2
-      double complex xin(n1, n2), xout(n2, n1)
-      
-      double complex z(transblockpad, transblock)
-
-      integer i, j, ii, jj
-
-      if (timers_enabled) call timer_start(T_transxzloc)
-
-c---------------------------------------------------------------------
-c If possible, block the transpose for cache memory systems. 
-c How much does this help? Example: R8000 Power Challenge (90 MHz)
-c Blocked version decreases time spend in this routine 
-c from 14 seconds to 5.2 seconds on 8 nodes class A.
-c---------------------------------------------------------------------
-
-      if (n1 .lt. transblock .or. n2 .lt. transblock) then
-         if (n1 .ge. n2) then 
-            do j = 1, n2
-               do i = 1, n1
-                  xout(j, i) = xin(i, j)
-               end do
-            end do
-         else
-            do i = 1, n1
-               do j = 1, n2
-                  xout(j, i) = xin(i, j)
-               end do
-            end do
-         endif
-      else
-         do j = 0, n2-1, transblock
-            do i = 0, n1-1, transblock
-               
-c---------------------------------------------------------------------
-c Note: compiler should be able to take j+jj out of inner loop
-c---------------------------------------------------------------------
-               do jj = 1, transblock
-                  do ii = 1, transblock
-                     z(jj,ii) = xin(i+ii, j+jj)
-                  end do
-               end do
-               
-               do ii = 1, transblock
-                  do jj = 1, transblock
-                     xout(j+jj, i+ii) = z(jj,ii)
-                  end do
-               end do
-               
-            end do
-         end do
-      endif
-      if (timers_enabled) call timer_stop(T_transxzloc)
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose2_global(xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      include 'mpinpb.h'
-      double complex xin(ntdivnp)
-      double complex xout(ntdivnp) 
-      integer ierr
-
-      if (timers_enabled) call synchup()
-
-      if (timers_enabled) call timer_start(T_transxzglo)
-      call mpi_alltoall(xin, ntdivnp/np, dc_type,
-     >                  xout, ntdivnp/np, dc_type,
-     >                  commslice1, ierr)
-      if (timers_enabled) call timer_stop(T_transxzglo)
-
-      return
-      end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose2_finish(n1, n2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer n1, n2, ioff
-      double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)
-      
-      integer i, j, p
-
-      if (timers_enabled) call timer_start(T_transxzfin)
-      do p = 0, np2-1
-         ioff = p*n2
-         do j = 1, n1/np2
-            do i = 1, n2
-               xout(i+ioff, j) = xin(i, j, p)
-            end do
-         end do
-      end do
-      if (timers_enabled) call timer_stop(T_transxzfin)
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_z(l1, l2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer l1, l2
-      double complex xin(ntdivnp), xout(ntdivnp)
-
-      call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
-     >                         xin, xout)
-      call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1), 
-     >                          xout, xin)
-      call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2), 
-     >                          xin, xout)
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_z_local(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer d1, d2, d3
-      double complex xin(d1,d2,d3)
-      double complex xout(d3,d2,d1)
-      integer block1, block3
-      integer i, j, k, kk, ii, i1, k1
-
-      double complex buf(transblockpad, maxdim)
-      if (timers_enabled) call timer_start(T_transxzloc)
-      if (d1 .lt. 32) goto 100
-      block3 = d3
-      if (block3 .eq. 1)  goto 100
-      if (block3 .gt. transblock) block3 = transblock
-      block1 = d1
-      if (block1*block3 .gt. transblock*transblock) 
-     >          block1 = transblock*transblock/block3
-c---------------------------------------------------------------------
-c blocked transpose
-c---------------------------------------------------------------------
-      do j = 1, d2
-         do kk = 0, d3-block3, block3
-            do ii = 0, d1-block1, block1
-               
-               do k = 1, block3
-                  k1 = k + kk
-                  do i = 1, block1
-                     buf(k, i) = xin(i+ii, j, k1)
-                  end do
-               end do
-
-               do i = 1, block1
-                  i1 = i + ii
-                  do k = 1, block3
-                     xout(k+kk, j, i1) = buf(k, i)
-                  end do
-               end do
-
-            end do
-         end do
-      end do
-      goto 200
-      
-
-c---------------------------------------------------------------------
-c basic transpose
-c---------------------------------------------------------------------
- 100  continue
-      
-      do j = 1, d2
-         do k = 1, d3
-            do i = 1, d1
-               xout(k, j, i) = xin(i, j, k)
-            end do
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c all done
-c---------------------------------------------------------------------
- 200  continue
-
-      if (timers_enabled) call timer_stop(T_transxzloc)
-      return 
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_z_global(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      include 'mpinpb.h'
-      integer d1, d2, d3
-      double complex xin(d3,d2,d1)
-      double complex xout(d3,d2,d1) ! not real layout, but right size
-      integer ierr
-
-      if (timers_enabled) call synchup()
-
-c---------------------------------------------------------------------
-c do transpose among all  processes with same 1-coord (me1)
-c---------------------------------------------------------------------
-      if (timers_enabled)call timer_start(T_transxzglo)
-      call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
-     >                  xout, d1*d2*d3/np2, dc_type,
-     >                  commslice1, ierr)
-      if (timers_enabled) call timer_stop(T_transxzglo)
-      return
-      end
-      
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer d1, d2, d3
-      double complex xin(d1/np2, d2, d3, 0:np2-1)
-      double complex xout(d1,d2,d3)
-      integer i, j, k, p, ioff
-      if (timers_enabled) call timer_start(T_transxzfin)
-c---------------------------------------------------------------------
-c this is the most straightforward way of doing it. the
-c calculation in the inner loop doesn't help. 
-c      do i = 1, d1/np2
-c         do j = 1, d2
-c            do k = 1, d3
-c               do p = 0, np2-1
-c                  ii = i + p*d1/np2
-c                  xout(ii, j, k) = xin(i, j, k, p)
-c               end do
-c            end do
-c         end do
-c      end do
-c---------------------------------------------------------------------
-
-      do p = 0, np2-1
-         ioff = p*d1/np2
-         do k = 1, d3
-            do j = 1, d2
-               do i = 1, d1/np2
-                  xout(i+ioff, j, k) = xin(i, j, k, p)
-               end do
-            end do
-         end do
-      end do
-      if (timers_enabled) call timer_stop(T_transxzfin)
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_y(l1, l2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer l1, l2
-      double complex xin(ntdivnp), xout(ntdivnp)
-
-c---------------------------------------------------------------------
-c xy transpose is a little tricky, since we don't want
-c to touch 3rd axis. But alltoall must involve 3rd axis (most 
-c slowly varying) to be efficient. So we do
-c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
-c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
-c then local finish. 
-c---------------------------------------------------------------------
-
-
-      call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
-     >                         xin, xout)
-      call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1), 
-     >                          xout, xin)
-      call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2), 
-     >                          xin, xout)
-
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_y_local(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer d1, d2, d3
-      double complex xin(d1, d2, d3)
-      double complex xout(d2, d3, d1)
-      integer i, j, k
-      if (timers_enabled) call timer_start(T_transxyloc)
-
-      do k = 1, d3
-         do i = 1, d1
-            do j = 1, d2
-               xout(j,k,i)=xin(i,j,k)
-            end do
-         end do
-      end do
-      if (timers_enabled) call timer_stop(T_transxyloc)
-      return 
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_y_global(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      include 'mpinpb.h'
-      integer d1, d2, d3
-c---------------------------------------------------------------------
-c array is in form (ny/np1, nz/np2, nx)
-c---------------------------------------------------------------------
-      double complex xin(d2,d3,d1)
-      double complex xout(d2,d3,d1) ! not real layout but right size
-      integer ierr
-
-      if (timers_enabled) call synchup()
-
-c---------------------------------------------------------------------
-c do transpose among all processes with same 1-coord (me1)
-c---------------------------------------------------------------------
-      if (timers_enabled) call timer_start(T_transxyglo)
-      call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
-     >                  xout, d1*d2*d3/np1, dc_type,
-     >                  commslice2, ierr)
-      if (timers_enabled) call timer_stop(T_transxyglo)
-
-      return
-      end
-      
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      integer d1, d2, d3
-      double complex xin(d1/np1, d3, d2, 0:np1-1)
-      double complex xout(d1,d2,d3)
-      integer i, j, k, p, ioff
-      if (timers_enabled) call timer_start(T_transxyfin)
-c---------------------------------------------------------------------
-c this is the most straightforward way of doing it. the
-c calculation in the inner loop doesn't help. 
-c      do i = 1, d1/np1
-c         do j = 1, d2
-c            do k = 1, d3
-c               do p = 0, np1-1
-c                  ii = i + p*d1/np1
-c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
-c                  xout(ii, j, k) = xin(i, k, j, p)
-c               end do
-c            end do
-c         end do
-c      end do
-c---------------------------------------------------------------------
-
-      do p = 0, np1-1
-         ioff = p*d1/np1
-         do k = 1, d3
-            do j = 1, d2
-               do i = 1, d1/np1
-                  xout(i+ioff, j, k) = xin(i, k, j, p)
-               end do
-            end do
-         end do
-      end do
-      if (timers_enabled) call timer_stop(T_transxyfin)
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine checksum(i, u1, d1, d2, d3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      include 'mpinpb.h'
-      integer i, d1, d2, d3
-      double complex u1(d1, d2, d3)
-      integer j, q,r,s, ierr
-      double complex chk,allchk
-      chk = (0.0,0.0)
-
-      do j=1,1024
-         q = mod(j, nx)+1
-         if (q .ge. xstart(1) .and. q .le. xend(1)) then
-            r = mod(3*j,ny)+1
-            if (r .ge. ystart(1) .and. r .le. yend(1)) then
-               s = mod(5*j,nz)+1
-               if (s .ge. zstart(1) .and. s .le. zend(1)) then
-                  chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
-               end if
-            end if
-         end if
-      end do
-      chk = chk/ntotal_f
-      
-      call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM, 
-     >                0, MPI_COMM_WORLD, ierr)      
-      if (me .eq. 0) then
-            write (*, 30) i, allchk
- 30         format (' T =',I5,5X,'Checksum =',1P2D22.12)
-      endif
-
-c      sums(i) = allchk
-c     If we compute the checksum for diagnostic purposes, we let i be
-c     negative, so the result will not be stored in an array
-      if (i .gt. 0) sums(i) = allchk
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine synchup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      include 'mpinpb.h'
-      integer ierr
-      call timer_start(T_synch)
-      call mpi_barrier(MPI_COMM_WORLD, ierr)
-      call timer_stop(T_synch)
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine verify (d1, d2, d3, nt, verified, class)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'global.h'
-      include 'mpinpb.h'
-      integer d1, d2, d3, nt
-      character class
-      logical verified
-      integer ierr, size, i
-      double precision err, epsilon
-
-c---------------------------------------------------------------------
-c   Reference checksums
-c---------------------------------------------------------------------
-      double complex csum_ref(25)
-
-
-      class = 'U'
-
-      if (me .ne. 0) return
-
-      epsilon = 1.0d-12
-      verified = .FALSE.
-
-      if (d1 .eq. 64 .and.
-     >    d2 .eq. 64 .and.
-     >    d3 .eq. 64 .and.
-     >    nt .eq. 6) then
-c---------------------------------------------------------------------
-c   Sample size reference checksums
-c---------------------------------------------------------------------
-         class = 'S'
-         csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
-         csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
-         csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
-         csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
-         csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
-         csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
-
-      else if (d1 .eq. 128 .and.
-     >    d2 .eq. 128 .and.
-     >    d3 .eq. 32 .and.
-     >    nt .eq. 6) then
-c---------------------------------------------------------------------
-c   Class W size reference checksums
-c---------------------------------------------------------------------
-         class = 'W'
-         csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
-         csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
-         csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
-         csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
-         csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
-         csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
-
-      else if (d1 .eq. 256 .and.
-     >    d2 .eq. 256 .and.
-     >    d3 .eq. 128 .and.
-     >    nt .eq. 6) then
-c---------------------------------------------------------------------
-c   Class A size reference checksums
-c---------------------------------------------------------------------
-         class = 'A'
-         csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
-         csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
-         csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
-         csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
-         csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
-         csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
-      
-      else if (d1 .eq. 512 .and.
-     >    d2 .eq. 256 .and.
-     >    d3 .eq. 256 .and.
-     >    nt .eq. 20) then
-c---------------------------------------------------------------------
-c   Class B size reference checksums
-c---------------------------------------------------------------------
-         class = 'B'
-         csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
-         csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
-         csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
-         csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
-         csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
-         csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
-         csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
-         csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
-         csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
-         csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
-         csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
-         csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
-         csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
-         csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
-         csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
-         csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
-         csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
-         csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
-         csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
-         csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
-
-      else if (d1 .eq. 512 .and.
-     >    d2 .eq. 512 .and.
-     >    d3 .eq. 512 .and.
-     >    nt .eq. 20) then
-c---------------------------------------------------------------------
-c   Class C size reference checksums
-c---------------------------------------------------------------------
-         class = 'C'
-         csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
-         csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
-         csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
-         csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
-         csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
-         csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
-         csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
-         csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
-         csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
-         csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
-         csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
-         csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
-         csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
-         csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
-         csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
-         csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
-         csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
-         csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
-         csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
-         csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
-
-      else if (d1 .eq. 2048 .and.
-     >    d2 .eq. 1024 .and.
-     >    d3 .eq. 1024 .and.
-     >    nt .eq. 25) then
-c---------------------------------------------------------------------
-c   Class D size reference checksums
-c---------------------------------------------------------------------
-         class = 'D'
-         csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
-         csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
-         csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
-         csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
-         csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
-         csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
-         csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
-         csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
-         csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
-         csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
-         csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
-         csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
-         csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
-         csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
-         csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
-         csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
-         csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
-         csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
-         csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
-         csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
-         csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
-         csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
-         csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
-         csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
-         csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
-
-      else if (d1 .eq. 4096 .and.
-     >    d2 .eq. 2048 .and.
-     >    d3 .eq. 2048 .and.
-     >    nt .eq. 25) then
-c---------------------------------------------------------------------
-c   Class E size reference checksums
-c---------------------------------------------------------------------
-         class = 'E'
-         csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
-         csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
-         csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
-         csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
-         csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
-         csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
-         csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
-         csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
-         csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
-         csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
-         csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
-         csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
-         csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
-         csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
-         csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
-         csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
-         csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
-         csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
-         csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
-         csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
-         csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
-         csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
-         csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
-         csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
-         csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
-
-      endif
-
-
-      if (class .ne. 'U') then
-
-         do i = 1, nt
-            err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
-            if (.not.(err .le. epsilon)) goto 100
-         end do
-         verified = .TRUE.
- 100     continue
-
-      endif
-
-      call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
-      if (size .ne. np) then
-         write(*, 4010) np
-         write(*, 4011)
-         write(*, 4012)
-c---------------------------------------------------------------------
-c multiple statements because some Fortran compilers have
-c problems with long strings. 
-c---------------------------------------------------------------------
- 4010    format( ' Warning: benchmark was compiled for ', i5, 
-     >           'processors')
- 4011    format( ' Must be run on this many processors for official',
-     >           ' verification')
- 4012    format( ' so memory access is repeatable')
-         verified = .false.
-      endif
-         
-      if (class .ne. 'U') then
-         if (verified) then
-            write(*,2000)
- 2000       format(' Result verification successful')
-         else
-            write(*,2001)
- 2001       format(' Result verification failed')
-         endif
-      endif
-      print *, 'class = ', class
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/FT/global.h b/examples/smpi/NAS/FT/global.h
deleted file mode 100644
index 3e534bb48d..0000000000
--- a/examples/smpi/NAS/FT/global.h
+++ /dev/null
@@ -1,134 +0,0 @@
-      include 'npbparams.h'
-
-c 2D processor array -> 2D grid decomposition (by pencils)
-c If processor array is 1xN or -> 1D grid decomposition (by planes)
-c If processor array is 1x1 -> 0D grid decomposition
-c For simplicity, do not treat Nx1 (np2 = 1) specially
-      integer np1, np2, np
-
-c basic decomposition strategy
-      integer layout_type
-      integer layout_0D, layout_1D, layout_2D
-      parameter (layout_0D = 0, layout_1D = 1, layout_2D = 2)
-
-      common /procgrid/ np1, np2, layout_type, np
-
-
-c Cache blocking params. These values are good for most
-c RISC processors.  
-c FFT parameters:
-c  fftblock controls how many ffts are done at a time. 
-c  The default is appropriate for most cache-based machines
-c  On vector machines, the FFT can be vectorized with vector
-c  length equal to the block size, so the block size should
-c  be as large as possible. This is the size of the smallest
-c  dimension of the problem: 128 for class A, 256 for class B and
-c  512 for class C.
-c Transpose parameters:
-c  transblock is the blocking factor for the transposes when there
-c  is a 1-D layout. On vector machines it should probably be
-c  large (largest dimension of the problem).
-
-
-      integer fftblock_default, fftblockpad_default
-      parameter (fftblock_default=16, fftblockpad_default=18)
-      integer transblock, transblockpad
-      parameter(transblock=32, transblockpad=34)
-      
-      integer fftblock, fftblockpad
-      common /blockinfo/ fftblock, fftblockpad
-
-c we need a bunch of logic to keep track of how
-c arrays are laid out. 
-c coords of this processor
-      integer me, me1, me2
-      common /coords/ me, me1, me2
-c need a communicator for row/col in processor grid
-      integer commslice1, commslice2
-      common /comms/ commslice1, commslice2
-
-
-
-c There are basically three stages
-c 1: x-y-z layout
-c 2: after x-transform (before y)
-c 3: after y-transform (before z)
-c The computation proceeds logically as
-
-c set up initial conditions
-c fftx(1)
-c transpose (1->2)
-c ffty(2)
-c transpose (2->3)
-c fftz(3)
-c time evolution
-c fftz(3)
-c transpose (3->2)
-c ffty(2)
-c transpose (2->1)
-c fftx(1)
-c compute residual(1)
-
-c for the 0D, 1D, 2D strategies, the layouts look like xxx
-c        
-c            0D        1D        2D
-c 1:        xyz       xyz       xyz
-c 2:        xyz       xyz       yxz
-c 3:        xyz       zyx       zxy
-
-c the array dimensions are stored in dims(coord, phase)
-      integer dims(3, 3)
-      integer xstart(3), ystart(3), zstart(3)
-      integer xend(3), yend(3), zend(3)
-      common /layout/ dims,
-     >                xstart, ystart, zstart, 
-     >                xend, yend, zend
-
-      integer T_total, T_setup, T_fft, T_evolve, T_checksum, 
-     >        T_fftlow, T_fftcopy, T_transpose, 
-     >        T_transxzloc, T_transxzglo, T_transxzfin, 
-     >        T_transxyloc, T_transxyglo, T_transxyfin, 
-     >        T_synch, T_max
-      parameter (T_total = 1, T_setup = 2, T_fft = 3, 
-     >           T_evolve = 4, T_checksum = 5, 
-     >           T_fftlow = 6, T_fftcopy = 7, T_transpose = 8,
-     >           T_transxzloc = 9, T_transxzglo = 10, T_transxzfin = 11, 
-     >           T_transxyloc = 12, T_transxyglo = 13, 
-     >           T_transxyfin = 14,  T_synch = 15, T_max = 15)
-
-
-
-      logical timers_enabled
-      parameter (timers_enabled = .false.)
-
-
-      external timer_read
-      double precision timer_read
-      external ilog2
-      integer ilog2
-
-      external randlc
-      double precision randlc
-
-
-c other stuff
-      logical debug, debugsynch
-      common /dbg/ debug, debugsynch
-
-      double precision seed, a, pi, alpha
-      parameter (seed = 314159265.d0, a = 1220703125.d0, 
-     >  pi = 3.141592653589793238d0, alpha=1.0d-6)
-
-c roots of unity array
-c relies on x being largest dimension?
-      double complex u(nx)
-      common /ucomm/ u
-
-
-c for checksum data
-      double complex sums(0:niter_default)
-      common /sumcomm/ sums
-
-c number of iterations
-      integer niter
-      common /iter/ niter
diff --git a/examples/smpi/NAS/FT/inputft.data.sample b/examples/smpi/NAS/FT/inputft.data.sample
deleted file mode 100644
index 448ac42bc0..0000000000
--- a/examples/smpi/NAS/FT/inputft.data.sample
+++ /dev/null
@@ -1,3 +0,0 @@
-6   ! number of iterations
-2   ! layout type. 0 = 0d, 1 = 1d, 2 = 2d
-2 4 ! processor layout. 0d must be "1 1"; 1d must be "1 N"
diff --git a/examples/smpi/NAS/FT/mpinpb.h b/examples/smpi/NAS/FT/mpinpb.h
deleted file mode 100644
index e43e552a84..0000000000
--- a/examples/smpi/NAS/FT/mpinpb.h
+++ /dev/null
@@ -1,4 +0,0 @@
-      include 'mpif.h'
-c mpi data types
-      integer dc_type
-      common /mpistuff/ dc_type
diff --git a/examples/smpi/NAS/LU/Makefile b/examples/smpi/NAS/LU/Makefile
deleted file mode 100644
index a05c94dc4a..0000000000
--- a/examples/smpi/NAS/LU/Makefile
+++ /dev/null
@@ -1,74 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=lu
-BENCHMARKU=LU
-VEC=
-
-include ../config/make.def
-
-OBJS = lu.o init_comm.o read_input.o bcast_inputs.o proc_grid.o neighbors.o \
-       nodedim.o subdomain.o setcoeff.o sethyper.o setbv.o exact.o setiv.o \
-       erhs.o ssor.o exchange_1.o exchange_3.o exchange_4.o exchange_5.o \
-       exchange_6.o rhs.o l2norm.o jacld.o blts$(VEC).o jacu.o buts$(VEC).o \
-       error.o pintgr.o verify.o ${COMMON}/print_results.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-
-# npbparams.h is included by applu.incl
-# The following rule should do the trick but many make programs (not gmake)
-# will do the wrong thing and rebuild the world every time (because the
-# mod time on header.h is not changed. One solution would be to 
-# touch header.h but this might cause confusion if someone has
-# accidentally deleted it. Instead, make the dependency on npbparams.h
-# explicit in all the lines below (even though dependence is indirect). 
-
-# applu.incl: npbparams.h
-
-${PROGRAM}: config
-	@if [ x$(VERSION) = xvec ] ; then	\
-		${MAKE} VEC=_vec exec;		\
-	elif [ x$(VERSION) = xVEC ] ; then	\
-		${MAKE} VEC=_vec exec;		\
-	else					\
-		${MAKE} exec;			\
-	fi
-
-exec: $(OBJS)
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-.f.o :
-	${FCOMPILE} $<
-
-lu.o:		lu.f applu.incl npbparams.h
-bcast_inputs.o:	bcast_inputs.f applu.incl npbparams.h mpinpb.h
-blts$(VEC).o:	blts$(VEC).f
-buts$(VEC).o:	buts$(VEC).f
-erhs.o:		erhs.f applu.incl npbparams.h
-error.o:	error.f applu.incl npbparams.h mpinpb.h
-exact.o:	exact.f applu.incl npbparams.h
-exchange_1.o:	exchange_1.f applu.incl npbparams.h mpinpb.h
-exchange_3.o:	exchange_3.f applu.incl npbparams.h mpinpb.h
-exchange_4.o:	exchange_4.f applu.incl npbparams.h mpinpb.h
-exchange_5.o:	exchange_5.f applu.incl npbparams.h mpinpb.h
-exchange_6.o:	exchange_6.f applu.incl npbparams.h mpinpb.h
-init_comm.o:	init_comm.f applu.incl npbparams.h mpinpb.h 
-jacld.o:	jacld.f applu.incl npbparams.h
-jacu.o:		jacu.f applu.incl npbparams.h
-l2norm.o:	l2norm.f mpinpb.h
-neighbors.o:	neighbors.f applu.incl npbparams.h
-nodedim.o:	nodedim.f
-pintgr.o:	pintgr.f applu.incl npbparams.h mpinpb.h
-proc_grid.o:	proc_grid.f applu.incl npbparams.h
-read_input.o:	read_input.f applu.incl npbparams.h mpinpb.h
-rhs.o:		rhs.f applu.incl npbparams.h
-setbv.o:	setbv.f applu.incl npbparams.h
-setiv.o:	setiv.f applu.incl npbparams.h
-setcoeff.o:	setcoeff.f applu.incl npbparams.h
-sethyper.o:	sethyper.f applu.incl npbparams.h
-ssor.o:		ssor.f applu.incl npbparams.h mpinpb.h
-subdomain.o:	subdomain.f applu.incl npbparams.h mpinpb.h
-verify.o:	verify.f applu.incl npbparams.h
-
-clean:
-	- /bin/rm -f npbparams.h
-	- /bin/rm -f *.o *~
diff --git a/examples/smpi/NAS/LU/applu.incl b/examples/smpi/NAS/LU/applu.incl
deleted file mode 100644
index 413fc834e3..0000000000
--- a/examples/smpi/NAS/LU/applu.incl
+++ /dev/null
@@ -1,153 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-c---  applu.incl   
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   npbparams.h defines parameters that depend on the class and 
-c   number of nodes
-c---------------------------------------------------------------------
-
-      include 'npbparams.h'
-
-c---------------------------------------------------------------------
-c   parameters which can be overridden in runtime config file
-c   (in addition to size of problem - isiz01,02,03 give the maximum size)
-c   ipr = 1 to print out verbose information
-c   omega = 2.0 is correct for all classes
-c   tolrsd is tolerance levels for steady state residuals
-c---------------------------------------------------------------------
-      integer ipr_default
-      parameter (ipr_default = 1)
-      double precision omega_default
-      parameter (omega_default = 1.2d0)
-      double precision tolrsd1_def, tolrsd2_def, tolrsd3_def, 
-     >                 tolrsd4_def, tolrsd5_def
-      parameter (tolrsd1_def=1.0e-08, 
-     >          tolrsd2_def=1.0e-08, tolrsd3_def=1.0e-08, 
-     >          tolrsd4_def=1.0e-08, tolrsd5_def=1.0e-08)
-
-      double precision c1, c2, c3, c4, c5
-      parameter( c1 = 1.40d+00, c2 = 0.40d+00,
-     >           c3 = 1.00d-01, c4 = 1.00d+00,
-     >           c5 = 1.40d+00 )
-
-c---------------------------------------------------------------------
-c   grid
-c---------------------------------------------------------------------
-      integer nx, ny, nz
-      integer nx0, ny0, nz0
-      integer ipt, ist, iend
-      integer jpt, jst, jend
-      integer ii1, ii2
-      integer ji1, ji2
-      integer ki1, ki2
-      double precision  dxi, deta, dzeta
-      double precision  tx1, tx2, tx3
-      double precision  ty1, ty2, ty3
-      double precision  tz1, tz2, tz3
-
-      common/cgcon/ dxi, deta, dzeta,
-     >              tx1, tx2, tx3,
-     >              ty1, ty2, ty3,
-     >              tz1, tz2, tz3,
-     >              nx, ny, nz, 
-     >              nx0, ny0, nz0,
-     >              ipt, ist, iend,
-     >              jpt, jst, jend,
-     >              ii1, ii2, 
-     >              ji1, ji2, 
-     >              ki1, ki2
-
-c---------------------------------------------------------------------
-c   dissipation
-c---------------------------------------------------------------------
-      double precision dx1, dx2, dx3, dx4, dx5
-      double precision dy1, dy2, dy3, dy4, dy5
-      double precision dz1, dz2, dz3, dz4, dz5
-      double precision dssp
-
-      common/disp/ dx1,dx2,dx3,dx4,dx5,
-     >             dy1,dy2,dy3,dy4,dy5,
-     >             dz1,dz2,dz3,dz4,dz5,
-     >             dssp
-
-c---------------------------------------------------------------------
-c   field variables and residuals
-c---------------------------------------------------------------------
-      double precision u(5,-1:isiz1+2,-1:isiz2+2,isiz3),
-     >       rsd(5,-1:isiz1+2,-1:isiz2+2,isiz3),
-     >       frct(5,-1:isiz1+2,-1:isiz2+2,isiz3),
-     >       flux(5,0:isiz1+1,0:isiz2+1,isiz3)
-
-      common/cvar/ u,
-     >             rsd,
-     >             frct,
-     >             flux
-
-
-c---------------------------------------------------------------------
-c   output control parameters
-c---------------------------------------------------------------------
-      integer ipr, inorm
-
-      common/cprcon/ ipr, inorm
-
-c---------------------------------------------------------------------
-c   newton-raphson iteration control parameters
-c---------------------------------------------------------------------
-      integer itmax, invert
-      double precision  dt, omega, tolrsd(5),
-     >        rsdnm(5), errnm(5), frc, ttotal
-
-      common/ctscon/ dt, omega, tolrsd,
-     >               rsdnm, errnm, frc, ttotal,
-     >               itmax, invert
-
-      double precision a(5,5,isiz1,isiz2),
-     >       b(5,5,isiz1,isiz2),
-     >       c(5,5,isiz1,isiz2),
-     >       d(5,5,isiz1,isiz2)
-
-      common/cjac/ a, b, c, d
-
-c---------------------------------------------------------------------
-c   coefficients of the exact solution
-c---------------------------------------------------------------------
-      double precision ce(5,13)
-
-      common/cexact/ ce
-
-c---------------------------------------------------------------------
-c   multi-processor common blocks
-c---------------------------------------------------------------------
-      integer id, ndim, num, xdim, ydim, row, col
-      common/dim/ id,ndim,num,xdim,ydim,row,col
-
-      integer north,south,east,west
-      common/neigh/ north,south,east, west
-
-      integer from_s,from_n,from_e,from_w
-      parameter (from_s=1,from_n=2,from_e=3,from_w=4)
-
-      integer npmax
-      parameter (npmax=isiz01+isiz02)
-
-      logical icommn(npmax+1),icomms(npmax+1),
-     >        icomme(npmax+1),icommw(npmax+1)
-      double precision  buf(5,2*isiz2*isiz3),
-     >                  buf1(5,2*isiz2*isiz3)
-
-      common/comm/ buf, buf1,
-     >             icommn,icomms,
-     >             icomme,icommw
-
-      double precision maxtime
-      common/timer/maxtime
-
-
-c---------------------------------------------------------------------
-c   end of include file
-c---------------------------------------------------------------------
diff --git a/examples/smpi/NAS/LU/bcast_inputs.f b/examples/smpi/NAS/LU/bcast_inputs.f
deleted file mode 100644
index c606724bec..0000000000
--- a/examples/smpi/NAS/LU/bcast_inputs.f
+++ /dev/null
@@ -1,41 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine bcast_inputs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer ierr
-
-c---------------------------------------------------------------------
-c   root broadcasts the data
-c   The data isn't contiguous or of the same type, so it's not
-c   clear how to send it in the "MPI" way. 
-c   We could pack the info into a buffer or we could create
-c   an obscene datatype to handle it all at once. Since we only
-c   broadcast the data once, just use a separate broadcast for
-c   each piece. 
-c---------------------------------------------------------------------
-      call MPI_BCAST(ipr, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(inorm, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(itmax, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(dt, 1, dp_type, root, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(omega, 1, dp_type, root, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(tolrsd, 5, dp_type, root, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(nx0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(ny0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
-      call MPI_BCAST(nz0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/LU/blts.f b/examples/smpi/NAS/LU/blts.f
deleted file mode 100644
index 9861261b03..0000000000
--- a/examples/smpi/NAS/LU/blts.f
+++ /dev/null
@@ -1,261 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine blts ( ldmx, ldmy, ldmz,
-     >                  nx, ny, nz, k,
-     >                  omega,
-     >                  v,
-     >                  ldz, ldy, ldx, d,
-     >                  ist, iend, jst, jend,
-     >                  nx0, ny0, ipt, jpt)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   compute the regular-sparse, block lower triangular solution:
-c
-c                     v <-- ( L-inv ) * v
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer ldmx, ldmy, ldmz
-      integer nx, ny, nz
-      integer k
-      double precision  omega
-      double precision  v( 5, -1:ldmx+2, -1:ldmy+2, *),
-     >        ldz( 5, 5, ldmx, ldmy),
-     >        ldy( 5, 5, ldmx, ldmy),
-     >        ldx( 5, 5, ldmx, ldmy),
-     >        d( 5, 5, ldmx, ldmy)
-      integer ist, iend
-      integer jst, jend
-      integer nx0, ny0
-      integer ipt, jpt
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, m
-      integer iex
-      double precision  tmp, tmp1
-      double precision  tmat(5,5)
-
-
-c---------------------------------------------------------------------
-c   receive data from north and west
-c---------------------------------------------------------------------
-      iex = 0
-      call exchange_1( v,k,iex )
-
-
-      do j = jst, jend
-         do i = ist, iend
-            do m = 1, 5
-
-                  v( m, i, j, k ) =  v( m, i, j, k )
-     >    - omega * (  ldz( m, 1, i, j ) * v( 1, i, j, k-1 )
-     >               + ldz( m, 2, i, j ) * v( 2, i, j, k-1 )
-     >               + ldz( m, 3, i, j ) * v( 3, i, j, k-1 )
-     >               + ldz( m, 4, i, j ) * v( 4, i, j, k-1 )
-     >               + ldz( m, 5, i, j ) * v( 5, i, j, k-1 )  )
-
-            end do
-         end do
-      end do
-
-
-      do j=jst,jend
-        do i = ist, iend
-
-            do m = 1, 5
-
-                  v( m, i, j, k ) =  v( m, i, j, k )
-     > - omega * ( ldy( m, 1, i, j ) * v( 1, i, j-1, k )
-     >           + ldx( m, 1, i, j ) * v( 1, i-1, j, k )
-     >           + ldy( m, 2, i, j ) * v( 2, i, j-1, k )
-     >           + ldx( m, 2, i, j ) * v( 2, i-1, j, k )
-     >           + ldy( m, 3, i, j ) * v( 3, i, j-1, k )
-     >           + ldx( m, 3, i, j ) * v( 3, i-1, j, k )
-     >           + ldy( m, 4, i, j ) * v( 4, i, j-1, k )
-     >           + ldx( m, 4, i, j ) * v( 4, i-1, j, k )
-     >           + ldy( m, 5, i, j ) * v( 5, i, j-1, k )
-     >           + ldx( m, 5, i, j ) * v( 5, i-1, j, k ) )
-
-            end do
-       
-c---------------------------------------------------------------------
-c   diagonal block inversion
-c
-c   forward elimination
-c---------------------------------------------------------------------
-            do m = 1, 5
-               tmat( m, 1 ) = d( m, 1, i, j )
-               tmat( m, 2 ) = d( m, 2, i, j )
-               tmat( m, 3 ) = d( m, 3, i, j )
-               tmat( m, 4 ) = d( m, 4, i, j )
-               tmat( m, 5 ) = d( m, 5, i, j )
-            end do
-
-            tmp1 = 1.0d+00 / tmat( 1, 1 )
-            tmp = tmp1 * tmat( 2, 1 )
-            tmat( 2, 2 ) =  tmat( 2, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 2, 3 ) =  tmat( 2, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 2, 4 ) =  tmat( 2, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 2, 5 ) =  tmat( 2, 5 )
-     >           - tmp * tmat( 1, 5 )
-            v( 2, i, j, k ) = v( 2, i, j, k )
-     >        - v( 1, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 3, 1 )
-            tmat( 3, 2 ) =  tmat( 3, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 3, 3 ) =  tmat( 3, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 3, 4 ) =  tmat( 3, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 3, 5 ) =  tmat( 3, 5 )
-     >           - tmp * tmat( 1, 5 )
-            v( 3, i, j, k ) = v( 3, i, j, k )
-     >        - v( 1, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 4, 1 )
-            tmat( 4, 2 ) =  tmat( 4, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 4, 3 ) =  tmat( 4, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 1, 5 )
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >        - v( 1, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 5, 1 )
-            tmat( 5, 2 ) =  tmat( 5, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 5, 3 ) =  tmat( 5, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 1, 5 )
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >        - v( 1, i, j, k ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 2, 2 )
-            tmp = tmp1 * tmat( 3, 2 )
-            tmat( 3, 3 ) =  tmat( 3, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 3, 4 ) =  tmat( 3, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 3, 5 ) =  tmat( 3, 5 )
-     >           - tmp * tmat( 2, 5 )
-            v( 3, i, j, k ) = v( 3, i, j, k )
-     >        - v( 2, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 4, 2 )
-            tmat( 4, 3 ) =  tmat( 4, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 2, 5 )
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >        - v( 2, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 5, 2 )
-            tmat( 5, 3 ) =  tmat( 5, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 2, 5 )
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >        - v( 2, i, j, k ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 3, 3 )
-            tmp = tmp1 * tmat( 4, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 3, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 3, 5 )
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >        - v( 3, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 5, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 3, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 3, 5 )
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >        - v( 3, i, j, k ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 4, 4 )
-            tmp = tmp1 * tmat( 5, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 4, 5 )
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >        - v( 4, i, j, k ) * tmp
-
-c---------------------------------------------------------------------
-c   back substitution
-c---------------------------------------------------------------------
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >                      / tmat( 5, 5 )
-
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >           - tmat( 4, 5 ) * v( 5, i, j, k )
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >                      / tmat( 4, 4 )
-
-            v( 3, i, j, k ) = v( 3, i, j, k )
-     >           - tmat( 3, 4 ) * v( 4, i, j, k )
-     >           - tmat( 3, 5 ) * v( 5, i, j, k )
-            v( 3, i, j, k ) = v( 3, i, j, k )
-     >                      / tmat( 3, 3 )
-
-            v( 2, i, j, k ) = v( 2, i, j, k )
-     >           - tmat( 2, 3 ) * v( 3, i, j, k )
-     >           - tmat( 2, 4 ) * v( 4, i, j, k )
-     >           - tmat( 2, 5 ) * v( 5, i, j, k )
-            v( 2, i, j, k ) = v( 2, i, j, k )
-     >                      / tmat( 2, 2 )
-
-            v( 1, i, j, k ) = v( 1, i, j, k )
-     >           - tmat( 1, 2 ) * v( 2, i, j, k )
-     >           - tmat( 1, 3 ) * v( 3, i, j, k )
-     >           - tmat( 1, 4 ) * v( 4, i, j, k )
-     >           - tmat( 1, 5 ) * v( 5, i, j, k )
-            v( 1, i, j, k ) = v( 1, i, j, k )
-     >                      / tmat( 1, 1 )
-
-
-        enddo
-      enddo
-
-c---------------------------------------------------------------------
-c   send data to east and south
-c---------------------------------------------------------------------
-      iex = 2
-      call exchange_1( v,k,iex )
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/LU/blts_vec.f b/examples/smpi/NAS/LU/blts_vec.f
deleted file mode 100644
index f90ea84560..0000000000
--- a/examples/smpi/NAS/LU/blts_vec.f
+++ /dev/null
@@ -1,334 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine blts ( ldmx, ldmy, ldmz,
-     >                  nx, ny, nz, k,
-     >                  omega,
-     >                  v,
-     >                  ldz, ldy, ldx, d,
-     >                  ist, iend, jst, jend,
-     >                  nx0, ny0, ipt, jpt)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   compute the regular-sparse, block lower triangular solution:
-c
-c                     v <-- ( L-inv ) * v
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer ldmx, ldmy, ldmz
-      integer nx, ny, nz
-      integer k
-      double precision  omega
-      double precision  v( 5, -1:ldmx+2, -1:ldmy+2, *),
-     >        ldz( 5, 5, ldmx, ldmy),
-     >        ldy( 5, 5, ldmx, ldmy),
-     >        ldx( 5, 5, ldmx, ldmy),
-     >        d( 5, 5, ldmx, ldmy)
-      integer ist, iend
-      integer jst, jend
-      integer nx0, ny0
-      integer ipt, jpt
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, m, l, istp, iendp
-      integer iex
-      double precision  tmp, tmp1
-      double precision  tmat(5,5)
-
-
-c---------------------------------------------------------------------
-c   receive data from north and west
-c---------------------------------------------------------------------
-      iex = 0
-      call exchange_1( v,k,iex )
-
-
-      do j = jst, jend
-         do i = ist, iend
-            do m = 1, 5
-
-                  v( m, i, j, k ) =  v( m, i, j, k )
-     >    - omega * (  ldz( m, 1, i, j ) * v( 1, i, j, k-1 )
-     >               + ldz( m, 2, i, j ) * v( 2, i, j, k-1 )
-     >               + ldz( m, 3, i, j ) * v( 3, i, j, k-1 )
-     >               + ldz( m, 4, i, j ) * v( 4, i, j, k-1 )
-     >               + ldz( m, 5, i, j ) * v( 5, i, j, k-1 )  )
-
-            end do
-         end do
-      end do
-
-
-      do l = ist+jst, iend+jend
-         istp  = max(l - jend, ist)
-         iendp = min(l - jst, iend)
-
-!dir$ ivdep
-         do i = istp, iendp
-            j = l - i
-
-!!dir$ unroll 5
-!   manually unroll the loop
-!            do m = 1, 5
-
-                  v( 1, i, j, k ) =  v( 1, i, j, k )
-     > - omega * ( ldy( 1, 1, i, j ) * v( 1, i, j-1, k )
-     >           + ldx( 1, 1, i, j ) * v( 1, i-1, j, k )
-     >           + ldy( 1, 2, i, j ) * v( 2, i, j-1, k )
-     >           + ldx( 1, 2, i, j ) * v( 2, i-1, j, k )
-     >           + ldy( 1, 3, i, j ) * v( 3, i, j-1, k )
-     >           + ldx( 1, 3, i, j ) * v( 3, i-1, j, k )
-     >           + ldy( 1, 4, i, j ) * v( 4, i, j-1, k )
-     >           + ldx( 1, 4, i, j ) * v( 4, i-1, j, k )
-     >           + ldy( 1, 5, i, j ) * v( 5, i, j-1, k )
-     >           + ldx( 1, 5, i, j ) * v( 5, i-1, j, k ) )
-                  v( 2, i, j, k ) =  v( 2, i, j, k )
-     > - omega * ( ldy( 2, 1, i, j ) * v( 1, i, j-1, k )
-     >           + ldx( 2, 1, i, j ) * v( 1, i-1, j, k )
-     >           + ldy( 2, 2, i, j ) * v( 2, i, j-1, k )
-     >           + ldx( 2, 2, i, j ) * v( 2, i-1, j, k )
-     >           + ldy( 2, 3, i, j ) * v( 3, i, j-1, k )
-     >           + ldx( 2, 3, i, j ) * v( 3, i-1, j, k )
-     >           + ldy( 2, 4, i, j ) * v( 4, i, j-1, k )
-     >           + ldx( 2, 4, i, j ) * v( 4, i-1, j, k )
-     >           + ldy( 2, 5, i, j ) * v( 5, i, j-1, k )
-     >           + ldx( 2, 5, i, j ) * v( 5, i-1, j, k ) )
-                  v( 3, i, j, k ) =  v( 3, i, j, k )
-     > - omega * ( ldy( 3, 1, i, j ) * v( 1, i, j-1, k )
-     >           + ldx( 3, 1, i, j ) * v( 1, i-1, j, k )
-     >           + ldy( 3, 2, i, j ) * v( 2, i, j-1, k )
-     >           + ldx( 3, 2, i, j ) * v( 2, i-1, j, k )
-     >           + ldy( 3, 3, i, j ) * v( 3, i, j-1, k )
-     >           + ldx( 3, 3, i, j ) * v( 3, i-1, j, k )
-     >           + ldy( 3, 4, i, j ) * v( 4, i, j-1, k )
-     >           + ldx( 3, 4, i, j ) * v( 4, i-1, j, k )
-     >           + ldy( 3, 5, i, j ) * v( 5, i, j-1, k )
-     >           + ldx( 3, 5, i, j ) * v( 5, i-1, j, k ) )
-                  v( 4, i, j, k ) =  v( 4, i, j, k )
-     > - omega * ( ldy( 4, 1, i, j ) * v( 1, i, j-1, k )
-     >           + ldx( 4, 1, i, j ) * v( 1, i-1, j, k )
-     >           + ldy( 4, 2, i, j ) * v( 2, i, j-1, k )
-     >           + ldx( 4, 2, i, j ) * v( 2, i-1, j, k )
-     >           + ldy( 4, 3, i, j ) * v( 3, i, j-1, k )
-     >           + ldx( 4, 3, i, j ) * v( 3, i-1, j, k )
-     >           + ldy( 4, 4, i, j ) * v( 4, i, j-1, k )
-     >           + ldx( 4, 4, i, j ) * v( 4, i-1, j, k )
-     >           + ldy( 4, 5, i, j ) * v( 5, i, j-1, k )
-     >           + ldx( 4, 5, i, j ) * v( 5, i-1, j, k ) )
-                  v( 5, i, j, k ) =  v( 5, i, j, k )
-     > - omega * ( ldy( 5, 1, i, j ) * v( 1, i, j-1, k )
-     >           + ldx( 5, 1, i, j ) * v( 1, i-1, j, k )
-     >           + ldy( 5, 2, i, j ) * v( 2, i, j-1, k )
-     >           + ldx( 5, 2, i, j ) * v( 2, i-1, j, k )
-     >           + ldy( 5, 3, i, j ) * v( 3, i, j-1, k )
-     >           + ldx( 5, 3, i, j ) * v( 3, i-1, j, k )
-     >           + ldy( 5, 4, i, j ) * v( 4, i, j-1, k )
-     >           + ldx( 5, 4, i, j ) * v( 4, i-1, j, k )
-     >           + ldy( 5, 5, i, j ) * v( 5, i, j-1, k )
-     >           + ldx( 5, 5, i, j ) * v( 5, i-1, j, k ) )
-
-!            end do
-       
-c---------------------------------------------------------------------
-c   diagonal block inversion
-c
-c   forward elimination
-c---------------------------------------------------------------------
-!!dir$ unroll 5
-!   manually unroll the loop
-!            do m = 1, 5
-               tmat( 1, 1 ) = d( 1, 1, i, j )
-               tmat( 1, 2 ) = d( 1, 2, i, j )
-               tmat( 1, 3 ) = d( 1, 3, i, j )
-               tmat( 1, 4 ) = d( 1, 4, i, j )
-               tmat( 1, 5 ) = d( 1, 5, i, j )
-               tmat( 2, 1 ) = d( 2, 1, i, j )
-               tmat( 2, 2 ) = d( 2, 2, i, j )
-               tmat( 2, 3 ) = d( 2, 3, i, j )
-               tmat( 2, 4 ) = d( 2, 4, i, j )
-               tmat( 2, 5 ) = d( 2, 5, i, j )
-               tmat( 3, 1 ) = d( 3, 1, i, j )
-               tmat( 3, 2 ) = d( 3, 2, i, j )
-               tmat( 3, 3 ) = d( 3, 3, i, j )
-               tmat( 3, 4 ) = d( 3, 4, i, j )
-               tmat( 3, 5 ) = d( 3, 5, i, j )
-               tmat( 4, 1 ) = d( 4, 1, i, j )
-               tmat( 4, 2 ) = d( 4, 2, i, j )
-               tmat( 4, 3 ) = d( 4, 3, i, j )
-               tmat( 4, 4 ) = d( 4, 4, i, j )
-               tmat( 4, 5 ) = d( 4, 5, i, j )
-               tmat( 5, 1 ) = d( 5, 1, i, j )
-               tmat( 5, 2 ) = d( 5, 2, i, j )
-               tmat( 5, 3 ) = d( 5, 3, i, j )
-               tmat( 5, 4 ) = d( 5, 4, i, j )
-               tmat( 5, 5 ) = d( 5, 5, i, j )
-!            end do
-
-            tmp1 = 1.0d+00 / tmat( 1, 1 )
-            tmp = tmp1 * tmat( 2, 1 )
-            tmat( 2, 2 ) =  tmat( 2, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 2, 3 ) =  tmat( 2, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 2, 4 ) =  tmat( 2, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 2, 5 ) =  tmat( 2, 5 )
-     >           - tmp * tmat( 1, 5 )
-            v( 2, i, j, k ) = v( 2, i, j, k )
-     >        - v( 1, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 3, 1 )
-            tmat( 3, 2 ) =  tmat( 3, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 3, 3 ) =  tmat( 3, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 3, 4 ) =  tmat( 3, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 3, 5 ) =  tmat( 3, 5 )
-     >           - tmp * tmat( 1, 5 )
-            v( 3, i, j, k ) = v( 3, i, j, k )
-     >        - v( 1, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 4, 1 )
-            tmat( 4, 2 ) =  tmat( 4, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 4, 3 ) =  tmat( 4, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 1, 5 )
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >        - v( 1, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 5, 1 )
-            tmat( 5, 2 ) =  tmat( 5, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 5, 3 ) =  tmat( 5, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 1, 5 )
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >        - v( 1, i, j, k ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 2, 2 )
-            tmp = tmp1 * tmat( 3, 2 )
-            tmat( 3, 3 ) =  tmat( 3, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 3, 4 ) =  tmat( 3, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 3, 5 ) =  tmat( 3, 5 )
-     >           - tmp * tmat( 2, 5 )
-            v( 3, i, j, k ) = v( 3, i, j, k )
-     >        - v( 2, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 4, 2 )
-            tmat( 4, 3 ) =  tmat( 4, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 2, 5 )
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >        - v( 2, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 5, 2 )
-            tmat( 5, 3 ) =  tmat( 5, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 2, 5 )
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >        - v( 2, i, j, k ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 3, 3 )
-            tmp = tmp1 * tmat( 4, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 3, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 3, 5 )
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >        - v( 3, i, j, k ) * tmp
-
-            tmp = tmp1 * tmat( 5, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 3, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 3, 5 )
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >        - v( 3, i, j, k ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 4, 4 )
-            tmp = tmp1 * tmat( 5, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 4, 5 )
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >        - v( 4, i, j, k ) * tmp
-
-c---------------------------------------------------------------------
-c   back substitution
-c---------------------------------------------------------------------
-            v( 5, i, j, k ) = v( 5, i, j, k )
-     >                      / tmat( 5, 5 )
-
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >           - tmat( 4, 5 ) * v( 5, i, j, k )
-            v( 4, i, j, k ) = v( 4, i, j, k )
-     >                      / tmat( 4, 4 )
-
-            v( 3, i, j, k ) = v( 3, i, j, k )
-     >           - tmat( 3, 4 ) * v( 4, i, j, k )
-     >           - tmat( 3, 5 ) * v( 5, i, j, k )
-            v( 3, i, j, k ) = v( 3, i, j, k )
-     >                      / tmat( 3, 3 )
-
-            v( 2, i, j, k ) = v( 2, i, j, k )
-     >           - tmat( 2, 3 ) * v( 3, i, j, k )
-     >           - tmat( 2, 4 ) * v( 4, i, j, k )
-     >           - tmat( 2, 5 ) * v( 5, i, j, k )
-            v( 2, i, j, k ) = v( 2, i, j, k )
-     >                      / tmat( 2, 2 )
-
-            v( 1, i, j, k ) = v( 1, i, j, k )
-     >           - tmat( 1, 2 ) * v( 2, i, j, k )
-     >           - tmat( 1, 3 ) * v( 3, i, j, k )
-     >           - tmat( 1, 4 ) * v( 4, i, j, k )
-     >           - tmat( 1, 5 ) * v( 5, i, j, k )
-            v( 1, i, j, k ) = v( 1, i, j, k )
-     >                      / tmat( 1, 1 )
-
-
-        enddo
-      enddo
-
-c---------------------------------------------------------------------
-c   send data to east and south
-c---------------------------------------------------------------------
-      iex = 2
-      call exchange_1( v,k,iex )
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/LU/buts.f b/examples/smpi/NAS/LU/buts.f
deleted file mode 100644
index a6fc3d6217..0000000000
--- a/examples/smpi/NAS/LU/buts.f
+++ /dev/null
@@ -1,259 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine buts( ldmx, ldmy, ldmz,
-     >                 nx, ny, nz, k,
-     >                 omega,
-     >                 v, tv,
-     >                 d, udx, udy, udz,
-     >                 ist, iend, jst, jend,
-     >                 nx0, ny0, ipt, jpt )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   compute the regular-sparse, block upper triangular solution:
-c
-c                     v <-- ( U-inv ) * v
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer ldmx, ldmy, ldmz
-      integer nx, ny, nz
-      integer k
-      double precision  omega
-      double precision  v( 5, -1:ldmx+2, -1:ldmy+2, *), 
-     >        tv(5, ldmx, ldmy),
-     >        d( 5, 5, ldmx, ldmy),
-     >        udx( 5, 5, ldmx, ldmy),
-     >        udy( 5, 5, ldmx, ldmy),
-     >        udz( 5, 5, ldmx, ldmy )
-      integer ist, iend
-      integer jst, jend
-      integer nx0, ny0
-      integer ipt, jpt
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, m
-      integer iex
-      double precision  tmp, tmp1
-      double precision  tmat(5,5)
-
-
-c---------------------------------------------------------------------
-c   receive data from south and east
-c---------------------------------------------------------------------
-      iex = 1
-      call exchange_1( v,k,iex )
-
-      do j = jend, jst, -1
-         do i = iend, ist, -1
-            do m = 1, 5
-                  tv( m, i, j ) = 
-     >      omega * (  udz( m, 1, i, j ) * v( 1, i, j, k+1 )
-     >               + udz( m, 2, i, j ) * v( 2, i, j, k+1 )
-     >               + udz( m, 3, i, j ) * v( 3, i, j, k+1 )
-     >               + udz( m, 4, i, j ) * v( 4, i, j, k+1 )
-     >               + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) )
-            end do
-         end do
-      end do
-
-
-      do j = jend,jst,-1
-        do i = iend,ist,-1
-
-            do m = 1, 5
-                  tv( m, i, j ) = tv( m, i, j )
-     > + omega * ( udy( m, 1, i, j ) * v( 1, i, j+1, k )
-     >           + udx( m, 1, i, j ) * v( 1, i+1, j, k )
-     >           + udy( m, 2, i, j ) * v( 2, i, j+1, k )
-     >           + udx( m, 2, i, j ) * v( 2, i+1, j, k )
-     >           + udy( m, 3, i, j ) * v( 3, i, j+1, k )
-     >           + udx( m, 3, i, j ) * v( 3, i+1, j, k )
-     >           + udy( m, 4, i, j ) * v( 4, i, j+1, k )
-     >           + udx( m, 4, i, j ) * v( 4, i+1, j, k )
-     >           + udy( m, 5, i, j ) * v( 5, i, j+1, k )
-     >           + udx( m, 5, i, j ) * v( 5, i+1, j, k ) )
-            end do
-
-c---------------------------------------------------------------------
-c   diagonal block inversion
-c---------------------------------------------------------------------
-            do m = 1, 5
-               tmat( m, 1 ) = d( m, 1, i, j )
-               tmat( m, 2 ) = d( m, 2, i, j )
-               tmat( m, 3 ) = d( m, 3, i, j )
-               tmat( m, 4 ) = d( m, 4, i, j )
-               tmat( m, 5 ) = d( m, 5, i, j )
-            end do
-
-            tmp1 = 1.0d+00 / tmat( 1, 1 )
-            tmp = tmp1 * tmat( 2, 1 )
-            tmat( 2, 2 ) =  tmat( 2, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 2, 3 ) =  tmat( 2, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 2, 4 ) =  tmat( 2, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 2, 5 ) =  tmat( 2, 5 )
-     >           - tmp * tmat( 1, 5 )
-            tv( 2, i, j ) = tv( 2, i, j )
-     >        - tv( 1, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 3, 1 )
-            tmat( 3, 2 ) =  tmat( 3, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 3, 3 ) =  tmat( 3, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 3, 4 ) =  tmat( 3, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 3, 5 ) =  tmat( 3, 5 )
-     >           - tmp * tmat( 1, 5 )
-            tv( 3, i, j ) = tv( 3, i, j )
-     >        - tv( 1, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 4, 1 )
-            tmat( 4, 2 ) =  tmat( 4, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 4, 3 ) =  tmat( 4, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 1, 5 )
-            tv( 4, i, j ) = tv( 4, i, j )
-     >        - tv( 1, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 5, 1 )
-            tmat( 5, 2 ) =  tmat( 5, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 5, 3 ) =  tmat( 5, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 1, 5 )
-            tv( 5, i, j ) = tv( 5, i, j )
-     >        - tv( 1, i, j ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 2, 2 )
-            tmp = tmp1 * tmat( 3, 2 )
-            tmat( 3, 3 ) =  tmat( 3, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 3, 4 ) =  tmat( 3, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 3, 5 ) =  tmat( 3, 5 )
-     >           - tmp * tmat( 2, 5 )
-            tv( 3, i, j ) = tv( 3, i, j )
-     >        - tv( 2, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 4, 2 )
-            tmat( 4, 3 ) =  tmat( 4, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 2, 5 )
-            tv( 4, i, j ) = tv( 4, i, j )
-     >        - tv( 2, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 5, 2 )
-            tmat( 5, 3 ) =  tmat( 5, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 2, 5 )
-            tv( 5, i, j ) = tv( 5, i, j )
-     >        - tv( 2, i, j ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 3, 3 )
-            tmp = tmp1 * tmat( 4, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 3, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 3, 5 )
-            tv( 4, i, j ) = tv( 4, i, j )
-     >        - tv( 3, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 5, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 3, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 3, 5 )
-            tv( 5, i, j ) = tv( 5, i, j )
-     >        - tv( 3, i, j ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 4, 4 )
-            tmp = tmp1 * tmat( 5, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 4, 5 )
-            tv( 5, i, j ) = tv( 5, i, j )
-     >        - tv( 4, i, j ) * tmp
-
-c---------------------------------------------------------------------
-c   back substitution
-c---------------------------------------------------------------------
-            tv( 5, i, j ) = tv( 5, i, j )
-     >                      / tmat( 5, 5 )
-
-            tv( 4, i, j ) = tv( 4, i, j )
-     >           - tmat( 4, 5 ) * tv( 5, i, j )
-            tv( 4, i, j ) = tv( 4, i, j )
-     >                      / tmat( 4, 4 )
-
-            tv( 3, i, j ) = tv( 3, i, j )
-     >           - tmat( 3, 4 ) * tv( 4, i, j )
-     >           - tmat( 3, 5 ) * tv( 5, i, j )
-            tv( 3, i, j ) = tv( 3, i, j )
-     >                      / tmat( 3, 3 )
-
-            tv( 2, i, j ) = tv( 2, i, j )
-     >           - tmat( 2, 3 ) * tv( 3, i, j )
-     >           - tmat( 2, 4 ) * tv( 4, i, j )
-     >           - tmat( 2, 5 ) * tv( 5, i, j )
-            tv( 2, i, j ) = tv( 2, i, j )
-     >                      / tmat( 2, 2 )
-
-            tv( 1, i, j ) = tv( 1, i, j )
-     >           - tmat( 1, 2 ) * tv( 2, i, j )
-     >           - tmat( 1, 3 ) * tv( 3, i, j )
-     >           - tmat( 1, 4 ) * tv( 4, i, j )
-     >           - tmat( 1, 5 ) * tv( 5, i, j )
-            tv( 1, i, j ) = tv( 1, i, j )
-     >                      / tmat( 1, 1 )
-
-            v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j )
-            v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j )
-            v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j )
-            v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j )
-            v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j )
-
-
-        enddo
-      end do
-
-c---------------------------------------------------------------------
-c   send data to north and west
-c---------------------------------------------------------------------
-      iex = 3
-      call exchange_1( v,k,iex )
- 
-      return
-      end
diff --git a/examples/smpi/NAS/LU/buts_vec.f b/examples/smpi/NAS/LU/buts_vec.f
deleted file mode 100644
index 813105d270..0000000000
--- a/examples/smpi/NAS/LU/buts_vec.f
+++ /dev/null
@@ -1,332 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine buts( ldmx, ldmy, ldmz,
-     >                 nx, ny, nz, k,
-     >                 omega,
-     >                 v, tv,
-     >                 d, udx, udy, udz,
-     >                 ist, iend, jst, jend,
-     >                 nx0, ny0, ipt, jpt )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   compute the regular-sparse, block upper triangular solution:
-c
-c                     v <-- ( U-inv ) * v
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer ldmx, ldmy, ldmz
-      integer nx, ny, nz
-      integer k
-      double precision  omega
-      double precision  v( 5, -1:ldmx+2, -1:ldmy+2, *), 
-     >        tv(5, ldmx, ldmy),
-     >        d( 5, 5, ldmx, ldmy),
-     >        udx( 5, 5, ldmx, ldmy),
-     >        udy( 5, 5, ldmx, ldmy),
-     >        udz( 5, 5, ldmx, ldmy )
-      integer ist, iend
-      integer jst, jend
-      integer nx0, ny0
-      integer ipt, jpt
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, m, l, istp, iendp
-      integer iex
-      double precision  tmp, tmp1
-      double precision  tmat(5,5)
-
-
-c---------------------------------------------------------------------
-c   receive data from south and east
-c---------------------------------------------------------------------
-      iex = 1
-      call exchange_1( v,k,iex )
-
-      do j = jend, jst, -1
-         do i = iend, ist, -1
-            do m = 1, 5
-                  tv( m, i, j ) = 
-     >      omega * (  udz( m, 1, i, j ) * v( 1, i, j, k+1 )
-     >               + udz( m, 2, i, j ) * v( 2, i, j, k+1 )
-     >               + udz( m, 3, i, j ) * v( 3, i, j, k+1 )
-     >               + udz( m, 4, i, j ) * v( 4, i, j, k+1 )
-     >               + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) )
-            end do
-         end do
-      end do
-
-
-      do l = iend+jend, ist+jst, -1
-         istp  = max(l - jend, ist)
-         iendp = min(l - jst, iend)
-
-!dir$ ivdep
-         do i = istp, iendp
-            j = l - i
-
-!!dir$ unroll 5
-!   manually unroll the loop
-!            do m = 1, 5
-                  tv( 1, i, j ) = tv( 1, i, j )
-     > + omega * ( udy( 1, 1, i, j ) * v( 1, i, j+1, k )
-     >           + udx( 1, 1, i, j ) * v( 1, i+1, j, k )
-     >           + udy( 1, 2, i, j ) * v( 2, i, j+1, k )
-     >           + udx( 1, 2, i, j ) * v( 2, i+1, j, k )
-     >           + udy( 1, 3, i, j ) * v( 3, i, j+1, k )
-     >           + udx( 1, 3, i, j ) * v( 3, i+1, j, k )
-     >           + udy( 1, 4, i, j ) * v( 4, i, j+1, k )
-     >           + udx( 1, 4, i, j ) * v( 4, i+1, j, k )
-     >           + udy( 1, 5, i, j ) * v( 5, i, j+1, k )
-     >           + udx( 1, 5, i, j ) * v( 5, i+1, j, k ) )
-                  tv( 2, i, j ) = tv( 2, i, j )
-     > + omega * ( udy( 2, 1, i, j ) * v( 1, i, j+1, k )
-     >           + udx( 2, 1, i, j ) * v( 1, i+1, j, k )
-     >           + udy( 2, 2, i, j ) * v( 2, i, j+1, k )
-     >           + udx( 2, 2, i, j ) * v( 2, i+1, j, k )
-     >           + udy( 2, 3, i, j ) * v( 3, i, j+1, k )
-     >           + udx( 2, 3, i, j ) * v( 3, i+1, j, k )
-     >           + udy( 2, 4, i, j ) * v( 4, i, j+1, k )
-     >           + udx( 2, 4, i, j ) * v( 4, i+1, j, k )
-     >           + udy( 2, 5, i, j ) * v( 5, i, j+1, k )
-     >           + udx( 2, 5, i, j ) * v( 5, i+1, j, k ) )
-                  tv( 3, i, j ) = tv( 3, i, j )
-     > + omega * ( udy( 3, 1, i, j ) * v( 1, i, j+1, k )
-     >           + udx( 3, 1, i, j ) * v( 1, i+1, j, k )
-     >           + udy( 3, 2, i, j ) * v( 2, i, j+1, k )
-     >           + udx( 3, 2, i, j ) * v( 2, i+1, j, k )
-     >           + udy( 3, 3, i, j ) * v( 3, i, j+1, k )
-     >           + udx( 3, 3, i, j ) * v( 3, i+1, j, k )
-     >           + udy( 3, 4, i, j ) * v( 4, i, j+1, k )
-     >           + udx( 3, 4, i, j ) * v( 4, i+1, j, k )
-     >           + udy( 3, 5, i, j ) * v( 5, i, j+1, k )
-     >           + udx( 3, 5, i, j ) * v( 5, i+1, j, k ) )
-                  tv( 4, i, j ) = tv( 4, i, j )
-     > + omega * ( udy( 4, 1, i, j ) * v( 1, i, j+1, k )
-     >           + udx( 4, 1, i, j ) * v( 1, i+1, j, k )
-     >           + udy( 4, 2, i, j ) * v( 2, i, j+1, k )
-     >           + udx( 4, 2, i, j ) * v( 2, i+1, j, k )
-     >           + udy( 4, 3, i, j ) * v( 3, i, j+1, k )
-     >           + udx( 4, 3, i, j ) * v( 3, i+1, j, k )
-     >           + udy( 4, 4, i, j ) * v( 4, i, j+1, k )
-     >           + udx( 4, 4, i, j ) * v( 4, i+1, j, k )
-     >           + udy( 4, 5, i, j ) * v( 5, i, j+1, k )
-     >           + udx( 4, 5, i, j ) * v( 5, i+1, j, k ) )
-                  tv( 5, i, j ) = tv( 5, i, j )
-     > + omega * ( udy( 5, 1, i, j ) * v( 1, i, j+1, k )
-     >           + udx( 5, 1, i, j ) * v( 1, i+1, j, k )
-     >           + udy( 5, 2, i, j ) * v( 2, i, j+1, k )
-     >           + udx( 5, 2, i, j ) * v( 2, i+1, j, k )
-     >           + udy( 5, 3, i, j ) * v( 3, i, j+1, k )
-     >           + udx( 5, 3, i, j ) * v( 3, i+1, j, k )
-     >           + udy( 5, 4, i, j ) * v( 4, i, j+1, k )
-     >           + udx( 5, 4, i, j ) * v( 4, i+1, j, k )
-     >           + udy( 5, 5, i, j ) * v( 5, i, j+1, k )
-     >           + udx( 5, 5, i, j ) * v( 5, i+1, j, k ) )
-!            end do
-
-c---------------------------------------------------------------------
-c   diagonal block inversion
-c---------------------------------------------------------------------
-!!dir$ unroll 5
-!   manually unroll the loop
-!            do m = 1, 5
-               tmat( 1, 1 ) = d( 1, 1, i, j )
-               tmat( 1, 2 ) = d( 1, 2, i, j )
-               tmat( 1, 3 ) = d( 1, 3, i, j )
-               tmat( 1, 4 ) = d( 1, 4, i, j )
-               tmat( 1, 5 ) = d( 1, 5, i, j )
-               tmat( 2, 1 ) = d( 2, 1, i, j )
-               tmat( 2, 2 ) = d( 2, 2, i, j )
-               tmat( 2, 3 ) = d( 2, 3, i, j )
-               tmat( 2, 4 ) = d( 2, 4, i, j )
-               tmat( 2, 5 ) = d( 2, 5, i, j )
-               tmat( 3, 1 ) = d( 3, 1, i, j )
-               tmat( 3, 2 ) = d( 3, 2, i, j )
-               tmat( 3, 3 ) = d( 3, 3, i, j )
-               tmat( 3, 4 ) = d( 3, 4, i, j )
-               tmat( 3, 5 ) = d( 3, 5, i, j )
-               tmat( 4, 1 ) = d( 4, 1, i, j )
-               tmat( 4, 2 ) = d( 4, 2, i, j )
-               tmat( 4, 3 ) = d( 4, 3, i, j )
-               tmat( 4, 4 ) = d( 4, 4, i, j )
-               tmat( 4, 5 ) = d( 4, 5, i, j )
-               tmat( 5, 1 ) = d( 5, 1, i, j )
-               tmat( 5, 2 ) = d( 5, 2, i, j )
-               tmat( 5, 3 ) = d( 5, 3, i, j )
-               tmat( 5, 4 ) = d( 5, 4, i, j )
-               tmat( 5, 5 ) = d( 5, 5, i, j )
-!            end do
-
-            tmp1 = 1.0d+00 / tmat( 1, 1 )
-            tmp = tmp1 * tmat( 2, 1 )
-            tmat( 2, 2 ) =  tmat( 2, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 2, 3 ) =  tmat( 2, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 2, 4 ) =  tmat( 2, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 2, 5 ) =  tmat( 2, 5 )
-     >           - tmp * tmat( 1, 5 )
-            tv( 2, i, j ) = tv( 2, i, j )
-     >        - tv( 1, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 3, 1 )
-            tmat( 3, 2 ) =  tmat( 3, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 3, 3 ) =  tmat( 3, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 3, 4 ) =  tmat( 3, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 3, 5 ) =  tmat( 3, 5 )
-     >           - tmp * tmat( 1, 5 )
-            tv( 3, i, j ) = tv( 3, i, j )
-     >        - tv( 1, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 4, 1 )
-            tmat( 4, 2 ) =  tmat( 4, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 4, 3 ) =  tmat( 4, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 1, 5 )
-            tv( 4, i, j ) = tv( 4, i, j )
-     >        - tv( 1, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 5, 1 )
-            tmat( 5, 2 ) =  tmat( 5, 2 )
-     >           - tmp * tmat( 1, 2 )
-            tmat( 5, 3 ) =  tmat( 5, 3 )
-     >           - tmp * tmat( 1, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 1, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 1, 5 )
-            tv( 5, i, j ) = tv( 5, i, j )
-     >        - tv( 1, i, j ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 2, 2 )
-            tmp = tmp1 * tmat( 3, 2 )
-            tmat( 3, 3 ) =  tmat( 3, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 3, 4 ) =  tmat( 3, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 3, 5 ) =  tmat( 3, 5 )
-     >           - tmp * tmat( 2, 5 )
-            tv( 3, i, j ) = tv( 3, i, j )
-     >        - tv( 2, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 4, 2 )
-            tmat( 4, 3 ) =  tmat( 4, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 2, 5 )
-            tv( 4, i, j ) = tv( 4, i, j )
-     >        - tv( 2, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 5, 2 )
-            tmat( 5, 3 ) =  tmat( 5, 3 )
-     >           - tmp * tmat( 2, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 2, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 2, 5 )
-            tv( 5, i, j ) = tv( 5, i, j )
-     >        - tv( 2, i, j ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 3, 3 )
-            tmp = tmp1 * tmat( 4, 3 )
-            tmat( 4, 4 ) =  tmat( 4, 4 )
-     >           - tmp * tmat( 3, 4 )
-            tmat( 4, 5 ) =  tmat( 4, 5 )
-     >           - tmp * tmat( 3, 5 )
-            tv( 4, i, j ) = tv( 4, i, j )
-     >        - tv( 3, i, j ) * tmp
-
-            tmp = tmp1 * tmat( 5, 3 )
-            tmat( 5, 4 ) =  tmat( 5, 4 )
-     >           - tmp * tmat( 3, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 3, 5 )
-            tv( 5, i, j ) = tv( 5, i, j )
-     >        - tv( 3, i, j ) * tmp
-
-
-
-            tmp1 = 1.0d+00 / tmat( 4, 4 )
-            tmp = tmp1 * tmat( 5, 4 )
-            tmat( 5, 5 ) =  tmat( 5, 5 )
-     >           - tmp * tmat( 4, 5 )
-            tv( 5, i, j ) = tv( 5, i, j )
-     >        - tv( 4, i, j ) * tmp
-
-c---------------------------------------------------------------------
-c   back substitution
-c---------------------------------------------------------------------
-            tv( 5, i, j ) = tv( 5, i, j )
-     >                      / tmat( 5, 5 )
-
-            tv( 4, i, j ) = tv( 4, i, j )
-     >           - tmat( 4, 5 ) * tv( 5, i, j )
-            tv( 4, i, j ) = tv( 4, i, j )
-     >                      / tmat( 4, 4 )
-
-            tv( 3, i, j ) = tv( 3, i, j )
-     >           - tmat( 3, 4 ) * tv( 4, i, j )
-     >           - tmat( 3, 5 ) * tv( 5, i, j )
-            tv( 3, i, j ) = tv( 3, i, j )
-     >                      / tmat( 3, 3 )
-
-            tv( 2, i, j ) = tv( 2, i, j )
-     >           - tmat( 2, 3 ) * tv( 3, i, j )
-     >           - tmat( 2, 4 ) * tv( 4, i, j )
-     >           - tmat( 2, 5 ) * tv( 5, i, j )
-            tv( 2, i, j ) = tv( 2, i, j )
-     >                      / tmat( 2, 2 )
-
-            tv( 1, i, j ) = tv( 1, i, j )
-     >           - tmat( 1, 2 ) * tv( 2, i, j )
-     >           - tmat( 1, 3 ) * tv( 3, i, j )
-     >           - tmat( 1, 4 ) * tv( 4, i, j )
-     >           - tmat( 1, 5 ) * tv( 5, i, j )
-            tv( 1, i, j ) = tv( 1, i, j )
-     >                      / tmat( 1, 1 )
-
-            v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j )
-            v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j )
-            v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j )
-            v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j )
-            v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j )
-
-
-        enddo
-      end do
-
-c---------------------------------------------------------------------
-c   send data to north and west
-c---------------------------------------------------------------------
-      iex = 3
-      call exchange_1( v,k,iex )
- 
-      return
-      end
diff --git a/examples/smpi/NAS/LU/erhs.f b/examples/smpi/NAS/LU/erhs.f
deleted file mode 100644
index 928e2a9f50..0000000000
--- a/examples/smpi/NAS/LU/erhs.f
+++ /dev/null
@@ -1,536 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine erhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   compute the right hand side based on exact solution
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, k, m
-      integer iglob, jglob
-      integer iex
-      integer L1, L2
-      integer ist1, iend1
-      integer jst1, jend1
-      double precision  dsspm
-      double precision  xi, eta, zeta
-      double precision  q
-      double precision  u21, u31, u41
-      double precision  tmp
-      double precision  u21i, u31i, u41i, u51i
-      double precision  u21j, u31j, u41j, u51j
-      double precision  u21k, u31k, u41k, u51k
-      double precision  u21im1, u31im1, u41im1, u51im1
-      double precision  u21jm1, u31jm1, u41jm1, u51jm1
-      double precision  u21km1, u31km1, u41km1, u51km1
-
-      dsspm = dssp
-
-
-      do k = 1, nz
-         do j = 1, ny
-            do i = 1, nx
-               do m = 1, 5
-                  frct( m, i, j, k ) = 0.0d+00
-               end do
-            end do
-         end do
-      end do
-
-      do k = 1, nz
-         zeta = ( dble(k-1) ) / ( nz - 1 )
-         do j = 1, ny
-            jglob = jpt + j
-            eta = ( dble(jglob-1) ) / ( ny0 - 1 )
-            do i = 1, nx
-               iglob = ipt + i
-               xi = ( dble(iglob-1) ) / ( nx0 - 1 )
-               do m = 1, 5
-                  rsd(m,i,j,k) =  ce(m,1)
-     >                 + ce(m,2) * xi
-     >                 + ce(m,3) * eta
-     >                 + ce(m,4) * zeta
-     >                 + ce(m,5) * xi * xi
-     >                 + ce(m,6) * eta * eta
-     >                 + ce(m,7) * zeta * zeta
-     >                 + ce(m,8) * xi * xi * xi
-     >                 + ce(m,9) * eta * eta * eta
-     >                 + ce(m,10) * zeta * zeta * zeta
-     >                 + ce(m,11) * xi * xi * xi * xi
-     >                 + ce(m,12) * eta * eta * eta * eta
-     >                 + ce(m,13) * zeta * zeta * zeta * zeta
-               end do
-            end do
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c   xi-direction flux differences
-c---------------------------------------------------------------------
-c
-c   iex = flag : iex = 0  north/south communication
-c              : iex = 1  east/west communication
-c
-c---------------------------------------------------------------------
-      iex   = 0
-
-c---------------------------------------------------------------------
-c   communicate and receive/send two rows of data
-c---------------------------------------------------------------------
-      call exchange_3 (rsd,iex)
-
-      L1 = 0
-      if (north.eq.-1) L1 = 1
-      L2 = nx + 1
-      if (south.eq.-1) L2 = nx
-
-      ist1 = 1
-      iend1 = nx
-      if (north.eq.-1) ist1 = 4
-      if (south.eq.-1) iend1 = nx - 3
-
-      do k = 2, nz - 1
-         do j = jst, jend
-            do i = L1, L2
-               flux(1,i,j,k) = rsd(2,i,j,k)
-               u21 = rsd(2,i,j,k) / rsd(1,i,j,k)
-               q = 0.50d+00 * (  rsd(2,i,j,k) * rsd(2,i,j,k)
-     >                         + rsd(3,i,j,k) * rsd(3,i,j,k)
-     >                         + rsd(4,i,j,k) * rsd(4,i,j,k) )
-     >                      / rsd(1,i,j,k)
-               flux(2,i,j,k) = rsd(2,i,j,k) * u21 + c2 * 
-     >                         ( rsd(5,i,j,k) - q )
-               flux(3,i,j,k) = rsd(3,i,j,k) * u21
-               flux(4,i,j,k) = rsd(4,i,j,k) * u21
-               flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u21
-            end do
-         end do
-      end do 
-
-      do k = 2, nz - 1
-         do j = jst, jend
-            do i = ist, iend
-               do m = 1, 5
-                  frct(m,i,j,k) =  frct(m,i,j,k)
-     >                   - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) )
-               end do
-            end do
-            do i = ist, L2
-               tmp = 1.0d+00 / rsd(1,i,j,k)
-
-               u21i = tmp * rsd(2,i,j,k)
-               u31i = tmp * rsd(3,i,j,k)
-               u41i = tmp * rsd(4,i,j,k)
-               u51i = tmp * rsd(5,i,j,k)
-
-               tmp = 1.0d+00 / rsd(1,i-1,j,k)
-
-               u21im1 = tmp * rsd(2,i-1,j,k)
-               u31im1 = tmp * rsd(3,i-1,j,k)
-               u41im1 = tmp * rsd(4,i-1,j,k)
-               u51im1 = tmp * rsd(5,i-1,j,k)
-
-               flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * 
-     >                        ( u21i - u21im1 )
-               flux(3,i,j,k) = tx3 * ( u31i - u31im1 )
-               flux(4,i,j,k) = tx3 * ( u41i - u41im1 )
-               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
-     >              * tx3 * ( ( u21i  **2 + u31i  **2 + u41i  **2 )
-     >                      - ( u21im1**2 + u31im1**2 + u41im1**2 ) )
-     >              + (1.0d+00/6.0d+00)
-     >              * tx3 * ( u21i**2 - u21im1**2 )
-     >              + c1 * c5 * tx3 * ( u51i - u51im1 )
-            end do
-
-            do i = ist, iend
-               frct(1,i,j,k) = frct(1,i,j,k)
-     >              + dx1 * tx1 * (            rsd(1,i-1,j,k)
-     >                             - 2.0d+00 * rsd(1,i,j,k)
-     >                             +           rsd(1,i+1,j,k) )
-               frct(2,i,j,k) = frct(2,i,j,k)
-     >           + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) )
-     >              + dx2 * tx1 * (            rsd(2,i-1,j,k)
-     >                             - 2.0d+00 * rsd(2,i,j,k)
-     >                             +           rsd(2,i+1,j,k) )
-               frct(3,i,j,k) = frct(3,i,j,k)
-     >           + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) )
-     >              + dx3 * tx1 * (            rsd(3,i-1,j,k)
-     >                             - 2.0d+00 * rsd(3,i,j,k)
-     >                             +           rsd(3,i+1,j,k) )
-               frct(4,i,j,k) = frct(4,i,j,k)
-     >            + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) )
-     >              + dx4 * tx1 * (            rsd(4,i-1,j,k)
-     >                             - 2.0d+00 * rsd(4,i,j,k)
-     >                             +           rsd(4,i+1,j,k) )
-               frct(5,i,j,k) = frct(5,i,j,k)
-     >           + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) )
-     >              + dx5 * tx1 * (            rsd(5,i-1,j,k)
-     >                             - 2.0d+00 * rsd(5,i,j,k)
-     >                             +           rsd(5,i+1,j,k) )
-            end do
-
-c---------------------------------------------------------------------
-c   Fourth-order dissipation
-c---------------------------------------------------------------------
-            IF (north.eq.-1) then
-             do m = 1, 5
-               frct(m,2,j,k) = frct(m,2,j,k)
-     >           - dsspm * ( + 5.0d+00 * rsd(m,2,j,k)
-     >                       - 4.0d+00 * rsd(m,3,j,k)
-     >                       +           rsd(m,4,j,k) )
-               frct(m,3,j,k) = frct(m,3,j,k)
-     >           - dsspm * ( - 4.0d+00 * rsd(m,2,j,k)
-     >                       + 6.0d+00 * rsd(m,3,j,k)
-     >                       - 4.0d+00 * rsd(m,4,j,k)
-     >                       +           rsd(m,5,j,k) )
-             end do
-            END IF
-
-            do i = ist1,iend1
-               do m = 1, 5
-                  frct(m,i,j,k) = frct(m,i,j,k)
-     >              - dsspm * (            rsd(m,i-2,j,k)
-     >                         - 4.0d+00 * rsd(m,i-1,j,k)
-     >                         + 6.0d+00 * rsd(m,i,j,k)
-     >                         - 4.0d+00 * rsd(m,i+1,j,k)
-     >                         +           rsd(m,i+2,j,k) )
-               end do
-            end do
-
-            IF (south.eq.-1) then
-             do m = 1, 5
-               frct(m,nx-2,j,k) = frct(m,nx-2,j,k)
-     >           - dsspm * (             rsd(m,nx-4,j,k)
-     >                       - 4.0d+00 * rsd(m,nx-3,j,k)
-     >                       + 6.0d+00 * rsd(m,nx-2,j,k)
-     >                       - 4.0d+00 * rsd(m,nx-1,j,k)  )
-               frct(m,nx-1,j,k) = frct(m,nx-1,j,k)
-     >           - dsspm * (             rsd(m,nx-3,j,k)
-     >                       - 4.0d+00 * rsd(m,nx-2,j,k)
-     >                       + 5.0d+00 * rsd(m,nx-1,j,k) )
-             end do
-            END IF
-
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c   eta-direction flux differences
-c---------------------------------------------------------------------
-c
-c   iex = flag : iex = 0  north/south communication
-c              : iex = 1  east/west communication
-c
-c---------------------------------------------------------------------
-      iex   = 1
-
-c---------------------------------------------------------------------
-c   communicate and receive/send two rows of data
-c---------------------------------------------------------------------
-      call exchange_3 (rsd,iex)
-
-      L1 = 0
-      if (west.eq.-1) L1 = 1
-      L2 = ny + 1
-      if (east.eq.-1) L2 = ny
-
-      jst1 = 1
-      jend1 = ny
-      if (west.eq.-1) jst1 = 4
-      if (east.eq.-1) jend1 = ny - 3
-
-      do k = 2, nz - 1
-         do j = L1, L2
-            do i = ist, iend
-               flux(1,i,j,k) = rsd(3,i,j,k)
-               u31 = rsd(3,i,j,k) / rsd(1,i,j,k)
-               q = 0.50d+00 * (  rsd(2,i,j,k) * rsd(2,i,j,k)
-     >                         + rsd(3,i,j,k) * rsd(3,i,j,k)
-     >                         + rsd(4,i,j,k) * rsd(4,i,j,k) )
-     >                      / rsd(1,i,j,k)
-               flux(2,i,j,k) = rsd(2,i,j,k) * u31 
-               flux(3,i,j,k) = rsd(3,i,j,k) * u31 + c2 * 
-     >                       ( rsd(5,i,j,k) - q )
-               flux(4,i,j,k) = rsd(4,i,j,k) * u31
-               flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u31
-            end do
-         end do
-      end do
-
-      do k = 2, nz - 1
-         do i = ist, iend
-            do j = jst, jend
-               do m = 1, 5
-                  frct(m,i,j,k) =  frct(m,i,j,k)
-     >                 - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) )
-               end do
-            end do
-         end do
-
-         do j = jst, L2
-            do i = ist, iend
-               tmp = 1.0d+00 / rsd(1,i,j,k)
-
-               u21j = tmp * rsd(2,i,j,k)
-               u31j = tmp * rsd(3,i,j,k)
-               u41j = tmp * rsd(4,i,j,k)
-               u51j = tmp * rsd(5,i,j,k)
-
-               tmp = 1.0d+00 / rsd(1,i,j-1,k)
-
-               u21jm1 = tmp * rsd(2,i,j-1,k)
-               u31jm1 = tmp * rsd(3,i,j-1,k)
-               u41jm1 = tmp * rsd(4,i,j-1,k)
-               u51jm1 = tmp * rsd(5,i,j-1,k)
-
-               flux(2,i,j,k) = ty3 * ( u21j - u21jm1 )
-               flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * 
-     >                       ( u31j - u31jm1 )
-               flux(4,i,j,k) = ty3 * ( u41j - u41jm1 )
-               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
-     >              * ty3 * ( ( u21j  **2 + u31j  **2 + u41j  **2 )
-     >                      - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) )
-     >              + (1.0d+00/6.0d+00)
-     >              * ty3 * ( u31j**2 - u31jm1**2 )
-     >              + c1 * c5 * ty3 * ( u51j - u51jm1 )
-            end do
-         end do
-
-         do j = jst, jend
-            do i = ist, iend
-               frct(1,i,j,k) = frct(1,i,j,k)
-     >              + dy1 * ty1 * (            rsd(1,i,j-1,k)
-     >                             - 2.0d+00 * rsd(1,i,j,k)
-     >                             +           rsd(1,i,j+1,k) )
-               frct(2,i,j,k) = frct(2,i,j,k)
-     >          + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) )
-     >              + dy2 * ty1 * (            rsd(2,i,j-1,k)
-     >                             - 2.0d+00 * rsd(2,i,j,k)
-     >                             +           rsd(2,i,j+1,k) )
-               frct(3,i,j,k) = frct(3,i,j,k)
-     >          + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) )
-     >              + dy3 * ty1 * (            rsd(3,i,j-1,k)
-     >                             - 2.0d+00 * rsd(3,i,j,k)
-     >                             +           rsd(3,i,j+1,k) )
-               frct(4,i,j,k) = frct(4,i,j,k)
-     >          + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) )
-     >              + dy4 * ty1 * (            rsd(4,i,j-1,k)
-     >                             - 2.0d+00 * rsd(4,i,j,k)
-     >                             +           rsd(4,i,j+1,k) )
-               frct(5,i,j,k) = frct(5,i,j,k)
-     >          + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) )
-     >              + dy5 * ty1 * (            rsd(5,i,j-1,k)
-     >                             - 2.0d+00 * rsd(5,i,j,k)
-     >                             +           rsd(5,i,j+1,k) )
-            end do
-         end do
-
-c---------------------------------------------------------------------
-c   fourth-order dissipation
-c---------------------------------------------------------------------
-         IF (west.eq.-1) then
-            do i = ist, iend
-             do m = 1, 5
-               frct(m,i,2,k) = frct(m,i,2,k)
-     >           - dsspm * ( + 5.0d+00 * rsd(m,i,2,k)
-     >                       - 4.0d+00 * rsd(m,i,3,k)
-     >                       +           rsd(m,i,4,k) )
-               frct(m,i,3,k) = frct(m,i,3,k)
-     >           - dsspm * ( - 4.0d+00 * rsd(m,i,2,k)
-     >                       + 6.0d+00 * rsd(m,i,3,k)
-     >                       - 4.0d+00 * rsd(m,i,4,k)
-     >                       +           rsd(m,i,5,k) )
-             end do
-            end do
-         END IF
-
-         do j = jst1, jend1
-            do i = ist, iend
-               do m = 1, 5
-                  frct(m,i,j,k) = frct(m,i,j,k)
-     >              - dsspm * (            rsd(m,i,j-2,k)
-     >                        - 4.0d+00 * rsd(m,i,j-1,k)
-     >                        + 6.0d+00 * rsd(m,i,j,k)
-     >                        - 4.0d+00 * rsd(m,i,j+1,k)
-     >                        +           rsd(m,i,j+2,k) )
-               end do
-            end do
-         end do
-
-         IF (east.eq.-1) then
-            do i = ist, iend
-             do m = 1, 5
-               frct(m,i,ny-2,k) = frct(m,i,ny-2,k)
-     >           - dsspm * (             rsd(m,i,ny-4,k)
-     >                       - 4.0d+00 * rsd(m,i,ny-3,k)
-     >                       + 6.0d+00 * rsd(m,i,ny-2,k)
-     >                       - 4.0d+00 * rsd(m,i,ny-1,k)  )
-               frct(m,i,ny-1,k) = frct(m,i,ny-1,k)
-     >           - dsspm * (             rsd(m,i,ny-3,k)
-     >                       - 4.0d+00 * rsd(m,i,ny-2,k)
-     >                       + 5.0d+00 * rsd(m,i,ny-1,k)  )
-             end do
-            end do
-         END IF
-
-      end do
-
-c---------------------------------------------------------------------
-c   zeta-direction flux differences
-c---------------------------------------------------------------------
-      do k = 1, nz
-         do j = jst, jend
-            do i = ist, iend
-               flux(1,i,j,k) = rsd(4,i,j,k)
-               u41 = rsd(4,i,j,k) / rsd(1,i,j,k)
-               q = 0.50d+00 * (  rsd(2,i,j,k) * rsd(2,i,j,k)
-     >                         + rsd(3,i,j,k) * rsd(3,i,j,k)
-     >                         + rsd(4,i,j,k) * rsd(4,i,j,k) )
-     >                      / rsd(1,i,j,k)
-               flux(2,i,j,k) = rsd(2,i,j,k) * u41 
-               flux(3,i,j,k) = rsd(3,i,j,k) * u41 
-               flux(4,i,j,k) = rsd(4,i,j,k) * u41 + c2 * 
-     >                         ( rsd(5,i,j,k) - q )
-               flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u41
-            end do
-         end do
-      end do
-
-      do k = 2, nz - 1
-         do j = jst, jend
-            do i = ist, iend
-               do m = 1, 5
-                  frct(m,i,j,k) =  frct(m,i,j,k)
-     >                  - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) )
-               end do
-            end do
-         end do
-      end do
-
-      do k = 2, nz
-         do j = jst, jend
-            do i = ist, iend
-               tmp = 1.0d+00 / rsd(1,i,j,k)
-
-               u21k = tmp * rsd(2,i,j,k)
-               u31k = tmp * rsd(3,i,j,k)
-               u41k = tmp * rsd(4,i,j,k)
-               u51k = tmp * rsd(5,i,j,k)
-
-               tmp = 1.0d+00 / rsd(1,i,j,k-1)
-
-               u21km1 = tmp * rsd(2,i,j,k-1)
-               u31km1 = tmp * rsd(3,i,j,k-1)
-               u41km1 = tmp * rsd(4,i,j,k-1)
-               u51km1 = tmp * rsd(5,i,j,k-1)
-
-               flux(2,i,j,k) = tz3 * ( u21k - u21km1 )
-               flux(3,i,j,k) = tz3 * ( u31k - u31km1 )
-               flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * ( u41k 
-     >                       - u41km1 )
-               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
-     >              * tz3 * ( ( u21k  **2 + u31k  **2 + u41k  **2 )
-     >                      - ( u21km1**2 + u31km1**2 + u41km1**2 ) )
-     >              + (1.0d+00/6.0d+00)
-     >              * tz3 * ( u41k**2 - u41km1**2 )
-     >              + c1 * c5 * tz3 * ( u51k - u51km1 )
-            end do
-         end do
-      end do
-
-      do k = 2, nz - 1
-         do j = jst, jend
-            do i = ist, iend
-               frct(1,i,j,k) = frct(1,i,j,k)
-     >              + dz1 * tz1 * (            rsd(1,i,j,k+1)
-     >                             - 2.0d+00 * rsd(1,i,j,k)
-     >                             +           rsd(1,i,j,k-1) )
-               frct(2,i,j,k) = frct(2,i,j,k)
-     >          + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) )
-     >              + dz2 * tz1 * (            rsd(2,i,j,k+1)
-     >                             - 2.0d+00 * rsd(2,i,j,k)
-     >                             +           rsd(2,i,j,k-1) )
-               frct(3,i,j,k) = frct(3,i,j,k)
-     >          + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) )
-     >              + dz3 * tz1 * (            rsd(3,i,j,k+1)
-     >                             - 2.0d+00 * rsd(3,i,j,k)
-     >                             +           rsd(3,i,j,k-1) )
-               frct(4,i,j,k) = frct(4,i,j,k)
-     >          + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) )
-     >              + dz4 * tz1 * (            rsd(4,i,j,k+1)
-     >                             - 2.0d+00 * rsd(4,i,j,k)
-     >                             +           rsd(4,i,j,k-1) )
-               frct(5,i,j,k) = frct(5,i,j,k)
-     >          + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) )
-     >              + dz5 * tz1 * (            rsd(5,i,j,k+1)
-     >                             - 2.0d+00 * rsd(5,i,j,k)
-     >                             +           rsd(5,i,j,k-1) )
-            end do
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c   fourth-order dissipation
-c---------------------------------------------------------------------
-      do j = jst, jend
-         do i = ist, iend
-            do m = 1, 5
-               frct(m,i,j,2) = frct(m,i,j,2)
-     >           - dsspm * ( + 5.0d+00 * rsd(m,i,j,2)
-     >                       - 4.0d+00 * rsd(m,i,j,3)
-     >                       +           rsd(m,i,j,4) )
-               frct(m,i,j,3) = frct(m,i,j,3)
-     >           - dsspm * (- 4.0d+00 * rsd(m,i,j,2)
-     >                      + 6.0d+00 * rsd(m,i,j,3)
-     >                      - 4.0d+00 * rsd(m,i,j,4)
-     >                      +           rsd(m,i,j,5) )
-            end do
-         end do
-      end do
-
-      do k = 4, nz - 3
-         do j = jst, jend
-            do i = ist, iend
-               do m = 1, 5
-                  frct(m,i,j,k) = frct(m,i,j,k)
-     >              - dsspm * (           rsd(m,i,j,k-2)
-     >                        - 4.0d+00 * rsd(m,i,j,k-1)
-     >                        + 6.0d+00 * rsd(m,i,j,k)
-     >                        - 4.0d+00 * rsd(m,i,j,k+1)
-     >                        +           rsd(m,i,j,k+2) )
-               end do
-            end do
-         end do
-      end do
-
-      do j = jst, jend
-         do i = ist, iend
-            do m = 1, 5
-               frct(m,i,j,nz-2) = frct(m,i,j,nz-2)
-     >           - dsspm * (            rsd(m,i,j,nz-4)
-     >                      - 4.0d+00 * rsd(m,i,j,nz-3)
-     >                      + 6.0d+00 * rsd(m,i,j,nz-2)
-     >                      - 4.0d+00 * rsd(m,i,j,nz-1)  )
-               frct(m,i,j,nz-1) = frct(m,i,j,nz-1)
-     >           - dsspm * (             rsd(m,i,j,nz-3)
-     >                       - 4.0d+00 * rsd(m,i,j,nz-2)
-     >                       + 5.0d+00 * rsd(m,i,j,nz-1)  )
-            end do
-         end do
-      end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/error.f b/examples/smpi/NAS/LU/error.f
deleted file mode 100644
index e83f74912f..0000000000
--- a/examples/smpi/NAS/LU/error.f
+++ /dev/null
@@ -1,81 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine error
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   compute the solution error
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, k, m
-      integer iglob, jglob
-      double precision  tmp
-      double precision  u000ijk(5), dummy(5)
-
-      integer IERROR
-
-
-      do m = 1, 5
-         errnm(m) = 0.0d+00
-         dummy(m) = 0.0d+00
-      end do
-
-      do k = 2, nz-1
-         do j = jst, jend
-            jglob = jpt + j
-            do i = ist, iend
-               iglob = ipt + i
-               call exact( iglob, jglob, k, u000ijk )
-               do m = 1, 5
-                  tmp = ( u000ijk(m) - u(m,i,j,k) )
-                  dummy(m) = dummy(m) + tmp ** 2
-               end do
-            end do
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c   compute the global sum of individual contributions to dot product.
-c---------------------------------------------------------------------
-      call MPI_ALLREDUCE( dummy,
-     >                    errnm,
-     >                    5,
-     >                    dp_type,
-     >                    MPI_SUM,
-     >                    MPI_COMM_WORLD,
-     >                    IERROR )
-
-      do m = 1, 5
-         errnm(m) = sqrt ( errnm(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) )
-      end do
-
-c      if (id.eq.0) then
-c        write (*,1002) ( errnm(m), m = 1, 5 )
-c      end if
-
- 1002 format (1x/1x,'RMS-norm of error in soln. to ',
-     > 'first pde  = ',1pe12.5/,
-     > 1x,'RMS-norm of error in soln. to ',
-     > 'second pde = ',1pe12.5/,
-     > 1x,'RMS-norm of error in soln. to ',
-     > 'third pde  = ',1pe12.5/,
-     > 1x,'RMS-norm of error in soln. to ',
-     > 'fourth pde = ',1pe12.5/,
-     > 1x,'RMS-norm of error in soln. to ',
-     > 'fifth pde  = ',1pe12.5)
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/exact.f b/examples/smpi/NAS/LU/exact.f
deleted file mode 100644
index 19e14c3d5f..0000000000
--- a/examples/smpi/NAS/LU/exact.f
+++ /dev/null
@@ -1,53 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine exact( i, j, k, u000ijk )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   compute the exact solution at (i,j,k)
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer i, j, k
-      double precision u000ijk(*)
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer m
-      double precision xi, eta, zeta
-
-      xi  = ( dble ( i - 1 ) ) / ( nx0 - 1 )
-      eta  = ( dble ( j - 1 ) ) / ( ny0 - 1 )
-      zeta = ( dble ( k - 1 ) ) / ( nz - 1 )
-
-
-      do m = 1, 5
-         u000ijk(m) =  ce(m,1)
-     >        + ce(m,2) * xi
-     >        + ce(m,3) * eta
-     >        + ce(m,4) * zeta
-     >        + ce(m,5) * xi * xi
-     >        + ce(m,6) * eta * eta
-     >        + ce(m,7) * zeta * zeta
-     >        + ce(m,8) * xi * xi * xi
-     >        + ce(m,9) * eta * eta * eta
-     >        + ce(m,10) * zeta * zeta * zeta
-     >        + ce(m,11) * xi * xi * xi * xi
-     >        + ce(m,12) * eta * eta * eta * eta
-     >        + ce(m,13) * zeta * zeta * zeta * zeta
-      end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/exchange_1.f b/examples/smpi/NAS/LU/exchange_1.f
deleted file mode 100644
index 2bf7d28b94..0000000000
--- a/examples/smpi/NAS/LU/exchange_1.f
+++ /dev/null
@@ -1,180 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine exchange_1( g,k,iex )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-      double precision  g(5,-1:isiz1+2,-1:isiz2+2,isiz3)
-      integer k
-      integer iex
-      integer i, j
-      double precision dum(5,isiz1+isiz2), dum1(5,isiz1+isiz2)
-
-      integer STATUS(MPI_STATUS_SIZE)
-      integer IERROR
-
-
-
-      if( iex .eq. 0 ) then
-
-          if( north .ne. -1 ) then
-              call MPI_RECV( dum1(1,jst),
-     >                       5*(jend-jst+1),
-     >                       dp_type,
-     >                       north,
-     >                       from_n,
-     >                       MPI_COMM_WORLD,
-     >                       status,
-     >                       IERROR )
-              do j=jst,jend
-                  g(1,0,j,k) = dum1(1,j)
-                  g(2,0,j,k) = dum1(2,j)
-                  g(3,0,j,k) = dum1(3,j)
-                  g(4,0,j,k) = dum1(4,j)
-                  g(5,0,j,k) = dum1(5,j)
-              enddo
-          endif
-
-          if( west .ne. -1 ) then
-              call MPI_RECV( dum1(1,ist),
-     >                       5*(iend-ist+1),
-     >                       dp_type,
-     >                       west,
-     >                       from_w,
-     >                       MPI_COMM_WORLD,
-     >                       status,
-     >                       IERROR )
-              do i=ist,iend
-                  g(1,i,0,k) = dum1(1,i)
-                  g(2,i,0,k) = dum1(2,i)
-                  g(3,i,0,k) = dum1(3,i)
-                  g(4,i,0,k) = dum1(4,i)
-                  g(5,i,0,k) = dum1(5,i)
-              enddo
-          endif
-
-      else if( iex .eq. 1 ) then
-
-          if( south .ne. -1 ) then
-              call MPI_RECV( dum1(1,jst),
-     >                       5*(jend-jst+1),
-     >                       dp_type,
-     >                       south,
-     >                       from_s,
-     >                       MPI_COMM_WORLD,
-     >                       status,
-     >                       IERROR )
-              do j=jst,jend
-                  g(1,nx+1,j,k) = dum1(1,j)
-                  g(2,nx+1,j,k) = dum1(2,j)
-                  g(3,nx+1,j,k) = dum1(3,j)
-                  g(4,nx+1,j,k) = dum1(4,j)
-                  g(5,nx+1,j,k) = dum1(5,j)
-              enddo
-          endif
-
-          if( east .ne. -1 ) then
-              call MPI_RECV( dum1(1,ist),
-     >                       5*(iend-ist+1),
-     >                       dp_type,
-     >                       east,
-     >                       from_e,
-     >                       MPI_COMM_WORLD,
-     >                       status,
-     >                       IERROR )
-              do i=ist,iend
-                  g(1,i,ny+1,k) = dum1(1,i)
-                  g(2,i,ny+1,k) = dum1(2,i)
-                  g(3,i,ny+1,k) = dum1(3,i)
-                  g(4,i,ny+1,k) = dum1(4,i)
-                  g(5,i,ny+1,k) = dum1(5,i)
-              enddo
-          endif
-
-      else if( iex .eq. 2 ) then
-
-          if( south .ne. -1 ) then
-              do j=jst,jend
-                  dum(1,j) = g(1,nx,j,k) 
-                  dum(2,j) = g(2,nx,j,k) 
-                  dum(3,j) = g(3,nx,j,k) 
-                  dum(4,j) = g(4,nx,j,k) 
-                  dum(5,j) = g(5,nx,j,k) 
-              enddo
-              call MPI_SEND( dum(1,jst), 
-     >                       5*(jend-jst+1), 
-     >                       dp_type, 
-     >                       south, 
-     >                       from_n, 
-     >                       MPI_COMM_WORLD, 
-     >                       IERROR )
-          endif
-
-          if( east .ne. -1 ) then
-              do i=ist,iend
-                  dum(1,i) = g(1,i,ny,k)
-                  dum(2,i) = g(2,i,ny,k)
-                  dum(3,i) = g(3,i,ny,k)
-                  dum(4,i) = g(4,i,ny,k)
-                  dum(5,i) = g(5,i,ny,k)
-              enddo
-              call MPI_SEND( dum(1,ist), 
-     >                       5*(iend-ist+1), 
-     >                       dp_type, 
-     >                       east, 
-     >                       from_w, 
-     >                       MPI_COMM_WORLD, 
-     >                       IERROR )
-          endif
-
-      else
-
-          if( north .ne. -1 ) then
-              do j=jst,jend
-                  dum(1,j) = g(1,1,j,k)
-                  dum(2,j) = g(2,1,j,k)
-                  dum(3,j) = g(3,1,j,k)
-                  dum(4,j) = g(4,1,j,k)
-                  dum(5,j) = g(5,1,j,k)
-              enddo
-              call MPI_SEND( dum(1,jst), 
-     >                       5*(jend-jst+1), 
-     >                       dp_type, 
-     >                       north, 
-     >                       from_s, 
-     >                       MPI_COMM_WORLD, 
-     >                       IERROR )
-          endif
-
-          if( west .ne. -1 ) then
-              do i=ist,iend
-                  dum(1,i) = g(1,i,1,k)
-                  dum(2,i) = g(2,i,1,k)
-                  dum(3,i) = g(3,i,1,k)
-                  dum(4,i) = g(4,i,1,k)
-                  dum(5,i) = g(5,i,1,k)
-              enddo
-              call MPI_SEND( dum(1,ist), 
-     >                       5*(iend-ist+1), 
-     >                       dp_type, 
-     >                       west, 
-     >                       from_e, 
-     >                       MPI_COMM_WORLD, 
-     >                       IERROR )
-          endif
-
-      endif
-
-      end
-
-
-
diff --git a/examples/smpi/NAS/LU/exchange_3.f b/examples/smpi/NAS/LU/exchange_3.f
deleted file mode 100644
index d52ae7ef4d..0000000000
--- a/examples/smpi/NAS/LU/exchange_3.f
+++ /dev/null
@@ -1,312 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine exchange_3(g,iex)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      double precision  g(5,-1:isiz1+2,-1:isiz2+2,isiz3)
-      integer iex
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, k
-      integer ipos1, ipos2
-
-      integer mid
-      integer STATUS(MPI_STATUS_SIZE)
-      integer IERROR
-
-
-
-      if (iex.eq.0) then
-c---------------------------------------------------------------------
-c   communicate in the south and north directions
-c---------------------------------------------------------------------
-      if (north.ne.-1) then
-          call MPI_IRECV( buf1,
-     >                    10*ny*nz,
-     >                    dp_type,
-     >                    MPI_ANY_SOURCE,
-     >                    from_n,
-     >                    MPI_COMM_WORLD,
-     >                    mid,
-     >                    IERROR )
-      end if
-
-c---------------------------------------------------------------------
-c   send south
-c---------------------------------------------------------------------
-      if (south.ne.-1) then
-          do k = 1,nz
-            do j = 1,ny
-              ipos1 = (k-1)*ny + j
-              ipos2 = ipos1 + ny*nz
-              buf(1,ipos1) = g(1,nx-1,j,k) 
-              buf(2,ipos1) = g(2,nx-1,j,k) 
-              buf(3,ipos1) = g(3,nx-1,j,k) 
-              buf(4,ipos1) = g(4,nx-1,j,k) 
-              buf(5,ipos1) = g(5,nx-1,j,k) 
-              buf(1,ipos2) = g(1,nx,j,k)
-              buf(2,ipos2) = g(2,nx,j,k)
-              buf(3,ipos2) = g(3,nx,j,k)
-              buf(4,ipos2) = g(4,nx,j,k)
-              buf(5,ipos2) = g(5,nx,j,k)
-            end do
-          end do
-
-          call MPI_SEND( buf,
-     >                   10*ny*nz,
-     >                   dp_type,
-     >                   south,
-     >                   from_n,
-     >                   MPI_COMM_WORLD,
-     >                   IERROR )
-        end if
-
-c---------------------------------------------------------------------
-c   receive from north
-c---------------------------------------------------------------------
-        if (north.ne.-1) then
-          call MPI_WAIT( mid, STATUS, IERROR )
-
-          do k = 1,nz
-            do j = 1,ny
-              ipos1 = (k-1)*ny + j
-              ipos2 = ipos1 + ny*nz
-              g(1,-1,j,k) = buf1(1,ipos1)
-              g(2,-1,j,k) = buf1(2,ipos1)
-              g(3,-1,j,k) = buf1(3,ipos1)
-              g(4,-1,j,k) = buf1(4,ipos1)
-              g(5,-1,j,k) = buf1(5,ipos1)
-              g(1,0,j,k) = buf1(1,ipos2)
-              g(2,0,j,k) = buf1(2,ipos2)
-              g(3,0,j,k) = buf1(3,ipos2)
-              g(4,0,j,k) = buf1(4,ipos2)
-              g(5,0,j,k) = buf1(5,ipos2)
-            end do
-          end do
-
-        end if
-
-      if (south.ne.-1) then
-          call MPI_IRECV( buf1,
-     >                    10*ny*nz,
-     >                    dp_type,
-     >                    MPI_ANY_SOURCE,
-     >                    from_s,
-     >                    MPI_COMM_WORLD,
-     >                    mid,
-     >                    IERROR )
-      end if
-
-c---------------------------------------------------------------------
-c   send north
-c---------------------------------------------------------------------
-        if (north.ne.-1) then
-          do k = 1,nz
-            do j = 1,ny
-              ipos1 = (k-1)*ny + j
-              ipos2 = ipos1 + ny*nz
-              buf(1,ipos1) = g(1,2,j,k)
-              buf(2,ipos1) = g(2,2,j,k)
-              buf(3,ipos1) = g(3,2,j,k)
-              buf(4,ipos1) = g(4,2,j,k)
-              buf(5,ipos1) = g(5,2,j,k)
-              buf(1,ipos2) = g(1,1,j,k)
-              buf(2,ipos2) = g(2,1,j,k)
-              buf(3,ipos2) = g(3,1,j,k)
-              buf(4,ipos2) = g(4,1,j,k)
-              buf(5,ipos2) = g(5,1,j,k)
-            end do
-          end do
-
-          call MPI_SEND( buf,
-     >                   10*ny*nz,
-     >                   dp_type,
-     >                   north,
-     >                   from_s,
-     >                   MPI_COMM_WORLD,
-     >                   IERROR )
-        end if
-
-c---------------------------------------------------------------------
-c   receive from south
-c---------------------------------------------------------------------
-        if (south.ne.-1) then
-          call MPI_WAIT( mid, STATUS, IERROR )
-
-          do k = 1,nz
-            do j = 1,ny
-              ipos1 = (k-1)*ny + j
-              ipos2 = ipos1 + ny*nz
-              g(1,nx+2,j,k)  = buf1(1,ipos1)
-              g(2,nx+2,j,k)  = buf1(2,ipos1)
-              g(3,nx+2,j,k)  = buf1(3,ipos1)
-              g(4,nx+2,j,k)  = buf1(4,ipos1)
-              g(5,nx+2,j,k)  = buf1(5,ipos1)
-              g(1,nx+1,j,k) = buf1(1,ipos2)
-              g(2,nx+1,j,k) = buf1(2,ipos2)
-              g(3,nx+1,j,k) = buf1(3,ipos2)
-              g(4,nx+1,j,k) = buf1(4,ipos2)
-              g(5,nx+1,j,k) = buf1(5,ipos2)
-            end do
-          end do
-        end if
-
-      else
-
-c---------------------------------------------------------------------
-c   communicate in the east and west directions
-c---------------------------------------------------------------------
-      if (west.ne.-1) then
-          call MPI_IRECV( buf1,
-     >                    10*nx*nz,
-     >                    dp_type,
-     >                    MPI_ANY_SOURCE,
-     >                    from_w,
-     >                    MPI_COMM_WORLD,
-     >                    mid,
-     >                    IERROR )
-      end if
-
-c---------------------------------------------------------------------
-c   send east
-c---------------------------------------------------------------------
-        if (east.ne.-1) then
-          do k = 1,nz
-            do i = 1,nx
-              ipos1 = (k-1)*nx + i
-              ipos2 = ipos1 + nx*nz
-              buf(1,ipos1) = g(1,i,ny-1,k)
-              buf(2,ipos1) = g(2,i,ny-1,k)
-              buf(3,ipos1) = g(3,i,ny-1,k)
-              buf(4,ipos1) = g(4,i,ny-1,k)
-              buf(5,ipos1) = g(5,i,ny-1,k)
-              buf(1,ipos2) = g(1,i,ny,k)
-              buf(2,ipos2) = g(2,i,ny,k)
-              buf(3,ipos2) = g(3,i,ny,k)
-              buf(4,ipos2) = g(4,i,ny,k)
-              buf(5,ipos2) = g(5,i,ny,k)
-            end do
-          end do
-
-          call MPI_SEND( buf,
-     >                   10*nx*nz,
-     >                   dp_type,
-     >                   east,
-     >                   from_w,
-     >                   MPI_COMM_WORLD,
-     >                   IERROR )
-        end if
-
-c---------------------------------------------------------------------
-c   receive from west
-c---------------------------------------------------------------------
-        if (west.ne.-1) then
-          call MPI_WAIT( mid, STATUS, IERROR )
-
-          do k = 1,nz
-            do i = 1,nx
-              ipos1 = (k-1)*nx + i
-              ipos2 = ipos1 + nx*nz
-              g(1,i,-1,k) = buf1(1,ipos1)
-              g(2,i,-1,k) = buf1(2,ipos1)
-              g(3,i,-1,k) = buf1(3,ipos1)
-              g(4,i,-1,k) = buf1(4,ipos1)
-              g(5,i,-1,k) = buf1(5,ipos1)
-              g(1,i,0,k) = buf1(1,ipos2)
-              g(2,i,0,k) = buf1(2,ipos2)
-              g(3,i,0,k) = buf1(3,ipos2)
-              g(4,i,0,k) = buf1(4,ipos2)
-              g(5,i,0,k) = buf1(5,ipos2)
-            end do
-          end do
-
-        end if
-
-      if (east.ne.-1) then
-          call MPI_IRECV( buf1,
-     >                    10*nx*nz,
-     >                    dp_type,
-     >                    MPI_ANY_SOURCE,
-     >                    from_e,
-     >                    MPI_COMM_WORLD,
-     >                    mid,
-     >                    IERROR )
-      end if
-
-c---------------------------------------------------------------------
-c   send west
-c---------------------------------------------------------------------
-      if (west.ne.-1) then
-          do k = 1,nz
-            do i = 1,nx
-              ipos1 = (k-1)*nx + i
-              ipos2 = ipos1 + nx*nz
-              buf(1,ipos1) = g(1,i,2,k)
-              buf(2,ipos1) = g(2,i,2,k)
-              buf(3,ipos1) = g(3,i,2,k)
-              buf(4,ipos1) = g(4,i,2,k)
-              buf(5,ipos1) = g(5,i,2,k)
-              buf(1,ipos2) = g(1,i,1,k)
-              buf(2,ipos2) = g(2,i,1,k)
-              buf(3,ipos2) = g(3,i,1,k)
-              buf(4,ipos2) = g(4,i,1,k)
-              buf(5,ipos2) = g(5,i,1,k)
-            end do
-          end do
-
-          call MPI_SEND( buf,
-     >                   10*nx*nz,
-     >                   dp_type,
-     >                   west,
-     >                   from_e,
-     >                   MPI_COMM_WORLD,
-     >                   IERROR )
-        end if
-
-c---------------------------------------------------------------------
-c   receive from east
-c---------------------------------------------------------------------
-        if (east.ne.-1) then
-          call MPI_WAIT( mid, STATUS, IERROR )
-
-          do k = 1,nz
-            do i = 1,nx
-              ipos1 = (k-1)*nx + i
-              ipos2 = ipos1 + nx*nz
-              g(1,i,ny+2,k)  = buf1(1,ipos1)
-              g(2,i,ny+2,k)  = buf1(2,ipos1)
-              g(3,i,ny+2,k)  = buf1(3,ipos1)
-              g(4,i,ny+2,k)  = buf1(4,ipos1)
-              g(5,i,ny+2,k)  = buf1(5,ipos1)
-              g(1,i,ny+1,k) = buf1(1,ipos2)
-              g(2,i,ny+1,k) = buf1(2,ipos2)
-              g(3,i,ny+1,k) = buf1(3,ipos2)
-              g(4,i,ny+1,k) = buf1(4,ipos2)
-              g(5,i,ny+1,k) = buf1(5,ipos2)
-            end do
-          end do
-
-        end if
-
-      end if
-
-      return
-      end     
diff --git a/examples/smpi/NAS/LU/exchange_4.f b/examples/smpi/NAS/LU/exchange_4.f
deleted file mode 100644
index 1c4c38e218..0000000000
--- a/examples/smpi/NAS/LU/exchange_4.f
+++ /dev/null
@@ -1,133 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine exchange_4(g,h,ibeg,ifin1,jbeg,jfin1)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      double precision  g(0:isiz2+1,0:isiz3+1), 
-     >        h(0:isiz2+1,0:isiz3+1)
-      integer ibeg, ifin1
-      integer jbeg, jfin1
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j
-      integer ny2
-      double precision  dum(1024)
-
-      integer msgid1, msgid3
-      integer STATUS(MPI_STATUS_SIZE)
-      integer IERROR
-
-
-
-      ny2 = ny + 2
-
-c---------------------------------------------------------------------
-c   communicate in the east and west directions
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   receive from east
-c---------------------------------------------------------------------
-      if (jfin1.eq.ny) then
-        call MPI_IRECV( dum,
-     >                  2*nx,
-     >                  dp_type,
-     >                  MPI_ANY_SOURCE,
-     >                  from_e,
-     >                  MPI_COMM_WORLD,
-     >                  msgid3,
-     >                  IERROR )
-
-        call MPI_WAIT( msgid3, STATUS, IERROR )
-
-        do i = 1,nx
-          g(i,ny+1) = dum(i)
-          h(i,ny+1) = dum(i+nx)
-        end do
-
-      end if
-
-c---------------------------------------------------------------------
-c   send west
-c---------------------------------------------------------------------
-      if (jbeg.eq.1) then
-        do i = 1,nx
-          dum(i) = g(i,1)
-          dum(i+nx) = h(i,1)
-        end do
-
-        call MPI_SEND( dum,
-     >                 2*nx,
-     >                 dp_type,
-     >                 west,
-     >                 from_e,
-     >                 MPI_COMM_WORLD,
-     >                 IERROR )
-
-      end if
-
-c---------------------------------------------------------------------
-c   communicate in the south and north directions
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   receive from south
-c---------------------------------------------------------------------
-      if (ifin1.eq.nx) then
-        call MPI_IRECV( dum,
-     >                  2*ny2,
-     >                  dp_type,
-     >                  MPI_ANY_SOURCE,
-     >                  from_s,
-     >                  MPI_COMM_WORLD,
-     >                  msgid1,
-     >                  IERROR )
-
-        call MPI_WAIT( msgid1, STATUS, IERROR )
-
-        do j = 0,ny+1
-          g(nx+1,j) = dum(j+1)
-          h(nx+1,j) = dum(j+ny2+1)
-        end do
-
-      end if
-
-c---------------------------------------------------------------------
-c   send north
-c---------------------------------------------------------------------
-      if (ibeg.eq.1) then
-        do j = 0,ny+1
-          dum(j+1) = g(1,j)
-          dum(j+ny2+1) = h(1,j)
-        end do
-
-        call MPI_SEND( dum,
-     >                 2*ny2,
-     >                 dp_type,
-     >                 north,
-     >                 from_s,
-     >                 MPI_COMM_WORLD,
-     >                 IERROR )
-
-      end if
-
-      return
-      end     
diff --git a/examples/smpi/NAS/LU/exchange_5.f b/examples/smpi/NAS/LU/exchange_5.f
deleted file mode 100644
index e4cc66f5bf..0000000000
--- a/examples/smpi/NAS/LU/exchange_5.f
+++ /dev/null
@@ -1,81 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine exchange_5(g,ibeg,ifin1)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      double precision  g(0:isiz2+1,0:isiz3+1)
-      integer ibeg, ifin1
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer k
-      double precision  dum(1024)
-
-      integer msgid1
-      integer STATUS(MPI_STATUS_SIZE)
-      integer IERROR
-
-
-
-c---------------------------------------------------------------------
-c   communicate in the south and north directions
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   receive from south
-c---------------------------------------------------------------------
-      if (ifin1.eq.nx) then
-        call MPI_IRECV( dum,
-     >                  nz,
-     >                  dp_type,
-     >                  MPI_ANY_SOURCE,
-     >                  from_s,
-     >                  MPI_COMM_WORLD,
-     >                  msgid1,
-     >                  IERROR )
-
-        call MPI_WAIT( msgid1, STATUS, IERROR )
-
-        do k = 1,nz
-          g(nx+1,k) = dum(k)
-        end do
-
-      end if
-
-c---------------------------------------------------------------------
-c   send north
-c---------------------------------------------------------------------
-      if (ibeg.eq.1) then
-        do k = 1,nz
-          dum(k) = g(1,k)
-        end do
-
-        call MPI_SEND( dum,
-     >                 nz,
-     >                 dp_type,
-     >                 north,
-     >                 from_s,
-     >                 MPI_COMM_WORLD,
-     >                 IERROR )
-
-      end if
-
-      return
-      end     
diff --git a/examples/smpi/NAS/LU/exchange_6.f b/examples/smpi/NAS/LU/exchange_6.f
deleted file mode 100644
index 0626609547..0000000000
--- a/examples/smpi/NAS/LU/exchange_6.f
+++ /dev/null
@@ -1,81 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine exchange_6(g,jbeg,jfin1)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      double precision  g(0:isiz2+1,0:isiz3+1)
-      integer jbeg, jfin1
-
-c---------------------------------------------------------------------
-c  local parameters
-c---------------------------------------------------------------------
-      integer k
-      double precision  dum(1024)
-
-      integer msgid3
-      integer STATUS(MPI_STATUS_SIZE)
-      integer IERROR
-
-
-
-c---------------------------------------------------------------------
-c   communicate in the east and west directions
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   receive from east
-c---------------------------------------------------------------------
-      if (jfin1.eq.ny) then
-        call MPI_IRECV( dum,
-     >                  nz,
-     >                  dp_type,
-     >                  MPI_ANY_SOURCE,
-     >                  from_e,
-     >                  MPI_COMM_WORLD,
-     >                  msgid3,
-     >                  IERROR )
-
-        call MPI_WAIT( msgid3, STATUS, IERROR )
-
-        do k = 1,nz
-          g(ny+1,k) = dum(k)
-        end do
-
-      end if
-
-c---------------------------------------------------------------------
-c   send west
-c---------------------------------------------------------------------
-      if (jbeg.eq.1) then
-        do k = 1,nz
-          dum(k) = g(1,k)
-        end do
-
-        call MPI_SEND( dum,
-     >                 nz,
-     >                 dp_type,
-     >                 west,
-     >                 from_e,
-     >                 MPI_COMM_WORLD,
-     >                 IERROR )
-
-      end if
-
-      return
-      end     
diff --git a/examples/smpi/NAS/LU/init_comm.f b/examples/smpi/NAS/LU/init_comm.f
deleted file mode 100644
index 72ece00ac4..0000000000
--- a/examples/smpi/NAS/LU/init_comm.f
+++ /dev/null
@@ -1,57 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine init_comm 
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   initialize MPI and establish rank and size
-c
-c This is a module in the MPI implementation of LUSSOR
-c pseudo application from the NAS Parallel Benchmarks. 
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-      integer nodedim
-      integer IERROR
-
-
-c---------------------------------------------------------------------
-c    initialize MPI communication
-c---------------------------------------------------------------------
-      call MPI_INIT( IERROR )
-
-c---------------------------------------------------------------------
-c   establish the global rank of this process
-c---------------------------------------------------------------------
-      call MPI_COMM_RANK( MPI_COMM_WORLD,
-     >                     id,
-     >                     IERROR )
-
-c---------------------------------------------------------------------
-c   establish the size of the global group
-c---------------------------------------------------------------------
-      call MPI_COMM_SIZE( MPI_COMM_WORLD,
-     >                     num,
-     >                     IERROR )
-
-      ndim   = nodedim(num)
-
-      if (.not. convertdouble) then
-         dp_type = MPI_DOUBLE_PRECISION
-      else
-         dp_type = MPI_REAL
-      endif
-
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/inputlu.data.sample b/examples/smpi/NAS/LU/inputlu.data.sample
deleted file mode 100644
index 9ef5a7be00..0000000000
--- a/examples/smpi/NAS/LU/inputlu.data.sample
+++ /dev/null
@@ -1,24 +0,0 @@
-c
-c***controls printing of the progress of iterations: ipr    inorm
-                                                      1      250
-c
-c***the maximum no. of pseudo-time steps to be performed: nitmax
-                                                             250
-c
-c***magnitude of the time step: dt 
-                               2.0e+00
-c
-c***relaxation factor for SSOR iterations: omega
-                                            1.2
-c
-c***tolerance levels for steady-state residuals: tolnwt(m),m=1,5
-                             1.0e-08   1.0e-08   1.0e-08  1.0e-08  1.0e-08 
-c
-c***number of grid points in xi and eta and zeta directions: nx   ny   nz
-                                                            64  64  64
-c
-
-
-
-
-
diff --git a/examples/smpi/NAS/LU/jacld.f b/examples/smpi/NAS/LU/jacld.f
deleted file mode 100644
index 9580d080ab..0000000000
--- a/examples/smpi/NAS/LU/jacld.f
+++ /dev/null
@@ -1,384 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine jacld(k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c   compute the lower triangular part of the jacobian matrix
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer k
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j
-      double precision  r43
-      double precision  c1345
-      double precision  c34
-      double precision  tmp1, tmp2, tmp3
-
-
-
-      r43 = ( 4.0d+00 / 3.0d+00 )
-      c1345 = c1 * c3 * c4 * c5
-      c34 = c3 * c4
-
-         do j = jst, jend
-            do i = ist, iend
-
-c---------------------------------------------------------------------
-c   form the block daigonal
-c---------------------------------------------------------------------
-               tmp1 = 1.0d+00 / u(1,i,j,k)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               d(1,1,i,j) =  1.0d+00
-     >                       + dt * 2.0d+00 * (   tx1 * dx1
-     >                                          + ty1 * dy1
-     >                                          + tz1 * dz1 )
-               d(1,2,i,j) =  0.0d+00
-               d(1,3,i,j) =  0.0d+00
-               d(1,4,i,j) =  0.0d+00
-               d(1,5,i,j) =  0.0d+00
-
-               d(2,1,i,j) =  dt * 2.0d+00
-     >          * (  tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) )
-     >             + ty1 * ( -       c34 * tmp2 * u(2,i,j,k) )
-     >             + tz1 * ( -       c34 * tmp2 * u(2,i,j,k) ) )
-               d(2,2,i,j) =  1.0d+00
-     >          + dt * 2.0d+00 
-     >          * (  tx1 * r43 * c34 * tmp1
-     >             + ty1 *       c34 * tmp1
-     >             + tz1 *       c34 * tmp1 )
-     >          + dt * 2.0d+00 * (   tx1 * dx2
-     >                             + ty1 * dy2
-     >                             + tz1 * dz2  )
-               d(2,3,i,j) = 0.0d+00
-               d(2,4,i,j) = 0.0d+00
-               d(2,5,i,j) = 0.0d+00
-
-               d(3,1,i,j) = dt * 2.0d+00
-     >      * (  tx1 * ( -       c34 * tmp2 * u(3,i,j,k) )
-     >         + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) )
-     >         + tz1 * ( -       c34 * tmp2 * u(3,i,j,k) ) )
-               d(3,2,i,j) = 0.0d+00
-               d(3,3,i,j) = 1.0d+00
-     >         + dt * 2.0d+00
-     >              * (  tx1 *       c34 * tmp1
-     >                 + ty1 * r43 * c34 * tmp1
-     >                 + tz1 *       c34 * tmp1 )
-     >         + dt * 2.0d+00 * (  tx1 * dx3
-     >                           + ty1 * dy3
-     >                           + tz1 * dz3 )
-               d(3,4,i,j) = 0.0d+00
-               d(3,5,i,j) = 0.0d+00
-
-               d(4,1,i,j) = dt * 2.0d+00
-     >      * (  tx1 * ( -       c34 * tmp2 * u(4,i,j,k) )
-     >         + ty1 * ( -       c34 * tmp2 * u(4,i,j,k) )
-     >         + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) )
-               d(4,2,i,j) = 0.0d+00
-               d(4,3,i,j) = 0.0d+00
-               d(4,4,i,j) = 1.0d+00
-     >         + dt * 2.0d+00
-     >              * (  tx1 *       c34 * tmp1
-     >                 + ty1 *       c34 * tmp1
-     >                 + tz1 * r43 * c34 * tmp1 )
-     >         + dt * 2.0d+00 * (  tx1 * dx4
-     >                           + ty1 * dy4
-     >                           + tz1 * dz4 )
-               d(4,5,i,j) = 0.0d+00
-
-               d(5,1,i,j) = dt * 2.0d+00
-     > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
-     >             - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
-     >             - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
-     >             - ( c1345 ) * tmp2 * u(5,i,j,k) )
-     >   + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
-     >             - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
-     >             - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
-     >             - ( c1345 ) * tmp2 * u(5,i,j,k) )
-     >   + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
-     >             - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
-     >             - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
-     >             - ( c1345 ) * tmp2 * u(5,i,j,k) ) )
-               d(5,2,i,j) = dt * 2.0d+00
-     > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k)
-     >   + ty1 * (     c34 - c1345 ) * tmp2 * u(2,i,j,k)
-     >   + tz1 * (     c34 - c1345 ) * tmp2 * u(2,i,j,k) )
-               d(5,3,i,j) = dt * 2.0d+00
-     > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k)
-     >   + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k)
-     >   + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) )
-               d(5,4,i,j) = dt * 2.0d+00
-     > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
-     >   + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
-     >   + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) )
-               d(5,5,i,j) = 1.0d+00
-     >   + dt * 2.0d+00 * ( tx1 * c1345 * tmp1
-     >                    + ty1 * c1345 * tmp1
-     >                    + tz1 * c1345 * tmp1 )
-     >   + dt * 2.0d+00 * (  tx1 * dx5
-     >                    +  ty1 * dy5
-     >                    +  tz1 * dz5 )
-
-c---------------------------------------------------------------------
-c   form the first block sub-diagonal
-c---------------------------------------------------------------------
-               tmp1 = 1.0d+00 / u(1,i,j,k-1)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               a(1,1,i,j) = - dt * tz1 * dz1
-               a(1,2,i,j) =   0.0d+00
-               a(1,3,i,j) =   0.0d+00
-               a(1,4,i,j) = - dt * tz2
-               a(1,5,i,j) =   0.0d+00
-
-               a(2,1,i,j) = - dt * tz2
-     >           * ( - ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
-     >           - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k-1) )
-               a(2,2,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 )
-     >           - dt * tz1 * c34 * tmp1
-     >           - dt * tz1 * dz2 
-               a(2,3,i,j) = 0.0d+00
-               a(2,4,i,j) = - dt * tz2 * ( u(2,i,j,k-1) * tmp1 )
-               a(2,5,i,j) = 0.0d+00
-
-               a(3,1,i,j) = - dt * tz2
-     >           * ( - ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
-     >           - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k-1) )
-               a(3,2,i,j) = 0.0d+00
-               a(3,3,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 )
-     >           - dt * tz1 * ( c34 * tmp1 )
-     >           - dt * tz1 * dz3
-               a(3,4,i,j) = - dt * tz2 * ( u(3,i,j,k-1) * tmp1 )
-               a(3,5,i,j) = 0.0d+00
-
-               a(4,1,i,j) = - dt * tz2
-     >        * ( - ( u(4,i,j,k-1) * tmp1 ) ** 2
-     >            + 0.50d+00 * c2
-     >            * ( ( u(2,i,j,k-1) * u(2,i,j,k-1)
-     >                + u(3,i,j,k-1) * u(3,i,j,k-1)
-     >                + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2 ) )
-     >        - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k-1) )
-               a(4,2,i,j) = - dt * tz2
-     >             * ( - c2 * ( u(2,i,j,k-1) * tmp1 ) )
-               a(4,3,i,j) = - dt * tz2
-     >             * ( - c2 * ( u(3,i,j,k-1) * tmp1 ) )
-               a(4,4,i,j) = - dt * tz2 * ( 2.0d+00 - c2 )
-     >             * ( u(4,i,j,k-1) * tmp1 )
-     >             - dt * tz1 * ( r43 * c34 * tmp1 )
-     >             - dt * tz1 * dz4
-               a(4,5,i,j) = - dt * tz2 * c2
-
-               a(5,1,i,j) = - dt * tz2
-     >     * ( ( c2 * (  u(2,i,j,k-1) * u(2,i,j,k-1)
-     >                 + u(3,i,j,k-1) * u(3,i,j,k-1)
-     >                 + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2
-     >       - c1 * ( u(5,i,j,k-1) * tmp1 ) )
-     >            * ( u(4,i,j,k-1) * tmp1 ) )
-     >       - dt * tz1
-     >       * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k-1)**2)
-     >           - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k-1)**2)
-     >           - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k-1)**2)
-     >          - c1345 * tmp2 * u(5,i,j,k-1) )
-               a(5,2,i,j) = - dt * tz2
-     >       * ( - c2 * ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
-     >       - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k-1)
-               a(5,3,i,j) = - dt * tz2
-     >       * ( - c2 * ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
-     >       - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k-1)
-               a(5,4,i,j) = - dt * tz2
-     >       * ( c1 * ( u(5,i,j,k-1) * tmp1 )
-     >       - 0.50d+00 * c2
-     >       * ( (  u(2,i,j,k-1)*u(2,i,j,k-1)
-     >            + u(3,i,j,k-1)*u(3,i,j,k-1)
-     >            + 3.0d+00*u(4,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) )
-     >       - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k-1)
-               a(5,5,i,j) = - dt * tz2
-     >       * ( c1 * ( u(4,i,j,k-1) * tmp1 ) )
-     >       - dt * tz1 * c1345 * tmp1
-     >       - dt * tz1 * dz5
-
-c---------------------------------------------------------------------
-c   form the second block sub-diagonal
-c---------------------------------------------------------------------
-               tmp1 = 1.0d+00 / u(1,i,j-1,k)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               b(1,1,i,j) = - dt * ty1 * dy1
-               b(1,2,i,j) =   0.0d+00
-               b(1,3,i,j) = - dt * ty2
-               b(1,4,i,j) =   0.0d+00
-               b(1,5,i,j) =   0.0d+00
-
-               b(2,1,i,j) = - dt * ty2
-     >           * ( - ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 )
-     >           - dt * ty1 * ( - c34 * tmp2 * u(2,i,j-1,k) )
-               b(2,2,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 )
-     >          - dt * ty1 * ( c34 * tmp1 )
-     >          - dt * ty1 * dy2
-               b(2,3,i,j) = - dt * ty2 * ( u(2,i,j-1,k) * tmp1 )
-               b(2,4,i,j) = 0.0d+00
-               b(2,5,i,j) = 0.0d+00
-
-               b(3,1,i,j) = - dt * ty2
-     >           * ( - ( u(3,i,j-1,k) * tmp1 ) ** 2
-     >      + 0.50d+00 * c2 * ( (  u(2,i,j-1,k) * u(2,i,j-1,k)
-     >                           + u(3,i,j-1,k) * u(3,i,j-1,k)
-     >                           + u(4,i,j-1,k) * u(4,i,j-1,k) )
-     >                          * tmp2 ) )
-     >       - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j-1,k) )
-               b(3,2,i,j) = - dt * ty2
-     >                   * ( - c2 * ( u(2,i,j-1,k) * tmp1 ) )
-               b(3,3,i,j) = - dt * ty2 * ( ( 2.0d+00 - c2 )
-     >                   * ( u(3,i,j-1,k) * tmp1 ) )
-     >       - dt * ty1 * ( r43 * c34 * tmp1 )
-     >       - dt * ty1 * dy3
-               b(3,4,i,j) = - dt * ty2
-     >                   * ( - c2 * ( u(4,i,j-1,k) * tmp1 ) )
-               b(3,5,i,j) = - dt * ty2 * c2
-
-               b(4,1,i,j) = - dt * ty2
-     >              * ( - ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 )
-     >       - dt * ty1 * ( - c34 * tmp2 * u(4,i,j-1,k) )
-               b(4,2,i,j) = 0.0d+00
-               b(4,3,i,j) = - dt * ty2 * ( u(4,i,j-1,k) * tmp1 )
-               b(4,4,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 )
-     >                        - dt * ty1 * ( c34 * tmp1 )
-     >                        - dt * ty1 * dy4
-               b(4,5,i,j) = 0.0d+00
-
-               b(5,1,i,j) = - dt * ty2
-     >          * ( ( c2 * (  u(2,i,j-1,k) * u(2,i,j-1,k)
-     >                      + u(3,i,j-1,k) * u(3,i,j-1,k)
-     >                      + u(4,i,j-1,k) * u(4,i,j-1,k) ) * tmp2
-     >               - c1 * ( u(5,i,j-1,k) * tmp1 ) )
-     >          * ( u(3,i,j-1,k) * tmp1 ) )
-     >          - dt * ty1
-     >          * ( - (     c34 - c1345 )*tmp3*(u(2,i,j-1,k)**2)
-     >              - ( r43*c34 - c1345 )*tmp3*(u(3,i,j-1,k)**2)
-     >              - (     c34 - c1345 )*tmp3*(u(4,i,j-1,k)**2)
-     >              - c1345*tmp2*u(5,i,j-1,k) )
-               b(5,2,i,j) = - dt * ty2
-     >          * ( - c2 * ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 )
-     >          - dt * ty1
-     >          * ( c34 - c1345 ) * tmp2 * u(2,i,j-1,k)
-               b(5,3,i,j) = - dt * ty2
-     >          * ( c1 * ( u(5,i,j-1,k) * tmp1 )
-     >          - 0.50d+00 * c2 
-     >          * ( (  u(2,i,j-1,k)*u(2,i,j-1,k)
-     >               + 3.0d+00 * u(3,i,j-1,k)*u(3,i,j-1,k)
-     >               + u(4,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) )
-     >          - dt * ty1
-     >          * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j-1,k)
-               b(5,4,i,j) = - dt * ty2
-     >          * ( - c2 * ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 )
-     >          - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j-1,k)
-               b(5,5,i,j) = - dt * ty2
-     >          * ( c1 * ( u(3,i,j-1,k) * tmp1 ) )
-     >          - dt * ty1 * c1345 * tmp1
-     >          - dt * ty1 * dy5
-
-c---------------------------------------------------------------------
-c   form the third block sub-diagonal
-c---------------------------------------------------------------------
-               tmp1 = 1.0d+00 / u(1,i-1,j,k)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               c(1,1,i,j) = - dt * tx1 * dx1
-               c(1,2,i,j) = - dt * tx2
-               c(1,3,i,j) =   0.0d+00
-               c(1,4,i,j) =   0.0d+00
-               c(1,5,i,j) =   0.0d+00
-
-               c(2,1,i,j) = - dt * tx2
-     >          * ( - ( u(2,i-1,j,k) * tmp1 ) ** 2
-     >     + c2 * 0.50d+00 * (  u(2,i-1,j,k) * u(2,i-1,j,k)
-     >                        + u(3,i-1,j,k) * u(3,i-1,j,k)
-     >                        + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2 )
-     >          - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i-1,j,k) )
-               c(2,2,i,j) = - dt * tx2
-     >          * ( ( 2.0d+00 - c2 ) * ( u(2,i-1,j,k) * tmp1 ) )
-     >          - dt * tx1 * ( r43 * c34 * tmp1 )
-     >          - dt * tx1 * dx2
-               c(2,3,i,j) = - dt * tx2
-     >              * ( - c2 * ( u(3,i-1,j,k) * tmp1 ) )
-               c(2,4,i,j) = - dt * tx2
-     >              * ( - c2 * ( u(4,i-1,j,k) * tmp1 ) )
-               c(2,5,i,j) = - dt * tx2 * c2 
-
-               c(3,1,i,j) = - dt * tx2
-     >              * ( - ( u(2,i-1,j,k) * u(3,i-1,j,k) ) * tmp2 )
-     >         - dt * tx1 * ( - c34 * tmp2 * u(3,i-1,j,k) )
-               c(3,2,i,j) = - dt * tx2 * ( u(3,i-1,j,k) * tmp1 )
-               c(3,3,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 )
-     >          - dt * tx1 * ( c34 * tmp1 )
-     >          - dt * tx1 * dx3
-               c(3,4,i,j) = 0.0d+00
-               c(3,5,i,j) = 0.0d+00
-
-               c(4,1,i,j) = - dt * tx2
-     >          * ( - ( u(2,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 )
-     >          - dt * tx1 * ( - c34 * tmp2 * u(4,i-1,j,k) )
-               c(4,2,i,j) = - dt * tx2 * ( u(4,i-1,j,k) * tmp1 )
-               c(4,3,i,j) = 0.0d+00
-               c(4,4,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 )
-     >          - dt * tx1 * ( c34 * tmp1 )
-     >          - dt * tx1 * dx4
-               c(4,5,i,j) = 0.0d+00
-
-               c(5,1,i,j) = - dt * tx2
-     >          * ( ( c2 * (  u(2,i-1,j,k) * u(2,i-1,j,k)
-     >                      + u(3,i-1,j,k) * u(3,i-1,j,k)
-     >                      + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2
-     >              - c1 * ( u(5,i-1,j,k) * tmp1 ) )
-     >          * ( u(2,i-1,j,k) * tmp1 ) )
-     >          - dt * tx1
-     >          * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i-1,j,k)**2 )
-     >              - (     c34 - c1345 ) * tmp3 * ( u(3,i-1,j,k)**2 )
-     >              - (     c34 - c1345 ) * tmp3 * ( u(4,i-1,j,k)**2 )
-     >              - c1345 * tmp2 * u(5,i-1,j,k) )
-               c(5,2,i,j) = - dt * tx2
-     >          * ( c1 * ( u(5,i-1,j,k) * tmp1 )
-     >             - 0.50d+00 * c2
-     >             * ( (  3.0d+00*u(2,i-1,j,k)*u(2,i-1,j,k)
-     >                  + u(3,i-1,j,k)*u(3,i-1,j,k)
-     >                  + u(4,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 ) )
-     >           - dt * tx1
-     >           * ( r43*c34 - c1345 ) * tmp2 * u(2,i-1,j,k)
-               c(5,3,i,j) = - dt * tx2
-     >           * ( - c2 * ( u(3,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 )
-     >           - dt * tx1
-     >           * (  c34 - c1345 ) * tmp2 * u(3,i-1,j,k)
-               c(5,4,i,j) = - dt * tx2
-     >           * ( - c2 * ( u(4,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 )
-     >           - dt * tx1
-     >           * (  c34 - c1345 ) * tmp2 * u(4,i-1,j,k)
-               c(5,5,i,j) = - dt * tx2
-     >           * ( c1 * ( u(2,i-1,j,k) * tmp1 ) )
-     >           - dt * tx1 * c1345 * tmp1
-     >           - dt * tx1 * dx5
-
-            end do
-         end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/jacu.f b/examples/smpi/NAS/LU/jacu.f
deleted file mode 100644
index 6a3c5b8ddb..0000000000
--- a/examples/smpi/NAS/LU/jacu.f
+++ /dev/null
@@ -1,384 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine jacu(k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   compute the upper triangular part of the jacobian matrix
-c---------------------------------------------------------------------
-
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer k
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j
-      double precision  r43
-      double precision  c1345
-      double precision  c34
-      double precision  tmp1, tmp2, tmp3
-
-
-
-      r43 = ( 4.0d+00 / 3.0d+00 )
-      c1345 = c1 * c3 * c4 * c5
-      c34 = c3 * c4
-
-         do j = jst, jend
-            do i = ist, iend
-
-c---------------------------------------------------------------------
-c   form the block daigonal
-c---------------------------------------------------------------------
-               tmp1 = 1.0d+00 / u(1,i,j,k)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               d(1,1,i,j) =  1.0d+00
-     >                       + dt * 2.0d+00 * (   tx1 * dx1
-     >                                          + ty1 * dy1
-     >                                          + tz1 * dz1 )
-               d(1,2,i,j) =  0.0d+00
-               d(1,3,i,j) =  0.0d+00
-               d(1,4,i,j) =  0.0d+00
-               d(1,5,i,j) =  0.0d+00
-
-               d(2,1,i,j) =  dt * 2.0d+00
-     >          * (  tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) )
-     >             + ty1 * ( -       c34 * tmp2 * u(2,i,j,k) )
-     >             + tz1 * ( -       c34 * tmp2 * u(2,i,j,k) ) )
-               d(2,2,i,j) =  1.0d+00
-     >          + dt * 2.0d+00 
-     >          * (  tx1 * r43 * c34 * tmp1
-     >             + ty1 *       c34 * tmp1
-     >             + tz1 *       c34 * tmp1 )
-     >          + dt * 2.0d+00 * (   tx1 * dx2
-     >                             + ty1 * dy2
-     >                             + tz1 * dz2  )
-               d(2,3,i,j) = 0.0d+00
-               d(2,4,i,j) = 0.0d+00
-               d(2,5,i,j) = 0.0d+00
-
-               d(3,1,i,j) = dt * 2.0d+00
-     >      * (  tx1 * ( -       c34 * tmp2 * u(3,i,j,k) )
-     >         + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) )
-     >         + tz1 * ( -       c34 * tmp2 * u(3,i,j,k) ) )
-               d(3,2,i,j) = 0.0d+00
-               d(3,3,i,j) = 1.0d+00
-     >         + dt * 2.0d+00
-     >              * (  tx1 *       c34 * tmp1
-     >                 + ty1 * r43 * c34 * tmp1
-     >                 + tz1 *       c34 * tmp1 )
-     >         + dt * 2.0d+00 * (  tx1 * dx3
-     >                           + ty1 * dy3
-     >                           + tz1 * dz3 )
-               d(3,4,i,j) = 0.0d+00
-               d(3,5,i,j) = 0.0d+00
-
-               d(4,1,i,j) = dt * 2.0d+00
-     >      * (  tx1 * ( -       c34 * tmp2 * u(4,i,j,k) )
-     >         + ty1 * ( -       c34 * tmp2 * u(4,i,j,k) )
-     >         + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) )
-               d(4,2,i,j) = 0.0d+00
-               d(4,3,i,j) = 0.0d+00
-               d(4,4,i,j) = 1.0d+00
-     >         + dt * 2.0d+00
-     >              * (  tx1 *       c34 * tmp1
-     >                 + ty1 *       c34 * tmp1
-     >                 + tz1 * r43 * c34 * tmp1 )
-     >         + dt * 2.0d+00 * (  tx1 * dx4
-     >                           + ty1 * dy4
-     >                           + tz1 * dz4 )
-               d(4,5,i,j) = 0.0d+00
-
-               d(5,1,i,j) = dt * 2.0d+00
-     > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
-     >             - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
-     >             - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
-     >             - ( c1345 ) * tmp2 * u(5,i,j,k) )
-     >   + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
-     >             - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
-     >             - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
-     >             - ( c1345 ) * tmp2 * u(5,i,j,k) )
-     >   + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
-     >             - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
-     >             - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
-     >             - ( c1345 ) * tmp2 * u(5,i,j,k) ) )
-               d(5,2,i,j) = dt * 2.0d+00
-     > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k)
-     >   + ty1 * (     c34 - c1345 ) * tmp2 * u(2,i,j,k)
-     >   + tz1 * (     c34 - c1345 ) * tmp2 * u(2,i,j,k) )
-               d(5,3,i,j) = dt * 2.0d+00
-     > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k)
-     >   + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k)
-     >   + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) )
-               d(5,4,i,j) = dt * 2.0d+00
-     > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
-     >   + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
-     >   + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) )
-               d(5,5,i,j) = 1.0d+00
-     >   + dt * 2.0d+00 * ( tx1 * c1345 * tmp1
-     >                    + ty1 * c1345 * tmp1
-     >                    + tz1 * c1345 * tmp1 )
-     >   + dt * 2.0d+00 * (  tx1 * dx5
-     >                    +  ty1 * dy5
-     >                    +  tz1 * dz5 )
-
-c---------------------------------------------------------------------
-c   form the first block sub-diagonal
-c---------------------------------------------------------------------
-               tmp1 = 1.0d+00 / u(1,i+1,j,k)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               a(1,1,i,j) = - dt * tx1 * dx1
-               a(1,2,i,j) =   dt * tx2
-               a(1,3,i,j) =   0.0d+00
-               a(1,4,i,j) =   0.0d+00
-               a(1,5,i,j) =   0.0d+00
-
-               a(2,1,i,j) =  dt * tx2
-     >          * ( - ( u(2,i+1,j,k) * tmp1 ) ** 2
-     >     + c2 * 0.50d+00 * (  u(2,i+1,j,k) * u(2,i+1,j,k)
-     >                        + u(3,i+1,j,k) * u(3,i+1,j,k)
-     >                        + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2 )
-     >          - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i+1,j,k) )
-               a(2,2,i,j) =  dt * tx2
-     >          * ( ( 2.0d+00 - c2 ) * ( u(2,i+1,j,k) * tmp1 ) )
-     >          - dt * tx1 * ( r43 * c34 * tmp1 )
-     >          - dt * tx1 * dx2
-               a(2,3,i,j) =  dt * tx2
-     >              * ( - c2 * ( u(3,i+1,j,k) * tmp1 ) )
-               a(2,4,i,j) =  dt * tx2
-     >              * ( - c2 * ( u(4,i+1,j,k) * tmp1 ) )
-               a(2,5,i,j) =  dt * tx2 * c2 
-
-               a(3,1,i,j) =  dt * tx2
-     >              * ( - ( u(2,i+1,j,k) * u(3,i+1,j,k) ) * tmp2 )
-     >         - dt * tx1 * ( - c34 * tmp2 * u(3,i+1,j,k) )
-               a(3,2,i,j) =  dt * tx2 * ( u(3,i+1,j,k) * tmp1 )
-               a(3,3,i,j) =  dt * tx2 * ( u(2,i+1,j,k) * tmp1 )
-     >          - dt * tx1 * ( c34 * tmp1 )
-     >          - dt * tx1 * dx3
-               a(3,4,i,j) = 0.0d+00
-               a(3,5,i,j) = 0.0d+00
-
-               a(4,1,i,j) = dt * tx2
-     >          * ( - ( u(2,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 )
-     >          - dt * tx1 * ( - c34 * tmp2 * u(4,i+1,j,k) )
-               a(4,2,i,j) = dt * tx2 * ( u(4,i+1,j,k) * tmp1 )
-               a(4,3,i,j) = 0.0d+00
-               a(4,4,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 )
-     >          - dt * tx1 * ( c34 * tmp1 )
-     >          - dt * tx1 * dx4
-               a(4,5,i,j) = 0.0d+00
-
-               a(5,1,i,j) = dt * tx2
-     >          * ( ( c2 * (  u(2,i+1,j,k) * u(2,i+1,j,k)
-     >                      + u(3,i+1,j,k) * u(3,i+1,j,k)
-     >                      + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2
-     >              - c1 * ( u(5,i+1,j,k) * tmp1 ) )
-     >          * ( u(2,i+1,j,k) * tmp1 ) )
-     >          - dt * tx1
-     >          * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i+1,j,k)**2 )
-     >              - (     c34 - c1345 ) * tmp3 * ( u(3,i+1,j,k)**2 )
-     >              - (     c34 - c1345 ) * tmp3 * ( u(4,i+1,j,k)**2 )
-     >              - c1345 * tmp2 * u(5,i+1,j,k) )
-               a(5,2,i,j) = dt * tx2
-     >          * ( c1 * ( u(5,i+1,j,k) * tmp1 )
-     >             - 0.50d+00 * c2
-     >             * ( (  3.0d+00*u(2,i+1,j,k)*u(2,i+1,j,k)
-     >                  + u(3,i+1,j,k)*u(3,i+1,j,k)
-     >                  + u(4,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 ) )
-     >           - dt * tx1
-     >           * ( r43*c34 - c1345 ) * tmp2 * u(2,i+1,j,k)
-               a(5,3,i,j) = dt * tx2
-     >           * ( - c2 * ( u(3,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 )
-     >           - dt * tx1
-     >           * (  c34 - c1345 ) * tmp2 * u(3,i+1,j,k)
-               a(5,4,i,j) = dt * tx2
-     >           * ( - c2 * ( u(4,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 )
-     >           - dt * tx1
-     >           * (  c34 - c1345 ) * tmp2 * u(4,i+1,j,k)
-               a(5,5,i,j) = dt * tx2
-     >           * ( c1 * ( u(2,i+1,j,k) * tmp1 ) )
-     >           - dt * tx1 * c1345 * tmp1
-     >           - dt * tx1 * dx5
-
-c---------------------------------------------------------------------
-c   form the second block sub-diagonal
-c---------------------------------------------------------------------
-               tmp1 = 1.0d+00 / u(1,i,j+1,k)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               b(1,1,i,j) = - dt * ty1 * dy1
-               b(1,2,i,j) =   0.0d+00
-               b(1,3,i,j) =  dt * ty2
-               b(1,4,i,j) =   0.0d+00
-               b(1,5,i,j) =   0.0d+00
-
-               b(2,1,i,j) =  dt * ty2
-     >           * ( - ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 )
-     >           - dt * ty1 * ( - c34 * tmp2 * u(2,i,j+1,k) )
-               b(2,2,i,j) =  dt * ty2 * ( u(3,i,j+1,k) * tmp1 )
-     >          - dt * ty1 * ( c34 * tmp1 )
-     >          - dt * ty1 * dy2
-               b(2,3,i,j) =  dt * ty2 * ( u(2,i,j+1,k) * tmp1 )
-               b(2,4,i,j) = 0.0d+00
-               b(2,5,i,j) = 0.0d+00
-
-               b(3,1,i,j) =  dt * ty2
-     >           * ( - ( u(3,i,j+1,k) * tmp1 ) ** 2
-     >      + 0.50d+00 * c2 * ( (  u(2,i,j+1,k) * u(2,i,j+1,k)
-     >                           + u(3,i,j+1,k) * u(3,i,j+1,k)
-     >                           + u(4,i,j+1,k) * u(4,i,j+1,k) )
-     >                          * tmp2 ) )
-     >       - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j+1,k) )
-               b(3,2,i,j) =  dt * ty2
-     >                   * ( - c2 * ( u(2,i,j+1,k) * tmp1 ) )
-               b(3,3,i,j) =  dt * ty2 * ( ( 2.0d+00 - c2 )
-     >                   * ( u(3,i,j+1,k) * tmp1 ) )
-     >       - dt * ty1 * ( r43 * c34 * tmp1 )
-     >       - dt * ty1 * dy3
-               b(3,4,i,j) =  dt * ty2
-     >                   * ( - c2 * ( u(4,i,j+1,k) * tmp1 ) )
-               b(3,5,i,j) =  dt * ty2 * c2
-
-               b(4,1,i,j) =  dt * ty2
-     >              * ( - ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 )
-     >       - dt * ty1 * ( - c34 * tmp2 * u(4,i,j+1,k) )
-               b(4,2,i,j) = 0.0d+00
-               b(4,3,i,j) =  dt * ty2 * ( u(4,i,j+1,k) * tmp1 )
-               b(4,4,i,j) =  dt * ty2 * ( u(3,i,j+1,k) * tmp1 )
-     >                        - dt * ty1 * ( c34 * tmp1 )
-     >                        - dt * ty1 * dy4
-               b(4,5,i,j) = 0.0d+00
-
-               b(5,1,i,j) =  dt * ty2
-     >          * ( ( c2 * (  u(2,i,j+1,k) * u(2,i,j+1,k)
-     >                      + u(3,i,j+1,k) * u(3,i,j+1,k)
-     >                      + u(4,i,j+1,k) * u(4,i,j+1,k) ) * tmp2
-     >               - c1 * ( u(5,i,j+1,k) * tmp1 ) )
-     >          * ( u(3,i,j+1,k) * tmp1 ) )
-     >          - dt * ty1
-     >          * ( - (     c34 - c1345 )*tmp3*(u(2,i,j+1,k)**2)
-     >              - ( r43*c34 - c1345 )*tmp3*(u(3,i,j+1,k)**2)
-     >              - (     c34 - c1345 )*tmp3*(u(4,i,j+1,k)**2)
-     >              - c1345*tmp2*u(5,i,j+1,k) )
-               b(5,2,i,j) =  dt * ty2
-     >          * ( - c2 * ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 )
-     >          - dt * ty1
-     >          * ( c34 - c1345 ) * tmp2 * u(2,i,j+1,k)
-               b(5,3,i,j) =  dt * ty2
-     >          * ( c1 * ( u(5,i,j+1,k) * tmp1 )
-     >          - 0.50d+00 * c2 
-     >          * ( (  u(2,i,j+1,k)*u(2,i,j+1,k)
-     >               + 3.0d+00 * u(3,i,j+1,k)*u(3,i,j+1,k)
-     >               + u(4,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) )
-     >          - dt * ty1
-     >          * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j+1,k)
-               b(5,4,i,j) =  dt * ty2
-     >          * ( - c2 * ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 )
-     >          - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j+1,k)
-               b(5,5,i,j) =  dt * ty2
-     >          * ( c1 * ( u(3,i,j+1,k) * tmp1 ) )
-     >          - dt * ty1 * c1345 * tmp1
-     >          - dt * ty1 * dy5
-
-c---------------------------------------------------------------------
-c   form the third block sub-diagonal
-c---------------------------------------------------------------------
-               tmp1 = 1.0d+00 / u(1,i,j,k+1)
-               tmp2 = tmp1 * tmp1
-               tmp3 = tmp1 * tmp2
-
-               c(1,1,i,j) = - dt * tz1 * dz1
-               c(1,2,i,j) =   0.0d+00
-               c(1,3,i,j) =   0.0d+00
-               c(1,4,i,j) = dt * tz2
-               c(1,5,i,j) =   0.0d+00
-
-               c(2,1,i,j) = dt * tz2
-     >           * ( - ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
-     >           - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k+1) )
-               c(2,2,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 )
-     >           - dt * tz1 * c34 * tmp1
-     >           - dt * tz1 * dz2 
-               c(2,3,i,j) = 0.0d+00
-               c(2,4,i,j) = dt * tz2 * ( u(2,i,j,k+1) * tmp1 )
-               c(2,5,i,j) = 0.0d+00
-
-               c(3,1,i,j) = dt * tz2
-     >           * ( - ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
-     >           - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k+1) )
-               c(3,2,i,j) = 0.0d+00
-               c(3,3,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 )
-     >           - dt * tz1 * ( c34 * tmp1 )
-     >           - dt * tz1 * dz3
-               c(3,4,i,j) = dt * tz2 * ( u(3,i,j,k+1) * tmp1 )
-               c(3,5,i,j) = 0.0d+00
-
-               c(4,1,i,j) = dt * tz2
-     >        * ( - ( u(4,i,j,k+1) * tmp1 ) ** 2
-     >            + 0.50d+00 * c2
-     >            * ( ( u(2,i,j,k+1) * u(2,i,j,k+1)
-     >                + u(3,i,j,k+1) * u(3,i,j,k+1)
-     >                + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2 ) )
-     >        - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k+1) )
-               c(4,2,i,j) = dt * tz2
-     >             * ( - c2 * ( u(2,i,j,k+1) * tmp1 ) )
-               c(4,3,i,j) = dt * tz2
-     >             * ( - c2 * ( u(3,i,j,k+1) * tmp1 ) )
-               c(4,4,i,j) = dt * tz2 * ( 2.0d+00 - c2 )
-     >             * ( u(4,i,j,k+1) * tmp1 )
-     >             - dt * tz1 * ( r43 * c34 * tmp1 )
-     >             - dt * tz1 * dz4
-               c(4,5,i,j) = dt * tz2 * c2
-
-               c(5,1,i,j) = dt * tz2
-     >     * ( ( c2 * (  u(2,i,j,k+1) * u(2,i,j,k+1)
-     >                 + u(3,i,j,k+1) * u(3,i,j,k+1)
-     >                 + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2
-     >       - c1 * ( u(5,i,j,k+1) * tmp1 ) )
-     >            * ( u(4,i,j,k+1) * tmp1 ) )
-     >       - dt * tz1
-     >       * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k+1)**2)
-     >           - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k+1)**2)
-     >           - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k+1)**2)
-     >          - c1345 * tmp2 * u(5,i,j,k+1) )
-               c(5,2,i,j) = dt * tz2
-     >       * ( - c2 * ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
-     >       - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k+1)
-               c(5,3,i,j) = dt * tz2
-     >       * ( - c2 * ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
-     >       - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k+1)
-               c(5,4,i,j) = dt * tz2
-     >       * ( c1 * ( u(5,i,j,k+1) * tmp1 )
-     >       - 0.50d+00 * c2
-     >       * ( (  u(2,i,j,k+1)*u(2,i,j,k+1)
-     >            + u(3,i,j,k+1)*u(3,i,j,k+1)
-     >            + 3.0d+00*u(4,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) )
-     >       - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k+1)
-               c(5,5,i,j) = dt * tz2
-     >       * ( c1 * ( u(4,i,j,k+1) * tmp1 ) )
-     >       - dt * tz1 * c1345 * tmp1
-     >       - dt * tz1 * dz5
-
-            end do
-         end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/l2norm.f b/examples/smpi/NAS/LU/l2norm.f
deleted file mode 100644
index 147b21deb4..0000000000
--- a/examples/smpi/NAS/LU/l2norm.f
+++ /dev/null
@@ -1,68 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine l2norm ( ldx, ldy, ldz, 
-     >                    nx0, ny0, nz0,
-     >                    ist, iend, 
-     >                    jst, jend,
-     >                    v, sum )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   to compute the l2-norm of vector v.
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer ldx, ldy, ldz
-      integer nx0, ny0, nz0
-      integer ist, iend
-      integer jst, jend
-      double precision  v(5,-1:ldx+2,-1:ldy+2,*), sum(5)
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, k, m
-      double precision  dummy(5)
-
-      integer IERROR
-
-
-      do m = 1, 5
-         dummy(m) = 0.0d+00
-      end do
-
-      do k = 2, nz0-1
-         do j = jst, jend
-            do i = ist, iend
-               do m = 1, 5
-                  dummy(m) = dummy(m) + v(m,i,j,k) * v(m,i,j,k)
-               end do
-            end do
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c   compute the global sum of individual contributions to dot product.
-c---------------------------------------------------------------------
-      call MPI_ALLREDUCE( dummy,
-     >                    sum,
-     >                    5,
-     >                    dp_type,
-     >                    MPI_SUM,
-     >                    MPI_COMM_WORLD,
-     >                    IERROR )
-
-      do m = 1, 5
-         sum(m) = sqrt ( sum(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) )
-      end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/lu.f b/examples/smpi/NAS/LU/lu.f
deleted file mode 100644
index 543463a07b..0000000000
--- a/examples/smpi/NAS/LU/lu.f
+++ /dev/null
@@ -1,164 +0,0 @@
-!-------------------------------------------------------------------------!
-!                                                                         !
-!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
-!                                                                         !
-!                                   L U                                   !
-!                                                                         !
-!-------------------------------------------------------------------------!
-!                                                                         !
-!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
-!    It is described in NAS Technical Reports 95-020 and 02-007           !
-!                                                                         !
-!    Permission to use, copy, distribute and modify this software         !
-!    for any purpose with or without fee is hereby granted.  We           !
-!    request, however, that all derived work reference the NAS            !
-!    Parallel Benchmarks 3.3. This software is provided "as is"           !
-!    without express or implied warranty.                                 !
-!                                                                         !
-!    Information on NPB 3.3, including the technical report, the          !
-!    original specifications, source code, results and information        !
-!    on how to submit new results, is available at:                       !
-!                                                                         !
-!           http://www.nas.nasa.gov/Software/NPB/                         !
-!                                                                         !
-!    Send comments or suggestions to  npb@nas.nasa.gov                    !
-!                                                                         !
-!          NAS Parallel Benchmarks Group                                  !
-!          NASA Ames Research Center                                      !
-!          Mail Stop: T27A-1                                              !
-!          Moffett Field, CA   94035-1000                                 !
-!                                                                         !
-!          E-mail:  npb@nas.nasa.gov                                      !
-!          Fax:     (650) 604-3957                                        !
-!                                                                         !
-!-------------------------------------------------------------------------!
-
-c---------------------------------------------------------------------
-c
-c Authors: S. Weeratunga
-c          V. Venkatakrishnan
-c          E. Barszcz
-c          M. Yarrow
-c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-      program applu
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   driver for the performance evaluation of the solver for
-c   five coupled parabolic/elliptic partial differential equations.
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-      character class
-      logical verified
-      double precision mflops
-      integer ierr
-
-c---------------------------------------------------------------------
-c   initialize communications
-c---------------------------------------------------------------------
-      call init_comm()
-
-c---------------------------------------------------------------------
-c   read input data
-c---------------------------------------------------------------------
-      call read_input()
-
-c---------------------------------------------------------------------
-c   set up processor grid
-c---------------------------------------------------------------------
-      call proc_grid()
-
-c---------------------------------------------------------------------
-c   determine the neighbors
-c---------------------------------------------------------------------
-      call neighbors()
-
-c---------------------------------------------------------------------
-c   set up sub-domain sizes
-c---------------------------------------------------------------------
-      call subdomain()
-
-c---------------------------------------------------------------------
-c   set up coefficients
-c---------------------------------------------------------------------
-      call setcoeff()
-
-c---------------------------------------------------------------------
-c   set the masks required for comm
-c---------------------------------------------------------------------
-      call sethyper()
-
-c---------------------------------------------------------------------
-c   set the boundary values for dependent variables
-c---------------------------------------------------------------------
-      call setbv()
-
-c---------------------------------------------------------------------
-c   set the initial values for dependent variables
-c---------------------------------------------------------------------
-      call setiv()
-
-c---------------------------------------------------------------------
-c   compute the forcing term based on prescribed exact solution
-c---------------------------------------------------------------------
-      call erhs()
-
-c---------------------------------------------------------------------
-c   perform one SSOR iteration to touch all data and program pages 
-c---------------------------------------------------------------------
-      call ssor(1)
-
-c---------------------------------------------------------------------
-c   reset the boundary and initial values
-c---------------------------------------------------------------------
-      call setbv()
-      call setiv()
-
-c---------------------------------------------------------------------
-c   perform the SSOR iterations
-c---------------------------------------------------------------------
-      call ssor(itmax)
-
-c---------------------------------------------------------------------
-c   compute the solution error
-c---------------------------------------------------------------------
-      call error()
-
-c---------------------------------------------------------------------
-c   compute the surface integral
-c---------------------------------------------------------------------
-      call pintgr()
-
-c---------------------------------------------------------------------
-c   verification test
-c---------------------------------------------------------------------
-      IF (id.eq.0) THEN
-         call verify ( rsdnm, errnm, frc, class, verified )
-         mflops = float(itmax)*(1984.77*float( nx0 )
-     >        *float( ny0 )
-     >        *float( nz0 )
-     >        -10923.3*(float( nx0+ny0+nz0 )/3.)**2 
-     >        +27770.9* float( nx0+ny0+nz0 )/3.
-     >        -144010.)
-     >        / (maxtime*1000000.)
-
-         call print_results('LU', class, nx0,
-     >     ny0, nz0, itmax, nnodes_compiled,
-     >     num, maxtime, mflops, '          floating point', verified, 
-     >     npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, 
-     >     '(none)')
-
-      END IF
-
-      call mpi_finalize(ierr)
-      end
-
-
diff --git a/examples/smpi/NAS/LU/mpinpb.h b/examples/smpi/NAS/LU/mpinpb.h
deleted file mode 100644
index ddbf1515f0..0000000000
--- a/examples/smpi/NAS/LU/mpinpb.h
+++ /dev/null
@@ -1,11 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'mpif.h'
-
-      integer           node, no_nodes, root, comm_setup, 
-     >                  comm_solve, comm_rhs, dp_type
-      common /mpistuff/ node, no_nodes, root, comm_setup, 
-     >                  comm_solve, comm_rhs, dp_type
-
diff --git a/examples/smpi/NAS/LU/neighbors.f b/examples/smpi/NAS/LU/neighbors.f
deleted file mode 100644
index ed8a3126af..0000000000
--- a/examples/smpi/NAS/LU/neighbors.f
+++ /dev/null
@@ -1,48 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine neighbors ()
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c     figure out the neighbors and their wrap numbers for each processor
-c---------------------------------------------------------------------
-
-        south = -1
-        east  = -1
-        north = -1
-        west  = -1
-
-      if (row.gt.1) then
-              north = id -1
-      else
-              north = -1
-      end if
-
-      if (row.lt.xdim) then
-              south = id + 1
-      else
-              south = -1
-      end if
-
-      if (col.gt.1) then
-              west = id- xdim
-      else
-              west = -1
-      end if
-
-      if (col.lt.ydim) then
-              east = id + xdim
-      else 
-              east = -1
-      end if
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/nodedim.f b/examples/smpi/NAS/LU/nodedim.f
deleted file mode 100644
index f4def3a01e..0000000000
--- a/examples/smpi/NAS/LU/nodedim.f
+++ /dev/null
@@ -1,36 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      integer function nodedim(num)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c  compute the exponent where num = 2**nodedim
-c  NOTE: assumes a power-of-two number of nodes
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-c---------------------------------------------------------------------
-c  input parameters
-c---------------------------------------------------------------------
-      integer num
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      double precision fnum
-
-
-      fnum = dble(num)
-      nodedim = log(fnum)/log(2.0d+0) + 0.00001
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/LU/pintgr.f b/examples/smpi/NAS/LU/pintgr.f
deleted file mode 100644
index de514cccd2..0000000000
--- a/examples/smpi/NAS/LU/pintgr.f
+++ /dev/null
@@ -1,288 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine pintgr
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, k
-      integer ibeg, ifin, ifin1
-      integer jbeg, jfin, jfin1
-      integer iglob, iglob1, iglob2
-      integer jglob, jglob1, jglob2
-      integer ind1, ind2
-      double precision  phi1(0:isiz2+1,0:isiz3+1),
-     >                  phi2(0:isiz2+1,0:isiz3+1)
-      double precision  frc1, frc2, frc3
-      double precision  dummy
-
-      integer IERROR
-
-
-c---------------------------------------------------------------------
-c   set up the sub-domains for integeration in each processor
-c---------------------------------------------------------------------
-      ibeg = nx + 1
-      ifin = 0
-      iglob1 = ipt + 1
-      iglob2 = ipt + nx
-      if (iglob1.ge.ii1.and.iglob2.lt.ii2+nx) ibeg = 1
-      if (iglob1.gt.ii1-nx.and.iglob2.le.ii2) ifin = nx
-      if (ii1.ge.iglob1.and.ii1.le.iglob2) ibeg = ii1 - ipt
-      if (ii2.ge.iglob1.and.ii2.le.iglob2) ifin = ii2 - ipt
-      jbeg = ny + 1
-      jfin = 0
-      jglob1 = jpt + 1
-      jglob2 = jpt + ny
-      if (jglob1.ge.ji1.and.jglob2.lt.ji2+ny) jbeg = 1
-      if (jglob1.gt.ji1-ny.and.jglob2.le.ji2) jfin = ny
-      if (ji1.ge.jglob1.and.ji1.le.jglob2) jbeg = ji1 - jpt
-      if (ji2.ge.jglob1.and.ji2.le.jglob2) jfin = ji2 - jpt
-      ifin1 = ifin
-      jfin1 = jfin
-      if (ipt + ifin1.eq.ii2) ifin1 = ifin -1
-      if (jpt + jfin1.eq.ji2) jfin1 = jfin -1
-
-c---------------------------------------------------------------------
-c   initialize
-c---------------------------------------------------------------------
-      do i = 0,isiz2+1
-        do k = 0,isiz3+1
-          phi1(i,k) = 0.
-          phi2(i,k) = 0.
-        end do
-      end do
-
-      do j = jbeg,jfin
-         jglob = jpt + j
-         do i = ibeg,ifin
-            iglob = ipt + i
-
-            k = ki1
-
-            phi1(i,j) = c2*(  u(5,i,j,k)
-     >           - 0.50d+00 * (  u(2,i,j,k) ** 2
-     >                         + u(3,i,j,k) ** 2
-     >                         + u(4,i,j,k) ** 2 )
-     >                        / u(1,i,j,k) )
-
-            k = ki2
-
-            phi2(i,j) = c2*(  u(5,i,j,k)
-     >           - 0.50d+00 * (  u(2,i,j,k) ** 2
-     >                         + u(3,i,j,k) ** 2
-     >                         + u(4,i,j,k) ** 2 )
-     >                        / u(1,i,j,k) )
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c  communicate in i and j directions
-c---------------------------------------------------------------------
-      call exchange_4(phi1,phi2,ibeg,ifin1,jbeg,jfin1)
-
-      frc1 = 0.0d+00
-
-      do j = jbeg,jfin1
-         do i = ibeg, ifin1
-            frc1 = frc1 + (  phi1(i,j)
-     >                     + phi1(i+1,j)
-     >                     + phi1(i,j+1)
-     >                     + phi1(i+1,j+1)
-     >                     + phi2(i,j)
-     >                     + phi2(i+1,j)
-     >                     + phi2(i,j+1)
-     >                     + phi2(i+1,j+1) )
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c  compute the global sum of individual contributions to frc1
-c---------------------------------------------------------------------
-      dummy = frc1
-      call MPI_ALLREDUCE( dummy,
-     >                    frc1,
-     >                    1,
-     >                    dp_type,
-     >                    MPI_SUM,
-     >                    MPI_COMM_WORLD,
-     >                    IERROR )
-
-      frc1 = dxi * deta * frc1
-
-c---------------------------------------------------------------------
-c   initialize
-c---------------------------------------------------------------------
-      do i = 0,isiz2+1
-        do k = 0,isiz3+1
-          phi1(i,k) = 0.
-          phi2(i,k) = 0.
-        end do
-      end do
-      jglob = jpt + jbeg
-      ind1 = 0
-      if (jglob.eq.ji1) then
-        ind1 = 1
-        do k = ki1, ki2
-           do i = ibeg, ifin
-              iglob = ipt + i
-              phi1(i,k) = c2*(  u(5,i,jbeg,k)
-     >             - 0.50d+00 * (  u(2,i,jbeg,k) ** 2
-     >                           + u(3,i,jbeg,k) ** 2
-     >                           + u(4,i,jbeg,k) ** 2 )
-     >                          / u(1,i,jbeg,k) )
-           end do
-        end do
-      end if
-
-      jglob = jpt + jfin
-      ind2 = 0
-      if (jglob.eq.ji2) then
-        ind2 = 1
-        do k = ki1, ki2
-           do i = ibeg, ifin
-              iglob = ipt + i
-              phi2(i,k) = c2*(  u(5,i,jfin,k)
-     >             - 0.50d+00 * (  u(2,i,jfin,k) ** 2
-     >                           + u(3,i,jfin,k) ** 2
-     >                           + u(4,i,jfin,k) ** 2 )
-     >                          / u(1,i,jfin,k) )
-           end do
-        end do
-      end if
-
-c---------------------------------------------------------------------
-c  communicate in i direction
-c---------------------------------------------------------------------
-      if (ind1.eq.1) then
-        call exchange_5(phi1,ibeg,ifin1)
-      end if
-      if (ind2.eq.1) then
-        call exchange_5 (phi2,ibeg,ifin1)
-      end if
-
-      frc2 = 0.0d+00
-      do k = ki1, ki2-1
-         do i = ibeg, ifin1
-            frc2 = frc2 + (  phi1(i,k)
-     >                     + phi1(i+1,k)
-     >                     + phi1(i,k+1)
-     >                     + phi1(i+1,k+1)
-     >                     + phi2(i,k)
-     >                     + phi2(i+1,k)
-     >                     + phi2(i,k+1)
-     >                     + phi2(i+1,k+1) )
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c  compute the global sum of individual contributions to frc2
-c---------------------------------------------------------------------
-      dummy = frc2
-      call MPI_ALLREDUCE( dummy,
-     >                    frc2,
-     >                    1,
-     >                    dp_type,
-     >                    MPI_SUM,
-     >                    MPI_COMM_WORLD,
-     >                    IERROR )
-
-      frc2 = dxi * dzeta * frc2
-
-c---------------------------------------------------------------------
-c   initialize
-c---------------------------------------------------------------------
-      do i = 0,isiz2+1
-        do k = 0,isiz3+1
-          phi1(i,k) = 0.
-          phi2(i,k) = 0.
-        end do
-      end do
-      iglob = ipt + ibeg
-      ind1 = 0
-      if (iglob.eq.ii1) then
-        ind1 = 1
-        do k = ki1, ki2
-           do j = jbeg, jfin
-              jglob = jpt + j
-              phi1(j,k) = c2*(  u(5,ibeg,j,k)
-     >             - 0.50d+00 * (  u(2,ibeg,j,k) ** 2
-     >                           + u(3,ibeg,j,k) ** 2
-     >                           + u(4,ibeg,j,k) ** 2 )
-     >                          / u(1,ibeg,j,k) )
-           end do
-        end do
-      end if
-
-      iglob = ipt + ifin
-      ind2 = 0
-      if (iglob.eq.ii2) then
-        ind2 = 1
-        do k = ki1, ki2
-           do j = jbeg, jfin
-              jglob = jpt + j
-              phi2(j,k) = c2*(  u(5,ifin,j,k)
-     >             - 0.50d+00 * (  u(2,ifin,j,k) ** 2
-     >                           + u(3,ifin,j,k) ** 2
-     >                           + u(4,ifin,j,k) ** 2 )
-     >                          / u(1,ifin,j,k) )
-           end do
-        end do
-      end if
-
-c---------------------------------------------------------------------
-c  communicate in j direction
-c---------------------------------------------------------------------
-      if (ind1.eq.1) then
-        call exchange_6(phi1,jbeg,jfin1)
-      end if
-      if (ind2.eq.1) then
-        call exchange_6(phi2,jbeg,jfin1)
-      end if
-
-      frc3 = 0.0d+00
-
-      do k = ki1, ki2-1
-         do j = jbeg, jfin1
-            frc3 = frc3 + (  phi1(j,k)
-     >                     + phi1(j+1,k)
-     >                     + phi1(j,k+1)
-     >                     + phi1(j+1,k+1)
-     >                     + phi2(j,k)
-     >                     + phi2(j+1,k)
-     >                     + phi2(j,k+1)
-     >                     + phi2(j+1,k+1) )
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c  compute the global sum of individual contributions to frc3
-c---------------------------------------------------------------------
-      dummy = frc3
-      call MPI_ALLREDUCE( dummy,
-     >                    frc3,
-     >                    1,
-     >                    dp_type,
-     >                    MPI_SUM,
-     >                    MPI_COMM_WORLD,
-     >                    IERROR )
-
-      frc3 = deta * dzeta * frc3
-      frc = 0.25d+00 * ( frc1 + frc2 + frc3 )
-c      if (id.eq.0) write (*,1001) frc
-
-      return
-
- 1001 format (//5x,'surface integral = ',1pe12.5//)
-
-      end
diff --git a/examples/smpi/NAS/LU/proc_grid.f b/examples/smpi/NAS/LU/proc_grid.f
deleted file mode 100644
index 40271c135e..0000000000
--- a/examples/smpi/NAS/LU/proc_grid.f
+++ /dev/null
@@ -1,36 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine proc_grid
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   set up a two-d grid for processors: column-major ordering of unknowns
-c   NOTE: assumes a power-of-two number of processors
-c
-c---------------------------------------------------------------------
-
-      xdim   = 2**(ndim/2)
-      if (mod(ndim,2).eq.1) xdim = xdim + xdim
-      ydim   = num/xdim
-
-      row    = mod(id,xdim) + 1
-      col    = id/xdim + 1
-
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/LU/read_input.f b/examples/smpi/NAS/LU/read_input.f
deleted file mode 100644
index b2e5ff1e79..0000000000
--- a/examples/smpi/NAS/LU/read_input.f
+++ /dev/null
@@ -1,127 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine read_input
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-      integer IERROR, fstatus, nnodes
-
-
-c---------------------------------------------------------------------
-c    only root reads the input file
-c    if input file does not exist, it uses defaults
-c       ipr = 1 for detailed progress output
-c       inorm = how often the norm is printed (once every inorm iterations)
-c       itmax = number of pseudo time steps
-c       dt = time step
-c       omega 1 over-relaxation factor for SSOR
-c       tolrsd = steady state residual tolerance levels
-c       nx, ny, nz = number of grid points in x, y, z directions
-c---------------------------------------------------------------------
-      ROOT = 0
-      if (id .eq. ROOT) then
-
-         write(*, 1000)
-
-         open (unit=3,file='inputlu.data',status='old',
-     >         access='sequential',form='formatted', iostat=fstatus)
-         if (fstatus .eq. 0) then
-
-            write(*, *) 'Reading from input file inputlu.data'
-
-            read (3,*)
-            read (3,*)
-            read (3,*) ipr, inorm
-            read (3,*)
-            read (3,*)
-            read (3,*) itmax
-            read (3,*)
-            read (3,*)
-            read (3,*) dt
-            read (3,*)
-            read (3,*)
-            read (3,*) omega
-            read (3,*)
-            read (3,*)
-            read (3,*) tolrsd(1),tolrsd(2),tolrsd(3),tolrsd(4),tolrsd(5)
-            read (3,*)
-            read (3,*)
-            read (3,*) nx0, ny0, nz0
-            close(3)
-         else
-            ipr = ipr_default
-            inorm = inorm_default
-            itmax = itmax_default
-            dt = dt_default
-            omega = omega_default
-            tolrsd(1) = tolrsd1_def
-            tolrsd(2) = tolrsd2_def
-            tolrsd(3) = tolrsd3_def
-            tolrsd(4) = tolrsd4_def
-            tolrsd(5) = tolrsd5_def
-            nx0 = isiz01
-            ny0 = isiz02
-            nz0 = isiz03
-         endif
-
-c---------------------------------------------------------------------
-c   check problem size
-c---------------------------------------------------------------------
-         call MPI_COMM_SIZE(MPI_COMM_WORLD, nnodes, ierror)
-         if (nnodes .ne. nnodes_compiled) then
-            write (*, 2000) nnodes, nnodes_compiled
- 2000       format (5x,'Warning: program is running on',i3,' processors'
-     >             /5x,'but was compiled for ', i3)
-         endif
-
-         if ( ( nx0 .lt. 4 ) .or.
-     >        ( ny0 .lt. 4 ) .or.
-     >        ( nz0 .lt. 4 ) ) then
-
-            write (*,2001)
- 2001       format (5x,'PROBLEM SIZE IS TOO SMALL - ',
-     >           /5x,'SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5')
-            CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR )
-
-         end if
-
-         if ( ( nx0 .gt. isiz01 ) .or.
-     >        ( ny0 .gt. isiz02 ) .or.
-     >        ( nz0 .gt. isiz03 ) ) then
-
-            write (*,2002)
- 2002       format (5x,'PROBLEM SIZE IS TOO LARGE - ',
-     >           /5x,'NX, NY AND NZ SHOULD BE LESS THAN OR EQUAL TO ',
-     >           /5x,'ISIZ01, ISIZ02 AND ISIZ03 RESPECTIVELY')
-            CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR )
-
-         end if
-
-
-         write(*, 1001) nx0, ny0, nz0
-         write(*, 1002) itmax
-         write(*, 1003) nnodes
-
- 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- LU Benchmark',/)
- 1001    format(' Size: ', i4, 'x', i4, 'x', i4)
- 1002    format(' Iterations: ', i4)
- 1003    format(' Number of processes: ', i5, /)
-         
-
-
-      end if
-
-      call bcast_inputs
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/LU/rhs.f b/examples/smpi/NAS/LU/rhs.f
deleted file mode 100644
index 3da39117a4..0000000000
--- a/examples/smpi/NAS/LU/rhs.f
+++ /dev/null
@@ -1,504 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   compute the right hand sides
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, k, m
-      integer iex
-      integer L1, L2
-      integer ist1, iend1
-      integer jst1, jend1
-      double precision  q
-      double precision  u21, u31, u41
-      double precision  tmp
-      double precision  u21i, u31i, u41i, u51i
-      double precision  u21j, u31j, u41j, u51j
-      double precision  u21k, u31k, u41k, u51k
-      double precision  u21im1, u31im1, u41im1, u51im1
-      double precision  u21jm1, u31jm1, u41jm1, u51jm1
-      double precision  u21km1, u31km1, u41km1, u51km1
-
-
-
-      do k = 1, nz
-         do j = 1, ny
-            do i = 1, nx
-               do m = 1, 5
-                  rsd(m,i,j,k) = - frct(m,i,j,k)
-               end do
-            end do
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c   xi-direction flux differences
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   iex = flag : iex = 0  north/south communication
-c              : iex = 1  east/west communication
-c---------------------------------------------------------------------
-      iex   = 0
-
-c---------------------------------------------------------------------
-c   communicate and receive/send two rows of data
-c---------------------------------------------------------------------
-      call exchange_3(u,iex)
-
-      L1 = 0
-      if (north.eq.-1) L1 = 1
-      L2 = nx + 1
-      if (south.eq.-1) L2 = nx
-
-      ist1 = 1
-      iend1 = nx
-      if (north.eq.-1) ist1 = 4
-      if (south.eq.-1) iend1 = nx - 3
-
-      do k = 2, nz - 1
-         do j = jst, jend
-            do i = L1, L2
-               flux(1,i,j,k) = u(2,i,j,k)
-               u21 = u(2,i,j,k) / u(1,i,j,k)
-
-               q = 0.50d+00 * (  u(2,i,j,k) * u(2,i,j,k)
-     >                         + u(3,i,j,k) * u(3,i,j,k)
-     >                         + u(4,i,j,k) * u(4,i,j,k) )
-     >                      / u(1,i,j,k)
-
-               flux(2,i,j,k) = u(2,i,j,k) * u21 + c2 * 
-     >                        ( u(5,i,j,k) - q )
-               flux(3,i,j,k) = u(3,i,j,k) * u21
-               flux(4,i,j,k) = u(4,i,j,k) * u21
-               flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u21
-            end do
-
-            do i = ist, iend
-               do m = 1, 5
-                  rsd(m,i,j,k) =  rsd(m,i,j,k)
-     >                 - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) )
-               end do
-            end do
-
-            do i = ist, L2
-               tmp = 1.0d+00 / u(1,i,j,k)
-
-               u21i = tmp * u(2,i,j,k)
-               u31i = tmp * u(3,i,j,k)
-               u41i = tmp * u(4,i,j,k)
-               u51i = tmp * u(5,i,j,k)
-
-               tmp = 1.0d+00 / u(1,i-1,j,k)
-
-               u21im1 = tmp * u(2,i-1,j,k)
-               u31im1 = tmp * u(3,i-1,j,k)
-               u41im1 = tmp * u(4,i-1,j,k)
-               u51im1 = tmp * u(5,i-1,j,k)
-
-               flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * (u21i-u21im1)
-               flux(3,i,j,k) = tx3 * ( u31i - u31im1 )
-               flux(4,i,j,k) = tx3 * ( u41i - u41im1 )
-               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
-     >              * tx3 * ( ( u21i  **2 + u31i  **2 + u41i  **2 )
-     >                      - ( u21im1**2 + u31im1**2 + u41im1**2 ) )
-     >              + (1.0d+00/6.0d+00)
-     >              * tx3 * ( u21i**2 - u21im1**2 )
-     >              + c1 * c5 * tx3 * ( u51i - u51im1 )
-            end do
-
-            do i = ist, iend
-               rsd(1,i,j,k) = rsd(1,i,j,k)
-     >              + dx1 * tx1 * (            u(1,i-1,j,k)
-     >                             - 2.0d+00 * u(1,i,j,k)
-     >                             +           u(1,i+1,j,k) )
-               rsd(2,i,j,k) = rsd(2,i,j,k)
-     >          + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) )
-     >              + dx2 * tx1 * (            u(2,i-1,j,k)
-     >                             - 2.0d+00 * u(2,i,j,k)
-     >                             +           u(2,i+1,j,k) )
-               rsd(3,i,j,k) = rsd(3,i,j,k)
-     >          + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) )
-     >              + dx3 * tx1 * (            u(3,i-1,j,k)
-     >                             - 2.0d+00 * u(3,i,j,k)
-     >                             +           u(3,i+1,j,k) )
-               rsd(4,i,j,k) = rsd(4,i,j,k)
-     >          + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) )
-     >              + dx4 * tx1 * (            u(4,i-1,j,k)
-     >                             - 2.0d+00 * u(4,i,j,k)
-     >                             +           u(4,i+1,j,k) )
-               rsd(5,i,j,k) = rsd(5,i,j,k)
-     >          + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) )
-     >              + dx5 * tx1 * (            u(5,i-1,j,k)
-     >                             - 2.0d+00 * u(5,i,j,k)
-     >                             +           u(5,i+1,j,k) )
-            end do
-
-c---------------------------------------------------------------------
-c   Fourth-order dissipation
-c---------------------------------------------------------------------
-            IF (north.eq.-1) then
-             do m = 1, 5
-               rsd(m,2,j,k) = rsd(m,2,j,k)
-     >           - dssp * ( + 5.0d+00 * u(m,2,j,k)
-     >                      - 4.0d+00 * u(m,3,j,k)
-     >                      +           u(m,4,j,k) )
-               rsd(m,3,j,k) = rsd(m,3,j,k)
-     >           - dssp * ( - 4.0d+00 * u(m,2,j,k)
-     >                      + 6.0d+00 * u(m,3,j,k)
-     >                      - 4.0d+00 * u(m,4,j,k)
-     >                      +           u(m,5,j,k) )
-             end do
-            END IF
-
-            do i = ist1,iend1
-               do m = 1, 5
-                  rsd(m,i,j,k) = rsd(m,i,j,k)
-     >              - dssp * (            u(m,i-2,j,k)
-     >                        - 4.0d+00 * u(m,i-1,j,k)
-     >                        + 6.0d+00 * u(m,i,j,k)
-     >                        - 4.0d+00 * u(m,i+1,j,k)
-     >                        +           u(m,i+2,j,k) )
-               end do
-            end do
-
-            IF (south.eq.-1) then
-             do m = 1, 5
-               rsd(m,nx-2,j,k) = rsd(m,nx-2,j,k)
-     >           - dssp * (             u(m,nx-4,j,k)
-     >                      - 4.0d+00 * u(m,nx-3,j,k)
-     >                      + 6.0d+00 * u(m,nx-2,j,k)
-     >                      - 4.0d+00 * u(m,nx-1,j,k)  )
-               rsd(m,nx-1,j,k) = rsd(m,nx-1,j,k)
-     >           - dssp * (             u(m,nx-3,j,k)
-     >                      - 4.0d+00 * u(m,nx-2,j,k)
-     >                      + 5.0d+00 * u(m,nx-1,j,k) )
-             end do
-            END IF
-
-         end do
-      end do 
-
-c---------------------------------------------------------------------
-c   eta-direction flux differences
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   iex = flag : iex = 0  north/south communication
-c---------------------------------------------------------------------
-      iex   = 1
-
-c---------------------------------------------------------------------
-c   communicate and receive/send two rows of data
-c---------------------------------------------------------------------
-      call exchange_3(u,iex)
-
-      L1 = 0
-      if (west.eq.-1) L1 = 1
-      L2 = ny + 1
-      if (east.eq.-1) L2 = ny
-
-      jst1 = 1
-      jend1 = ny
-      if (west.eq.-1) jst1 = 4
-      if (east.eq.-1) jend1 = ny - 3
-
-      do k = 2, nz - 1
-         do j = L1, L2
-            do i = ist, iend
-               flux(1,i,j,k) = u(3,i,j,k)
-               u31 = u(3,i,j,k) / u(1,i,j,k)
-
-               q = 0.50d+00 * (  u(2,i,j,k) * u(2,i,j,k)
-     >                         + u(3,i,j,k) * u(3,i,j,k)
-     >                         + u(4,i,j,k) * u(4,i,j,k) )
-     >                      / u(1,i,j,k)
-
-               flux(2,i,j,k) = u(2,i,j,k) * u31 
-               flux(3,i,j,k) = u(3,i,j,k) * u31 + c2 * (u(5,i,j,k)-q)
-               flux(4,i,j,k) = u(4,i,j,k) * u31
-               flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u31
-            end do
-         end do
-
-         do j = jst, jend
-            do i = ist, iend
-               do m = 1, 5
-                  rsd(m,i,j,k) =  rsd(m,i,j,k)
-     >                   - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) )
-               end do
-            end do
-         end do
-
-         do j = jst, L2
-            do i = ist, iend
-               tmp = 1.0d+00 / u(1,i,j,k)
-
-               u21j = tmp * u(2,i,j,k)
-               u31j = tmp * u(3,i,j,k)
-               u41j = tmp * u(4,i,j,k)
-               u51j = tmp * u(5,i,j,k)
-
-               tmp = 1.0d+00 / u(1,i,j-1,k)
-               u21jm1 = tmp * u(2,i,j-1,k)
-               u31jm1 = tmp * u(3,i,j-1,k)
-               u41jm1 = tmp * u(4,i,j-1,k)
-               u51jm1 = tmp * u(5,i,j-1,k)
-
-               flux(2,i,j,k) = ty3 * ( u21j - u21jm1 )
-               flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * (u31j-u31jm1)
-               flux(4,i,j,k) = ty3 * ( u41j - u41jm1 )
-               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
-     >              * ty3 * ( ( u21j  **2 + u31j  **2 + u41j  **2 )
-     >                      - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) )
-     >              + (1.0d+00/6.0d+00)
-     >              * ty3 * ( u31j**2 - u31jm1**2 )
-     >              + c1 * c5 * ty3 * ( u51j - u51jm1 )
-            end do
-         end do
-
-         do j = jst, jend
-            do i = ist, iend
-
-               rsd(1,i,j,k) = rsd(1,i,j,k)
-     >              + dy1 * ty1 * (            u(1,i,j-1,k)
-     >                             - 2.0d+00 * u(1,i,j,k)
-     >                             +           u(1,i,j+1,k) )
-
-               rsd(2,i,j,k) = rsd(2,i,j,k)
-     >          + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) )
-     >              + dy2 * ty1 * (            u(2,i,j-1,k)
-     >                             - 2.0d+00 * u(2,i,j,k)
-     >                             +           u(2,i,j+1,k) )
-
-               rsd(3,i,j,k) = rsd(3,i,j,k)
-     >          + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) )
-     >              + dy3 * ty1 * (            u(3,i,j-1,k)
-     >                             - 2.0d+00 * u(3,i,j,k)
-     >                             +           u(3,i,j+1,k) )
-
-               rsd(4,i,j,k) = rsd(4,i,j,k)
-     >          + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) )
-     >              + dy4 * ty1 * (            u(4,i,j-1,k)
-     >                             - 2.0d+00 * u(4,i,j,k)
-     >                             +           u(4,i,j+1,k) )
-
-               rsd(5,i,j,k) = rsd(5,i,j,k)
-     >          + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) )
-     >              + dy5 * ty1 * (            u(5,i,j-1,k)
-     >                             - 2.0d+00 * u(5,i,j,k)
-     >                             +           u(5,i,j+1,k) )
-
-            end do
-         end do
-
-c---------------------------------------------------------------------
-c   fourth-order dissipation
-c---------------------------------------------------------------------
-         IF (west.eq.-1) then
-            do i = ist, iend
-             do m = 1, 5
-               rsd(m,i,2,k) = rsd(m,i,2,k)
-     >           - dssp * ( + 5.0d+00 * u(m,i,2,k)
-     >                      - 4.0d+00 * u(m,i,3,k)
-     >                      +           u(m,i,4,k) )
-               rsd(m,i,3,k) = rsd(m,i,3,k)
-     >           - dssp * ( - 4.0d+00 * u(m,i,2,k)
-     >                      + 6.0d+00 * u(m,i,3,k)
-     >                      - 4.0d+00 * u(m,i,4,k)
-     >                      +           u(m,i,5,k) )
-             end do
-            end do
-         END IF
-
-         do j = jst1, jend1
-            do i = ist, iend
-               do m = 1, 5
-                  rsd(m,i,j,k) = rsd(m,i,j,k)
-     >              - dssp * (            u(m,i,j-2,k)
-     >                        - 4.0d+00 * u(m,i,j-1,k)
-     >                        + 6.0d+00 * u(m,i,j,k)
-     >                        - 4.0d+00 * u(m,i,j+1,k)
-     >                        +           u(m,i,j+2,k) )
-               end do
-            end do
-         end do
-
-         IF (east.eq.-1) then
-            do i = ist, iend
-             do m = 1, 5
-               rsd(m,i,ny-2,k) = rsd(m,i,ny-2,k)
-     >           - dssp * (             u(m,i,ny-4,k)
-     >                      - 4.0d+00 * u(m,i,ny-3,k)
-     >                      + 6.0d+00 * u(m,i,ny-2,k)
-     >                      - 4.0d+00 * u(m,i,ny-1,k)  )
-               rsd(m,i,ny-1,k) = rsd(m,i,ny-1,k)
-     >           - dssp * (             u(m,i,ny-3,k)
-     >                      - 4.0d+00 * u(m,i,ny-2,k)
-     >                      + 5.0d+00 * u(m,i,ny-1,k) )
-             end do
-            end do
-         END IF
-
-      end do
-
-c---------------------------------------------------------------------
-c   zeta-direction flux differences
-c---------------------------------------------------------------------
-      do k = 1, nz
-         do j = jst, jend
-            do i = ist, iend
-               flux(1,i,j,k) = u(4,i,j,k)
-               u41 = u(4,i,j,k) / u(1,i,j,k)
-
-               q = 0.50d+00 * (  u(2,i,j,k) * u(2,i,j,k)
-     >                         + u(3,i,j,k) * u(3,i,j,k)
-     >                         + u(4,i,j,k) * u(4,i,j,k) )
-     >                      / u(1,i,j,k)
-
-               flux(2,i,j,k) = u(2,i,j,k) * u41 
-               flux(3,i,j,k) = u(3,i,j,k) * u41 
-               flux(4,i,j,k) = u(4,i,j,k) * u41 + c2 * (u(5,i,j,k)-q)
-               flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u41
-            end do
-         end do
-      end do
-
-      do k = 2, nz - 1
-         do j = jst, jend
-            do i = ist, iend
-               do m = 1, 5
-                  rsd(m,i,j,k) =  rsd(m,i,j,k)
-     >                - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) )
-               end do
-            end do
-         end do
-      end do
-
-      do k = 2, nz
-         do j = jst, jend
-            do i = ist, iend
-               tmp = 1.0d+00 / u(1,i,j,k)
-
-               u21k = tmp * u(2,i,j,k)
-               u31k = tmp * u(3,i,j,k)
-               u41k = tmp * u(4,i,j,k)
-               u51k = tmp * u(5,i,j,k)
-
-               tmp = 1.0d+00 / u(1,i,j,k-1)
-
-               u21km1 = tmp * u(2,i,j,k-1)
-               u31km1 = tmp * u(3,i,j,k-1)
-               u41km1 = tmp * u(4,i,j,k-1)
-               u51km1 = tmp * u(5,i,j,k-1)
-
-               flux(2,i,j,k) = tz3 * ( u21k - u21km1 )
-               flux(3,i,j,k) = tz3 * ( u31k - u31km1 )
-               flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * (u41k-u41km1)
-               flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
-     >              * tz3 * ( ( u21k  **2 + u31k  **2 + u41k  **2 )
-     >                      - ( u21km1**2 + u31km1**2 + u41km1**2 ) )
-     >              + (1.0d+00/6.0d+00)
-     >              * tz3 * ( u41k**2 - u41km1**2 )
-     >              + c1 * c5 * tz3 * ( u51k - u51km1 )
-            end do
-         end do
-      end do
-
-      do k = 2, nz - 1
-         do j = jst, jend
-            do i = ist, iend
-               rsd(1,i,j,k) = rsd(1,i,j,k)
-     >              + dz1 * tz1 * (            u(1,i,j,k-1)
-     >                             - 2.0d+00 * u(1,i,j,k)
-     >                             +           u(1,i,j,k+1) )
-               rsd(2,i,j,k) = rsd(2,i,j,k)
-     >          + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) )
-     >              + dz2 * tz1 * (            u(2,i,j,k-1)
-     >                             - 2.0d+00 * u(2,i,j,k)
-     >                             +           u(2,i,j,k+1) )
-               rsd(3,i,j,k) = rsd(3,i,j,k)
-     >          + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) )
-     >              + dz3 * tz1 * (            u(3,i,j,k-1)
-     >                             - 2.0d+00 * u(3,i,j,k)
-     >                             +           u(3,i,j,k+1) )
-               rsd(4,i,j,k) = rsd(4,i,j,k)
-     >          + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) )
-     >              + dz4 * tz1 * (            u(4,i,j,k-1)
-     >                             - 2.0d+00 * u(4,i,j,k)
-     >                             +           u(4,i,j,k+1) )
-               rsd(5,i,j,k) = rsd(5,i,j,k)
-     >          + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) )
-     >              + dz5 * tz1 * (            u(5,i,j,k-1)
-     >                             - 2.0d+00 * u(5,i,j,k)
-     >                             +           u(5,i,j,k+1) )
-            end do
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c   fourth-order dissipation
-c---------------------------------------------------------------------
-      do j = jst, jend
-         do i = ist, iend
-            do m = 1, 5
-               rsd(m,i,j,2) = rsd(m,i,j,2)
-     >           - dssp * ( + 5.0d+00 * u(m,i,j,2)
-     >                      - 4.0d+00 * u(m,i,j,3)
-     >                      +           u(m,i,j,4) )
-               rsd(m,i,j,3) = rsd(m,i,j,3)
-     >           - dssp * ( - 4.0d+00 * u(m,i,j,2)
-     >                      + 6.0d+00 * u(m,i,j,3)
-     >                      - 4.0d+00 * u(m,i,j,4)
-     >                      +           u(m,i,j,5) )
-            end do
-         end do
-      end do
-
-      do k = 4, nz - 3
-         do j = jst, jend
-            do i = ist, iend
-               do m = 1, 5
-                  rsd(m,i,j,k) = rsd(m,i,j,k)
-     >              - dssp * (            u(m,i,j,k-2)
-     >                        - 4.0d+00 * u(m,i,j,k-1)
-     >                        + 6.0d+00 * u(m,i,j,k)
-     >                        - 4.0d+00 * u(m,i,j,k+1)
-     >                        +           u(m,i,j,k+2) )
-               end do
-            end do
-         end do
-      end do
-
-      do j = jst, jend
-         do i = ist, iend
-            do m = 1, 5
-               rsd(m,i,j,nz-2) = rsd(m,i,j,nz-2)
-     >           - dssp * (             u(m,i,j,nz-4)
-     >                      - 4.0d+00 * u(m,i,j,nz-3)
-     >                      + 6.0d+00 * u(m,i,j,nz-2)
-     >                      - 4.0d+00 * u(m,i,j,nz-1)  )
-               rsd(m,i,j,nz-1) = rsd(m,i,j,nz-1)
-     >           - dssp * (             u(m,i,j,nz-3)
-     >                      - 4.0d+00 * u(m,i,j,nz-2)
-     >                      + 5.0d+00 * u(m,i,j,nz-1) )
-            end do
-         end do
-      end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/setbv.f b/examples/smpi/NAS/LU/setbv.f
deleted file mode 100644
index 56b0edf967..0000000000
--- a/examples/smpi/NAS/LU/setbv.f
+++ /dev/null
@@ -1,79 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setbv
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   set the boundary values of dependent variables
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c   local variables
-c---------------------------------------------------------------------
-      integer i, j, k
-      integer iglob, jglob
-
-c---------------------------------------------------------------------
-c   set the dependent variable values along the top and bottom faces
-c---------------------------------------------------------------------
-      do j = 1, ny
-         jglob = jpt + j
-         do i = 1, nx
-           iglob = ipt + i
-            call exact( iglob, jglob, 1, u( 1, i, j, 1 ) )
-            call exact( iglob, jglob, nz, u( 1, i, j, nz ) )
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c   set the dependent variable values along north and south faces
-c---------------------------------------------------------------------
-      IF (west.eq.-1) then
-         do k = 1, nz
-            do i = 1, nx
-               iglob = ipt + i
-               call exact( iglob, 1, k, u( 1, i, 1, k ) )
-            end do
-         end do
-      END IF
-
-      IF (east.eq.-1) then
-          do k = 1, nz
-             do i = 1, nx
-                iglob = ipt + i
-                call exact( iglob, ny0, k, u( 1, i, ny, k ) )
-             end do
-          end do
-      END IF
-
-c---------------------------------------------------------------------
-c   set the dependent variable values along east and west faces
-c---------------------------------------------------------------------
-      IF (north.eq.-1) then
-         do k = 1, nz
-            do j = 1, ny
-               jglob = jpt + j
-               call exact( 1, jglob, k, u( 1, 1, j, k ) )
-            end do
-         end do
-      END IF
-
-      IF (south.eq.-1) then
-         do k = 1, nz
-            do j = 1, ny
-                  jglob = jpt + j
-            call exact( nx0, jglob, k, u( 1, nx, j, k ) )
-            end do
-         end do
-      END IF
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/setcoeff.f b/examples/smpi/NAS/LU/setcoeff.f
deleted file mode 100644
index 8fc5c1840c..0000000000
--- a/examples/smpi/NAS/LU/setcoeff.f
+++ /dev/null
@@ -1,159 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setcoeff
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c   set up coefficients
-c---------------------------------------------------------------------
-      dxi = 1.0d+00 / ( nx0 - 1 )
-      deta = 1.0d+00 / ( ny0 - 1 )
-      dzeta = 1.0d+00 / ( nz0 - 1 )
-
-      tx1 = 1.0d+00 / ( dxi * dxi )
-      tx2 = 1.0d+00 / ( 2.0d+00 * dxi )
-      tx3 = 1.0d+00 / dxi
-
-      ty1 = 1.0d+00 / ( deta * deta )
-      ty2 = 1.0d+00 / ( 2.0d+00 * deta )
-      ty3 = 1.0d+00 / deta
-
-      tz1 = 1.0d+00 / ( dzeta * dzeta )
-      tz2 = 1.0d+00 / ( 2.0d+00 * dzeta )
-      tz3 = 1.0d+00 / dzeta
-
-      ii1 = 2
-      ii2 = nx0 - 1
-      ji1 = 2
-      ji2 = ny0 - 2
-      ki1 = 3
-      ki2 = nz0 - 1
-
-c---------------------------------------------------------------------
-c   diffusion coefficients
-c---------------------------------------------------------------------
-      dx1 = 0.75d+00
-      dx2 = dx1
-      dx3 = dx1
-      dx4 = dx1
-      dx5 = dx1
-
-      dy1 = 0.75d+00
-      dy2 = dy1
-      dy3 = dy1
-      dy4 = dy1
-      dy5 = dy1
-
-      dz1 = 1.00d+00
-      dz2 = dz1
-      dz3 = dz1
-      dz4 = dz1
-      dz5 = dz1
-
-c---------------------------------------------------------------------
-c   fourth difference dissipation
-c---------------------------------------------------------------------
-      dssp = ( max (dx1, dy1, dz1 ) ) / 4.0d+00
-
-c---------------------------------------------------------------------
-c   coefficients of the exact solution to the first pde
-c---------------------------------------------------------------------
-      ce(1,1) = 2.0d+00
-      ce(1,2) = 0.0d+00
-      ce(1,3) = 0.0d+00
-      ce(1,4) = 4.0d+00
-      ce(1,5) = 5.0d+00
-      ce(1,6) = 3.0d+00
-      ce(1,7) = 5.0d-01
-      ce(1,8) = 2.0d-02
-      ce(1,9) = 1.0d-02
-      ce(1,10) = 3.0d-02
-      ce(1,11) = 5.0d-01
-      ce(1,12) = 4.0d-01
-      ce(1,13) = 3.0d-01
-
-c---------------------------------------------------------------------
-c   coefficients of the exact solution to the second pde
-c---------------------------------------------------------------------
-      ce(2,1) = 1.0d+00
-      ce(2,2) = 0.0d+00
-      ce(2,3) = 0.0d+00
-      ce(2,4) = 0.0d+00
-      ce(2,5) = 1.0d+00
-      ce(2,6) = 2.0d+00
-      ce(2,7) = 3.0d+00
-      ce(2,8) = 1.0d-02
-      ce(2,9) = 3.0d-02
-      ce(2,10) = 2.0d-02
-      ce(2,11) = 4.0d-01
-      ce(2,12) = 3.0d-01
-      ce(2,13) = 5.0d-01
-
-c---------------------------------------------------------------------
-c   coefficients of the exact solution to the third pde
-c---------------------------------------------------------------------
-      ce(3,1) = 2.0d+00
-      ce(3,2) = 2.0d+00
-      ce(3,3) = 0.0d+00
-      ce(3,4) = 0.0d+00
-      ce(3,5) = 0.0d+00
-      ce(3,6) = 2.0d+00
-      ce(3,7) = 3.0d+00
-      ce(3,8) = 4.0d-02
-      ce(3,9) = 3.0d-02
-      ce(3,10) = 5.0d-02
-      ce(3,11) = 3.0d-01
-      ce(3,12) = 5.0d-01
-      ce(3,13) = 4.0d-01
-
-c---------------------------------------------------------------------
-c   coefficients of the exact solution to the fourth pde
-c---------------------------------------------------------------------
-      ce(4,1) = 2.0d+00
-      ce(4,2) = 2.0d+00
-      ce(4,3) = 0.0d+00
-      ce(4,4) = 0.0d+00
-      ce(4,5) = 0.0d+00
-      ce(4,6) = 2.0d+00
-      ce(4,7) = 3.0d+00
-      ce(4,8) = 3.0d-02
-      ce(4,9) = 5.0d-02
-      ce(4,10) = 4.0d-02
-      ce(4,11) = 2.0d-01
-      ce(4,12) = 1.0d-01
-      ce(4,13) = 3.0d-01
-
-c---------------------------------------------------------------------
-c   coefficients of the exact solution to the fifth pde
-c---------------------------------------------------------------------
-      ce(5,1) = 5.0d+00
-      ce(5,2) = 4.0d+00
-      ce(5,3) = 3.0d+00
-      ce(5,4) = 2.0d+00
-      ce(5,5) = 1.0d-01
-      ce(5,6) = 4.0d-01
-      ce(5,7) = 3.0d-01
-      ce(5,8) = 5.0d-02
-      ce(5,9) = 4.0d-02
-      ce(5,10) = 3.0d-02
-      ce(5,11) = 1.0d-01
-      ce(5,12) = 3.0d-01
-      ce(5,13) = 2.0d-01
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/LU/sethyper.f b/examples/smpi/NAS/LU/sethyper.f
deleted file mode 100644
index 15245d4c21..0000000000
--- a/examples/smpi/NAS/LU/sethyper.f
+++ /dev/null
@@ -1,94 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine sethyper
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c    for each column in a hyperplane, istart = first row,
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j
-      integer iglob, jglob
-      integer kp
-
-c---------------------------------------------------------------------
-c compute the pointers for hyperplanes
-c---------------------------------------------------------------------
-        do kp = 2,nx0+ny0
-          icomms(kp) = .false.
-          icommn(kp) = .false.
-          icomme(kp) = .false.
-          icommw(kp) = .false.
-
-c---------------------------------------------------------------------
-c  check to see if comm. to south is required
-c---------------------------------------------------------------------
-          if (south.ne.-1) then
-            i     = iend
-            iglob = ipt + i
-            jglob = kp - iglob
-            j     = jglob - jpt
-            if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and.
-     >         j.le.jend) icomms(kp) = .true.
-          end if
-
-c---------------------------------------------------------------------
-c  check to see if comm. to north is required
-c---------------------------------------------------------------------
-          if (north.ne.-1) then
-            i     = ist
-            iglob = ipt + i
-            jglob = kp - iglob
-            j     = jglob - jpt
-            if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and.
-     >         j.le.jend) icommn(kp) = .true.
-          end if
-
-c---------------------------------------------------------------------
-c  check to see if comm. to east is required
-c---------------------------------------------------------------------
-          if (east.ne.-1) then
-            j     = jend
-            jglob = jpt + j
-            iglob = kp - jglob
-            i     = iglob - ipt
-            if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and.
-     >         i.le.iend) icomme(kp) = .true.
-          end if
-
-c---------------------------------------------------------------------
-c  check to see if comm. to west is required
-c---------------------------------------------------------------------
-          if (west.ne.-1) then
-            j = jst
-            jglob = jpt + j
-            iglob = kp - jglob
-            i     = iglob - ipt
-            if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and.
-     >         i.le.iend) icommw(kp) = .true.
-          end if
-
-        end do
-
-        icomms(1) = .false.
-        icommn(1) = .false.
-        icomme(1) = .false.
-        icommw(1) = .false.
-        icomms(nx0+ny0+1) = .false.
-        icommn(nx0+ny0+1) = .false.
-        icomme(nx0+ny0+1) = .false.
-        icommw(nx0+ny0+1) = .false.
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/setiv.f b/examples/smpi/NAS/LU/setiv.f
deleted file mode 100644
index 73725cbe5c..0000000000
--- a/examples/smpi/NAS/LU/setiv.f
+++ /dev/null
@@ -1,67 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-      subroutine setiv
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c   set the initial values of independent variables based on tri-linear
-c   interpolation of boundary values in the computational space.
-c
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, k, m
-      integer iglob, jglob
-      double precision  xi, eta, zeta
-      double precision  pxi, peta, pzeta
-      double precision  ue_1jk(5),ue_nx0jk(5),ue_i1k(5),
-     >        ue_iny0k(5),ue_ij1(5),ue_ijnz(5)
-
-
-      do k = 2, nz - 1
-         zeta = ( dble (k-1) ) / (nz-1)
-         do j = 1, ny
-          jglob = jpt + j
-          IF (jglob.ne.1.and.jglob.ne.ny0) then
-            eta = ( dble (jglob-1) ) / (ny0-1)
-            do i = 1, nx
-              iglob = ipt + i
-              IF (iglob.ne.1.and.iglob.ne.nx0) then
-               xi = ( dble (iglob-1) ) / (nx0-1)
-               call exact (1,jglob,k,ue_1jk)
-               call exact (nx0,jglob,k,ue_nx0jk)
-               call exact (iglob,1,k,ue_i1k)
-               call exact (iglob,ny0,k,ue_iny0k)
-               call exact (iglob,jglob,1,ue_ij1)
-               call exact (iglob,jglob,nz,ue_ijnz)
-               do m = 1, 5
-                  pxi =   ( 1.0d+00 - xi ) * ue_1jk(m)
-     >                              + xi   * ue_nx0jk(m)
-                  peta =  ( 1.0d+00 - eta ) * ue_i1k(m)
-     >                              + eta   * ue_iny0k(m)
-                  pzeta = ( 1.0d+00 - zeta ) * ue_ij1(m)
-     >                              + zeta   * ue_ijnz(m)
-
-                  u( m, i, j, k ) = pxi + peta + pzeta
-     >                 - pxi * peta - peta * pzeta - pzeta * pxi
-     >                 + pxi * peta * pzeta
-
-               end do
-              END IF
-            end do
-          END IF
-         end do
-      end do
-
-      return
-      end
diff --git a/examples/smpi/NAS/LU/ssor.f b/examples/smpi/NAS/LU/ssor.f
deleted file mode 100644
index cf4eed0eb7..0000000000
--- a/examples/smpi/NAS/LU/ssor.f
+++ /dev/null
@@ -1,241 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine ssor(niter)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   to perform pseudo-time stepping SSOR iterations
-c   for five nonlinear pde's.
-c---------------------------------------------------------------------
-
-      implicit none
-      integer  niter
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer i, j, k, m
-      integer istep
-      double precision  tmp
-      double precision  delunm(5), tv(5,isiz1,isiz2)
-
-      external timer_read
-      double precision wtime, timer_read
-
-      integer IERROR
-
- 
-      ROOT = 0
- 
-c---------------------------------------------------------------------
-c   begin pseudo-time stepping iterations
-c---------------------------------------------------------------------
-      tmp = 1.0d+00 / ( omega * ( 2.0d+00 - omega ) ) 
-
-c---------------------------------------------------------------------
-c   initialize a,b,c,d to zero (guarantees that page tables have been
-c   formed, if applicable on given architecture, before timestepping).
-c---------------------------------------------------------------------
-      do m=1,isiz2
-         do k=1,isiz1
-            do j=1,5
-               do i=1,5
-                  a(i,j,k,m) = 0.d0
-                  b(i,j,k,m) = 0.d0
-                  c(i,j,k,m) = 0.d0
-                  d(i,j,k,m) = 0.d0
-               enddo
-            enddo
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c   compute the steady-state residuals
-c---------------------------------------------------------------------
-      call rhs
- 
-c---------------------------------------------------------------------
-c   compute the L2 norms of newton iteration residuals
-c---------------------------------------------------------------------
-      call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
-     >             ist, iend, jst, jend,
-     >             rsd, rsdnm )
-  
-      call MPI_BARRIER( MPI_COMM_WORLD, IERROR )
- 
-      call timer_clear(1)
-      call timer_start(1)
-
-c---------------------------------------------------------------------
-c   the timestep loop
-c---------------------------------------------------------------------
-      do istep = 1, niter
-
-         if (id .eq. 0) then
-            if (mod ( istep, 20) .eq. 0 .or.
-     >            istep .eq. itmax .or.
-     >            istep .eq. 1) then
-               if (niter .gt. 1) write( *, 200) istep
- 200           format(' Time step ', i4)
-            endif
-         endif
- 
-c---------------------------------------------------------------------
-c   perform SSOR iteration
-c---------------------------------------------------------------------
-         do k = 2, nz - 1
-            do j = jst, jend
-               do i = ist, iend
-                  do m = 1, 5
-                     rsd(m,i,j,k) = dt * rsd(m,i,j,k)
-                  end do
-               end do
-            end do
-         end do
- 
-         DO k = 2, nz -1 
-c---------------------------------------------------------------------
-c   form the lower triangular part of the jacobian matrix
-c---------------------------------------------------------------------
-            call jacld(k)
- 
-c---------------------------------------------------------------------
-c   perform the lower triangular solution
-c---------------------------------------------------------------------
-            call blts( isiz1, isiz2, isiz3,
-     >                 nx, ny, nz, k,
-     >                 omega,
-     >                 rsd,
-     >                 a, b, c, d,
-     >                 ist, iend, jst, jend, 
-     >                 nx0, ny0, ipt, jpt)
-          END DO
- 
-          DO k = nz - 1, 2, -1
-c---------------------------------------------------------------------
-c   form the strictly upper triangular part of the jacobian matrix
-c---------------------------------------------------------------------
-            call jacu(k)
-
-c---------------------------------------------------------------------
-c   perform the upper triangular solution
-c---------------------------------------------------------------------
-            call buts( isiz1, isiz2, isiz3,
-     >                 nx, ny, nz, k,
-     >                 omega,
-     >                 rsd, tv,
-     >                 d, a, b, c,
-     >                 ist, iend, jst, jend,
-     >                 nx0, ny0, ipt, jpt)
-          END DO
- 
-c---------------------------------------------------------------------
-c   update the variables
-c---------------------------------------------------------------------
- 
-         do k = 2, nz-1
-            do j = jst, jend
-               do i = ist, iend
-                  do m = 1, 5
-                     u( m, i, j, k ) = u( m, i, j, k )
-     >                    + tmp * rsd( m, i, j, k )
-                  end do
-               end do
-            end do
-         end do
- 
-c---------------------------------------------------------------------
-c   compute the max-norms of newton iteration corrections
-c---------------------------------------------------------------------
-         if ( mod ( istep, inorm ) .eq. 0 ) then
-            call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
-     >                   ist, iend, jst, jend,
-     >                   rsd, delunm )
-c            if ( ipr .eq. 1 .and. id .eq. 0 ) then
-c                write (*,1006) ( delunm(m), m = 1, 5 )
-c            else if ( ipr .eq. 2 .and. id .eq. 0 ) then
-c                write (*,'(i5,f15.6)') istep,delunm(5)
-c            end if
-         end if
- 
-c---------------------------------------------------------------------
-c   compute the steady-state residuals
-c---------------------------------------------------------------------
-         call rhs
- 
-c---------------------------------------------------------------------
-c   compute the max-norms of newton iteration residuals
-c---------------------------------------------------------------------
-         if ( ( mod ( istep, inorm ) .eq. 0 ) .or.
-     >        ( istep .eq. itmax ) ) then
-            call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
-     >                   ist, iend, jst, jend,
-     >                   rsd, rsdnm )
-c            if ( ipr .eq. 1.and.id.eq.0 ) then
-c                write (*,1007) ( rsdnm(m), m = 1, 5 )
-c            end if
-         end if
-
-c---------------------------------------------------------------------
-c   check the newton-iteration residuals against the tolerance levels
-c---------------------------------------------------------------------
-         if ( ( rsdnm(1) .lt. tolrsd(1) ) .and.
-     >        ( rsdnm(2) .lt. tolrsd(2) ) .and.
-     >        ( rsdnm(3) .lt. tolrsd(3) ) .and.
-     >        ( rsdnm(4) .lt. tolrsd(4) ) .and.
-     >        ( rsdnm(5) .lt. tolrsd(5) ) ) then
-c            if (ipr .eq. 1 .and. id.eq.0) then
-c               write (*,1004) istep
-c            end if
-            return
-         end if
- 
-      end do
- 
-      call timer_stop(1)
-      wtime = timer_read(1)
- 
-
-      call MPI_ALLREDUCE( wtime, 
-     >                    maxtime, 
-     >                    1, 
-     >                    MPI_DOUBLE_PRECISION, 
-     >                    MPI_MAX, 
-     >                    MPI_COMM_WORLD,
-     >                    IERROR )
- 
-
-
-      return
-      
- 1001 format (1x/5x,'pseudo-time SSOR iteration no.=',i4/)
- 1004 format (1x/1x,'convergence was achieved after ',i4,
-     >   ' pseudo-time steps' )
- 1006 format (1x/1x,'RMS-norm of SSOR-iteration correction ',
-     > 'for first pde  = ',1pe12.5/,
-     > 1x,'RMS-norm of SSOR-iteration correction ',
-     > 'for second pde = ',1pe12.5/,
-     > 1x,'RMS-norm of SSOR-iteration correction ',
-     > 'for third pde  = ',1pe12.5/,
-     > 1x,'RMS-norm of SSOR-iteration correction ',
-     > 'for fourth pde = ',1pe12.5/,
-     > 1x,'RMS-norm of SSOR-iteration correction ',
-     > 'for fifth pde  = ',1pe12.5)
- 1007 format (1x/1x,'RMS-norm of steady-state residual for ',
-     > 'first pde  = ',1pe12.5/,
-     > 1x,'RMS-norm of steady-state residual for ',
-     > 'second pde = ',1pe12.5/,
-     > 1x,'RMS-norm of steady-state residual for ',
-     > 'third pde  = ',1pe12.5/,
-     > 1x,'RMS-norm of steady-state residual for ',
-     > 'fourth pde = ',1pe12.5/,
-     > 1x,'RMS-norm of steady-state residual for ',
-     > 'fifth pde  = ',1pe12.5)
- 
-      end
diff --git a/examples/smpi/NAS/LU/subdomain.f b/examples/smpi/NAS/LU/subdomain.f
deleted file mode 100644
index 388bbf4494..0000000000
--- a/examples/smpi/NAS/LU/subdomain.f
+++ /dev/null
@@ -1,103 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine subdomain
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'applu.incl'
-
-c---------------------------------------------------------------------
-c  local variables
-c---------------------------------------------------------------------
-      integer mm, ierror, errorcode
-
-
-c---------------------------------------------------------------------
-c
-c   set up the sub-domain sizes
-c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   x dimension
-c---------------------------------------------------------------------
-      mm   = mod(nx0,xdim)
-      if (row.le.mm) then
-        nx = nx0/xdim + 1
-        ipt = (row-1)*nx
-      else
-        nx = nx0/xdim
-        ipt = (row-1)*nx + mm
-      end if
-
-c---------------------------------------------------------------------
-c   y dimension
-c---------------------------------------------------------------------
-      mm   = mod(ny0,ydim)
-      if (col.le.mm) then
-        ny = ny0/ydim + 1
-        jpt = (col-1)*ny
-      else
-        ny = ny0/ydim
-        jpt = (col-1)*ny + mm
-      end if
-
-c---------------------------------------------------------------------
-c   z dimension
-c---------------------------------------------------------------------
-      nz = nz0
-
-c---------------------------------------------------------------------
-c   check the sub-domain size
-c---------------------------------------------------------------------
-      if ( ( nx .lt. 4 ) .or.
-     >     ( ny .lt. 4 ) .or.
-     >     ( nz .lt. 4 ) ) then
-         write (*,2001) nx, ny, nz
- 2001    format (5x,'SUBDOMAIN SIZE IS TOO SMALL - ',
-     >        /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS',
-     >        /5x,'SO THAT NX, NY AND NZ ARE GREATER THAN OR EQUAL',
-     >        /5x,'TO 4 THEY ARE CURRENTLY', 3I3)
-          CALL MPI_ABORT( MPI_COMM_WORLD,
-     >                    ERRORCODE,
-     >                    IERROR )
-      end if
-
-      if ( ( nx .gt. isiz1 ) .or.
-     >     ( ny .gt. isiz2 ) .or.
-     >     ( nz .gt. isiz3 ) ) then
-         write (*,2002) nx, ny, nz
- 2002    format (5x,'SUBDOMAIN SIZE IS TOO LARGE - ',
-     >        /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS',
-     >        /5x,'SO THAT NX, NY AND NZ ARE LESS THAN OR EQUAL TO ',
-     >        /5x,'ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY.  THEY ARE',
-     >        /5x,'CURRENTLY', 3I4)
-          CALL MPI_ABORT( MPI_COMM_WORLD,
-     >                    ERRORCODE,
-     >                    IERROR )
-      end if
-
-
-c---------------------------------------------------------------------
-c   set up the start and end in i and j extents for all processors
-c---------------------------------------------------------------------
-      ist = 1
-      iend = nx
-      if (north.eq.-1) ist = 2
-      if (south.eq.-1) iend = nx - 1
-
-      jst = 1
-      jend = ny
-      if (west.eq.-1) jst = 2
-      if (east.eq.-1) jend = ny - 1
-
-      return
-      end
-
-
diff --git a/examples/smpi/NAS/LU/verify.f b/examples/smpi/NAS/LU/verify.f
deleted file mode 100644
index 2572441a44..0000000000
--- a/examples/smpi/NAS/LU/verify.f
+++ /dev/null
@@ -1,403 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-        subroutine verify(xcr, xce, xci, class, verified)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c  verification routine                         
-c---------------------------------------------------------------------
-
-        implicit none
-        include 'mpinpb.h'
-        include 'applu.incl'
-
-        double precision xcr(5), xce(5), xci
-        double precision xcrref(5),xceref(5),xciref, 
-     >                   xcrdif(5),xcedif(5),xcidif,
-     >                   epsilon, dtref
-        integer m
-        character class
-        logical verified
-
-c---------------------------------------------------------------------
-c   tolerance level
-c---------------------------------------------------------------------
-        epsilon = 1.0d-08
-
-        class = 'U'
-        verified = .true.
-
-        do m = 1,5
-           xcrref(m) = 1.0
-           xceref(m) = 1.0
-        end do
-        xciref = 1.0
-
-        if ( (nx0  .eq. 12     ) .and. 
-     >       (ny0  .eq. 12     ) .and.
-     >       (nz0  .eq. 12     ) .and.
-     >       (itmax   .eq. 50    ))  then
-
-           class = 'S'
-           dtref = 5.0d-1
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of residual, for the (12X12X12) grid,
-c   after 50 time steps, with  DT = 5.0d-01
-c---------------------------------------------------------------------
-         xcrref(1) = 1.6196343210976702d-02
-         xcrref(2) = 2.1976745164821318d-03
-         xcrref(3) = 1.5179927653399185d-03
-         xcrref(4) = 1.5029584435994323d-03
-         xcrref(5) = 3.4264073155896461d-02
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of solution error, for the (12X12X12) grid,
-c   after 50 time steps, with  DT = 5.0d-01
-c---------------------------------------------------------------------
-         xceref(1) = 6.4223319957960924d-04
-         xceref(2) = 8.4144342047347926d-05
-         xceref(3) = 5.8588269616485186d-05
-         xceref(4) = 5.8474222595157350d-05
-         xceref(5) = 1.3103347914111294d-03
-
-c---------------------------------------------------------------------
-c   Reference value of surface integral, for the (12X12X12) grid,
-c   after 50 time steps, with DT = 5.0d-01
-c---------------------------------------------------------------------
-         xciref = 7.8418928865937083d+00
-
-
-        elseif ( (nx0 .eq. 33) .and. 
-     >           (ny0 .eq. 33) .and.
-     >           (nz0 .eq. 33) .and.
-     >           (itmax . eq. 300) ) then
-
-           class = 'W'   !SPEC95fp size
-           dtref = 1.5d-3
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of residual, for the (33x33x33) grid,
-c   after 300 time steps, with  DT = 1.5d-3
-c---------------------------------------------------------------------
-           xcrref(1) =   0.1236511638192d+02
-           xcrref(2) =   0.1317228477799d+01
-           xcrref(3) =   0.2550120713095d+01
-           xcrref(4) =   0.2326187750252d+01
-           xcrref(5) =   0.2826799444189d+02
-
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of solution error, for the (33X33X33) grid,
-c---------------------------------------------------------------------
-           xceref(1) =   0.4867877144216d+00
-           xceref(2) =   0.5064652880982d-01
-           xceref(3) =   0.9281818101960d-01
-           xceref(4) =   0.8570126542733d-01
-           xceref(5) =   0.1084277417792d+01
-
-
-c---------------------------------------------------------------------
-c   Reference value of surface integral, for the (33X33X33) grid,
-c   after 300 time steps, with  DT = 1.5d-3
-c---------------------------------------------------------------------
-           xciref    =   0.1161399311023d+02
-
-        elseif ( (nx0 .eq. 64) .and. 
-     >           (ny0 .eq. 64) .and.
-     >           (nz0 .eq. 64) .and.
-     >           (itmax . eq. 250) ) then
-
-           class = 'A'
-           dtref = 2.0d+0
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of residual, for the (64X64X64) grid,
-c   after 250 time steps, with  DT = 2.0d+00
-c---------------------------------------------------------------------
-         xcrref(1) = 7.7902107606689367d+02
-         xcrref(2) = 6.3402765259692870d+01
-         xcrref(3) = 1.9499249727292479d+02
-         xcrref(4) = 1.7845301160418537d+02
-         xcrref(5) = 1.8384760349464247d+03
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of solution error, for the (64X64X64) grid,
-c   after 250 time steps, with  DT = 2.0d+00
-c---------------------------------------------------------------------
-         xceref(1) = 2.9964085685471943d+01
-         xceref(2) = 2.8194576365003349d+00
-         xceref(3) = 7.3473412698774742d+00
-         xceref(4) = 6.7139225687777051d+00
-         xceref(5) = 7.0715315688392578d+01
-
-c---------------------------------------------------------------------
-c   Reference value of surface integral, for the (64X64X64) grid,
-c   after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
-         xciref = 2.6030925604886277d+01
-
-
-        elseif ( (nx0 .eq. 102) .and. 
-     >           (ny0 .eq. 102) .and.
-     >           (nz0 .eq. 102) .and.
-     >           (itmax . eq. 250) ) then
-
-           class = 'B'
-           dtref = 2.0d+0
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of residual, for the (102X102X102) grid,
-c   after 250 time steps, with  DT = 2.0d+00
-c---------------------------------------------------------------------
-         xcrref(1) = 3.5532672969982736d+03
-         xcrref(2) = 2.6214750795310692d+02
-         xcrref(3) = 8.8333721850952190d+02
-         xcrref(4) = 7.7812774739425265d+02
-         xcrref(5) = 7.3087969592545314d+03
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of solution error, for the (102X102X102) 
-c   grid, after 250 time steps, with  DT = 2.0d+00
-c---------------------------------------------------------------------
-         xceref(1) = 1.1401176380212709d+02
-         xceref(2) = 8.1098963655421574d+00
-         xceref(3) = 2.8480597317698308d+01
-         xceref(4) = 2.5905394567832939d+01
-         xceref(5) = 2.6054907504857413d+02
-
-c---------------------------------------------------------------------
-c   Reference value of surface integral, for the (102X102X102) grid,
-c   after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
-         xciref = 4.7887162703308227d+01
-
-        elseif ( (nx0 .eq. 162) .and. 
-     >           (ny0 .eq. 162) .and.
-     >           (nz0 .eq. 162) .and.
-     >           (itmax . eq. 250) ) then
-
-           class = 'C'
-           dtref = 2.0d+0
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of residual, for the (162X162X162) grid,
-c   after 250 time steps, with  DT = 2.0d+00
-c---------------------------------------------------------------------
-         xcrref(1) = 1.03766980323537846d+04
-         xcrref(2) = 8.92212458801008552d+02
-         xcrref(3) = 2.56238814582660871d+03
-         xcrref(4) = 2.19194343857831427d+03
-         xcrref(5) = 1.78078057261061185d+04
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of solution error, for the (162X162X162) 
-c   grid, after 250 time steps, with  DT = 2.0d+00
-c---------------------------------------------------------------------
-         xceref(1) = 2.15986399716949279d+02
-         xceref(2) = 1.55789559239863600d+01
-         xceref(3) = 5.41318863077207766d+01
-         xceref(4) = 4.82262643154045421d+01
-         xceref(5) = 4.55902910043250358d+02
-
-c---------------------------------------------------------------------
-c   Reference value of surface integral, for the (162X162X162) grid,
-c   after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
-         xciref = 6.66404553572181300d+01
-
-        elseif ( (nx0 .eq. 408) .and. 
-     >           (ny0 .eq. 408) .and.
-     >           (nz0 .eq. 408) .and.
-     >           (itmax . eq. 300) ) then
-
-           class = 'D'
-           dtref = 1.0d+0
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of residual, for the (408X408X408) grid,
-c   after 300 time steps, with  DT = 1.0d+00
-c---------------------------------------------------------------------
-         xcrref(1) = 0.4868417937025d+05
-         xcrref(2) = 0.4696371050071d+04
-         xcrref(3) = 0.1218114549776d+05 
-         xcrref(4) = 0.1033801493461d+05
-         xcrref(5) = 0.7142398413817d+05
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of solution error, for the (408X408X408) 
-c   grid, after 300 time steps, with  DT = 1.0d+00
-c---------------------------------------------------------------------
-         xceref(1) = 0.3752393004482d+03
-         xceref(2) = 0.3084128893659d+02
-         xceref(3) = 0.9434276905469d+02
-         xceref(4) = 0.8230686681928d+02
-         xceref(5) = 0.7002620636210d+03
-
-c---------------------------------------------------------------------
-c   Reference value of surface integral, for the (408X408X408) grid,
-c   after 300 time steps, with DT = 1.0d+00
-c---------------------------------------------------------------------
-         xciref =    0.8334101392503d+02
-
-        elseif ( (nx0 .eq. 1020) .and. 
-     >           (ny0 .eq. 1020) .and.
-     >           (nz0 .eq. 1020) .and.
-     >           (itmax . eq. 300) ) then
-
-           class = 'E'
-           dtref = 0.5d+0
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of residual, for the (1020X1020X1020) grid,
-c   after 300 time steps, with  DT = 0.5d+00
-c---------------------------------------------------------------------
-         xcrref(1) = 0.2099641687874d+06
-         xcrref(2) = 0.2130403143165d+05
-         xcrref(3) = 0.5319228789371d+05 
-         xcrref(4) = 0.4509761639833d+05
-         xcrref(5) = 0.2932360006590d+06
-
-c---------------------------------------------------------------------
-c   Reference values of RMS-norms of solution error, for the (1020X1020X1020) 
-c   grid, after 300 time steps, with  DT = 0.5d+00
-c---------------------------------------------------------------------
-         xceref(1) = 0.4800572578333d+03
-         xceref(2) = 0.4221993400184d+02
-         xceref(3) = 0.1210851906824d+03
-         xceref(4) = 0.1047888986770d+03
-         xceref(5) = 0.8363028257389d+03
-
-c---------------------------------------------------------------------
-c   Reference value of surface integral, for the (1020X1020X1020) grid,
-c   after 300 time steps, with DT = 0.5d+00
-c---------------------------------------------------------------------
-         xciref =    0.9512163272273d+02
-
-        else
-           verified = .FALSE.
-        endif
-
-c---------------------------------------------------------------------
-c    verification test for residuals if gridsize is one of 
-c    the defined grid sizes above (class .ne. 'U')
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c    Compute the difference of solution values and the known reference values.
-c---------------------------------------------------------------------
-        do m = 1, 5
-           
-           xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) 
-           xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
-           
-        enddo
-        xcidif = dabs((xci - xciref)/xciref)
-
-
-c---------------------------------------------------------------------
-c    Output the comparison of computed results to known cases.
-c---------------------------------------------------------------------
-
-        if (class .ne. 'U') then
-           write(*, 1990) class
- 1990      format(/, ' Verification being performed for class ', a)
-           write (*,2000) epsilon
- 2000      format(' Accuracy setting for epsilon = ', E20.13)
-           verified = (dabs(dt-dtref) .le. epsilon)
-           if (.not.verified) then  
-              class = 'U'
-              write (*,1000) dtref
- 1000         format(' DT does not match the reference value of ', 
-     >                 E15.8)
-           endif
-        else 
-           write(*, 1995)
- 1995      format(' Unknown class')
-        endif
-
-
-        if (class .ne. 'U') then
-           write (*,2001) 
-        else
-           write (*, 2005)
-        endif
-
- 2001   format(' Comparison of RMS-norms of residual')
- 2005   format(' RMS-norms of residual')
-        do m = 1, 5
-           if (class .eq. 'U') then
-              write(*, 2015) m, xcr(m)
-           else if (xcrdif(m) .le. epsilon) then
-              write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
-           else 
-              verified = .false.
-              write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
-           endif
-        enddo
-
-        if (class .ne. 'U') then
-           write (*,2002)
-        else
-           write (*,2006)
-        endif
- 2002   format(' Comparison of RMS-norms of solution error')
- 2006   format(' RMS-norms of solution error')
-        
-        do m = 1, 5
-           if (class .eq. 'U') then
-              write(*, 2015) m, xce(m)
-           else if (xcedif(m) .le. epsilon) then
-              write (*,2011) m,xce(m),xceref(m),xcedif(m)
-           else
-              verified = .false.
-              write (*,2010) m,xce(m),xceref(m),xcedif(m)
-           endif
-        enddo
-        
- 2010   format(' FAILURE: ', i2, 2x, E20.13, E20.13, E20.13)
- 2011   format('          ', i2, 2x, E20.13, E20.13, E20.13)
- 2015   format('          ', i2, 2x, E20.13)
-        
-        if (class .ne. 'U') then
-           write (*,2025)
-        else
-           write (*,2026)
-        endif
- 2025   format(' Comparison of surface integral')
- 2026   format(' Surface integral')
-
-
-        if (class .eq. 'U') then
-           write(*, 2030) xci
-        else if (xcidif .le. epsilon) then
-           write(*, 2032) xci, xciref, xcidif
-        else
-           verified = .false.
-           write(*, 2031) xci, xciref, xcidif
-        endif
-
- 2030   format('          ', 4x, E20.13)
- 2031   format(' FAILURE: ', 4x, E20.13, E20.13, E20.13)
- 2032   format('          ', 4x, E20.13, E20.13, E20.13)
-
-
-
-        if (class .eq. 'U') then
-           write(*, 2022)
-           write(*, 2023)
- 2022      format(' No reference values provided')
- 2023      format(' No verification performed')
-        else if (verified) then
-           write(*, 2020)
- 2020      format(' Verification Successful')
-        else
-           write(*, 2021)
- 2021      format(' Verification failed')
-        endif
-
-        return
-
-
-        end
diff --git a/examples/smpi/NAS/MG/Makefile b/examples/smpi/NAS/MG/Makefile
deleted file mode 100644
index 1554bedeea..0000000000
--- a/examples/smpi/NAS/MG/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=mg
-BENCHMARKU=MG
-
-include ../config/make.def
-
-OBJS = mg.o ${COMMON}/print_results.o  \
-       ${COMMON}/${RAND}.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-${PROGRAM}: config ${OBJS}
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-mg.o:		mg.f  globals.h mpinpb.h npbparams.h
-	${FCOMPILE} mg.f
-
-clean:
-	- rm -f *.o *~ 
-	- rm -f npbparams.h core
-
-
-
diff --git a/examples/smpi/NAS/MG/README b/examples/smpi/NAS/MG/README
deleted file mode 100644
index 6c03f78527..0000000000
--- a/examples/smpi/NAS/MG/README
+++ /dev/null
@@ -1,138 +0,0 @@
-Some info about the MG benchmark
-================================
-    
-'mg_demo' demonstrates the capabilities of a very simple multigrid
-solver in computing a three dimensional potential field.  This is
-a simplified multigrid solver in two important respects:
-
-  (1) it solves only a constant coefficient equation,
-  and that only on a uniform cubical grid,
-    
-  (2) it solves only a single equation, representing
-  a scalar field rather than a vector field.
-
-We chose it for its portability and simplicity, and expect that a
-supercomputer which can run it effectively will also be able to
-run more complex multigrid programs at least as well.
-     
-     Eric Barszcz                         Paul Frederickson
-     RIACS
-     NASA Ames Research Center            NASA Ames Research Center
-
-========================================================================
-Running the program:  (Note: also see parameter lm information in the
-                       two sections immediately below this section)
-
-The program may be run with or without an input deck (called "mg.input"). 
-The following describes a few things about the input deck if you want to 
-use one. 
-
-The four lines below are the "mg.input" file required to run a
-problem of total size 256x256x256, for 4 iterations (Class "A"),
-and presumes the use of 8 processors:
-
-   8 = top level
-   256 256 256 = nx ny nz
-   4 = nit
-   0 0 0 0 0 0 0 0 = debug_vec
-
-The first line of input indicates how many levels of multi-grid
-cycle will be applied to a particular subpartition.  Presuming that
-8 processors are solving this problem (recall that the number of 
-processors is specified to MPI as a run parameter, and MPI subsequently
-determines this for the code via an MPI subroutine call), a 2x2x2 
-processor grid is  formed, and thus each partition on a processor is 
-of size 128x128x128.  Therefore, a maximum of 8 multi-grid levels may 
-be used.  These are of size 128,64,32,16,8,4,2,1, with the coarsest 
-level being a single point on a given processor.
-
-
-Next, consider the same size problem but running on 1 processor.  The
-following "mg.input" file is appropriate:
-
-    9 = top level
-    256 256 256 = nx ny nz
-    4 = nit
-    0 0 0 0 0 0 0 0 = debug_vec
-
-Since this processor must solve the full 256x256x256 problem, this
-permits 9 multi-grid levels (256,128,64,32,16,8,4,2,1), resulting in 
-a coarsest multi-grid level of a single point on the processor
-
-
-Next, consider the same size problem but running on 2 processors.  The
-following "mg.input" file is required:
-
-    8 = top level
-    256 256 256 = nx ny nz
-    4 = nit
-    0 0 0 0 0 0 0 0 = debug_vec
-
-The algorithm for partitioning the full grid onto some power of 2 number 
-of processors is to start by splitting the last dimension of the grid
-(z dimension) in 2: the problem is now partitioned onto 2 processors.
-Next the middle dimension (y dimension) is split in 2: the problem is now
-partitioned onto 4 processors.  Next, first dimension (x dimension) is
-split in 2: the problem is now partitioned onto 8 processors.  Next, the
-last dimension (z dimension) is split again in 2: the problem is now
-partitioned onto 16 processors.  This partitioning is repeated until all 
-of the power of 2 processors have been allocated.
-
-Thus to run the above problem on 2 processors, the grid partitioning 
-algorithm will allocate the two processors across the last dimension, 
-creating two partitions each of size 256x256x128. The coarsest level of 
-multi-grid must be a single point surrounded by a cubic number of grid 
-points.  Therefore, each of the two processor partitions will contain 4 
-coarsest multi-grid level points, each surrounded by a cube of grid points 
-of size 128x128x128, indicated by a top level of 8.
-
-
-Next, consider the same size problem but running on 4 processors.  The
-following "mg.input" file is required:
-
-    8 = top level
-    256 256 256 = nx ny nz
-    4 = nit
-    0 0 0 0 0 0 0 0 = debug_vec
-
-The partitioning algorithm will create 4 partitions, each of size
-256x128x128.  Each partition will contain 2 coarsest multi-grid level
-points each surrounded by a cube of grid points of size 128x128x128, 
-indicated by a top level of 8.
-
-
-Next, consider the same size problem but running on 16 processors.  The
-following "mg.input" file is required:
-
-    7 = top level
-    256 256 256 = nx ny nz
-    4 = nit
-    0 0 0 0 0 0 0 0 = debug_vec
-
-On each node a partition of size 128x128x64 will be created.  A maximum
-of 7 multi-grid levels (64,32,16,8,4,2,1) may be used, resulting in each 
-partions containing 4 coarsest multi-grid level points, each surrounded 
-by a cube of grid points of size 64x64x64, indicated by a top level of 7.
-
-
-
-
-Note that non-cubic problem sizes may also be considered:
-
-The four lines below are the "mg.input" file appropriate for running a
-problem of total size 256x512x512, for 20 iterations and presumes the 
-use of 32 processors (note: this is NOT a class C problem):
-
-    8 = top level
-    256 512 512 = nx ny nz
-    20 = nit
-    0 0 0 0 0 0 0 0 = debug_vec
-
-The first line of input indicates how many levels of multi-grid
-cycle will be applied to a particular subpartition.  Presuming that
-32 processors are solving this problem, a 2x4x4 processor grid is
-formed, and thus each partition on a processor is of size 128x128x128.
-Therefore, a maximum of 8 multi-grid levels may be used.  These are of
-size 128,64,32,16,8,4,2,1, with the coarsest level being a single 
-point on a given processor.
-
diff --git a/examples/smpi/NAS/MG/globals.h b/examples/smpi/NAS/MG/globals.h
deleted file mode 100644
index 99573e3c20..0000000000
--- a/examples/smpi/NAS/MG/globals.h
+++ /dev/null
@@ -1,55 +0,0 @@
-c---------------------------------------------------------------------
-c  Parameter lm (declared and set in "npbparams.h") is the log-base2 of 
-c  the edge size max for the partition on a given node, so must be changed 
-c  either to save space (if running a small case) or made bigger for larger 
-c  cases, for example, 512^3. Thus lm=7 means that the largest dimension 
-c  of a partition that can be solved on a node is 2^7 = 128. lm is set 
-c  automatically in npbparams.h
-c  Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 
-c---------------------------------------------------------------------
-
-      include 'npbparams.h'
-
-      integer nm      ! actual dimension including ghost cells for communications
-     >      , nv      ! size of rhs array
-     >      , nr      ! size of residual array
-     >      , nm2     ! size of communication buffer
-     >      , maxlevel! maximum number of levels
-
-      parameter( nm=2+2**lm, nv=(2+2**ndim1)*(2+2**ndim2)*(2+2**ndim3) )
-      parameter( nm2=2*nm*nm, maxlevel=(lt_default+1) )
-      parameter( nr = (8*(nv+nm**2+5*nm+14*lt_default-7*lm))/7 )
-      integer maxprocs
-      parameter( maxprocs = 131072 )  ! this is the upper proc limit that 
-                                      ! the current "nr" parameter can handle
-c---------------------------------------------------------------------
-      integer nbr(3,-1:1,maxlevel), msg_type(3,-1:1)
-      integer  msg_id(3,-1:1,2),nx(maxlevel),ny(maxlevel),nz(maxlevel)
-      common /mg3/ nbr,msg_type,msg_id,nx,ny,nz
-
-      character class
-      common /ClassType/class
-
-      integer debug_vec(0:7)
-      common /my_debug/ debug_vec
-
-      integer ir(maxlevel), m1(maxlevel), m2(maxlevel), m3(maxlevel)
-      integer lt, lb
-      common /fap/ ir,m1,m2,m3,lt,lb
-
-      logical dead(maxlevel), give_ex(3,maxlevel), take_ex(3,maxlevel)
-      common /comm_ex/ dead, give_ex, take_ex
-
-c---------------------------------------------------------------------
-c  Set at m=1024, can handle cases up to 1024^3 case
-c---------------------------------------------------------------------
-      integer m
-c      parameter( m=1037 )
-      parameter( m=nm+1 )
-
-      double precision buff(nm2,4)
-      common /buffer/ buff
-
-
-
-
diff --git a/examples/smpi/NAS/MG/mg.f b/examples/smpi/NAS/MG/mg.f
deleted file mode 100644
index b0352ae2df..0000000000
--- a/examples/smpi/NAS/MG/mg.f
+++ /dev/null
@@ -1,2479 +0,0 @@
-!-------------------------------------------------------------------------!
-!                                                                         !
-!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
-!                                                                         !
-!                                   M G                                   !
-!                                                                         !
-!-------------------------------------------------------------------------!
-!                                                                         !
-!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
-!    It is described in NAS Technical Reports 95-020 and 02-007           !
-!                                                                         !
-!    Permission to use, copy, distribute and modify this software         !
-!    for any purpose with or without fee is hereby granted.  We           !
-!    request, however, that all derived work reference the NAS            !
-!    Parallel Benchmarks 3.3. This software is provided "as is"           !
-!    without express or implied warranty.                                 !
-!                                                                         !
-!    Information on NPB 3.3, including the technical report, the          !
-!    original specifications, source code, results and information        !
-!    on how to submit new results, is available at:                       !
-!                                                                         !
-!           http://www.nas.nasa.gov/Software/NPB/                         !
-!                                                                         !
-!    Send comments or suggestions to  npb@nas.nasa.gov                    !
-!                                                                         !
-!          NAS Parallel Benchmarks Group                                  !
-!          NASA Ames Research Center                                      !
-!          Mail Stop: T27A-1                                              !
-!          Moffett Field, CA   94035-1000                                 !
-!                                                                         !
-!          E-mail:  npb@nas.nasa.gov                                      !
-!          Fax:     (650) 604-3957                                        !
-!                                                                         !
-!-------------------------------------------------------------------------!
-
-
-c---------------------------------------------------------------------
-c
-c Authors: E. Barszcz
-c          P. Frederickson
-c          A. Woo
-c          M. Yarrow
-c          R. F. Van der Wijngaart
-c
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-      program mg_mpi
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-c---------------------------------------------------------------------------c
-c k is the current level. It is passed down through subroutine args
-c and is NOT global. it is the current iteration
-c---------------------------------------------------------------------------c
-
-      integer k, it
-      
-      external timer_read
-      double precision t, t0, tinit, mflops, timer_read
-
-c---------------------------------------------------------------------------c
-c These arrays are in common because they are quite large
-c and probably shouldn't be allocated on the stack. They
-c are always passed as subroutine args. 
-c---------------------------------------------------------------------------c
-
-      double precision u(nr),v(nv),r(nr),a(0:3),c(0:3)
-      common /noautom/ u,v,r   
-
-      double precision rnm2, rnmu, old2, oldu, epsilon
-      integer n1, n2, n3, nit
-      double precision nn, verify_value, err
-      logical verified
-
-      integer ierr,i, fstatus
-      integer T_bench, T_init
-      parameter (T_bench=1, T_init=2)
-
-      call mpi_init(ierr)
-      call mpi_comm_rank(mpi_comm_world, me, ierr)
-      call mpi_comm_size(mpi_comm_world, nprocs, ierr)
-
-      root = 0
-      if (nprocs_compiled .gt. maxprocs) then
-         if (me .eq. root) write(*,20) nprocs_compiled, maxprocs
- 20      format(' ERROR: compiled for ',i8,' processes'//
-     &          ' The maximum size allowed for this benchmark is ',i6)
-         call mpi_abort(MPI_COMM_WORLD, ierr)
-         stop
-      endif
-
-      if (.not. convertdouble) then
-         dp_type = MPI_DOUBLE_PRECISION
-      else
-         dp_type = MPI_REAL
-      endif
-
-
-      call timer_clear(T_bench)
-      call timer_clear(T_init)
-
-      call mpi_barrier(MPI_COMM_WORLD, ierr)
-
-      call timer_start(T_init)
-      
-
-c---------------------------------------------------------------------
-c Read in and broadcast input data
-c---------------------------------------------------------------------
-
-      if( me .eq. root )then
-         write (*, 1000) 
-
-         open(unit=7,file="mg.input", status="old", iostat=fstatus)
-         if (fstatus .eq. 0) then
-            write(*,50) 
- 50         format(' Reading from input file mg.input')
-            read(7,*) lt
-            read(7,*) nx(lt), ny(lt), nz(lt)
-            read(7,*) nit
-            read(7,*) (debug_vec(i),i=0,7)
-         else
-            write(*,51) 
- 51         format(' No input file. Using compiled defaults ')
-            lt = lt_default
-            nit = nit_default
-            nx(lt) = nx_default
-            ny(lt) = ny_default
-            nz(lt) = nz_default
-            do i = 0,7
-               debug_vec(i) = debug_default
-            end do
-         endif
-      endif
-
-      call mpi_bcast(lt, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
-      call mpi_bcast(nit, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
-      call mpi_bcast(nx(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
-      call mpi_bcast(ny(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
-      call mpi_bcast(nz(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
-      call mpi_bcast(debug_vec(0), 8, MPI_INTEGER, 0, 
-     >               mpi_comm_world, ierr)
-
-      if ( (nx(lt) .ne. ny(lt)) .or. (nx(lt) .ne. nz(lt)) ) then
-         Class = 'U' 
-      else if( nx(lt) .eq. 32 .and. nit .eq. 4 ) then
-         Class = 'S'
-      else if( nx(lt) .eq. 128 .and. nit .eq. 4 ) then
-         Class = 'W'
-      else if( nx(lt) .eq. 256 .and. nit .eq. 4 ) then  
-         Class = 'A'
-      else if( nx(lt) .eq. 256 .and. nit .eq. 20 ) then
-         Class = 'B'
-      else if( nx(lt) .eq. 512 .and. nit .eq. 20 ) then  
-         Class = 'C'
-      else if( nx(lt) .eq. 1024 .and. nit .eq. 50 ) then  
-         Class = 'D'
-      else if( nx(lt) .eq. 2048 .and. nit .eq. 50 ) then  
-         Class = 'E'
-      else
-         Class = 'U'
-      endif
-
-c---------------------------------------------------------------------
-c  Use these for debug info:
-c---------------------------------------------------------------------
-c     debug_vec(0) = 1 !=> report all norms
-c     debug_vec(1) = 1 !=> some setup information
-c     debug_vec(1) = 2 !=> more setup information
-c     debug_vec(2) = k => at level k or below, show result of resid
-c     debug_vec(3) = k => at level k or below, show result of psinv
-c     debug_vec(4) = k => at level k or below, show result of rprj
-c     debug_vec(5) = k => at level k or below, show result of interp
-c     debug_vec(6) = 1 => (unused)
-c     debug_vec(7) = 1 => (unused)
-c---------------------------------------------------------------------
-      a(0) = -8.0D0/3.0D0 
-      a(1) =  0.0D0 
-      a(2) =  1.0D0/6.0D0 
-      a(3) =  1.0D0/12.0D0
-      
-      if(Class .eq. 'A' .or. Class .eq. 'S'.or. Class .eq.'W') then
-c---------------------------------------------------------------------
-c     Coefficients for the S(a) smoother
-c---------------------------------------------------------------------
-         c(0) =  -3.0D0/8.0D0
-         c(1) =  +1.0D0/32.0D0
-         c(2) =  -1.0D0/64.0D0
-         c(3) =   0.0D0
-      else
-c---------------------------------------------------------------------
-c     Coefficients for the S(b) smoother
-c---------------------------------------------------------------------
-         c(0) =  -3.0D0/17.0D0
-         c(1) =  +1.0D0/33.0D0
-         c(2) =  -1.0D0/61.0D0
-         c(3) =   0.0D0
-      endif
-      lb = 1
-      k  = lt
-
-      call setup(n1,n2,n3,k)
-      call zero3(u,n1,n2,n3)
-      call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
-
-      call norm2u3(v,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
-
-      if( me .eq. root )then
-         write (*, 1001) nx(lt),ny(lt),nz(lt), Class
-         write (*, 1002) nit
-
- 1000 format(//,' NAS Parallel Benchmarks 3.3 -- MG Benchmark', /)
- 1001    format(' Size: ', i4, 'x', i4, 'x', i4, '  (class ', A, ')' )
- 1002    format(' Iterations: ', i4)
- 1003    format(' Number of processes: ', i6)
-         if (nprocs .ne. nprocs_compiled) then
-           write (*, 1004) nprocs_compiled
-           write (*, 1005) nprocs
- 1004      format(' WARNING: compiled for ', i6, ' processes ')
- 1005      format(' Number of active processes: ', i6, /)
-         else
-           write (*, 1003) nprocs
-         endif
-      endif
-
-      call resid(u,v,r,n1,n2,n3,a,k)
-      call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
-      old2 = rnm2
-      oldu = rnmu
-
-c---------------------------------------------------------------------
-c     One iteration for startup
-c---------------------------------------------------------------------
-      call mg3P(u,v,r,a,c,n1,n2,n3,k)
-      call resid(u,v,r,n1,n2,n3,a,k)
-      call setup(n1,n2,n3,k)
-      call zero3(u,n1,n2,n3)
-      call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
-
-      call timer_stop(T_init)
-      if( me .eq. root )then
-         tinit = timer_read(T_init)
-         write( *,'(/A,F15.3,A/)' ) 
-     >        ' Initialization time: ',tinit, ' seconds'
-      endif
-
-      call mpi_barrier(mpi_comm_world,ierr)
-
-      call timer_start(T_bench)
-
-      call resid(u,v,r,n1,n2,n3,a,k)
-      call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
-      old2 = rnm2
-      oldu = rnmu
-
-      do  it=1,nit
-         if (it.eq.1 .or. it.eq.nit .or. mod(it,5).eq.0) then
-            if (me .eq. root) write(*,80) it
-   80       format('  iter ',i4)
-         endif
-         call mg3P(u,v,r,a,c,n1,n2,n3,k)
-         call resid(u,v,r,n1,n2,n3,a,k)
-      enddo
-
-
-      call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
-
-      call timer_stop(T_bench)
-
-      t0 = timer_read(T_bench)
-
-      call mpi_reduce(t0,t,1,dp_type,
-     >     mpi_max,root,mpi_comm_world,ierr)
-      verified = .FALSE.
-      verify_value = 0.0
-      if( me .eq. root )then
-         write(*,100)
- 100     format(/' Benchmark completed ')
-
-         epsilon = 1.d-8
-         if (Class .ne. 'U') then
-            if(Class.eq.'S') then
-               verify_value = 0.5307707005734d-04
-            elseif(Class.eq.'W') then
-               verify_value = 0.6467329375339d-05
-            elseif(Class.eq.'A') then
-               verify_value = 0.2433365309069d-05
-            elseif(Class.eq.'B') then
-               verify_value = 0.1800564401355d-05
-            elseif(Class.eq.'C') then
-               verify_value = 0.5706732285740d-06
-            elseif(Class.eq.'D') then
-               verify_value = 0.1583275060440d-09
-            elseif(Class.eq.'E') then
-               verify_value = 0.5630442584711d-10
-            endif
-
-            err = abs( rnm2 - verify_value ) / verify_value
-            if( err .le. epsilon ) then
-               verified = .TRUE.
-               write(*, 200)
-               write(*, 201) rnm2
-               write(*, 202) err
- 200           format(' VERIFICATION SUCCESSFUL ')
- 201           format(' L2 Norm is ', E20.13)
- 202           format(' Error is   ', E20.13)
-            else
-               verified = .FALSE.
-               write(*, 300) 
-               write(*, 301) rnm2
-               write(*, 302) verify_value
- 300           format(' VERIFICATION FAILED')
- 301           format(' L2 Norm is             ', E20.13)
- 302           format(' The correct L2 Norm is ', E20.13)
-            endif
-         else
-            verified = .FALSE.
-            write (*, 400)
-            write (*, 401)
-            write (*, 201) rnm2
- 400        format(' Problem size unknown')
- 401        format(' NO VERIFICATION PERFORMED')
-         endif
-
-         nn = 1.0d0*nx(lt)*ny(lt)*nz(lt)
-
-         if( t .ne. 0. ) then
-            mflops = 58.*1.0D-6*nit*nn / t
-         else
-            mflops = 0.0
-         endif
-
-         call print_results('MG', class, nx(lt), ny(lt), nz(lt), 
-     >                      nit, nprocs_compiled, nprocs, t,
-     >                      mflops, '          floating point', 
-     >                      verified, npbversion, compiletime,
-     >                      cs1, cs2, cs3, cs4, cs5, cs6, cs7)
-
-
-      endif
-
-
-      call mpi_finalize(ierr)
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup(n1,n2,n3,k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer  is1, is2, is3, ie1, ie2, ie3
-      common /grid/ is1,is2,is3,ie1,ie2,ie3
-
-      integer n1,n2,n3,k
-      integer dx, dy, log_p, d, i, j
-
-      integer ax, next(3),mi(3,maxlevel),mip(3,maxlevel)
-      integer ng(3,maxlevel)
-      integer idi(3), pi(3), idin(3,-1:1)
-      integer s, dir,ierr
-
-      do  j=-1,1,1
-         do  d=1,3
-            msg_type(d,j) = 100*(j+2+10*d)
-         enddo
-      enddo
-
-      ng(1,lt) = nx(lt)
-      ng(2,lt) = ny(lt)
-      ng(3,lt) = nz(lt)
-      do  ax=1,3
-         next(ax) = 1
-         do  k=lt-1,1,-1
-            ng(ax,k) = ng(ax,k+1)/2
-         enddo
-      enddo
- 61   format(10i4)
-      do  k=lt,1,-1
-         nx(k) = ng(1,k)
-         ny(k) = ng(2,k)
-         nz(k) = ng(3,k)
-      enddo
-
-      log_p  = log(float(nprocs)+0.0001)/log(2.0)
-      dx     = log_p/3
-      pi(1)  = 2**dx
-      idi(1) = mod(me,pi(1))
-
-      dy     = (log_p-dx)/2
-      pi(2)  = 2**dy
-      idi(2) = mod((me/pi(1)),pi(2))
-
-      pi(3)  = nprocs/(pi(1)*pi(2))
-      idi(3) = me/(pi(1)*pi(2))
-
-      do  k = lt,1,-1
-         dead(k) = .false.
-         do  ax = 1,3
-            take_ex(ax,k) = .false.
-            give_ex(ax,k) = .false.
-
-            mi(ax,k) = 2 + 
-     >           ((idi(ax)+1)*ng(ax,k))/pi(ax) -
-     >           ((idi(ax)+0)*ng(ax,k))/pi(ax)
-            mip(ax,k) = 2 + 
-     >           ((next(ax)+idi(ax)+1)*ng(ax,k))/pi(ax) -
-     >           ((next(ax)+idi(ax)+0)*ng(ax,k))/pi(ax) 
-
-            if(mip(ax,k).eq.2.or.mi(ax,k).eq.2)then
-               next(ax) = 2*next(ax)
-            endif
-
-            if( k+1 .le. lt )then
-               if((mip(ax,k).eq.2).and.(mi(ax,k).eq.3))then
-                  give_ex(ax,k+1) = .true.
-               endif
-               if((mip(ax,k).eq.3).and.(mi(ax,k).eq.2))then
-                  take_ex(ax,k+1) = .true.
-               endif
-            endif
-         enddo
-
-         if( mi(1,k).eq.2 .or. 
-     >        mi(2,k).eq.2 .or. 
-     >        mi(3,k).eq.2      )then
-            dead(k) = .true.
-         endif
-         m1(k) = mi(1,k)
-         m2(k) = mi(2,k)
-         m3(k) = mi(3,k)
-
-         do  ax=1,3
-            idin(ax,+1) = mod( idi(ax) + next(ax) + pi(ax) , pi(ax) )
-            idin(ax,-1) = mod( idi(ax) - next(ax) + pi(ax) , pi(ax) )
-         enddo
-         do  dir = 1,-1,-2
-            nbr(1,dir,k) = idin(1,dir) + pi(1)
-     >           *(idi(2)      + pi(2)
-     >           * idi(3))
-            nbr(2,dir,k) = idi(1)      + pi(1)
-     >           *(idin(2,dir) + pi(2)
-     >           * idi(3))
-            nbr(3,dir,k) = idi(1)      + pi(1)
-     >           *(idi(2)      + pi(2)
-     >           * idin(3,dir))
-         enddo
-      enddo
-
-      k = lt
-      is1 = 2 + ng(1,k) - ((pi(1)  -idi(1))*ng(1,lt))/pi(1)
-      ie1 = 1 + ng(1,k) - ((pi(1)-1-idi(1))*ng(1,lt))/pi(1)
-      n1 = 3 + ie1 - is1
-      is2 = 2 + ng(2,k) - ((pi(2)  -idi(2))*ng(2,lt))/pi(2)
-      ie2 = 1 + ng(2,k) - ((pi(2)-1-idi(2))*ng(2,lt))/pi(2)
-      n2 = 3 + ie2 - is2
-      is3 = 2 + ng(3,k) - ((pi(3)  -idi(3))*ng(3,lt))/pi(3)
-      ie3 = 1 + ng(3,k) - ((pi(3)-1-idi(3))*ng(3,lt))/pi(3)
-      n3 = 3 + ie3 - is3
-
-
-      ir(lt)=1
-      do  j = lt-1, 1, -1
-         ir(j)=ir(j+1)+m1(j+1)*m2(j+1)*m3(j+1)
-      enddo
-
-
-      if( debug_vec(1) .ge. 1 )then
-         if( me .eq. root )write(*,*)' in setup, '
-         if( me .eq. root )write(*,*)' me   k  lt  nx  ny  nz ',
-     >        ' n1  n2  n3 is1 is2 is3 ie1 ie2 ie3'
-         do  i=0,nprocs-1
-            if( me .eq. i )then
-               write(*,9) me,k,lt,ng(1,k),ng(2,k),ng(3,k),
-     >              n1,n2,n3,is1,is2,is3,ie1,ie2,ie3
- 9             format(15i4)
-            endif
-            call mpi_barrier(mpi_comm_world,ierr)
-         enddo
-      endif
-      if( debug_vec(1) .ge. 2 )then
-         do  i=0,nprocs-1
-            if( me .eq. i )then
-               write(*,*)' '
-               write(*,*)' processor =',me
-               do  k=lt,1,-1
-                  write(*,7)k,idi(1),idi(2),idi(3),
-     >                 ((nbr(d,j,k),j=-1,1,2),d=1,3),
-     >                 (mi(d,k),d=1,3)
-               enddo
- 7             format(i4,'idi=',3i4,'nbr=',3(2i4,'  '),'mi=',3i4,' ')
-               write(*,*)'idi(s) = ',(idi(s),s=1,3)
-               write(*,*)'dead(2), dead(1) = ',dead(2),dead(1)
-               do  ax=1,3
-                  write(*,*)'give_ex(ax,2)= ',give_ex(ax,2)
-                  write(*,*)'take_ex(ax,2)= ',take_ex(ax,2)
-               enddo
-            endif
-            call mpi_barrier(mpi_comm_world,ierr)
-         enddo
-      endif
-
-      k = lt
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine mg3P(u,v,r,a,c,n1,n2,n3,k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     multigrid V-cycle routine
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer n1, n2, n3, k
-      double precision u(nr),v(nv),r(nr)
-      double precision a(0:3),c(0:3)
-
-      integer j
-
-c---------------------------------------------------------------------
-c     down cycle.
-c     restrict the residual from the find grid to the coarse
-c---------------------------------------------------------------------
-
-      do  k= lt, lb+1 , -1
-         j = k-1
-         call rprj3(r(ir(k)),m1(k),m2(k),m3(k),
-     >        r(ir(j)),m1(j),m2(j),m3(j),k)
-      enddo
-
-      k = lb
-c---------------------------------------------------------------------
-c     compute an approximate solution on the coarsest grid
-c---------------------------------------------------------------------
-      call zero3(u(ir(k)),m1(k),m2(k),m3(k))
-      call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
-
-      do  k = lb+1, lt-1     
-          j = k-1
-c---------------------------------------------------------------------
-c        prolongate from level k-1  to k
-c---------------------------------------------------------------------
-         call zero3(u(ir(k)),m1(k),m2(k),m3(k))
-         call interp(u(ir(j)),m1(j),m2(j),m3(j),
-     >               u(ir(k)),m1(k),m2(k),m3(k),k)
-c---------------------------------------------------------------------
-c        compute residual for level k
-c---------------------------------------------------------------------
-         call resid(u(ir(k)),r(ir(k)),r(ir(k)),m1(k),m2(k),m3(k),a,k)
-c---------------------------------------------------------------------
-c        apply smoother
-c---------------------------------------------------------------------
-         call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
-      enddo
- 200  continue
-      j = lt - 1
-      k = lt
-      call interp(u(ir(j)),m1(j),m2(j),m3(j),u,n1,n2,n3,k)
-      call resid(u,v,r,n1,n2,n3,a,k)
-      call psinv(r,u,n1,n2,n3,c,k)
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine psinv( r,u,n1,n2,n3,c,k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     psinv applies an approximate inverse as smoother:  u = u + Cr
-c
-c     This  implementation costs  15A + 4M per result, where
-c     A and M denote the costs of Addition and Multiplication.  
-c     Presuming coefficient c(3) is zero (the NPB assumes this,
-c     but it is thus not a general case), 2A + 1M may be eliminated,
-c     resulting in 13A + 3M.
-c     Note that this vectorizes, and is also fine for cache 
-c     based machines.  
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'globals.h'
-
-      integer n1,n2,n3,k
-      double precision u(n1,n2,n3),r(n1,n2,n3),c(0:3)
-      integer i3, i2, i1
-
-      double precision r1(m), r2(m)
-      
-      do i3=2,n3-1
-         do i2=2,n2-1
-            do i1=1,n1
-               r1(i1) = r(i1,i2-1,i3) + r(i1,i2+1,i3)
-     >                + r(i1,i2,i3-1) + r(i1,i2,i3+1)
-               r2(i1) = r(i1,i2-1,i3-1) + r(i1,i2+1,i3-1)
-     >                + r(i1,i2-1,i3+1) + r(i1,i2+1,i3+1)
-            enddo
-            do i1=2,n1-1
-               u(i1,i2,i3) = u(i1,i2,i3)
-     >                     + c(0) * r(i1,i2,i3)
-     >                     + c(1) * ( r(i1-1,i2,i3) + r(i1+1,i2,i3)
-     >                              + r1(i1) )
-     >                     + c(2) * ( r2(i1) + r1(i1-1) + r1(i1+1) )
-c---------------------------------------------------------------------
-c  Assume c(3) = 0    (Enable line below if c(3) not= 0)
-c---------------------------------------------------------------------
-c    >                     + c(3) * ( r2(i1-1) + r2(i1+1) )
-c---------------------------------------------------------------------
-            enddo
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     exchange boundary points
-c---------------------------------------------------------------------
-      call comm3(u,n1,n2,n3,k)
-
-      if( debug_vec(0) .ge. 1 )then
-         call rep_nrm(u,n1,n2,n3,'   psinv',k)
-      endif
-
-      if( debug_vec(3) .ge. k )then
-         call showall(u,n1,n2,n3)
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine resid( u,v,r,n1,n2,n3,a,k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     resid computes the residual:  r = v - Au
-c
-c     This  implementation costs  15A + 4M per result, where
-c     A and M denote the costs of Addition (or Subtraction) and 
-c     Multiplication, respectively. 
-c     Presuming coefficient a(1) is zero (the NPB assumes this,
-c     but it is thus not a general case), 3A + 1M may be eliminated,
-c     resulting in 12A + 3M.
-c     Note that this vectorizes, and is also fine for cache 
-c     based machines.  
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'globals.h'
-
-      integer n1,n2,n3,k
-      double precision u(n1,n2,n3),v(n1,n2,n3),r(n1,n2,n3),a(0:3)
-      integer i3, i2, i1
-      double precision u1(m), u2(m)
-
-      do i3=2,n3-1
-         do i2=2,n2-1
-            do i1=1,n1
-               u1(i1) = u(i1,i2-1,i3) + u(i1,i2+1,i3)
-     >                + u(i1,i2,i3-1) + u(i1,i2,i3+1)
-               u2(i1) = u(i1,i2-1,i3-1) + u(i1,i2+1,i3-1)
-     >                + u(i1,i2-1,i3+1) + u(i1,i2+1,i3+1)
-            enddo
-            do i1=2,n1-1
-               r(i1,i2,i3) = v(i1,i2,i3)
-     >                     - a(0) * u(i1,i2,i3)
-c---------------------------------------------------------------------
-c  Assume a(1) = 0      (Enable 2 lines below if a(1) not= 0)
-c---------------------------------------------------------------------
-c    >                     - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
-c    >                              + u1(i1) )
-c---------------------------------------------------------------------
-     >                     - a(2) * ( u2(i1) + u1(i1-1) + u1(i1+1) )
-     >                     - a(3) * ( u2(i1-1) + u2(i1+1) )
-            enddo
-         enddo
-      enddo
-
-c---------------------------------------------------------------------
-c     exchange boundary data
-c---------------------------------------------------------------------
-      call comm3(r,n1,n2,n3,k)
-
-      if( debug_vec(0) .ge. 1 )then
-         call rep_nrm(r,n1,n2,n3,'   resid',k)
-      endif
-
-      if( debug_vec(2) .ge. k )then
-         call showall(r,n1,n2,n3)
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine rprj3( r,m1k,m2k,m3k,s,m1j,m2j,m3j,k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     rprj3 projects onto the next coarser grid, 
-c     using a trilinear Finite Element projection:  s = r' = P r
-c     
-c     This  implementation costs  20A + 4M per result, where
-c     A and M denote the costs of Addition and Multiplication.  
-c     Note that this vectorizes, and is also fine for cache 
-c     based machines.  
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer m1k, m2k, m3k, m1j, m2j, m3j,k
-      double precision r(m1k,m2k,m3k), s(m1j,m2j,m3j)
-      integer j3, j2, j1, i3, i2, i1, d1, d2, d3, j
-
-      double precision x1(m), y1(m), x2,y2
-
-
-      if(m1k.eq.3)then
-        d1 = 2
-      else
-        d1 = 1
-      endif
-
-      if(m2k.eq.3)then
-        d2 = 2
-      else
-        d2 = 1
-      endif
-
-      if(m3k.eq.3)then
-        d3 = 2
-      else
-        d3 = 1
-      endif
-
-      do  j3=2,m3j-1
-         i3 = 2*j3-d3
-C        i3 = 2*j3-1
-         do  j2=2,m2j-1
-            i2 = 2*j2-d2
-C           i2 = 2*j2-1
-
-            do j1=2,m1j
-              i1 = 2*j1-d1
-C             i1 = 2*j1-1
-              x1(i1-1) = r(i1-1,i2-1,i3  ) + r(i1-1,i2+1,i3  )
-     >                 + r(i1-1,i2,  i3-1) + r(i1-1,i2,  i3+1)
-              y1(i1-1) = r(i1-1,i2-1,i3-1) + r(i1-1,i2-1,i3+1)
-     >                 + r(i1-1,i2+1,i3-1) + r(i1-1,i2+1,i3+1)
-            enddo
-
-            do  j1=2,m1j-1
-              i1 = 2*j1-d1
-C             i1 = 2*j1-1
-              y2 = r(i1,  i2-1,i3-1) + r(i1,  i2-1,i3+1)
-     >           + r(i1,  i2+1,i3-1) + r(i1,  i2+1,i3+1)
-              x2 = r(i1,  i2-1,i3  ) + r(i1,  i2+1,i3  )
-     >           + r(i1,  i2,  i3-1) + r(i1,  i2,  i3+1)
-              s(j1,j2,j3) =
-     >               0.5D0 * r(i1,i2,i3)
-     >             + 0.25D0 * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + x2)
-     >             + 0.125D0 * ( x1(i1-1) + x1(i1+1) + y2)
-     >             + 0.0625D0 * ( y1(i1-1) + y1(i1+1) )
-            enddo
-
-         enddo
-      enddo
-
-
-      j = k-1
-      call comm3(s,m1j,m2j,m3j,j)
-
-      if( debug_vec(0) .ge. 1 )then
-         call rep_nrm(s,m1j,m2j,m3j,'   rprj3',k-1)
-      endif
-
-      if( debug_vec(4) .ge. k )then
-         call showall(s,m1j,m2j,m3j)
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine interp( z,mm1,mm2,mm3,u,n1,n2,n3,k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     interp adds the trilinear interpolation of the correction
-c     from the coarser grid to the current approximation:  u = u + Qu'
-c     
-c     Observe that this  implementation costs  16A + 4M, where
-c     A and M denote the costs of Addition and Multiplication.  
-c     Note that this vectorizes, and is also fine for cache 
-c     based machines.  Vector machines may get slightly better 
-c     performance however, with 8 separate "do i1" loops, rather than 4.
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer mm1, mm2, mm3, n1, n2, n3,k
-      double precision z(mm1,mm2,mm3),u(n1,n2,n3)
-      integer i3, i2, i1, d1, d2, d3, t1, t2, t3
-
-c note that m = 1037 in globals.h but for this only need to be
-c 535 to handle up to 1024^3
-c      integer m
-c      parameter( m=535 )
-      double precision z1(m),z2(m),z3(m)
-
-
-      if( n1 .ne. 3 .and. n2 .ne. 3 .and. n3 .ne. 3 ) then
-
-         do  i3=1,mm3-1
-            do  i2=1,mm2-1
-
-               do i1=1,mm1
-                  z1(i1) = z(i1,i2+1,i3) + z(i1,i2,i3)
-                  z2(i1) = z(i1,i2,i3+1) + z(i1,i2,i3)
-                  z3(i1) = z(i1,i2+1,i3+1) + z(i1,i2,i3+1) + z1(i1)
-               enddo
-
-               do  i1=1,mm1-1
-                  u(2*i1-1,2*i2-1,2*i3-1)=u(2*i1-1,2*i2-1,2*i3-1)
-     >                 +z(i1,i2,i3)
-                  u(2*i1,2*i2-1,2*i3-1)=u(2*i1,2*i2-1,2*i3-1)
-     >                 +0.5d0*(z(i1+1,i2,i3)+z(i1,i2,i3))
-               enddo
-               do i1=1,mm1-1
-                  u(2*i1-1,2*i2,2*i3-1)=u(2*i1-1,2*i2,2*i3-1)
-     >                 +0.5d0 * z1(i1)
-                  u(2*i1,2*i2,2*i3-1)=u(2*i1,2*i2,2*i3-1)
-     >                 +0.25d0*( z1(i1) + z1(i1+1) )
-               enddo
-               do i1=1,mm1-1
-                  u(2*i1-1,2*i2-1,2*i3)=u(2*i1-1,2*i2-1,2*i3)
-     >                 +0.5d0 * z2(i1)
-                  u(2*i1,2*i2-1,2*i3)=u(2*i1,2*i2-1,2*i3)
-     >                 +0.25d0*( z2(i1) + z2(i1+1) )
-               enddo
-               do i1=1,mm1-1
-                  u(2*i1-1,2*i2,2*i3)=u(2*i1-1,2*i2,2*i3)
-     >                 +0.25d0* z3(i1)
-                  u(2*i1,2*i2,2*i3)=u(2*i1,2*i2,2*i3)
-     >                 +0.125d0*( z3(i1) + z3(i1+1) )
-               enddo
-            enddo
-         enddo
-
-      else
-
-         if(n1.eq.3)then
-            d1 = 2
-            t1 = 1
-         else
-            d1 = 1
-            t1 = 0
-         endif
-         
-         if(n2.eq.3)then
-            d2 = 2
-            t2 = 1
-         else
-            d2 = 1
-            t2 = 0
-         endif
-         
-         if(n3.eq.3)then
-            d3 = 2
-            t3 = 1
-         else
-            d3 = 1
-            t3 = 0
-         endif
-         
-         do  i3=d3,mm3-1
-            do  i2=d2,mm2-1
-               do  i1=d1,mm1-1
-                  u(2*i1-d1,2*i2-d2,2*i3-d3)=u(2*i1-d1,2*i2-d2,2*i3-d3)
-     >                 +z(i1,i2,i3)
-               enddo
-               do  i1=1,mm1-1
-                  u(2*i1-t1,2*i2-d2,2*i3-d3)=u(2*i1-t1,2*i2-d2,2*i3-d3)
-     >                 +0.5D0*(z(i1+1,i2,i3)+z(i1,i2,i3))
-               enddo
-            enddo
-            do  i2=1,mm2-1
-               do  i1=d1,mm1-1
-                  u(2*i1-d1,2*i2-t2,2*i3-d3)=u(2*i1-d1,2*i2-t2,2*i3-d3)
-     >                 +0.5D0*(z(i1,i2+1,i3)+z(i1,i2,i3))
-               enddo
-               do  i1=1,mm1-1
-                  u(2*i1-t1,2*i2-t2,2*i3-d3)=u(2*i1-t1,2*i2-t2,2*i3-d3)
-     >                 +0.25D0*(z(i1+1,i2+1,i3)+z(i1+1,i2,i3)
-     >                 +z(i1,  i2+1,i3)+z(i1,  i2,i3))
-               enddo
-            enddo
-         enddo
-
-         do  i3=1,mm3-1
-            do  i2=d2,mm2-1
-               do  i1=d1,mm1-1
-                  u(2*i1-d1,2*i2-d2,2*i3-t3)=u(2*i1-d1,2*i2-d2,2*i3-t3)
-     >                 +0.5D0*(z(i1,i2,i3+1)+z(i1,i2,i3))
-               enddo
-               do  i1=1,mm1-1
-                  u(2*i1-t1,2*i2-d2,2*i3-t3)=u(2*i1-t1,2*i2-d2,2*i3-t3)
-     >                 +0.25D0*(z(i1+1,i2,i3+1)+z(i1,i2,i3+1)
-     >                 +z(i1+1,i2,i3  )+z(i1,i2,i3  ))
-               enddo
-            enddo
-            do  i2=1,mm2-1
-               do  i1=d1,mm1-1
-                  u(2*i1-d1,2*i2-t2,2*i3-t3)=u(2*i1-d1,2*i2-t2,2*i3-t3)
-     >                 +0.25D0*(z(i1,i2+1,i3+1)+z(i1,i2,i3+1)
-     >                 +z(i1,i2+1,i3  )+z(i1,i2,i3  ))
-               enddo
-               do  i1=1,mm1-1
-                  u(2*i1-t1,2*i2-t2,2*i3-t3)=u(2*i1-t1,2*i2-t2,2*i3-t3)
-     >                 +0.125D0*(z(i1+1,i2+1,i3+1)+z(i1+1,i2,i3+1)
-     >                 +z(i1  ,i2+1,i3+1)+z(i1  ,i2,i3+1)
-     >                 +z(i1+1,i2+1,i3  )+z(i1+1,i2,i3  )
-     >                 +z(i1  ,i2+1,i3  )+z(i1  ,i2,i3  ))
-               enddo
-            enddo
-         enddo
-
-      endif
-
-      call comm3_ex(u,n1,n2,n3,k)
-
-      if( debug_vec(0) .ge. 1 )then
-         call rep_nrm(z,mm1,mm2,mm3,'z: inter',k-1)
-         call rep_nrm(u,n1,n2,n3,'u: inter',k)
-      endif
-
-      if( debug_vec(5) .ge. k )then
-         call showall(z,mm1,mm2,mm3)
-         call showall(u,n1,n2,n3)
-      endif
-
-      return 
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine norm2u3(r,n1,n2,n3,rnm2,rnmu,nx,ny,nz)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     norm2u3 evaluates approximations to the L2 norm and the
-c     uniform (or L-infinity or Chebyshev) norm, under the
-c     assumption that the boundaries are periodic or zero.  Add the
-c     boundaries in with half weight (quarter weight on the edges
-c     and eighth weight at the corners) for inhomogeneous boundaries.
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer n1, n2, n3, nx, ny, nz
-      double precision rnm2, rnmu, r(n1,n2,n3)
-      double precision s, a, ss
-      integer i3, i2, i1, ierr
-
-      double precision dn
-
-      dn = 1.0d0*nx*ny*nz
-
-      s=0.0D0
-      rnmu = 0.0D0
-      do  i3=2,n3-1
-         do  i2=2,n2-1
-            do  i1=2,n1-1
-               s=s+r(i1,i2,i3)**2
-               a=abs(r(i1,i2,i3))
-               if(a.gt.rnmu)rnmu=a
-            enddo
-         enddo
-      enddo
-
-      call mpi_allreduce(rnmu,ss,1,dp_type,
-     >     mpi_max,mpi_comm_world,ierr)
-      rnmu = ss
-      call mpi_allreduce(s, ss, 1, dp_type,
-     >     mpi_sum,mpi_comm_world,ierr)
-      s = ss
-      rnm2=sqrt( s / dn )
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine rep_nrm(u,n1,n2,n3,title,kk)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     report on norm
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer n1, n2, n3, kk
-      double precision u(n1,n2,n3)
-      character*8 title
-
-      double precision rnm2, rnmu
-
-
-      call norm2u3(u,n1,n2,n3,rnm2,rnmu,nx(kk),ny(kk),nz(kk))
-      if( me .eq. root )then
-         write(*,7)kk,title,rnm2,rnmu
- 7       format(' Level',i2,' in ',a8,': norms =',D21.14,D21.14)
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine comm3(u,n1,n2,n3,kk)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     comm3 organizes the communication on all borders 
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer n1, n2, n3, kk
-      double precision u(n1,n2,n3)
-      integer axis
-
-      if( .not. dead(kk) )then
-         do  axis = 1, 3
-            if( nprocs .ne. 1) then
-   
-               call ready( axis, -1, kk )
-               call ready( axis, +1, kk )
-   
-               call give3( axis, +1, u, n1, n2, n3, kk )
-               call give3( axis, -1, u, n1, n2, n3, kk )
-   
-               call take3( axis, -1, u, n1, n2, n3 )
-               call take3( axis, +1, u, n1, n2, n3 )
-   
-            else
-               call comm1p( axis, u, n1, n2, n3, kk )
-            endif
-         enddo
-      else
-         call zero3(u,n1,n2,n3)
-      endif
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine comm3_ex(u,n1,n2,n3,kk)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     comm3_ex  communicates to expand the number of processors
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer n1, n2, n3, kk
-      double precision u(n1,n2,n3)
-      integer axis
-
-      do  axis = 1, 3
-         if( nprocs .ne. 1 ) then
-            if( take_ex( axis, kk ) )then
-               call ready( axis, -1, kk )
-               call ready( axis, +1, kk )
-               call take3_ex( axis, -1, u, n1, n2, n3 )
-               call take3_ex( axis, +1, u, n1, n2, n3 )
-            endif
-   
-            if( give_ex( axis, kk ) )then
-               call give3_ex( axis, +1, u, n1, n2, n3, kk )
-               call give3_ex( axis, -1, u, n1, n2, n3, kk )
-            endif
-         else
-            call comm1p_ex( axis, u, n1, n2, n3, kk )
-         endif
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine ready( axis, dir, k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     ready allocates a buffer to take in a message
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer axis, dir, k
-      integer buff_id,buff_len,i,ierr
-
-      buff_id = 3 + dir
-      buff_len = nm2
-
-      do  i=1,nm2
-         buff(i,buff_id) = 0.0D0
-      enddo
-
-
-c---------------------------------------------------------------------
-c     fake message request type
-c---------------------------------------------------------------------
-      msg_id(axis,dir,1) = msg_type(axis,dir) +1000*me
-
-      call mpi_irecv( buff(1,buff_id), buff_len,
-     >     dp_type, nbr(axis,-dir,k), msg_type(axis,dir), 
-     >     mpi_comm_world, msg_id(axis,dir,1), ierr)
-      return
-      end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine give3( axis, dir, u, n1, n2, n3, k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     give3 sends border data out in the requested direction
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer axis, dir, n1, n2, n3, k, ierr
-      double precision u( n1, n2, n3 )
-
-      integer i3, i2, i1, buff_len,buff_id
-
-      buff_id = 2 + dir 
-      buff_len = 0
-
-      if( axis .eq.  1 )then
-         if( dir .eq. -1 )then
-
-            do  i3=2,n3-1
-               do  i2=2,n2-1
-                  buff_len = buff_len + 1
-                  buff(buff_len,buff_id ) = u( 2,  i2,i3)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=2,n3-1
-               do  i2=2,n2-1
-                  buff_len = buff_len + 1
-                  buff(buff_len, buff_id ) = u( n1-1, i2,i3)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         endif
-      endif
-
-      if( axis .eq.  2 )then
-         if( dir .eq. -1 )then
-
-            do  i3=2,n3-1
-               do  i1=1,n1
-                  buff_len = buff_len + 1
-                  buff(buff_len, buff_id ) = u( i1,  2,i3)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=2,n3-1
-               do  i1=1,n1
-                  buff_len = buff_len + 1
-                  buff(buff_len,  buff_id )= u( i1,n2-1,i3)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         endif
-      endif
-
-      if( axis .eq.  3 )then
-         if( dir .eq. -1 )then
-
-            do  i2=1,n2
-               do  i1=1,n1
-                  buff_len = buff_len + 1
-                  buff(buff_len, buff_id ) = u( i1,i2,2)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         else if( dir .eq. +1 ) then
-
-            do  i2=1,n2
-               do  i1=1,n1
-                  buff_len = buff_len + 1
-                  buff(buff_len, buff_id ) = u( i1,i2,n3-1)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         endif
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine take3( axis, dir, u, n1, n2, n3 )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     take3 copies in border data from the requested direction
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer axis, dir, n1, n2, n3
-      double precision u( n1, n2, n3 )
-
-      integer buff_id, indx
-
-      integer status(mpi_status_size), ierr
-
-      integer i3, i2, i1
-
-      call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
-      buff_id = 3 + dir
-      indx = 0
-
-      if( axis .eq.  1 )then
-         if( dir .eq. -1 )then
-
-            do  i3=2,n3-1
-               do  i2=2,n2-1
-                  indx = indx + 1
-                  u(n1,i2,i3) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=2,n3-1
-               do  i2=2,n2-1
-                  indx = indx + 1
-                  u(1,i2,i3) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         endif
-      endif
-
-      if( axis .eq.  2 )then
-         if( dir .eq. -1 )then
-
-            do  i3=2,n3-1
-               do  i1=1,n1
-                  indx = indx + 1
-                  u(i1,n2,i3) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=2,n3-1
-               do  i1=1,n1
-                  indx = indx + 1
-                  u(i1,1,i3) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         endif
-      endif
-
-      if( axis .eq.  3 )then
-         if( dir .eq. -1 )then
-
-            do  i2=1,n2
-               do  i1=1,n1
-                  indx = indx + 1
-                  u(i1,i2,n3) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         else if( dir .eq. +1 ) then
-
-            do  i2=1,n2
-               do  i1=1,n1
-                  indx = indx + 1
-                  u(i1,i2,1) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         endif
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine give3_ex( axis, dir, u, n1, n2, n3, k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     give3_ex sends border data out to expand number of processors
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer axis, dir, n1, n2, n3, k, ierr
-      double precision u( n1, n2, n3 )
-
-      integer i3, i2, i1, buff_len, buff_id
-
-      buff_id = 2 + dir 
-      buff_len = 0
-
-      if( axis .eq.  1 )then
-         if( dir .eq. -1 )then
-
-            do  i3=1,n3
-               do  i2=1,n2
-                  buff_len = buff_len + 1
-                  buff(buff_len,buff_id ) = u( 2,  i2,i3)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=1,n3
-               do  i2=1,n2
-                  do  i1=n1-1,n1
-                     buff_len = buff_len + 1
-                     buff(buff_len,buff_id)= u(i1,i2,i3)
-                  enddo
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         endif
-      endif
-
-      if( axis .eq.  2 )then
-         if( dir .eq. -1 )then
-
-            do  i3=1,n3
-               do  i1=1,n1
-                  buff_len = buff_len + 1
-                  buff(buff_len, buff_id ) = u( i1,  2,i3)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=1,n3
-               do  i2=n2-1,n2
-                  do  i1=1,n1
-                     buff_len = buff_len + 1
-                     buff(buff_len,buff_id )= u(i1,i2,i3)
-                  enddo
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         endif
-      endif
-
-      if( axis .eq.  3 )then
-         if( dir .eq. -1 )then
-
-            do  i2=1,n2
-               do  i1=1,n1
-                  buff_len = buff_len + 1
-                  buff(buff_len, buff_id ) = u( i1,i2,2)
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=n3-1,n3
-               do  i2=1,n2
-                  do  i1=1,n1
-                     buff_len = buff_len + 1
-                     buff(buff_len, buff_id ) = u( i1,i2,i3)
-                  enddo
-               enddo
-            enddo
-
-            call mpi_send( 
-     >           buff(1, buff_id ), buff_len,dp_type,
-     >           nbr( axis, dir, k ), msg_type(axis,dir), 
-     >           mpi_comm_world, ierr)
-
-         endif
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine take3_ex( axis, dir, u, n1, n2, n3 )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     take3_ex copies in border data to expand number of processors
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer axis, dir, n1, n2, n3
-      double precision u( n1, n2, n3 )
-
-      integer buff_id, indx
-
-      integer status(mpi_status_size) , ierr
-
-      integer i3, i2, i1
-
-      call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
-      buff_id = 3 + dir
-      indx = 0
-
-      if( axis .eq.  1 )then
-         if( dir .eq. -1 )then
-
-            do  i3=1,n3
-               do  i2=1,n2
-                  indx = indx + 1
-                  u(n1,i2,i3) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=1,n3
-               do  i2=1,n2
-                  do  i1=1,2
-                     indx = indx + 1
-                     u(i1,i2,i3) = buff(indx,buff_id)
-                  enddo
-               enddo
-            enddo
-
-         endif
-      endif
-
-      if( axis .eq.  2 )then
-         if( dir .eq. -1 )then
-
-            do  i3=1,n3
-               do  i1=1,n1
-                  indx = indx + 1
-                  u(i1,n2,i3) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=1,n3
-               do  i2=1,2
-                  do  i1=1,n1
-                     indx = indx + 1
-                     u(i1,i2,i3) = buff(indx,buff_id)
-                  enddo
-               enddo
-            enddo
-
-         endif
-      endif
-
-      if( axis .eq.  3 )then
-         if( dir .eq. -1 )then
-
-            do  i2=1,n2
-               do  i1=1,n1
-                  indx = indx + 1
-                  u(i1,i2,n3) = buff(indx, buff_id )
-               enddo
-            enddo
-
-         else if( dir .eq. +1 ) then
-
-            do  i3=1,2
-               do  i2=1,n2
-                  do  i1=1,n1
-                     indx = indx + 1
-                     u(i1,i2,i3) = buff(indx,buff_id)
-                  enddo
-               enddo
-            enddo
-
-         endif
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine comm1p( axis, u, n1, n2, n3, kk )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer axis, dir, n1, n2, n3
-      double precision u( n1, n2, n3 )
-
-      integer i3, i2, i1, buff_len,buff_id
-      integer i, kk, indx
-
-      dir = -1
-
-      buff_id = 3 + dir
-      buff_len = nm2
-
-      do  i=1,nm2
-         buff(i,buff_id) = 0.0D0
-      enddo
-
-
-      dir = +1
-
-      buff_id = 3 + dir
-      buff_len = nm2
-
-      do  i=1,nm2
-         buff(i,buff_id) = 0.0D0
-      enddo
-
-      dir = +1
-
-      buff_id = 2 + dir 
-      buff_len = 0
-
-      if( axis .eq.  1 )then
-         do  i3=2,n3-1
-            do  i2=2,n2-1
-               buff_len = buff_len + 1
-               buff(buff_len, buff_id ) = u( n1-1, i2,i3)
-            enddo
-         enddo
-      endif
-
-      if( axis .eq.  2 )then
-         do  i3=2,n3-1
-            do  i1=1,n1
-               buff_len = buff_len + 1
-               buff(buff_len,  buff_id )= u( i1,n2-1,i3)
-            enddo
-         enddo
-      endif
-
-      if( axis .eq.  3 )then
-         do  i2=1,n2
-            do  i1=1,n1
-               buff_len = buff_len + 1
-               buff(buff_len, buff_id ) = u( i1,i2,n3-1)
-            enddo
-         enddo
-      endif
-
-      dir = -1
-
-      buff_id = 2 + dir 
-      buff_len = 0
-
-      if( axis .eq.  1 )then
-         do  i3=2,n3-1
-            do  i2=2,n2-1
-               buff_len = buff_len + 1
-               buff(buff_len,buff_id ) = u( 2,  i2,i3)
-            enddo
-         enddo
-      endif
-
-      if( axis .eq.  2 )then
-         do  i3=2,n3-1
-            do  i1=1,n1
-               buff_len = buff_len + 1
-               buff(buff_len, buff_id ) = u( i1,  2,i3)
-            enddo
-         enddo
-      endif
-
-      if( axis .eq.  3 )then
-         do  i2=1,n2
-            do  i1=1,n1
-               buff_len = buff_len + 1
-               buff(buff_len, buff_id ) = u( i1,i2,2)
-            enddo
-         enddo
-      endif
-
-      do  i=1,nm2
-         buff(i,4) = buff(i,3)
-         buff(i,2) = buff(i,1)
-      enddo
-
-      dir = -1
-
-      buff_id = 3 + dir
-      indx = 0
-
-      if( axis .eq.  1 )then
-         do  i3=2,n3-1
-            do  i2=2,n2-1
-               indx = indx + 1
-               u(n1,i2,i3) = buff(indx, buff_id )
-            enddo
-         enddo
-      endif
-
-      if( axis .eq.  2 )then
-         do  i3=2,n3-1
-            do  i1=1,n1
-               indx = indx + 1
-               u(i1,n2,i3) = buff(indx, buff_id )
-            enddo
-         enddo
-      endif
-
-      if( axis .eq.  3 )then
-         do  i2=1,n2
-            do  i1=1,n1
-               indx = indx + 1
-               u(i1,i2,n3) = buff(indx, buff_id )
-            enddo
-         enddo
-      endif
-
-
-      dir = +1
-
-      buff_id = 3 + dir
-      indx = 0
-
-      if( axis .eq.  1 )then
-         do  i3=2,n3-1
-            do  i2=2,n2-1
-               indx = indx + 1
-               u(1,i2,i3) = buff(indx, buff_id )
-            enddo
-         enddo
-      endif
-
-      if( axis .eq.  2 )then
-         do  i3=2,n3-1
-            do  i1=1,n1
-               indx = indx + 1
-               u(i1,1,i3) = buff(indx, buff_id )
-            enddo
-         enddo
-      endif
-
-      if( axis .eq.  3 )then
-         do  i2=1,n2
-            do  i1=1,n1
-               indx = indx + 1
-               u(i1,i2,1) = buff(indx, buff_id )
-            enddo
-         enddo
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine comm1p_ex( axis, u, n1, n2, n3, kk )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      include 'globals.h'
-
-      integer axis, dir, n1, n2, n3
-      double precision u( n1, n2, n3 )
-
-      integer i3, i2, i1, buff_len,buff_id
-      integer i, kk, indx
-
-      if( take_ex( axis, kk ) ) then
-
-         dir = -1
-
-         buff_id = 3 + dir
-         buff_len = nm2
-
-         do  i=1,nm2
-            buff(i,buff_id) = 0.0D0
-         enddo
-
-
-         dir = +1
-
-         buff_id = 3 + dir
-         buff_len = nm2
-
-         do  i=1,nm2
-            buff(i,buff_id) = 0.0D0
-         enddo
-
-
-         dir = -1
-
-         buff_id = 3 + dir
-         indx = 0
-
-         if( axis .eq.  1 )then
-            do  i3=1,n3
-               do  i2=1,n2
-                  indx = indx + 1
-                  u(n1,i2,i3) = buff(indx, buff_id )
-               enddo
-            enddo
-         endif
-
-         if( axis .eq.  2 )then
-            do  i3=1,n3
-               do  i1=1,n1
-                  indx = indx + 1
-                  u(i1,n2,i3) = buff(indx, buff_id )
-               enddo
-            enddo
-         endif
-
-         if( axis .eq.  3 )then
-            do  i2=1,n2
-               do  i1=1,n1
-                  indx = indx + 1
-                  u(i1,i2,n3) = buff(indx, buff_id )
-               enddo
-            enddo
-         endif
-
-         dir = +1
-
-         buff_id = 3 + dir
-         indx = 0
-
-         if( axis .eq.  1 )then
-            do  i3=1,n3
-               do  i2=1,n2
-                  do  i1=1,2
-                     indx = indx + 1
-                     u(i1,i2,i3) = buff(indx,buff_id)
-                  enddo
-               enddo
-            enddo
-         endif
-
-         if( axis .eq.  2 )then
-            do  i3=1,n3
-               do  i2=1,2
-                  do  i1=1,n1
-                     indx = indx + 1
-                     u(i1,i2,i3) = buff(indx,buff_id)
-                  enddo
-               enddo
-            enddo
-         endif
-
-         if( axis .eq.  3 )then
-            do  i3=1,2
-               do  i2=1,n2
-                  do  i1=1,n1
-                     indx = indx + 1
-                     u(i1,i2,i3) = buff(indx,buff_id)
-                  enddo
-               enddo
-            enddo
-         endif
-
-      endif
-
-      if( give_ex( axis, kk ) )then
-
-         dir = +1
-
-         buff_id = 2 + dir 
-         buff_len = 0
-
-         if( axis .eq.  1 )then
-            do  i3=1,n3
-               do  i2=1,n2
-                  do  i1=n1-1,n1
-                     buff_len = buff_len + 1
-                     buff(buff_len,buff_id)= u(i1,i2,i3)
-                  enddo
-               enddo
-            enddo
-         endif
-
-         if( axis .eq.  2 )then
-            do  i3=1,n3
-               do  i2=n2-1,n2
-                  do  i1=1,n1
-                     buff_len = buff_len + 1
-                     buff(buff_len,buff_id )= u(i1,i2,i3)
-                  enddo
-               enddo
-            enddo
-         endif
-
-         if( axis .eq.  3 )then
-            do  i3=n3-1,n3
-               do  i2=1,n2
-                  do  i1=1,n1
-                     buff_len = buff_len + 1
-                     buff(buff_len, buff_id ) = u( i1,i2,i3)
-                  enddo
-               enddo
-            enddo
-         endif
-
-         dir = -1
-
-         buff_id = 2 + dir 
-         buff_len = 0
-
-         if( axis .eq.  1 )then
-            do  i3=1,n3
-               do  i2=1,n2
-                  buff_len = buff_len + 1
-                  buff(buff_len,buff_id ) = u( 2,  i2,i3)
-               enddo
-            enddo
-         endif
-
-         if( axis .eq.  2 )then
-            do  i3=1,n3
-               do  i1=1,n1
-                  buff_len = buff_len + 1
-                  buff(buff_len, buff_id ) = u( i1,  2,i3)
-               enddo
-            enddo
-         endif
-
-         if( axis .eq.  3 )then
-            do  i2=1,n2
-               do  i1=1,n1
-                  buff_len = buff_len + 1
-                  buff(buff_len, buff_id ) = u( i1,i2,2)
-               enddo
-            enddo
-         endif
-
-      endif
-
-      do  i=1,nm2
-         buff(i,4) = buff(i,3)
-         buff(i,2) = buff(i,1)
-      enddo
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine zran3(z,n1,n2,n3,nx,ny,k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     zran3  loads +1 at ten randomly chosen points,
-c     loads -1 at a different ten random points,
-c     and zero elsewhere.
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer  is1, is2, is3, ie1, ie2, ie3
-      common /grid/ is1,is2,is3,ie1,ie2,ie3
-
-      integer n1, n2, n3, k, nx, ny, ierr, i0, m0, m1
-      double precision z(n1,n2,n3)
-
-      integer mm, i1, i2, i3, d1, e1, e2, e3
-      double precision x, a
-      double precision xx, x0, x1, a1, a2, ai, power
-      parameter( mm = 10,  a = 5.D0 ** 13, x = 314159265.D0)
-      double precision ten( mm, 0:1 ), temp, best
-      integer i, j1( mm, 0:1 ), j2( mm, 0:1 ), j3( mm, 0:1 )
-      integer jg( 0:3, mm, 0:1 ), jg_temp(4)
-
-      external randlc
-      double precision randlc, rdummy
-
-      a1 = power( a, nx, 1, 0 )
-      a2 = power( a, nx, ny, 0 )
-
-      call zero3(z,n1,n2,n3)
-
-c      i = is1-2+nx*(is2-2+ny*(is3-2))
-
-      ai = power( a, nx, is2-2+ny*(is3-2), is1-2 )
-      d1 = ie1 - is1 + 1
-      e1 = ie1 - is1 + 2
-      e2 = ie2 - is2 + 2
-      e3 = ie3 - is3 + 2
-      x0 = x
-      rdummy = randlc( x0, ai )
-      do  i3 = 2, e3
-         x1 = x0
-         do  i2 = 2, e2
-            xx = x1
-            call vranlc( d1, xx, a, z( 2, i2, i3 ))
-            rdummy = randlc( x1, a1 )
-         enddo
-         rdummy = randlc( x0, a2 )
-      enddo
-
-c---------------------------------------------------------------------
-c       call comm3(z,n1,n2,n3)
-c       call showall(z,n1,n2,n3)
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     each processor looks for twenty candidates
-c---------------------------------------------------------------------
-      do  i=1,mm
-         ten( i, 1 ) = 0.0D0
-         j1( i, 1 ) = 0
-         j2( i, 1 ) = 0
-         j3( i, 1 ) = 0
-         ten( i, 0 ) = 1.0D0
-         j1( i, 0 ) = 0
-         j2( i, 0 ) = 0
-         j3( i, 0 ) = 0
-      enddo
-
-      do  i3=2,n3-1
-         do  i2=2,n2-1
-            do  i1=2,n1-1
-               if( z(i1,i2,i3) .gt. ten( 1, 1 ) )then
-                  ten(1,1) = z(i1,i2,i3) 
-                  j1(1,1) = i1
-                  j2(1,1) = i2
-                  j3(1,1) = i3
-                  call bubble( ten, j1, j2, j3, mm, 1 )
-               endif
-               if( z(i1,i2,i3) .lt. ten( 1, 0 ) )then
-                  ten(1,0) = z(i1,i2,i3) 
-                  j1(1,0) = i1
-                  j2(1,0) = i2
-                  j3(1,0) = i3
-                  call bubble( ten, j1, j2, j3, mm, 0 )
-               endif
-            enddo
-         enddo
-      enddo
-
-      call mpi_barrier(mpi_comm_world,ierr)
-
-c---------------------------------------------------------------------
-c     Now which of these are globally best?
-c---------------------------------------------------------------------
-      i1 = mm
-      i0 = mm
-      do  i=mm,1,-1
-
-         best = z( j1(i1,1), j2(i1,1), j3(i1,1) )
-         call mpi_allreduce(best,temp,1,dp_type,
-     >        mpi_max,mpi_comm_world,ierr)
-         best = temp
-         if(best.eq.z(j1(i1,1),j2(i1,1),j3(i1,1)))then
-            jg( 0, i, 1) = me
-            jg( 1, i, 1) = is1 - 2 + j1( i1, 1 ) 
-            jg( 2, i, 1) = is2 - 2 + j2( i1, 1 ) 
-            jg( 3, i, 1) = is3 - 2 + j3( i1, 1 ) 
-            i1 = i1-1
-         else
-            jg( 0, i, 1) = 0
-            jg( 1, i, 1) = 0
-            jg( 2, i, 1) = 0
-            jg( 3, i, 1) = 0
-         endif
-         ten( i, 1 ) = best
-         call mpi_allreduce(jg(0,i,1), jg_temp,4,MPI_INTEGER,
-     >        mpi_max,mpi_comm_world,ierr)
-         jg( 0, i, 1) =  jg_temp(1)
-         jg( 1, i, 1) =  jg_temp(2)
-         jg( 2, i, 1) =  jg_temp(3)
-         jg( 3, i, 1) =  jg_temp(4)
-
-         best = z( j1(i0,0), j2(i0,0), j3(i0,0) )
-         call mpi_allreduce(best,temp,1,dp_type,
-     >        mpi_min,mpi_comm_world,ierr)
-         best = temp
-         if(best.eq.z(j1(i0,0),j2(i0,0),j3(i0,0)))then
-            jg( 0, i, 0) = me
-            jg( 1, i, 0) = is1 - 2 + j1( i0, 0 ) 
-            jg( 2, i, 0) = is2 - 2 + j2( i0, 0 ) 
-            jg( 3, i, 0) = is3 - 2 + j3( i0, 0 ) 
-            i0 = i0-1
-         else
-            jg( 0, i, 0) = 0
-            jg( 1, i, 0) = 0
-            jg( 2, i, 0) = 0
-            jg( 3, i, 0) = 0
-         endif
-         ten( i, 0 ) = best
-         call mpi_allreduce(jg(0,i,0), jg_temp,4,MPI_INTEGER,
-     >        mpi_max,mpi_comm_world,ierr)
-         jg( 0, i, 0) =  jg_temp(1)
-         jg( 1, i, 0) =  jg_temp(2)
-         jg( 2, i, 0) =  jg_temp(3)
-         jg( 3, i, 0) =  jg_temp(4)
-
-      enddo
-      m1 = i1+1
-      m0 = i0+1
-
-c      if( me .eq. root) then
-c         write(*,*)' '
-c         write(*,*)' negative charges at'
-c         write(*,9)(jg(1,i,0),jg(2,i,0),jg(3,i,0),i=1,mm)
-c         write(*,*)' positive charges at'
-c         write(*,9)(jg(1,i,1),jg(2,i,1),jg(3,i,1),i=1,mm)
-c         write(*,*)' small random numbers were'
-c         write(*,8)(ten( i,0),i=mm,1,-1)
-c         write(*,*)' and they were found on processor number'
-c         write(*,7)(jg(0,i,0),i=mm,1,-1)
-c         write(*,*)' large random numbers were'
-c         write(*,8)(ten( i,1),i=mm,1,-1)
-c         write(*,*)' and they were found on processor number'
-c         write(*,7)(jg(0,i,1),i=mm,1,-1)
-c      endif
-c 9    format(5(' (',i3,2(',',i3),')'))
-c 8    format(5D15.8)
-c 7    format(10i4)
-      call mpi_barrier(mpi_comm_world,ierr)
-      do  i3=1,n3
-         do  i2=1,n2
-            do  i1=1,n1
-               z(i1,i2,i3) = 0.0D0
-            enddo
-         enddo
-      enddo
-      do  i=mm,m0,-1
-         z( j1(i,0), j2(i,0), j3(i,0) ) = -1.0D0
-      enddo
-      do  i=mm,m1,-1
-         z( j1(i,1), j2(i,1), j3(i,1) ) = +1.0D0
-      enddo
-      call comm3(z,n1,n2,n3,k)
-
-c---------------------------------------------------------------------
-c          call showall(z,n1,n2,n3)
-c---------------------------------------------------------------------
-
-      return 
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine show_l(z,n1,n2,n3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer n1,n2,n3,i1,i2,i3,ierr
-      double precision z(n1,n2,n3)
-      integer m1, m2, m3,i
-
-      m1 = min(n1,18)
-      m2 = min(n2,14)
-      m3 = min(n3,18)
-
-      write(*,*)'  '
-      do  i=0,nprocs-1
-         if( me .eq. i )then
-            write(*,*)' id = ', me
-            do  i3=1,m3
-               do  i1=1,m1
-                  write(*,6)(z(i1,i2,i3),i2=1,m2)
-               enddo
-               write(*,*)' - - - - - - - '
-            enddo
-            write(*,*)'  '
- 6          format(6f15.11)
-         endif
-         call mpi_barrier(mpi_comm_world,ierr)
-      enddo
-
-      return 
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine showall(z,n1,n2,n3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer n1,n2,n3,i1,i2,i3,i,ierr
-      double precision z(n1,n2,n3)
-      integer m1, m2, m3
-
-      m1 = min(n1,18)
-      m2 = min(n2,14)
-      m3 = min(n3,18)
-
-      write(*,*)'  '
-      do  i=0,nprocs-1
-         if( me .eq. i )then
-            write(*,*)' id = ', me
-            do  i3=1,m3
-               do  i1=1,m1
-                  write(*,6)(z(i1,i2,i3),i2=1,m2)
-               enddo
-               write(*,*)' - - - - - - - '
-            enddo
-            write(*,*)'  '
- 6          format(15f6.3)
-         endif
-         call mpi_barrier(mpi_comm_world,ierr)
-      enddo
-
-      return 
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine show(z,n1,n2,n3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-      integer n1,n2,n3,i1,i2,i3,ierr,i
-      double precision z(n1,n2,n3)
-
-      write(*,*)'  '
-      do  i=0,nprocs-1
-         if( me .eq. i )then
-            write(*,*)' id = ', me
-            do  i3=2,n3-1
-               do  i1=2,n1-1
-                  write(*,6)(z(i1,i2,i3),i2=2,n1-1)
-               enddo
-               write(*,*)' - - - - - - - '
-            enddo
-            write(*,*)'  '
- 6          format(8D10.3)
-         endif
-         call mpi_barrier(mpi_comm_world,ierr)
-      enddo
-
-c     call comm3(z,n1,n2,n3)
-
-      return 
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      double precision function power( a, n1, n2, n3 )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     power  raises an integer, disguised as a double
-c     precision real, to an integer power.
-c     This version tries to avoid integer overflow by treating
-c     it as expressed in a form of "n1*n2+n3".
-c---------------------------------------------------------------------
-      implicit none
-
-      double precision a, aj
-      integer n1, n2, n3
-
-      integer n1j, n2j, nj
-      external randlc
-      double precision randlc, rdummy
-
-      power = 1.0d0
-      aj = a
-      nj = n3
-      n1j = n1
-      n2j = n2
- 100  continue
-
-      if( n2j .gt. 0 ) then
-         if( mod(n2j,2) .eq. 1 ) nj = nj + n1j
-         n2j = n2j/2
-      else if( nj .eq. 0 ) then
-         go to 200
-      endif
-      if( mod(nj,2) .eq. 1 ) rdummy =  randlc( power, aj )
-      rdummy = randlc( aj, aj )
-      nj = nj/2
-      go to 100
-
- 200  continue
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine bubble( ten, j1, j2, j3, m, ind )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c     bubble        does a bubble sort in direction dir
-c---------------------------------------------------------------------
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer m, ind, j1( m, 0:1 ), j2( m, 0:1 ), j3( m, 0:1 )
-      double precision ten( m, 0:1 )
-      double precision temp
-      integer i, j_temp
-
-      if( ind .eq. 1 )then
-
-         do  i=1,m-1
-            if( ten(i,ind) .gt. ten(i+1,ind) )then
-
-               temp = ten( i+1, ind )
-               ten( i+1, ind ) = ten( i, ind )
-               ten( i, ind ) = temp
-
-               j_temp           = j1( i+1, ind )
-               j1( i+1, ind ) = j1( i,   ind )
-               j1( i,   ind ) = j_temp
-
-               j_temp           = j2( i+1, ind )
-               j2( i+1, ind ) = j2( i,   ind )
-               j2( i,   ind ) = j_temp
-
-               j_temp           = j3( i+1, ind )
-               j3( i+1, ind ) = j3( i,   ind )
-               j3( i,   ind ) = j_temp
-
-            else 
-               return
-            endif
-         enddo
-
-      else
-
-         do  i=1,m-1
-            if( ten(i,ind) .lt. ten(i+1,ind) )then
-
-               temp = ten( i+1, ind )
-               ten( i+1, ind ) = ten( i, ind )
-               ten( i, ind ) = temp
-
-               j_temp           = j1( i+1, ind )
-               j1( i+1, ind ) = j1( i,   ind )
-               j1( i,   ind ) = j_temp
-
-               j_temp           = j2( i+1, ind )
-               j2( i+1, ind ) = j2( i,   ind )
-               j2( i,   ind ) = j_temp
-
-               j_temp           = j3( i+1, ind )
-               j3( i+1, ind ) = j3( i,   ind )
-               j3( i,   ind ) = j_temp
-
-            else 
-               return
-            endif
-         enddo
-
-      endif
-
-      return
-      end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine zero3(z,n1,n2,n3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-      include 'mpinpb.h'
-
-      integer n1, n2, n3
-      double precision z(n1,n2,n3)
-      integer i1, i2, i3
-
-      do  i3=1,n3
-         do  i2=1,n2
-            do  i1=1,n1
-               z(i1,i2,i3)=0.0D0
-            enddo
-         enddo
-      enddo
-
-      return
-      end
-
-
-c----- end of program ------------------------------------------------
diff --git a/examples/smpi/NAS/MG/mg.input.sample b/examples/smpi/NAS/MG/mg.input.sample
deleted file mode 100644
index a4dcf81275..0000000000
--- a/examples/smpi/NAS/MG/mg.input.sample
+++ /dev/null
@@ -1,4 +0,0 @@
- 8 = top level
- 256 256 256 = nx ny nz
- 20 = nit
- 0 0 0 0 0 0 0 0 = debug_vec
diff --git a/examples/smpi/NAS/MG/mpinpb.h b/examples/smpi/NAS/MG/mpinpb.h
deleted file mode 100644
index 1f0368c0b7..0000000000
--- a/examples/smpi/NAS/MG/mpinpb.h
+++ /dev/null
@@ -1,9 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include           'mpif.h'
-
-      integer           me, nprocs, root, dp_type
-      common /mpistuff/ me, nprocs, root, dp_type
-
diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile
index f40f6b1226..7f1bee88a4 100644
--- a/examples/smpi/NAS/Makefile
+++ b/examples/smpi/NAS/Makefile
@@ -8,26 +8,6 @@ SFILE=config/suite.def
 default: header
 	@ sys/print_instructions
 
-BT: bt
-bt: header
-	cd BT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) SUBTYPE=$(SUBTYPE) VERSION=$(VERSION)
-
-SP: sp
-sp: header
-	cd SP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
-LU: lu
-lu: header
-	cd LU; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) VERSION=$(VERSION)
-
-MG: mg
-mg: header
-	cd MG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
-FT: ft
-ft: header
-	cd FT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
 IS: is
 is: header
 	cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
@@ -36,10 +16,6 @@ IS-trace: is-trace
 is-trace: header
 	cd IS-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
 
-CG: cg
-cg: header
-	cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
 EP: ep
 ep: header
 	cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
diff --git a/examples/smpi/NAS/SP/Makefile b/examples/smpi/NAS/SP/Makefile
deleted file mode 100644
index 01508aa935..0000000000
--- a/examples/smpi/NAS/SP/Makefile
+++ /dev/null
@@ -1,60 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=sp
-BENCHMARKU=SP
-
-include ../config/make.def
-
-
-OBJS = sp.o make_set.o initialize.o exact_solution.o exact_rhs.o \
-       set_constants.o adi.o define.o copy_faces.o rhs.o      \
-       lhsx.o lhsy.o lhsz.o x_solve.o ninvr.o y_solve.o pinvr.o    \
-       z_solve.o tzetar.o add.o txinvr.o error.o verify.o setup_mpi.o \
-       ${COMMON}/print_results.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-# npbparams.h is included by header.h
-# The following rule should do the trick but many make programs (not gmake)
-# will do the wrong thing and rebuild the world every time (because the
-# mod time on header.h is not changed. One solution would be to 
-# touch header.h but this might cause confusion if someone has
-# accidentally deleted it. Instead, make the dependency on npbparams.h
-# explicit in all the lines below (even though dependence is indirect). 
-
-# header.h: npbparams.h
-
-${PROGRAM}: config ${OBJS}
-	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-.f.o:
-	${FCOMPILE} $<
-
-sp.o:             sp.f  header.h npbparams.h  mpinpb.h
-make_set.o:       make_set.f  header.h npbparams.h  mpinpb.h
-initialize.o:     initialize.f  header.h npbparams.h
-exact_solution.o: exact_solution.f  header.h npbparams.h
-exact_rhs.o:      exact_rhs.f  header.h npbparams.h
-set_constants.o:  set_constants.f  header.h npbparams.h
-adi.o:            adi.f  header.h npbparams.h
-define.o:         define.f  header.h npbparams.h
-copy_faces.o:     copy_faces.f  header.h npbparams.h  mpinpb.h
-rhs.o:            rhs.f  header.h npbparams.h
-lhsx.o:           lhsx.f  header.h npbparams.h
-lhsy.o:           lhsy.f  header.h npbparams.h
-lhsz.o:           lhsz.f  header.h npbparams.h
-x_solve.o:        x_solve.f  header.h npbparams.h  mpinpb.h
-ninvr.o:          ninvr.f  header.h npbparams.h
-y_solve.o:        y_solve.f  header.h npbparams.h  mpinpb.h
-pinvr.o:          pinvr.f  header.h npbparams.h
-z_solve.o:        z_solve.f  header.h npbparams.h  mpinpb.h
-tzetar.o:         tzetar.f  header.h npbparams.h
-add.o:            add.f  header.h npbparams.h
-txinvr.o:         txinvr.f  header.h npbparams.h
-error.o:          error.f  header.h npbparams.h  mpinpb.h
-verify.o:         verify.f  header.h npbparams.h  mpinpb.h
-setup_mpi.o:      setup_mpi.f mpinpb.h npbparams.h 
-
-
-clean:
-	- rm -f *.o *~ mputil*
-	- rm -f npbparams.h core
diff --git a/examples/smpi/NAS/SP/README b/examples/smpi/NAS/SP/README
deleted file mode 100644
index fe423db43f..0000000000
--- a/examples/smpi/NAS/SP/README
+++ /dev/null
@@ -1,17 +0,0 @@
-
-This code implements a 3D Multi-partition algorithm for the solution 
-of the uncoupled systems of linear equations resulting from 
-Beam-Warming approximate factorization.  Consequently, the program 
-must be run on a square number of processors.  The included file 
-"npbparams.h" contains a parameter statement which sets "maxcells" 
-and "problem_size".  The parameter maxcells must be set to the 
-square root of the number of processors.  For example, if running 
-on 25 processors, then set max_cells=5.  The standard problem sizes 
-are problem_size=64 for class A, 102 for class B, and 162 for class C.
-
-The number of time steps and the time step size dt are set in the 
-npbparams.h but may be overridden in the input deck "inputsp.data".  
-The number of time steps is 400 for all three 
-standard problems, and the appropriate time step sizes "dt" are 
-0.0015d0 for class A, 0.001d0 for class B, and 0.00067 for class C.  
-
diff --git a/examples/smpi/NAS/SP/add.f b/examples/smpi/NAS/SP/add.f
deleted file mode 100644
index cdc4765cbf..0000000000
--- a/examples/smpi/NAS/SP/add.f
+++ /dev/null
@@ -1,31 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine  add
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c addition of update to the vector u
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       integer  c, i, j, k, m
-
-       do  c = 1, ncells
-          do m = 1, 5
-             do  k = start(3,c), cell_size(3,c)-end(3,c)-1
-                do  j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do  i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      u(i,j,k,m,c) = u(i,j,k,m,c) + rhs(i,j,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-       end do
-
-       return
-       end
diff --git a/examples/smpi/NAS/SP/adi.f b/examples/smpi/NAS/SP/adi.f
deleted file mode 100644
index e55cfd60da..0000000000
--- a/examples/smpi/NAS/SP/adi.f
+++ /dev/null
@@ -1,24 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine  adi
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       call copy_faces
-
-       call txinvr
-
-       call x_solve
-
-       call y_solve
-
-       call z_solve
-
-       call add
-
-       return
-       end
-
diff --git a/examples/smpi/NAS/SP/copy_faces.f b/examples/smpi/NAS/SP/copy_faces.f
deleted file mode 100644
index 41824d2198..0000000000
--- a/examples/smpi/NAS/SP/copy_faces.f
+++ /dev/null
@@ -1,306 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine copy_faces
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function copies the face values of a variable defined on a set 
-c of cells to the overlap locations of the adjacent sets of cells. 
-c Because a set of cells interfaces in each direction with exactly one 
-c other set, we only need to fill six different buffers. We could try to 
-c overlap communication with computation, by computing
-c some internal values while communicating boundary values, but this
-c adds so much overhead that it's not clearly useful. 
-c---------------------------------------------------------------------
-
-       include 'header.h'
-       include 'mpinpb.h'
-
-       integer i, j, k, c, m, requests(0:11), p0, p1, 
-     >         p2, p3, p4, p5, b_size(0:5), ss(0:5), 
-     >         sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
-
-c---------------------------------------------------------------------
-c      exit immediately if there are no faces to be copied           
-c---------------------------------------------------------------------
-       if (no_nodes .eq. 1) then
-          call compute_rhs
-          return
-       endif
-
-
-       ss(0) = start_send_east
-       ss(1) = start_send_west
-       ss(2) = start_send_north
-       ss(3) = start_send_south
-       ss(4) = start_send_top
-       ss(5) = start_send_bottom
-
-       sr(0) = start_recv_east
-       sr(1) = start_recv_west
-       sr(2) = start_recv_north
-       sr(3) = start_recv_south
-       sr(4) = start_recv_top
-       sr(5) = start_recv_bottom
-
-       b_size(0) = east_size   
-       b_size(1) = west_size   
-       b_size(2) = north_size  
-       b_size(3) = south_size  
-       b_size(4) = top_size    
-       b_size(5) = bottom_size 
-
-c---------------------------------------------------------------------
-c because the difference stencil for the diagonalized scheme is 
-c orthogonal, we do not have to perform the staged copying of faces, 
-c but can send all face information simultaneously to the neighboring 
-c cells in all directions          
-c---------------------------------------------------------------------
-       p0 = 0
-       p1 = 0
-       p2 = 0
-       p3 = 0
-       p4 = 0
-       p5 = 0
-
-       do  c = 1, ncells
-          do   m = 1, 5
-
-c---------------------------------------------------------------------
-c            fill the buffer to be sent to eastern neighbors (i-dir)
-c---------------------------------------------------------------------
-             if (cell_coord(1,c) .ne. ncells) then
-                do   k = 0, cell_size(3,c)-1
-                   do   j = 0, cell_size(2,c)-1
-                      do   i = cell_size(1,c)-2, cell_size(1,c)-1
-                         out_buffer(ss(0)+p0) = u(i,j,k,m,c)
-                         p0 = p0 + 1
-                      end do
-                   end do
-                end do
-             endif
-
-c---------------------------------------------------------------------
-c            fill the buffer to be sent to western neighbors 
-c---------------------------------------------------------------------
-             if (cell_coord(1,c) .ne. 1) then
-                do   k = 0, cell_size(3,c)-1
-                   do   j = 0, cell_size(2,c)-1
-                      do   i = 0, 1
-                         out_buffer(ss(1)+p1) = u(i,j,k,m,c)
-                         p1 = p1 + 1
-                      end do
-                   end do
-                end do
-
-
-             endif
-
-c---------------------------------------------------------------------
-c            fill the buffer to be sent to northern neighbors (j_dir)
-c---------------------------------------------------------------------
-             if (cell_coord(2,c) .ne. ncells) then
-                do   k = 0, cell_size(3,c)-1
-                   do   j = cell_size(2,c)-2, cell_size(2,c)-1
-                      do   i = 0, cell_size(1,c)-1
-                         out_buffer(ss(2)+p2) = u(i,j,k,m,c)
-                         p2 = p2 + 1
-                      end do
-                   end do
-                end do
-             endif
-
-c---------------------------------------------------------------------
-c            fill the buffer to be sent to southern neighbors 
-c---------------------------------------------------------------------
-             if (cell_coord(2,c).ne. 1) then
-                do   k = 0, cell_size(3,c)-1
-                   do   j = 0, 1
-                      do   i = 0, cell_size(1,c)-1   
-                         out_buffer(ss(3)+p3) = u(i,j,k,m,c)
-                         p3 = p3 + 1
-                      end do
-                   end do
-                end do
-             endif
-
-c---------------------------------------------------------------------
-c            fill the buffer to be sent to top neighbors (k-dir)
-c---------------------------------------------------------------------
-             if (cell_coord(3,c) .ne. ncells) then
-                do   k = cell_size(3,c)-2, cell_size(3,c)-1
-                   do   j = 0, cell_size(2,c)-1
-                      do   i = 0, cell_size(1,c)-1
-                         out_buffer(ss(4)+p4) = u(i,j,k,m,c)
-                         p4 = p4 + 1
-                      end do
-                   end do
-                end do
-             endif
-
-c---------------------------------------------------------------------
-c            fill the buffer to be sent to bottom neighbors
-c---------------------------------------------------------------------
-             if (cell_coord(3,c).ne. 1) then
-                 do    k=0, 1
-                    do   j = 0, cell_size(2,c)-1
-                       do   i = 0, cell_size(1,c)-1
-                          out_buffer(ss(5)+p5) = u(i,j,k,m,c)
-                          p5 = p5 + 1
-                       end do
-                    end do
-                 end do
-              endif
-
-c---------------------------------------------------------------------
-c          m loop
-c---------------------------------------------------------------------
-           end do
-
-c---------------------------------------------------------------------
-c       cell loop
-c---------------------------------------------------------------------
-        end do
-
-       call mpi_irecv(in_buffer(sr(0)), b_size(0), 
-     >                dp_type, successor(1), WEST,  
-     >                comm_rhs, requests(0), error)
-       call mpi_irecv(in_buffer(sr(1)), b_size(1), 
-     >                dp_type, predecessor(1), EAST,  
-     >                comm_rhs, requests(1), error)
-       call mpi_irecv(in_buffer(sr(2)), b_size(2), 
-     >                dp_type, successor(2), SOUTH, 
-     >                comm_rhs, requests(2), error)
-       call mpi_irecv(in_buffer(sr(3)), b_size(3), 
-     >                dp_type, predecessor(2), NORTH, 
-     >                comm_rhs, requests(3), error)
-       call mpi_irecv(in_buffer(sr(4)), b_size(4), 
-     >                dp_type, successor(3), BOTTOM,
-     >                comm_rhs, requests(4), error)
-       call mpi_irecv(in_buffer(sr(5)), b_size(5), 
-     >                dp_type, predecessor(3), TOP,   
-     >                comm_rhs, requests(5), error)
-
-       call mpi_isend(out_buffer(ss(0)), b_size(0), 
-     >                dp_type, successor(1),   EAST, 
-     >                comm_rhs, requests(6), error)
-       call mpi_isend(out_buffer(ss(1)), b_size(1), 
-     >                dp_type, predecessor(1), WEST, 
-     >                comm_rhs, requests(7), error)
-       call mpi_isend(out_buffer(ss(2)), b_size(2), 
-     >                dp_type,successor(2),   NORTH, 
-     >                comm_rhs, requests(8), error)
-       call mpi_isend(out_buffer(ss(3)), b_size(3), 
-     >                dp_type,predecessor(2), SOUTH, 
-     >                comm_rhs, requests(9), error)
-       call mpi_isend(out_buffer(ss(4)), b_size(4), 
-     >                dp_type,successor(3),   TOP, 
-     >                comm_rhs,   requests(10), error)
-       call mpi_isend(out_buffer(ss(5)), b_size(5), 
-     >                dp_type,predecessor(3), BOTTOM, 
-     >                comm_rhs,requests(11), error)
-
-
-       call mpi_waitall(12, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the data that has just been received;             
-c---------------------------------------------------------------------
-       p0 = 0
-       p1 = 0
-       p2 = 0
-       p3 = 0
-       p4 = 0
-       p5 = 0
-
-       do   c = 1, ncells
-          do    m = 1, 5
-
-             if (cell_coord(1,c) .ne. 1) then
-                do   k = 0, cell_size(3,c)-1
-                   do   j = 0, cell_size(2,c)-1
-                      do   i = -2, -1
-                         u(i,j,k,m,c) = in_buffer(sr(1)+p0)
-                         p0 = p0 + 1
-                      end do
-                   end do
-                end do
-             endif
-
-             if (cell_coord(1,c) .ne. ncells) then
-                do  k = 0, cell_size(3,c)-1
-                   do  j = 0, cell_size(2,c)-1
-                      do  i = cell_size(1,c), cell_size(1,c)+1
-                         u(i,j,k,m,c) = in_buffer(sr(0)+p1)
-                         p1 = p1 + 1
-                      end do
-                   end do
-                end do
-             end if
- 
-             if (cell_coord(2,c) .ne. 1) then
-                do  k = 0, cell_size(3,c)-1
-                   do   j = -2, -1
-                      do  i = 0, cell_size(1,c)-1
-                         u(i,j,k,m,c) = in_buffer(sr(3)+p2)
-                         p2 = p2 + 1
-                      end do
-                   end do
-                end do
-
-             endif
- 
-             if (cell_coord(2,c) .ne. ncells) then
-                do  k = 0, cell_size(3,c)-1
-                   do   j = cell_size(2,c), cell_size(2,c)+1
-                      do  i = 0, cell_size(1,c)-1
-                         u(i,j,k,m,c) = in_buffer(sr(2)+p3)
-                         p3 = p3 + 1
-                      end do
-                   end do
-                end do
-             endif
-
-             if (cell_coord(3,c) .ne. 1) then
-                do  k = -2, -1
-                   do  j = 0, cell_size(2,c)-1
-                      do  i = 0, cell_size(1,c)-1
-                         u(i,j,k,m,c) = in_buffer(sr(5)+p4)
-                         p4 = p4 + 1
-                      end do
-                   end do
-                end do
-             endif
-
-             if (cell_coord(3,c) .ne. ncells) then
-                do  k = cell_size(3,c), cell_size(3,c)+1
-                   do  j = 0, cell_size(2,c)-1
-                      do  i = 0, cell_size(1,c)-1
-                         u(i,j,k,m,c) = in_buffer(sr(4)+p5)
-                         p5 = p5 + 1
-                      end do
-                   end do
-                end do
-             endif
-
-c---------------------------------------------------------------------
-c         m loop            
-c---------------------------------------------------------------------
-          end do
-
-c---------------------------------------------------------------------
-c      cells loop
-c---------------------------------------------------------------------
-       end do
-
-c---------------------------------------------------------------------
-c now that we have all the data, compute the rhs
-c---------------------------------------------------------------------
-       call compute_rhs
-
-       return
-       end
diff --git a/examples/smpi/NAS/SP/define.f b/examples/smpi/NAS/SP/define.f
deleted file mode 100644
index c465533f9a..0000000000
--- a/examples/smpi/NAS/SP/define.f
+++ /dev/null
@@ -1,66 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine compute_buffer_size(dim)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       integer  c, dim, face_size
-
-       if (ncells .eq. 1) return
-
-c---------------------------------------------------------------------
-c      compute the actual sizes of the buffers; note that there is 
-c      always one cell face that doesn't need buffer space, because it 
-c      is at the boundary of the grid
-c---------------------------------------------------------------------
-
-       west_size = 0
-       east_size = 0
-
-       do   c = 1, ncells
-          face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
-          if (cell_coord(1,c).ne.1) west_size = west_size + face_size
-          if (cell_coord(1,c).ne.ncells) east_size = east_size + 
-     >                                                 face_size 
-       end do
-
-       north_size = 0
-       south_size = 0
-       do   c = 1, ncells
-          face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
-          if (cell_coord(2,c).ne.1) south_size = south_size + face_size
-          if (cell_coord(2,c).ne.ncells) north_size = north_size + 
-     >                                                  face_size 
-       end do
-
-       top_size = 0
-       bottom_size = 0
-       do   c = 1, ncells
-          face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
-          if (cell_coord(3,c).ne.1) bottom_size = bottom_size + 
-     >                                            face_size
-          if (cell_coord(3,c).ne.ncells) top_size = top_size +
-     >                                                face_size     
-       end do
-
-       start_send_west   = 1
-       start_send_east   = start_send_west   + west_size
-       start_send_south  = start_send_east   + east_size
-       start_send_north  = start_send_south  + south_size
-       start_send_bottom = start_send_north  + north_size
-       start_send_top    = start_send_bottom + bottom_size
-       start_recv_west   = 1
-       start_recv_east   = start_recv_west   + west_size
-       start_recv_south  = start_recv_east   + east_size
-       start_recv_north  = start_recv_south  + south_size
-       start_recv_bottom = start_recv_north  + north_size
-       start_recv_top    = start_recv_bottom + bottom_size
-
-       return
-       end
-
diff --git a/examples/smpi/NAS/SP/error.f b/examples/smpi/NAS/SP/error.f
deleted file mode 100644
index fd9aab37b3..0000000000
--- a/examples/smpi/NAS/SP/error.f
+++ /dev/null
@@ -1,105 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine error_norm(rms)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function computes the norm of the difference between the
-c computed solution and the exact solution
-c---------------------------------------------------------------------
-
-       include 'header.h'
-       include 'mpinpb.h'
-
-       integer c, i, j, k, m, ii, jj, kk, d, error
-       double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
-     >                  add
-
-       do   m = 1, 5 
-          rms_work(m) = 0.0d0
-       end do
-
-       do   c = 1, ncells
-          kk = 0
-          do   k = cell_low(3,c), cell_high(3,c)
-             zeta = dble(k) * dnzm1
-             jj = 0
-             do   j = cell_low(2,c), cell_high(2,c)
-                eta = dble(j) * dnym1
-                ii = 0
-                do   i = cell_low(1,c), cell_high(1,c)
-                   xi = dble(i) * dnxm1
-                   call exact_solution(xi, eta, zeta, u_exact)
-
-                   do   m = 1, 5
-                      add = u(ii,jj,kk,m,c)-u_exact(m)
-                      rms_work(m) = rms_work(m) + add*add
-                   end do
-                   ii = ii + 1
-                end do
-                jj = jj + 1
-             end do
-             kk = kk + 1
-          end do
-       end do
-
-       call mpi_allreduce(rms_work, rms, 5, dp_type, 
-     >                 MPI_SUM, comm_setup, error)
-
-       do    m = 1, 5
-          do    d = 1, 3
-             rms(m) = rms(m) / dble(grid_points(d)-2)
-          end do
-          rms(m) = dsqrt(rms(m))
-       end do
-
-       return
-       end
-
-
-
-       subroutine rhs_norm(rms)
-
-       include 'header.h'
-       include 'mpinpb.h'
-
-       integer c, i, j, k, d, m, error
-       double precision rms(5), rms_work(5), add
-
-       do    m = 1, 5
-          rms_work(m) = 0.0d0
-       end do
-
-       do   c = 1, ncells
-          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-                do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-                   do   m = 1, 5
-                      add = rhs(i,j,k,m,c)
-                      rms_work(m) = rms_work(m) + add*add
-                   end do
-                end do
-             end do
-          end do
-       end do
-
-
-
-       call mpi_allreduce(rms_work, rms, 5, dp_type, 
-     >                 MPI_SUM, comm_setup, error)
-
-       do   m = 1, 5
-          do   d = 1, 3
-             rms(m) = rms(m) / dble(grid_points(d)-2)
-          end do
-          rms(m) = dsqrt(rms(m))
-       end do
-
-       return
-       end
-
-
diff --git a/examples/smpi/NAS/SP/exact_rhs.f b/examples/smpi/NAS/SP/exact_rhs.f
deleted file mode 100644
index b589668126..0000000000
--- a/examples/smpi/NAS/SP/exact_rhs.f
+++ /dev/null
@@ -1,363 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine exact_rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       double precision dtemp(5), xi, eta, zeta, dtpp
-       integer          c, m, i, j, k, ip1, im1, jp1, 
-     >                  jm1, km1, kp1
-
-c---------------------------------------------------------------------
-c loop over all cells owned by this node                   
-c---------------------------------------------------------------------
-       do   c = 1, ncells
-
-c---------------------------------------------------------------------
-c         initialize                                  
-c---------------------------------------------------------------------
-          do   m = 1, 5
-             do   k= 0, cell_size(3,c)-1
-                do   j = 0, cell_size(2,c)-1
-                   do   i = 0, cell_size(1,c)-1
-                      forcing(i,j,k,m,c) = 0.0d0
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c xi-direction flux differences                      
-c---------------------------------------------------------------------
-          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-             zeta = dble(k+cell_low(3,c)) * dnzm1
-             do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-                eta = dble(j+cell_low(2,c)) * dnym1
-
-                do  i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
-                   xi = dble(i+cell_low(1,c)) * dnxm1
-
-                   call exact_solution(xi, eta, zeta, dtemp)
-                   do  m = 1, 5
-                      ue(i,m) = dtemp(m)
-                   end do
-
-                   dtpp = 1.0d0 / dtemp(1)
-
-                   do  m = 2, 5
-                      buf(i,m) = dtpp * dtemp(m)
-                   end do
-
-                   cuf(i)   = buf(i,2) * buf(i,2)
-                   buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + 
-     >                        buf(i,4) * buf(i,4) 
-                   q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
-     >                           buf(i,4)*ue(i,4))
-
-                end do
- 
-                do  i = start(1,c), cell_size(1,c)-end(1,c)-1
-                   im1 = i-1
-                   ip1 = i+1
-
-                   forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
-     >                 tx2*( ue(ip1,2)-ue(im1,2) )+
-     >                 dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
-
-                   forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tx2 * (
-     >                (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
-     >                (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
-     >                 xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
-     >                 dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
-
-                   forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tx2 * (
-     >                 ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
-     >                 xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
-     >                 dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
-                  
-                   forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tx2*(
-     >                 ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
-     >                 xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
-     >                 dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
-
-                   forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tx2*(
-     >                 buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
-     >                 buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
-     >                 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
-     >                               buf(im1,1))+
-     >                 xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
-     >                 xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
-     >                 dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
-                end do
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation                         
-c---------------------------------------------------------------------
-                if (start(1,c) .gt. 0) then
-                   do   m = 1, 5
-                      i = 1
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                    (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
-                      i = 2
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
-     >                     4.0d0*ue(i+1,m) +       ue(i+2,m))
-                   end do
-                endif
-
-                do   m = 1, 5
-                   do  i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
-     >                   (ue(i-2,m) - 4.0d0*ue(i-1,m) +
-     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
-                   end do
-                end do
-
-                if (end(1,c) .gt. 0) then
-                   do   m = 1, 5
-                      i = cell_size(1,c)-3
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (ue(i-2,m) - 4.0d0*ue(i-1,m) +
-     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
-                      i = cell_size(1,c)-2
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
-                   end do
-                endif
-
-             end do
-          end do
-c---------------------------------------------------------------------
-c  eta-direction flux differences             
-c---------------------------------------------------------------------
-          do   k = start(3,c), cell_size(3,c)-end(3,c)-1          
-             zeta = dble(k+cell_low(3,c)) * dnzm1
-             do   i=start(1,c), cell_size(1,c)-end(1,c)-1
-                xi = dble(i+cell_low(1,c)) * dnxm1
-
-                do  j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
-                   eta = dble(j+cell_low(2,c)) * dnym1
-
-                   call exact_solution(xi, eta, zeta, dtemp)
-                   do   m = 1, 5 
-                      ue(j,m) = dtemp(m)
-                   end do
-                   dtpp = 1.0d0/dtemp(1)
-
-                   do  m = 2, 5
-                      buf(j,m) = dtpp * dtemp(m)
-                   end do
-
-                   cuf(j)   = buf(j,3) * buf(j,3)
-                   buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + 
-     >                        buf(j,4) * buf(j,4)
-                   q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
-     >                           buf(j,4)*ue(j,4))
-                end do
-
-                do  j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   jm1 = j-1
-                   jp1 = j+1
-                  
-                   forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
-     >                ty2*( ue(jp1,3)-ue(jm1,3) )+
-     >                dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
-
-                   forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - ty2*(
-     >                ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
-     >                yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
-     >                dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
-
-                   forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - ty2*(
-     >                (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
-     >                (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
-     >                yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
-     >                dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
-
-                   forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - ty2*(
-     >                ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
-     >                yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
-     >                dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
-
-                   forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - ty2*(
-     >                buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
-     >                buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
-     >                0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
-     >                              buf(jm1,1))+
-     >                yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
-     >                yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
-     >                dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
-                end do
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation                      
-c---------------------------------------------------------------------
-                if (start(2,c) .gt. 0) then
-                   do   m = 1, 5
-                      j = 1
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                    (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
-                      j = 2
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
-     >                     4.0d0*ue(j+1,m) +       ue(j+2,m))
-                   end do
-                endif
-
-                do   m = 1, 5
-                   do  j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
-     >                   (ue(j-2,m) - 4.0d0*ue(j-1,m) +
-     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
-                   end do
-                end do
-                if (end(2,c) .gt. 0) then
-                   do   m = 1, 5
-                      j = cell_size(2,c)-3
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (ue(j-2,m) - 4.0d0*ue(j-1,m) +
-     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
-                      j = cell_size(2,c)-2
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
-
-                   end do
-                endif
-
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c zeta-direction flux differences                      
-c---------------------------------------------------------------------
-          do  j=start(2,c), cell_size(2,c)-end(2,c)-1
-             eta = dble(j+cell_low(2,c)) * dnym1
-             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-                xi = dble(i+cell_low(1,c)) * dnxm1
-
-                do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
-                   zeta = dble(k+cell_low(3,c)) * dnzm1
-
-                   call exact_solution(xi, eta, zeta, dtemp)
-                   do   m = 1, 5
-                      ue(k,m) = dtemp(m)
-                   end do
-
-                   dtpp = 1.0d0/dtemp(1)
-
-                   do   m = 2, 5
-                      buf(k,m) = dtpp * dtemp(m)
-                   end do
-
-                   cuf(k)   = buf(k,4) * buf(k,4)
-                   buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + 
-     >                        buf(k,3) * buf(k,3)
-                   q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
-     >                           buf(k,4)*ue(k,4))
-                end do
-
-                do    k=start(3,c), cell_size(3,c)-end(3,c)-1
-                   km1 = k-1
-                   kp1 = k+1
-                  
-                   forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
-     >                 tz2*( ue(kp1,4)-ue(km1,4) )+
-     >                 dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
-
-                   forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tz2 * (
-     >                 ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
-     >                 zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
-     >                 dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
-
-                   forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tz2 * (
-     >                 ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
-     >                 zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
-     >                 dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
-
-                   forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tz2 * (
-     >                (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
-     >                (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
-     >                zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
-     >                dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
-
-                   forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tz2 * (
-     >                 buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
-     >                 buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
-     >                 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
-     >                              +buf(km1,1))+
-     >                 zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
-     >                 zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
-     >                 dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
-                end do
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation                        
-c---------------------------------------------------------------------
-                if (start(3,c) .gt. 0) then
-                   do   m = 1, 5
-                      k = 1
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                    (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
-                      k = 2
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
-     >                     4.0d0*ue(k+1,m) +       ue(k+2,m))
-                   end do
-                endif
-
-                do   m = 1, 5
-                   do  k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
-     >                   (ue(k-2,m) - 4.0d0*ue(k-1,m) +
-     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
-                   end do
-                end do
-
-                if (end(3,c) .gt. 0) then
-                   do    m = 1, 5
-                      k = cell_size(3,c)-3
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (ue(k-2,m) - 4.0d0*ue(k-1,m) +
-     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
-                      k = cell_size(3,c)-2
-                      forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
-     >                   (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
-                   end do
-                endif
-
-             end do
-          end do
-c---------------------------------------------------------------------
-c now change the sign of the forcing function, 
-c---------------------------------------------------------------------
-          do   m = 1, 5
-             do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-                do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      forcing(i,j,k,m,c) = -1.d0 * forcing(i,j,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c      cell loop
-c---------------------------------------------------------------------
-       end do
-
-       return
-       end
-
-
-
-
-
diff --git a/examples/smpi/NAS/SP/exact_solution.f b/examples/smpi/NAS/SP/exact_solution.f
deleted file mode 100644
index 2644f0b8f9..0000000000
--- a/examples/smpi/NAS/SP/exact_solution.f
+++ /dev/null
@@ -1,30 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine exact_solution(xi,eta,zeta,dtemp)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function returns the exact solution at point xi, eta, zeta  
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       double precision  xi, eta, zeta, dtemp(5)
-       integer m
-
-       do  m = 1, 5
-          dtemp(m) =  ce(m,1) +
-     >    xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
-     >    eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
-     >    zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + 
-     >    zeta*ce(m,13))))
-       end do
-
-       return
-       end
-
-
diff --git a/examples/smpi/NAS/SP/header.h b/examples/smpi/NAS/SP/header.h
deleted file mode 100644
index 663515a104..0000000000
--- a/examples/smpi/NAS/SP/header.h
+++ /dev/null
@@ -1,113 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      implicit none
-
-c---------------------------------------------------------------------
-c The following include file is generated automatically by the
-c "setparams" utility. It defines 
-c      maxcells:      the square root of the maximum number of processors
-c      problem_size:  12, 64, 102, 162 (for class T, A, B, C)
-c      dt_default:    default time step for this problem size if no
-c                     config file
-c      niter_default: default number of iterations for this problem size
-c---------------------------------------------------------------------
-
-      include 'npbparams.h'
-
-      integer           ncells, grid_points(3)
-      common /global/   ncells, grid_points
-
-      double precision  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
-     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
-     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
-     >                  ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, 
-     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
-     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
-     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
-     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
-     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
-     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
-     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
-     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
-     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
-     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
-
-      common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
-     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
-     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
-     >                  ce, dxmax, dymax, dzmax, xxcon1, xxcon2, 
-     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
-     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
-     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
-     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
-     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
-     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
-     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
-     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
-     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
-     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
-
-      integer           EAST, WEST, NORTH, SOUTH, 
-     >                  BOTTOM, TOP
-
-      parameter (EAST=2000, WEST=3000,      NORTH=4000, SOUTH=5000,
-     >           BOTTOM=6000, TOP=7000)
-
-      integer cell_coord (3,maxcells), cell_low (3,maxcells), 
-     >        cell_high  (3,maxcells), cell_size(3,maxcells),
-     >        predecessor(3),          slice    (3,maxcells),
-     >        grid_size  (3),          successor(3),
-     >        start      (3,maxcells), end      (3,maxcells)
-      common /partition/ cell_coord, cell_low, cell_high, cell_size,
-     >                   grid_size, successor, predecessor, slice,
-     >                   start, end
-
-      integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE, IMAXP, JMAXP
-
-      parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
-
-      parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
-      parameter (IMAXP=IMAX/2*2+1,JMAXP=JMAX/2*2+1)
-
-c---------------------------------------------------------------------
-c +1 at end to avoid zero length arrays for 1 node
-c---------------------------------------------------------------------
-      parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60*2+1)
-
-      double precision 
-     >   u       (-2:IMAXP+1,-2:JMAXP+1,-2:KMAX+1, 5,maxcells),
-     >   us      (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
-     >   vs      (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
-     >   ws      (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
-     >   qs      (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
-     >   ainv    (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
-     >   rho_i   (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
-     >   speed   (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
-     >   square  (-1:IMAX,   -1:JMAX,   -1:KMAX,     maxcells),
-     >   rhs     ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells),
-     >   forcing ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells),
-     >   lhs     ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1,15,maxcells),
-     >   in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
-      common /fields/  u, us, vs, ws, qs, ainv, rho_i, speed, square, 
-     >                 rhs, forcing, lhs, in_buffer, out_buffer
-
-      double precision cv(-2:MAX_CELL_DIM+1),   rhon(-2:MAX_CELL_DIM+1),
-     >                 rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
-     >                 cuf(-2:MAX_CELL_DIM+1),  q(-2:MAX_CELL_DIM+1),
-     >                 ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
-      common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
-
-      integer  west_size, east_size, bottom_size, top_size,
-     >         north_size, south_size, start_send_west, 
-     >         start_send_east, start_send_south, start_send_north,
-     >         start_send_bottom, start_send_top, start_recv_west,
-     >         start_recv_east, start_recv_south, start_recv_north,
-     >         start_recv_bottom, start_recv_top
-      common /box/ west_size, east_size, bottom_size,
-     >             top_size, north_size, south_size, 
-     >             start_send_west, start_send_east, start_send_south,
-     >             start_send_north, start_send_bottom, start_send_top,
-     >             start_recv_west, start_recv_east, start_recv_south,
-     >             start_recv_north, start_recv_bottom, start_recv_top
diff --git a/examples/smpi/NAS/SP/initialize.f b/examples/smpi/NAS/SP/initialize.f
deleted file mode 100644
index 655c8d9369..0000000000
--- a/examples/smpi/NAS/SP/initialize.f
+++ /dev/null
@@ -1,286 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine  initialize
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This subroutine initializes the field variable u using 
-c tri-linear transfinite interpolation of the boundary values     
-c---------------------------------------------------------------------
-
-       include 'header.h'
-  
-       integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
-       double precision  xi, eta, zeta, Pface(5,3,2), Pxi, Peta, 
-     >                   Pzeta, temp(5)
-
-
-c---------------------------------------------------------------------
-c  Later (in compute_rhs) we compute 1/u for every element. A few of 
-c  the corner elements are not used, but it convenient (and faster) 
-c  to compute the whole thing with a simple loop. Make sure those 
-c  values are nonzero by initializing the whole thing here. 
-c---------------------------------------------------------------------
-      do c = 1, ncells
-         do kk = -1, IMAX
-            do jj = -1, IMAX
-               do ii = -1, IMAX
-                  u(ii, jj, kk, 1, c) = 1.0
-                  u(ii, jj, kk, 2, c) = 0.0
-                  u(ii, jj, kk, 3, c) = 0.0
-                  u(ii, jj, kk, 4, c) = 0.0
-                  u(ii, jj, kk, 5, c) = 1.0
-               end do
-            end do
-         end do
-      end do
-
-c---------------------------------------------------------------------
-c first store the "interpolated" values everywhere on the grid    
-c---------------------------------------------------------------------
-       do  c=1, ncells
-          kk = 0
-          do  k = cell_low(3,c), cell_high(3,c)
-             zeta = dble(k) * dnzm1
-             jj = 0
-             do  j = cell_low(2,c), cell_high(2,c)
-                eta = dble(j) * dnym1
-                ii = 0
-                do   i = cell_low(1,c), cell_high(1,c)
-                   xi = dble(i) * dnxm1
-                  
-                   do ix = 1, 2
-                      call exact_solution(dble(ix-1), eta, zeta, 
-     >                                    Pface(1,1,ix))
-                   end do
-
-                   do    iy = 1, 2
-                      call exact_solution(xi, dble(iy-1) , zeta, 
-     >                                    Pface(1,2,iy))
-                   end do
-
-                   do    iz = 1, 2
-                      call exact_solution(xi, eta, dble(iz-1),   
-     >                                    Pface(1,3,iz))
-                   end do
-
-                   do   m = 1, 5
-                      Pxi   = xi   * Pface(m,1,2) + 
-     >                        (1.0d0-xi)   * Pface(m,1,1)
-                      Peta  = eta  * Pface(m,2,2) + 
-     >                        (1.0d0-eta)  * Pface(m,2,1)
-                      Pzeta = zeta * Pface(m,3,2) + 
-     >                        (1.0d0-zeta) * Pface(m,3,1)
- 
-                      u(ii,jj,kk,m,c) = Pxi + Peta + Pzeta - 
-     >                          Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + 
-     >                          Pxi*Peta*Pzeta
-
-                   end do
-                   ii = ii + 1
-                end do
-                jj = jj + 1
-             end do
-             kk = kk+1
-          end do
-       end do
-
-c---------------------------------------------------------------------
-c now store the exact values on the boundaries        
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c west face                                                  
-c---------------------------------------------------------------------
-       c = slice(1,1)
-       ii = 0
-       xi = 0.0d0
-       kk = 0
-       do  k = cell_low(3,c), cell_high(3,c)
-          zeta = dble(k) * dnzm1
-          jj = 0
-          do   j = cell_low(2,c), cell_high(2,c)
-             eta = dble(j) * dnym1
-             call exact_solution(xi, eta, zeta, temp)
-             do   m = 1, 5
-                u(ii,jj,kk,m,c) = temp(m)
-             end do
-             jj = jj + 1
-          end do
-          kk = kk + 1
-       end do
-
-c---------------------------------------------------------------------
-c east face                                                      
-c---------------------------------------------------------------------
-       c  = slice(1,ncells)
-       ii = cell_size(1,c)-1
-       xi = 1.0d0
-       kk = 0
-       do   k = cell_low(3,c), cell_high(3,c)
-          zeta = dble(k) * dnzm1
-          jj = 0
-          do   j = cell_low(2,c), cell_high(2,c)
-             eta = dble(j) * dnym1
-             call exact_solution(xi, eta, zeta, temp)
-             do   m = 1, 5
-                u(ii,jj,kk,m,c) = temp(m)
-             end do
-             jj = jj + 1
-          end do
-          kk = kk + 1
-       end do
-
-c---------------------------------------------------------------------
-c south face                                                 
-c---------------------------------------------------------------------
-       c = slice(2,1)
-       jj = 0
-       eta = 0.0d0
-       kk = 0
-       do  k = cell_low(3,c), cell_high(3,c)
-          zeta = dble(k) * dnzm1
-          ii = 0
-          do   i = cell_low(1,c), cell_high(1,c)
-             xi = dble(i) * dnxm1
-             call exact_solution(xi, eta, zeta, temp)
-             do   m = 1, 5
-                u(ii,jj,kk,m,c) = temp(m)
-             end do
-             ii = ii + 1
-          end do
-          kk = kk + 1
-       end do
-
-
-c---------------------------------------------------------------------
-c north face                                    
-c---------------------------------------------------------------------
-       c = slice(2,ncells)
-       jj = cell_size(2,c)-1
-       eta = 1.0d0
-       kk = 0
-       do   k = cell_low(3,c), cell_high(3,c)
-          zeta = dble(k) * dnzm1
-          ii = 0
-          do   i = cell_low(1,c), cell_high(1,c)
-             xi = dble(i) * dnxm1
-             call exact_solution(xi, eta, zeta, temp)
-             do   m = 1, 5
-                u(ii,jj,kk,m,c) = temp(m)
-             end do
-             ii = ii + 1
-          end do
-          kk = kk + 1
-       end do
-
-c---------------------------------------------------------------------
-c bottom face                                       
-c---------------------------------------------------------------------
-       c = slice(3,1)
-       kk = 0
-       zeta = 0.0d0
-       jj = 0
-       do   j = cell_low(2,c), cell_high(2,c)
-          eta = dble(j) * dnym1
-          ii = 0
-          do   i =cell_low(1,c), cell_high(1,c)
-             xi = dble(i) *dnxm1
-             call exact_solution(xi, eta, zeta, temp)
-             do   m = 1, 5
-                u(ii,jj,kk,m,c) = temp(m)
-             end do
-             ii = ii + 1
-          end do
-          jj = jj + 1
-       end do
-
-c---------------------------------------------------------------------
-c top face     
-c---------------------------------------------------------------------
-       c = slice(3,ncells)
-       kk = cell_size(3,c)-1
-       zeta = 1.0d0
-       jj = 0
-       do   j = cell_low(2,c), cell_high(2,c)
-          eta = dble(j) * dnym1
-          ii = 0
-          do   i =cell_low(1,c), cell_high(1,c)
-             xi = dble(i) * dnxm1
-             call exact_solution(xi, eta, zeta, temp)
-             do   m = 1, 5
-                u(ii,jj,kk,m,c) = temp(m)
-             end do
-             ii = ii + 1
-          end do
-          jj = jj + 1
-       end do
-
-       return
-       end
-
-
-       subroutine lhsinit
-
-       include 'header.h'
-       
-       integer i, j, k, d, c, n
-
-c---------------------------------------------------------------------
-c loop over all cells                                       
-c---------------------------------------------------------------------
-       do  c = 1, ncells
-
-c---------------------------------------------------------------------
-c         first, initialize the start and end arrays
-c---------------------------------------------------------------------
-          do  d = 1, 3
-             if (cell_coord(d,c) .eq. 1) then
-                start(d,c) = 1
-             else 
-                start(d,c) = 0
-             endif
-             if (cell_coord(d,c) .eq. ncells) then
-                end(d,c) = 1
-             else
-                end(d,c) = 0
-             endif
-          end do
-
-c---------------------------------------------------------------------
-c     zap the whole left hand side for starters
-c---------------------------------------------------------------------
-          do  n = 1, 15
-             do  k = 0, cell_size(3,c)-1
-                do  j = 0, cell_size(2,c)-1
-                   do  i = 0, cell_size(1,c)-1
-                      lhs(i,j,k,n,c) = 0.0d0
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c next, set all diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
-          do   n = 1, 3
-             do   k = 0, cell_size(3,c)-1
-                do   j = 0, cell_size(2,c)-1
-                   do   i = 0, cell_size(1,c)-1
-                      lhs(i,j,k,5*n-2,c) = 1.0d0
-                   end do
-                end do
-             end do
-          end do
-
-       end do
-
-      return
-      end
-
-
-
diff --git a/examples/smpi/NAS/SP/inputsp.data.sample b/examples/smpi/NAS/SP/inputsp.data.sample
deleted file mode 100644
index ae3801fdb7..0000000000
--- a/examples/smpi/NAS/SP/inputsp.data.sample
+++ /dev/null
@@ -1,3 +0,0 @@
-400       number of time steps
-0.0015d0  dt for class A = 0.0015d0. class B = 0.001d0  class C = 0.00067d0
-64 64 64
diff --git a/examples/smpi/NAS/SP/lhsx.f b/examples/smpi/NAS/SP/lhsx.f
deleted file mode 100644
index cae7779122..0000000000
--- a/examples/smpi/NAS/SP/lhsx.f
+++ /dev/null
@@ -1,124 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine lhsx(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three x-factors  
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       double precision ru1
-       integer          i, j, k, c
-
-
-c---------------------------------------------------------------------
-c      treat only cell c             
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c      first fill the lhs for the u-eigenvalue                   
-c---------------------------------------------------------------------
-       do  k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do  j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do  i = start(1,c)-1, cell_size(1,c)-end(1,c)
-                ru1 = c3c4*rho_i(i,j,k,c)
-                cv(i) = us(i,j,k,c)
-                rhon(i) = dmax1(dx2+con43*ru1, 
-     >                          dx5+c1c5*ru1,
-     >                          dxmax+ru1,
-     >                          dx1)
-             end do
-
-             do  i = start(1,c), cell_size(1,c)-end(1,c)-1
-                lhs(i,j,k,1,c) =   0.0d0
-                lhs(i,j,k,2,c) = - dttx2 * cv(i-1) - dttx1 * rhon(i-1)
-                lhs(i,j,k,3,c) =   1.0d0 + c2dttx1 * rhon(i)
-                lhs(i,j,k,4,c) =   dttx2 * cv(i+1) - dttx1 * rhon(i+1)
-                lhs(i,j,k,5,c) =   0.0d0
-             end do
-          end do
-       end do
-
-c---------------------------------------------------------------------
-c      add fourth order dissipation                             
-c---------------------------------------------------------------------
-       if (start(1,c) .gt. 0) then
-          i = 1
-          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-  
-                lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4
-                lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz6
-                lhs(i+1,j,k,4,c) = lhs(i+1,j,k,4,c) - comz4
-                lhs(i+1,j,k,5,c) = lhs(i+1,j,k,5,c) + comz1
-             end do
-          end do
-       endif
-
-       do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do   i=3*start(1,c), cell_size(1,c)-3*end(1,c)-1
-                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
-                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-             end do
-          end do
-       end do
-
-       if (end(1,c) .gt. 0) then
-          i = cell_size(1,c)-3
-          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
-                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-
-                lhs(i+1,j,k,1,c) = lhs(i+1,j,k,1,c) + comz1
-                lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4
-                lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz5
-             end do
-          end do
-       endif
-
-c---------------------------------------------------------------------
-c      subsequently, fill the other factors (u+c), (u-c) by a4ing to 
-c      the first  
-c---------------------------------------------------------------------
-       do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-                lhs(i,j,k,1+5,c)  = lhs(i,j,k,1,c)
-                lhs(i,j,k,2+5,c)  = lhs(i,j,k,2,c) - 
-     >                            dttx2 * speed(i-1,j,k,c)
-                lhs(i,j,k,3+5,c)  = lhs(i,j,k,3,c)
-                lhs(i,j,k,4+5,c)  = lhs(i,j,k,4,c) + 
-     >                            dttx2 * speed(i+1,j,k,c)
-                lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
-                lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
-                lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + 
-     >                            dttx2 * speed(i-1,j,k,c)
-                lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
-                lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - 
-     >                            dttx2 * speed(i+1,j,k,c)
-                lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
-             end do
-          end do
-       end do
-
-       return
-       end
-
-
-
diff --git a/examples/smpi/NAS/SP/lhsy.f b/examples/smpi/NAS/SP/lhsy.f
deleted file mode 100644
index 9c07a35538..0000000000
--- a/examples/smpi/NAS/SP/lhsy.f
+++ /dev/null
@@ -1,125 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine lhsy(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three y-factors   
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       double precision ru1
-       integer          i, j, k, c
-
-c---------------------------------------------------------------------
-c      treat only cell c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c      first fill the lhs for the u-eigenvalue         
-c---------------------------------------------------------------------
-       do  k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do  i = start(1,c), cell_size(1,c)-end(1,c)-1
-
-             do  j = start(2,c)-1, cell_size(2,c)-end(2,c)
-                ru1 = c3c4*rho_i(i,j,k,c)
-                cv(j) = vs(i,j,k,c)
-                rhoq(j) = dmax1( dy3 + con43 * ru1,
-     >                           dy5 + c1c5*ru1,
-     >                           dymax + ru1,
-     >                           dy1)
-             end do
-            
-             do  j = start(2,c), cell_size(2,c)-end(2,c)-1
-                lhs(i,j,k,1,c) =  0.0d0
-                lhs(i,j,k,2,c) = -dtty2 * cv(j-1) - dtty1 * rhoq(j-1)
-                lhs(i,j,k,3,c) =  1.0 + c2dtty1 * rhoq(j)
-                lhs(i,j,k,4,c) =  dtty2 * cv(j+1) - dtty1 * rhoq(j+1)
-                lhs(i,j,k,5,c) =  0.0d0
-             end do
-          end do
-       end do
-
-c---------------------------------------------------------------------
-c      add fourth order dissipation                             
-c---------------------------------------------------------------------
-       if (start(2,c) .gt. 0) then
-          j = 1
-          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-       
-                lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4
-                lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz6
-                lhs(i,j+1,k,4,c) = lhs(i,j+1,k,4,c) - comz4
-                lhs(i,j+1,k,5,c) = lhs(i,j+1,k,5,c) + comz1
-             end do
-          end do
-       endif
-
-       do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do   j=3*start(2,c), cell_size(2,c)-3*end(2,c)-1
-             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-
-                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
-                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-             end do
-          end do
-       end do
-
-       if (end(2,c) .gt. 0) then
-          j = cell_size(2,c)-3
-          do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
-                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-
-                lhs(i,j+1,k,1,c) = lhs(i,j+1,k,1,c) + comz1
-                lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4
-                lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz5
-             end do
-          end do
-       endif
-
-c---------------------------------------------------------------------
-c      subsequently, do the other two factors                    
-c---------------------------------------------------------------------
-       do    k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-                lhs(i,j,k,1+5,c)  = lhs(i,j,k,1,c)
-                lhs(i,j,k,2+5,c)  = lhs(i,j,k,2,c) - 
-     >                            dtty2 * speed(i,j-1,k,c)
-                lhs(i,j,k,3+5,c)  = lhs(i,j,k,3,c)
-                lhs(i,j,k,4+5,c)  = lhs(i,j,k,4,c) + 
-     >                            dtty2 * speed(i,j+1,k,c)
-                lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
-                lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
-                lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + 
-     >                            dtty2 * speed(i,j-1,k,c)
-                lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
-                lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - 
-     >                            dtty2 * speed(i,j+1,k,c)
-                lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
-             end do
-          end do
-       end do
-
-       return
-       end
-
-
-
diff --git a/examples/smpi/NAS/SP/lhsz.f b/examples/smpi/NAS/SP/lhsz.f
deleted file mode 100644
index 08ea0bc24d..0000000000
--- a/examples/smpi/NAS/SP/lhsz.f
+++ /dev/null
@@ -1,123 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine lhsz(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three z-factors   
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       double precision ru1
-       integer i, j, k, c
-
-c---------------------------------------------------------------------
-c      treat only cell c                                         
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c first fill the lhs for the u-eigenvalue                          
-c---------------------------------------------------------------------
-       do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-          do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-
-             do   k = start(3,c)-1, cell_size(3,c)-end(3,c)
-                ru1 = c3c4*rho_i(i,j,k,c)
-                cv(k) = ws(i,j,k,c)
-                rhos(k) = dmax1(dz4 + con43 * ru1,
-     >                          dz5 + c1c5 * ru1,
-     >                          dzmax + ru1,
-     >                          dz1)
-             end do
-
-             do   k =  start(3,c), cell_size(3,c)-end(3,c)-1
-                lhs(i,j,k,1,c) =  0.0d0
-                lhs(i,j,k,2,c) = -dttz2 * cv(k-1) - dttz1 * rhos(k-1)
-                lhs(i,j,k,3,c) =  1.0 + c2dttz1 * rhos(k)
-                lhs(i,j,k,4,c) =  dttz2 * cv(k+1) - dttz1 * rhos(k+1)
-                lhs(i,j,k,5,c) =  0.0d0
-             end do
-          end do
-       end do
-
-c---------------------------------------------------------------------
-c      add fourth order dissipation                                  
-c---------------------------------------------------------------------
-       if (start(3,c) .gt. 0) then
-          k = 1
-          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-
-                lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4
-                lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz6
-                lhs(i,j,k+1,4,c) = lhs(i,j,k+1,4,c) - comz4
-                lhs(i,j,k+1,5,c) = lhs(i,j,k+1,5,c) + comz1
-             end do
-          end do
-       endif
-
-       do    k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
-          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
-                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-                lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-             end do
-          end do
-       end do
-
-       if (end(3,c) .gt. 0) then
-          k = cell_size(3,c)-3 
-          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-                lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
-                lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
-                lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
-                lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-
-                lhs(i,j,k+1,1,c) = lhs(i,j,k+1,1,c) + comz1
-                lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4
-                lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz5
-             end do
-          end do
-       endif
-
-
-c---------------------------------------------------------------------
-c      subsequently, fill the other factors (u+c), (u-c) 
-c---------------------------------------------------------------------
-       do    k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-                lhs(i,j,k,1+5,c)  = lhs(i,j,k,1,c)
-                lhs(i,j,k,2+5,c)  = lhs(i,j,k,2,c) - 
-     >                            dttz2 * speed(i,j,k-1,c)
-                lhs(i,j,k,3+5,c)  = lhs(i,j,k,3,c)
-                lhs(i,j,k,4+5,c)  = lhs(i,j,k,4,c) + 
-     >                            dttz2 * speed(i,j,k+1,c)
-                lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
-                lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
-                lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + 
-     >                            dttz2 * speed(i,j,k-1,c)
-                lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
-                lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - 
-     >                            dttz2 * speed(i,j,k+1,c)
-                lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
-             end do
-          end do
-       end do
-
-       return
-       end
-
-
diff --git a/examples/smpi/NAS/SP/make_set.f b/examples/smpi/NAS/SP/make_set.f
deleted file mode 100644
index 7a84e93010..0000000000
--- a/examples/smpi/NAS/SP/make_set.f
+++ /dev/null
@@ -1,120 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine make_set
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function allocates space for a set of cells and fills the set     
-c such that communication between cells on different nodes is only
-c nearest neighbor                                                   
-c---------------------------------------------------------------------
-
-       include 'header.h'
-       include 'mpinpb.h'
-
-       integer p, i, j, c, dir, size, excess, ierr,ierrcode
-
-c---------------------------------------------------------------------
-c     compute square root; add small number to allow for roundoff
-c     (note: this is computed in setup_mpi.f also, but prefer to do
-c     it twice because of some include file problems).
-c---------------------------------------------------------------------
-      ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
-
-c---------------------------------------------------------------------
-c      this makes coding easier
-c---------------------------------------------------------------------
-       p = ncells
-   
-c---------------------------------------------------------------------
-c      determine the location of the cell at the bottom of the 3D 
-c      array of cells
-c---------------------------------------------------------------------
-       cell_coord(1,1) = mod(node,p) 
-       cell_coord(2,1) = node/p 
-       cell_coord(3,1) = 0
-
-c---------------------------------------------------------------------
-c      set the cell_coords for cells in the rest of the z-layers; 
-c      this comes down to a simple linear numbering in the z-direct-
-c      ion, and to the doubly-cyclic numbering in the other dirs     
-c---------------------------------------------------------------------
-       do    c=2, p
-          cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) 
-          cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) 
-          cell_coord(3,c) = c-1
-       end do
-
-c---------------------------------------------------------------------
-c      offset all the coordinates by 1 to adjust for Fortran arrays
-c---------------------------------------------------------------------
-       do    dir = 1, 3
-          do    c = 1, p
-             cell_coord(dir,c) = cell_coord(dir,c) + 1
-          end do
-       end do
-   
-c---------------------------------------------------------------------
-c      slice(dir,n) contains the sequence number of the cell that is in
-c      coordinate plane n in the dir direction
-c---------------------------------------------------------------------
-       do   dir = 1, 3
-          do   c = 1, p
-             slice(dir,cell_coord(dir,c)) = c
-          end do
-       end do
-
-
-c---------------------------------------------------------------------
-c      fill the predecessor and successor entries, using the indices 
-c      of the bottom cells (they are the same at each level of k 
-c      anyway) acting as if full periodicity pertains; note that p is
-c      added to those arguments to the mod functions that might
-c      otherwise return wrong values when using the modulo function
-c---------------------------------------------------------------------
-       i = cell_coord(1,1)-1
-       j = cell_coord(2,1)-1
-
-       predecessor(1) = mod(i-1+p,p) + p*j
-       predecessor(2) = i + p*mod(j-1+p,p)
-       predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
-       successor(1)   = mod(i+1,p) + p*j
-       successor(2)   = i + p*mod(j+1,p)
-       successor(3)   = mod(i-1+p,p) + p*mod(j+1,p)
-
-c---------------------------------------------------------------------
-c now compute the sizes of the cells                                    
-c---------------------------------------------------------------------
-       do    dir= 1, 3
-c---------------------------------------------------------------------
-c         set cell_coord range for each direction                            
-c---------------------------------------------------------------------
-          size   = grid_points(dir)/p
-          excess = mod(grid_points(dir),p)
-          do    c=1, ncells
-             if (cell_coord(dir,c) .le. excess) then
-                cell_size(dir,c) = size+1
-                cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
-                cell_high(dir,c) = cell_low(dir,c)+size
-             else 
-                cell_size(dir,c) = size
-                cell_low(dir,c)  = excess*(size+1)+
-     >                   (cell_coord(dir,c)-excess-1)*size
-                cell_high(dir,c) = cell_low(dir,c)+size-1
-             endif
-             if (cell_size(dir, c) .le. 2) then
-                write(*,50)
- 50             format(' Error: Cell size too small. Min size is 3')
-                call MPI_Abort(mpi_comm_world,ierrcode,ierr)
-                stop
-             endif
-          end do
-       end do
-
-       return
-       end
-
diff --git a/examples/smpi/NAS/SP/mpinpb.h b/examples/smpi/NAS/SP/mpinpb.h
deleted file mode 100644
index 439db34f60..0000000000
--- a/examples/smpi/NAS/SP/mpinpb.h
+++ /dev/null
@@ -1,13 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'mpif.h'
-
-      integer           node, no_nodes, total_nodes, root, comm_setup, 
-     >                  comm_solve, comm_rhs, dp_type
-      logical           active
-      common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, 
-     >                  comm_solve, comm_rhs, dp_type, active
-      integer           DEFAULT_TAG
-      parameter         (DEFAULT_TAG = 0)
diff --git a/examples/smpi/NAS/SP/ninvr.f b/examples/smpi/NAS/SP/ninvr.f
deleted file mode 100644
index 146d046e8b..0000000000
--- a/examples/smpi/NAS/SP/ninvr.f
+++ /dev/null
@@ -1,45 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine  ninvr(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   block-diagonal matrix-vector multiplication              
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       integer  c,  i, j, k
-       double precision r1, r2, r3, r4, r5, t1, t2
-
-c---------------------------------------------------------------------
-c      treat only one cell                           
-c---------------------------------------------------------------------
-       do k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
-                r1 = rhs(i,j,k,1,c)
-                r2 = rhs(i,j,k,2,c)
-                r3 = rhs(i,j,k,3,c)
-                r4 = rhs(i,j,k,4,c)
-                r5 = rhs(i,j,k,5,c)
-               
-                t1 = bt * r3
-                t2 = 0.5d0 * ( r4 + r5 )
-
-                rhs(i,j,k,1,c) = -r2
-                rhs(i,j,k,2,c) =  r1
-                rhs(i,j,k,3,c) = bt * ( r4 - r5 )
-                rhs(i,j,k,4,c) = -t1 + t2
-                rhs(i,j,k,5,c) =  t1 + t2
-             enddo    
-          enddo
-       enddo
-
-       return
-       end
diff --git a/examples/smpi/NAS/SP/pinvr.f b/examples/smpi/NAS/SP/pinvr.f
deleted file mode 100644
index 060f0a57ef..0000000000
--- a/examples/smpi/NAS/SP/pinvr.f
+++ /dev/null
@@ -1,48 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine pinvr(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   block-diagonal matrix-vector multiplication                       
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       integer i, j, k, c
-       double precision r1, r2, r3, r4, r5, t1, t2
-
-c---------------------------------------------------------------------
-c      treat only one cell                                   
-c---------------------------------------------------------------------
-       do   k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do   j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do   i = start(1,c), cell_size(1,c)-end(1,c)-1
-
-                r1 = rhs(i,j,k,1,c)
-                r2 = rhs(i,j,k,2,c)
-                r3 = rhs(i,j,k,3,c)
-                r4 = rhs(i,j,k,4,c)
-                r5 = rhs(i,j,k,5,c)
-
-                t1 = bt * r1
-                t2 = 0.5d0 * ( r4 + r5 )
-
-                rhs(i,j,k,1,c) =  bt * ( r4 - r5 )
-                rhs(i,j,k,2,c) = -r3
-                rhs(i,j,k,3,c) =  r2
-                rhs(i,j,k,4,c) = -t1 + t2
-                rhs(i,j,k,5,c) =  t1 + t2
-             end do
-          end do
-       end do
-
-       return
-       end
-
-
-
diff --git a/examples/smpi/NAS/SP/rhs.f b/examples/smpi/NAS/SP/rhs.f
deleted file mode 100644
index 34e562a4e3..0000000000
--- a/examples/smpi/NAS/SP/rhs.f
+++ /dev/null
@@ -1,446 +0,0 @@
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine compute_rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       integer c, i, j, k, m
-       double precision aux, rho_inv, uijk, up1, um1, vijk, vp1, vm1,
-     >                  wijk, wp1, wm1
-
-
-c---------------------------------------------------------------------
-c loop over all cells owned by this node                           
-c---------------------------------------------------------------------
-       do    c = 1, ncells
-
-c---------------------------------------------------------------------
-c         compute the reciprocal of density, and the kinetic energy, 
-c         and the speed of sound. 
-c---------------------------------------------------------------------
-
-          do    k = -1, cell_size(3,c)
-             do    j = -1, cell_size(2,c)
-                do    i = -1, cell_size(1,c)
-                   rho_inv = 1.0d0/u(i,j,k,1,c)
-                   rho_i(i,j,k,c) = rho_inv
-                   us(i,j,k,c) = u(i,j,k,2,c) * rho_inv
-                   vs(i,j,k,c) = u(i,j,k,3,c) * rho_inv
-                   ws(i,j,k,c) = u(i,j,k,4,c) * rho_inv
-                   square(i,j,k,c)     = 0.5d0* (
-     >                        u(i,j,k,2,c)*u(i,j,k,2,c) + 
-     >                        u(i,j,k,3,c)*u(i,j,k,3,c) +
-     >                        u(i,j,k,4,c)*u(i,j,k,4,c) ) * rho_inv
-                   qs(i,j,k,c) = square(i,j,k,c) * rho_inv
-c---------------------------------------------------------------------
-c                  (don't need speed and ainx until the lhs computation)
-c---------------------------------------------------------------------
-                   aux = c1c2*rho_inv* (u(i,j,k,5,c) - square(i,j,k,c))
-                   aux = dsqrt(aux)
-                   speed(i,j,k,c) = aux
-                   ainv(i,j,k,c)  = 1.0d0/aux
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c copy the exact forcing term to the right hand side;  because 
-c this forcing term is known, we can store it on the whole of every 
-c cell,  including the boundary                   
-c---------------------------------------------------------------------
-
-          do   m = 1, 5
-             do   k = 0, cell_size(3,c)-1
-                do   j = 0, cell_size(2,c)-1
-                   do   i = 0, cell_size(1,c)-1
-                      rhs(i,j,k,m,c) = forcing(i,j,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-
-c---------------------------------------------------------------------
-c         compute xi-direction fluxes 
-c---------------------------------------------------------------------
-          do    k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-                do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-                   uijk = us(i,j,k,c)
-                   up1  = us(i+1,j,k,c)
-                   um1  = us(i-1,j,k,c)
-
-                   rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dx1tx1 * 
-     >                    (u(i+1,j,k,1,c) - 2.0d0*u(i,j,k,1,c) + 
-     >                     u(i-1,j,k,1,c)) -
-     >                    tx2 * (u(i+1,j,k,2,c) - u(i-1,j,k,2,c))
-
-                   rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dx2tx1 * 
-     >                    (u(i+1,j,k,2,c) - 2.0d0*u(i,j,k,2,c) + 
-     >                     u(i-1,j,k,2,c)) +
-     >                    xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
-     >                    tx2 * (u(i+1,j,k,2,c)*up1 - 
-     >                           u(i-1,j,k,2,c)*um1 +
-     >                           (u(i+1,j,k,5,c)- square(i+1,j,k,c)-
-     >                            u(i-1,j,k,5,c)+ square(i-1,j,k,c))*
-     >                            c2)
-
-                   rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dx3tx1 * 
-     >                    (u(i+1,j,k,3,c) - 2.0d0*u(i,j,k,3,c) +
-     >                     u(i-1,j,k,3,c)) +
-     >                    xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
-     >                              vs(i-1,j,k,c)) -
-     >                    tx2 * (u(i+1,j,k,3,c)*up1 - 
-     >                           u(i-1,j,k,3,c)*um1)
-
-                   rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dx4tx1 * 
-     >                    (u(i+1,j,k,4,c) - 2.0d0*u(i,j,k,4,c) +
-     >                     u(i-1,j,k,4,c)) +
-     >                    xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
-     >                              ws(i-1,j,k,c)) -
-     >                    tx2 * (u(i+1,j,k,4,c)*up1 - 
-     >                           u(i-1,j,k,4,c)*um1)
-
-                   rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dx5tx1 * 
-     >                    (u(i+1,j,k,5,c) - 2.0d0*u(i,j,k,5,c) +
-     >                     u(i-1,j,k,5,c)) +
-     >                    xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
-     >                              qs(i-1,j,k,c)) +
-     >                    xxcon4 * (up1*up1 -       2.0d0*uijk*uijk + 
-     >                              um1*um1) +
-     >                    xxcon5 * (u(i+1,j,k,5,c)*rho_i(i+1,j,k,c) - 
-     >                              2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
-     >                              u(i-1,j,k,5,c)*rho_i(i-1,j,k,c)) -
-     >                    tx2 * ( (c1*u(i+1,j,k,5,c) - 
-     >                             c2*square(i+1,j,k,c))*up1 -
-     >                            (c1*u(i-1,j,k,5,c) - 
-     >                             c2*square(i-1,j,k,c))*um1 )
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         add fourth order xi-direction dissipation               
-c---------------------------------------------------------------------
-          if (start(1,c) .gt. 0) then
-             i = 1
-             do    m = 1, 5
-                do    k = start(3,c), cell_size(3,c)-end(3,c)-1
-                   do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * 
-     >                    ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) +
-     >                            u(i+2,j,k,m,c))
-                   end do
-                end do
-             end do
-
-             i = 2
-             do    m = 1, 5
-                do    k = start(3,c), cell_size(3,c)-end(3,c)-1
-                   do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
-     >                    (-4.0d0*u(i-1,j,k,m,c) + 6.0d0*u(i,j,k,m,c) -
-     >                      4.0d0*u(i+1,j,k,m,c) + u(i+2,j,k,m,c))
-                   end do
-                end do
-             end do
-          endif
-
-          do     m = 1, 5
-             do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do  i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
-     >                    (  u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) + 
-     >                     6.0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) + 
-     >                         u(i+2,j,k,m,c) )
-                   end do
-                end do
-             end do
-          end do
- 
-
-          if (end(1,c) .gt. 0) then
-             i = cell_size(1,c)-3
-             do     m = 1, 5
-                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                   do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
-     >                    ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) + 
-     >                      6.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) )
-                   end do
-                end do
-             end do
-
-             i = cell_size(1,c)-2
-             do     m = 1, 5
-                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                   do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
-     >                    ( u(i-2,j,k,m,c) - 4.d0*u(i-1,j,k,m,c) +
-     >                      5.d0*u(i,j,k,m,c) )
-                   end do
-                end do
-             end do
-          endif
-
-c---------------------------------------------------------------------
-c         compute eta-direction fluxes 
-c---------------------------------------------------------------------
-          do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                   vijk = vs(i,j,k,c)
-                   vp1  = vs(i,j+1,k,c)
-                   vm1  = vs(i,j-1,k,c)
-                   rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dy1ty1 * 
-     >                   (u(i,j+1,k,1,c) - 2.0d0*u(i,j,k,1,c) + 
-     >                    u(i,j-1,k,1,c)) -
-     >                   ty2 * (u(i,j+1,k,3,c) - u(i,j-1,k,3,c))
-                   rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dy2ty1 * 
-     >                   (u(i,j+1,k,2,c) - 2.0d0*u(i,j,k,2,c) + 
-     >                    u(i,j-1,k,2,c)) +
-     >                   yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + 
-     >                             us(i,j-1,k,c)) -
-     >                   ty2 * (u(i,j+1,k,2,c)*vp1 - 
-     >                          u(i,j-1,k,2,c)*vm1)
-                   rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dy3ty1 * 
-     >                   (u(i,j+1,k,3,c) - 2.0d0*u(i,j,k,3,c) + 
-     >                    u(i,j-1,k,3,c)) +
-     >                   yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
-     >                   ty2 * (u(i,j+1,k,3,c)*vp1 - 
-     >                          u(i,j-1,k,3,c)*vm1 +
-     >                          (u(i,j+1,k,5,c) - square(i,j+1,k,c) - 
-     >                           u(i,j-1,k,5,c) + square(i,j-1,k,c))
-     >                          *c2)
-                   rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dy4ty1 * 
-     >                   (u(i,j+1,k,4,c) - 2.0d0*u(i,j,k,4,c) + 
-     >                    u(i,j-1,k,4,c)) +
-     >                   yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + 
-     >                             ws(i,j-1,k,c)) -
-     >                   ty2 * (u(i,j+1,k,4,c)*vp1 - 
-     >                          u(i,j-1,k,4,c)*vm1)
-                   rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dy5ty1 * 
-     >                   (u(i,j+1,k,5,c) - 2.0d0*u(i,j,k,5,c) + 
-     >                    u(i,j-1,k,5,c)) +
-     >                   yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + 
-     >                             qs(i,j-1,k,c)) +
-     >                   yycon4 * (vp1*vp1       - 2.0d0*vijk*vijk + 
-     >                             vm1*vm1) +
-     >                   yycon5 * (u(i,j+1,k,5,c)*rho_i(i,j+1,k,c) - 
-     >                             2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
-     >                             u(i,j-1,k,5,c)*rho_i(i,j-1,k,c)) -
-     >                   ty2 * ((c1*u(i,j+1,k,5,c) - 
-     >                           c2*square(i,j+1,k,c)) * vp1 -
-     >                          (c1*u(i,j-1,k,5,c) - 
-     >                           c2*square(i,j-1,k,c)) * vm1)
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         add fourth order eta-direction dissipation         
-c---------------------------------------------------------------------
-          if (start(2,c) .gt. 0) then
-             j = 1
-             do     m = 1, 5
-                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * 
-     >                    ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) +
-     >                            u(i,j+2,k,m,c))
-                   end do
-                end do
-             end do
-
-             j = 2
-             do     m = 1, 5
-                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
-     >                    (-4.0d0*u(i,j-1,k,m,c) + 6.0d0*u(i,j,k,m,c) -
-     >                      4.0d0*u(i,j+1,k,m,c) + u(i,j+2,k,m,c))
-                   end do
-                end do
-             end do
-          endif
-
-          do     m = 1, 5
-             do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                do    j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
-                   do  i = start(1,c),cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
-     >                    (  u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) + 
-     >                     6.0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) + 
-     >                         u(i,j+2,k,m,c) )
-                   end do
-                end do
-             end do
-          end do
- 
-          if (end(2,c) .gt. 0) then
-             j = cell_size(2,c)-3
-             do     m = 1, 5
-                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
-     >                    ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) + 
-     >                      6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) )
-                   end do
-                end do
-             end do
-
-             j = cell_size(2,c)-2
-             do     m = 1, 5
-                do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
-     >                    ( u(i,j-2,k,m,c) - 4.d0*u(i,j-1,k,m,c) +
-     >                      5.d0*u(i,j,k,m,c) )
-                   end do
-                end do
-             end do
-          endif
-
-
-c---------------------------------------------------------------------
-c         compute zeta-direction fluxes 
-c---------------------------------------------------------------------
-          do    k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                   wijk = ws(i,j,k,c)
-                   wp1  = ws(i,j,k+1,c)
-                   wm1  = ws(i,j,k-1,c)
-
-                   rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dz1tz1 * 
-     >                   (u(i,j,k+1,1,c) - 2.0d0*u(i,j,k,1,c) + 
-     >                    u(i,j,k-1,1,c)) -
-     >                   tz2 * (u(i,j,k+1,4,c) - u(i,j,k-1,4,c))
-                   rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dz2tz1 * 
-     >                   (u(i,j,k+1,2,c) - 2.0d0*u(i,j,k,2,c) + 
-     >                    u(i,j,k-1,2,c)) +
-     >                   zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + 
-     >                             us(i,j,k-1,c)) -
-     >                   tz2 * (u(i,j,k+1,2,c)*wp1 - 
-     >                          u(i,j,k-1,2,c)*wm1)
-                   rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dz3tz1 * 
-     >                   (u(i,j,k+1,3,c) - 2.0d0*u(i,j,k,3,c) + 
-     >                    u(i,j,k-1,3,c)) +
-     >                   zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + 
-     >                             vs(i,j,k-1,c)) -
-     >                   tz2 * (u(i,j,k+1,3,c)*wp1 - 
-     >                          u(i,j,k-1,3,c)*wm1)
-                   rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dz4tz1 * 
-     >                   (u(i,j,k+1,4,c) - 2.0d0*u(i,j,k,4,c) + 
-     >                    u(i,j,k-1,4,c)) +
-     >                   zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
-     >                   tz2 * (u(i,j,k+1,4,c)*wp1 - 
-     >                          u(i,j,k-1,4,c)*wm1 +
-     >                          (u(i,j,k+1,5,c) - square(i,j,k+1,c) - 
-     >                           u(i,j,k-1,5,c) + square(i,j,k-1,c))
-     >                          *c2)
-                   rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dz5tz1 * 
-     >                   (u(i,j,k+1,5,c) - 2.0d0*u(i,j,k,5,c) + 
-     >                    u(i,j,k-1,5,c)) +
-     >                   zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + 
-     >                             qs(i,j,k-1,c)) +
-     >                   zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + 
-     >                             wm1*wm1) +
-     >                   zzcon5 * (u(i,j,k+1,5,c)*rho_i(i,j,k+1,c) - 
-     >                             2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
-     >                             u(i,j,k-1,5,c)*rho_i(i,j,k-1,c)) -
-     >                   tz2 * ( (c1*u(i,j,k+1,5,c) - 
-     >                            c2*square(i,j,k+1,c))*wp1 -
-     >                           (c1*u(i,j,k-1,5,c) - 
-     >                            c2*square(i,j,k-1,c))*wm1)
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         add fourth order zeta-direction dissipation                
-c---------------------------------------------------------------------
-          if (start(3,c) .gt. 0) then
-             k = 1
-             do     m = 1, 5
-                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * 
-     >                    ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) +
-     >                            u(i,j,k+2,m,c))
-                   end do
-                end do
-             end do
-
-             k = 2
-             do     m = 1, 5
-                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
-     >                    (-4.0d0*u(i,j,k-1,m,c) + 6.0d0*u(i,j,k,m,c) -
-     >                      4.0d0*u(i,j,k+1,m,c) + u(i,j,k+2,m,c))
-                   end do
-                end do
-             end do
-          endif
-
-          do     m = 1, 5
-             do     k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
-                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do     i = start(1,c),cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * 
-     >                    (  u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) + 
-     >                     6.0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) + 
-     >                         u(i,j,k+2,m,c) )
-                   end do
-                end do
-             end do
-          end do
- 
-          if (end(3,c) .gt. 0) then
-             k = cell_size(3,c)-3
-             do     m = 1, 5
-                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
-     >                    ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) + 
-     >                      6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) )
-                   end do
-                end do
-             end do
-
-             k = cell_size(3,c)-2
-             do     m = 1, 5
-                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do     i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
-     >                    ( u(i,j,k-2,m,c) - 4.d0*u(i,j,k-1,m,c) +
-     >                      5.d0*u(i,j,k,m,c) )
-                   end do
-                end do
-             end do
-          endif
-
-          do     m = 1, 5
-             do     k = start(3,c), cell_size(3,c)-end(3,c)-1
-                do     j = start(2,c), cell_size(2,c)-end(2,c)-1
-                   do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) * dt
-                   end do
-                end do
-             end do
-          end do
-
-       end do
-    
-       return
-       end
-
-
-
-
diff --git a/examples/smpi/NAS/SP/set_constants.f b/examples/smpi/NAS/SP/set_constants.f
deleted file mode 100644
index 63ce72bb9b..0000000000
--- a/examples/smpi/NAS/SP/set_constants.f
+++ /dev/null
@@ -1,203 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine  set_constants
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       include 'header.h'
-  
-       ce(1,1)  = 2.0d0
-       ce(1,2)  = 0.0d0
-       ce(1,3)  = 0.0d0
-       ce(1,4)  = 4.0d0
-       ce(1,5)  = 5.0d0
-       ce(1,6)  = 3.0d0
-       ce(1,7)  = 0.5d0
-       ce(1,8)  = 0.02d0
-       ce(1,9)  = 0.01d0
-       ce(1,10) = 0.03d0
-       ce(1,11) = 0.5d0
-       ce(1,12) = 0.4d0
-       ce(1,13) = 0.3d0
- 
-       ce(2,1)  = 1.0d0
-       ce(2,2)  = 0.0d0
-       ce(2,3)  = 0.0d0
-       ce(2,4)  = 0.0d0
-       ce(2,5)  = 1.0d0
-       ce(2,6)  = 2.0d0
-       ce(2,7)  = 3.0d0
-       ce(2,8)  = 0.01d0
-       ce(2,9)  = 0.03d0
-       ce(2,10) = 0.02d0
-       ce(2,11) = 0.4d0
-       ce(2,12) = 0.3d0
-       ce(2,13) = 0.5d0
-
-       ce(3,1)  = 2.0d0
-       ce(3,2)  = 2.0d0
-       ce(3,3)  = 0.0d0
-       ce(3,4)  = 0.0d0
-       ce(3,5)  = 0.0d0
-       ce(3,6)  = 2.0d0
-       ce(3,7)  = 3.0d0
-       ce(3,8)  = 0.04d0
-       ce(3,9)  = 0.03d0
-       ce(3,10) = 0.05d0
-       ce(3,11) = 0.3d0
-       ce(3,12) = 0.5d0
-       ce(3,13) = 0.4d0
-
-       ce(4,1)  = 2.0d0
-       ce(4,2)  = 2.0d0
-       ce(4,3)  = 0.0d0
-       ce(4,4)  = 0.0d0
-       ce(4,5)  = 0.0d0
-       ce(4,6)  = 2.0d0
-       ce(4,7)  = 3.0d0
-       ce(4,8)  = 0.03d0
-       ce(4,9)  = 0.05d0
-       ce(4,10) = 0.04d0
-       ce(4,11) = 0.2d0
-       ce(4,12) = 0.1d0
-       ce(4,13) = 0.3d0
-
-       ce(5,1)  = 5.0d0
-       ce(5,2)  = 4.0d0
-       ce(5,3)  = 3.0d0
-       ce(5,4)  = 2.0d0
-       ce(5,5)  = 0.1d0
-       ce(5,6)  = 0.4d0
-       ce(5,7)  = 0.3d0
-       ce(5,8)  = 0.05d0
-       ce(5,9)  = 0.04d0
-       ce(5,10) = 0.03d0
-       ce(5,11) = 0.1d0
-       ce(5,12) = 0.3d0
-       ce(5,13) = 0.2d0
-
-       c1 = 1.4d0
-       c2 = 0.4d0
-       c3 = 0.1d0
-       c4 = 1.0d0
-       c5 = 1.4d0
-
-       bt = dsqrt(0.5d0)
-
-       dnxm1 = 1.0d0 / dble(grid_points(1)-1)
-       dnym1 = 1.0d0 / dble(grid_points(2)-1)
-       dnzm1 = 1.0d0 / dble(grid_points(3)-1)
-
-       c1c2 = c1 * c2
-       c1c5 = c1 * c5
-       c3c4 = c3 * c4
-       c1345 = c1c5 * c3c4
-
-       conz1 = (1.0d0-c1c5)
-
-       tx1 = 1.0d0 / (dnxm1 * dnxm1)
-       tx2 = 1.0d0 / (2.0d0 * dnxm1)
-       tx3 = 1.0d0 / dnxm1
-
-       ty1 = 1.0d0 / (dnym1 * dnym1)
-       ty2 = 1.0d0 / (2.0d0 * dnym1)
-       ty3 = 1.0d0 / dnym1
- 
-       tz1 = 1.0d0 / (dnzm1 * dnzm1)
-       tz2 = 1.0d0 / (2.0d0 * dnzm1)
-       tz3 = 1.0d0 / dnzm1
-
-       dx1 = 0.75d0
-       dx2 = 0.75d0
-       dx3 = 0.75d0
-       dx4 = 0.75d0
-       dx5 = 0.75d0
-
-       dy1 = 0.75d0
-       dy2 = 0.75d0
-       dy3 = 0.75d0
-       dy4 = 0.75d0
-       dy5 = 0.75d0
-
-       dz1 = 1.0d0
-       dz2 = 1.0d0
-       dz3 = 1.0d0
-       dz4 = 1.0d0
-       dz5 = 1.0d0
-
-       dxmax = dmax1(dx3, dx4)
-       dymax = dmax1(dy2, dy4)
-       dzmax = dmax1(dz2, dz3)
-
-       dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
-
-       c4dssp = 4.0d0 * dssp
-       c5dssp = 5.0d0 * dssp
-
-       dttx1 = dt*tx1
-       dttx2 = dt*tx2
-       dtty1 = dt*ty1
-       dtty2 = dt*ty2
-       dttz1 = dt*tz1
-       dttz2 = dt*tz2
-
-       c2dttx1 = 2.0d0*dttx1
-       c2dtty1 = 2.0d0*dtty1
-       c2dttz1 = 2.0d0*dttz1
-
-       dtdssp = dt*dssp
-
-       comz1  = dtdssp
-       comz4  = 4.0d0*dtdssp
-       comz5  = 5.0d0*dtdssp
-       comz6  = 6.0d0*dtdssp
-
-       c3c4tx3 = c3c4*tx3
-       c3c4ty3 = c3c4*ty3
-       c3c4tz3 = c3c4*tz3
-
-       dx1tx1 = dx1*tx1
-       dx2tx1 = dx2*tx1
-       dx3tx1 = dx3*tx1
-       dx4tx1 = dx4*tx1
-       dx5tx1 = dx5*tx1
-        
-       dy1ty1 = dy1*ty1
-       dy2ty1 = dy2*ty1
-       dy3ty1 = dy3*ty1
-       dy4ty1 = dy4*ty1
-       dy5ty1 = dy5*ty1
-        
-       dz1tz1 = dz1*tz1
-       dz2tz1 = dz2*tz1
-       dz3tz1 = dz3*tz1
-       dz4tz1 = dz4*tz1
-       dz5tz1 = dz5*tz1
-
-       c2iv  = 2.5d0
-       con43 = 4.0d0/3.0d0
-       con16 = 1.0d0/6.0d0
-        
-       xxcon1 = c3c4tx3*con43*tx3
-       xxcon2 = c3c4tx3*tx3
-       xxcon3 = c3c4tx3*conz1*tx3
-       xxcon4 = c3c4tx3*con16*tx3
-       xxcon5 = c3c4tx3*c1c5*tx3
-
-       yycon1 = c3c4ty3*con43*ty3
-       yycon2 = c3c4ty3*ty3
-       yycon3 = c3c4ty3*conz1*ty3
-       yycon4 = c3c4ty3*con16*ty3
-       yycon5 = c3c4ty3*c1c5*ty3
-
-       zzcon1 = c3c4tz3*con43*tz3
-       zzcon2 = c3c4tz3*tz3
-       zzcon3 = c3c4tz3*conz1*tz3
-       zzcon4 = c3c4tz3*con16*tz3
-       zzcon5 = c3c4tz3*c1c5*tz3
-
-       return
-       end
diff --git a/examples/smpi/NAS/SP/setup_mpi.f b/examples/smpi/NAS/SP/setup_mpi.f
deleted file mode 100644
index 2d98f7dd02..0000000000
--- a/examples/smpi/NAS/SP/setup_mpi.f
+++ /dev/null
@@ -1,65 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      subroutine setup_mpi
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c set up MPI stuff
-c---------------------------------------------------------------------
-
-      implicit none
-      include 'mpinpb.h'
-      include 'npbparams.h'
-      integer error, nc, color
-
-      call mpi_init(error)
-      
-      call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
-      call mpi_comm_rank(MPI_COMM_WORLD, node, error)
-
-      if (.not. convertdouble) then
-         dp_type = MPI_DOUBLE_PRECISION
-      else
-         dp_type = MPI_REAL
-      endif
-
-c---------------------------------------------------------------------
-c     compute square root; add small number to allow for roundoff
-c---------------------------------------------------------------------
-      nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
-
-c---------------------------------------------------------------------
-c We handle a non-square number of nodes by making the excess nodes
-c inactive. However, we can never handle more cells than were compiled
-c in. 
-c---------------------------------------------------------------------
-
-      if (nc .gt. maxcells) nc = maxcells
-
-      if (node .ge. nc*nc) then
-         active = .false.
-         color = 1
-      else
-         active = .true.
-         color = 0
-      end if
-      
-      call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
-      if (.not. active) return
-
-      call mpi_comm_size(comm_setup, no_nodes, error)
-      call mpi_comm_dup(comm_setup, comm_solve, error)
-      call mpi_comm_dup(comm_setup, comm_rhs, error)
-      
-c---------------------------------------------------------------------
-c     let node 0 be the root for the group (there is only one)
-c---------------------------------------------------------------------
-      root = 0
-
-      return
-      end
-
diff --git a/examples/smpi/NAS/SP/sp.f b/examples/smpi/NAS/SP/sp.f
deleted file mode 100644
index 740cadee46..0000000000
--- a/examples/smpi/NAS/SP/sp.f
+++ /dev/null
@@ -1,194 +0,0 @@
-!-------------------------------------------------------------------------!
-!                                                                         !
-!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
-!                                                                         !
-!                                   S P                                   !
-!                                                                         !
-!-------------------------------------------------------------------------!
-!                                                                         !
-!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
-!    It is described in NAS Technical Reports 95-020 and 02-007           !
-!                                                                         !
-!    Permission to use, copy, distribute and modify this software         !
-!    for any purpose with or without fee is hereby granted.  We           !
-!    request, however, that all derived work reference the NAS            !
-!    Parallel Benchmarks 3.3. This software is provided "as is"           !
-!    without express or implied warranty.                                 !
-!                                                                         !
-!    Information on NPB 3.3, including the technical report, the          !
-!    original specifications, source code, results and information        !
-!    on how to submit new results, is available at:                       !
-!                                                                         !
-!           http://www.nas.nasa.gov/Software/NPB/                         !
-!                                                                         !
-!    Send comments or suggestions to  npb@nas.nasa.gov                    !
-!                                                                         !
-!          NAS Parallel Benchmarks Group                                  !
-!          NASA Ames Research Center                                      !
-!          Mail Stop: T27A-1                                              !
-!          Moffett Field, CA   94035-1000                                 !
-!                                                                         !
-!          E-mail:  npb@nas.nasa.gov                                      !
-!          Fax:     (650) 604-3957                                        !
-!                                                                         !
-!-------------------------------------------------------------------------!
-
-
-c---------------------------------------------------------------------
-c
-c Authors: R. F. Van der Wijngaart
-c          W. Saphir
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-       program MPSP
-c---------------------------------------------------------------------
-
-       include  'header.h'
-       include  'mpinpb.h'
-      
-       integer          i, niter, step, c, error, fstatus
-       external timer_read
-       double precision mflops, t, tmax, timer_read
-       logical          verified
-       character        class
-
-       call setup_mpi
-       if (.not. active) goto 999
-
-c---------------------------------------------------------------------
-c      Root node reads input file (if it exists) else takes
-c      defaults from parameters
-c---------------------------------------------------------------------
-       if (node .eq. root) then
-          
-          write(*, 1000)
-          open (unit=2,file='inputsp.data',status='old', iostat=fstatus)
-c
-          if (fstatus .eq. 0) then
-            write(*,233) 
- 233        format(' Reading from input file inputsp.data')
-            read (2,*) niter
-            read (2,*) dt
-            read (2,*) grid_points(1), grid_points(2), grid_points(3)
-            close(2)
-          else
-            write(*,234) 
-            niter = niter_default
-            dt    = dt_default
-            grid_points(1) = problem_size
-            grid_points(2) = problem_size
-            grid_points(3) = problem_size
-          endif
- 234      format(' No input file inputsp.data. Using compiled defaults')
-
-          write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
-          write(*, 1002) niter, dt
-          if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
-          if (no_nodes .ne. maxcells*maxcells) 
-     >        write(*, 1005) maxcells*maxcells
-          write(*, 1003) no_nodes
-
- 1000 format(//,' NAS Parallel Benchmarks 3.3 -- SP Benchmark',/)
- 1001     format(' Size: ', i4, 'x', i4, 'x', i4)
- 1002     format(' Iterations: ', i4, '    dt: ', F11.7)
- 1004     format(' Total number of processes: ', i5)
- 1005     format(' WARNING: compiled for ', i5, ' processes ')
- 1003     format(' Number of active processes: ', i5, /)
-
-       endif
-
-       call mpi_bcast(niter, 1, MPI_INTEGER, 
-     >                root, comm_setup, error)
-
-       call mpi_bcast(dt, 1, dp_type, 
-     >                root, comm_setup, error)
-
-       call mpi_bcast(grid_points(1), 3, MPI_INTEGER, 
-     >                root, comm_setup, error)
-
-
-       call make_set
-
-       do  c = 1, ncells
-          if ( (cell_size(1,c) .gt. IMAX) .or.
-     >         (cell_size(2,c) .gt. JMAX) .or.
-     >         (cell_size(3,c) .gt. KMAX) ) then
-             print *,node, c, (cell_size(i,c),i=1,3)
-             print *,' Problem size too big for compiled array sizes'
-             goto 999
-          endif
-       end do
-
-       call set_constants
-
-       call initialize
-
-c       call mpi_finalize(error)
-c       stop
-
-       call lhsinit
-
-       call exact_rhs
-
-       call compute_buffer_size(5)
-
-c---------------------------------------------------------------------
-c      do one time step to touch all code, and reinitialize
-c---------------------------------------------------------------------
-       call adi
-       call initialize
-
-c---------------------------------------------------------------------
-c      Synchronize before placing time stamp
-c---------------------------------------------------------------------
-       call mpi_barrier(comm_setup, error)
-
-       call timer_clear(1)
-       call timer_start(1)
-
-       do  step = 1, niter
-
-          if (node .eq. root) then
-             if (mod(step, 20) .eq. 0 .or. 
-     >           step .eq. 1) then
-                write(*, 200) step
- 200            format(' Time step ', i4)
-              endif
-          endif
-
-          call adi
-
-       end do
-
-       call timer_stop(1)
-       t = timer_read(1)
-       
-       call verify(niter, class, verified)
-
-       call mpi_reduce(t, tmax, 1, 
-     >                 dp_type, MPI_MAX, 
-     >                 root, comm_setup, error)
-
-       if( node .eq. root ) then
-          if( tmax .ne. 0. ) then
-             mflops = (881.174*float( problem_size )**3
-     >                -4683.91*float( problem_size )**2
-     >                +11484.5*float( problem_size )
-     >                -19272.4) * float( niter ) / (tmax*1000000.0d0)
-          else
-             mflops = 0.0
-          endif
-
-         call print_results('SP', class, grid_points(1), 
-     >     grid_points(2), grid_points(3), niter, maxcells*maxcells, 
-     >     total_nodes, tmax, mflops, '          floating point', 
-     >     verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, 
-     >     cs6, '(none)')
-       endif
-
- 999   continue
-       call mpi_barrier(MPI_COMM_WORLD, error)
-       call mpi_finalize(error)
-
-       end
diff --git a/examples/smpi/NAS/SP/txinvr.f b/examples/smpi/NAS/SP/txinvr.f
deleted file mode 100644
index b5ca4616f5..0000000000
--- a/examples/smpi/NAS/SP/txinvr.f
+++ /dev/null
@@ -1,59 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine  txinvr
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c block-diagonal matrix-vector multiplication                  
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       integer c, i, j, k
-       double precision t1, t2, t3, ac, ru1, uu, vv, ww, r1, r2, r3, 
-     >                  r4, r5, ac2inv
-
-c---------------------------------------------------------------------
-c      loop over all cells owned by this node          
-c---------------------------------------------------------------------
-       do   c = 1, ncells
-          do    k = start(3,c), cell_size(3,c)-end(3,c)-1
-             do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-                do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-
-                   ru1 = rho_i(i,j,k,c)
-                   uu = us(i,j,k,c)
-                   vv = vs(i,j,k,c)
-                   ww = ws(i,j,k,c)
-                   ac = speed(i,j,k,c)
-                   ac2inv = ainv(i,j,k,c)*ainv(i,j,k,c)
-
-                   r1 = rhs(i,j,k,1,c)
-                   r2 = rhs(i,j,k,2,c)
-                   r3 = rhs(i,j,k,3,c)
-                   r4 = rhs(i,j,k,4,c)
-                   r5 = rhs(i,j,k,5,c)
-
-                   t1 = c2 * ac2inv * ( qs(i,j,k,c)*r1 - uu*r2  - 
-     >                  vv*r3 - ww*r4 + r5 )
-                   t2 = bt * ru1 * ( uu * r1 - r2 )
-                   t3 = ( bt * ru1 * ac ) * t1
-
-                   rhs(i,j,k,1,c) = r1 - t1
-                   rhs(i,j,k,2,c) = - ru1 * ( ww*r1 - r4 )
-                   rhs(i,j,k,3,c) =   ru1 * ( vv*r1 - r3 )
-                   rhs(i,j,k,4,c) = - t2 + t3
-                   rhs(i,j,k,5,c) =   t2 + t3
-                end do
-             end do
-          end do
-       end do
-
-       return
-       end
-
-
diff --git a/examples/smpi/NAS/SP/tzetar.f b/examples/smpi/NAS/SP/tzetar.f
deleted file mode 100644
index 554066d6fc..0000000000
--- a/examples/smpi/NAS/SP/tzetar.f
+++ /dev/null
@@ -1,60 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine  tzetar(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c   block-diagonal matrix-vector multiplication                       
-c---------------------------------------------------------------------
-
-       include 'header.h'
-
-       integer i, j, k, c
-       double precision  t1, t2, t3, ac, xvel, yvel, zvel, r1, r2, r3, 
-     >                   r4, r5, btuz, acinv, ac2u, uzik1
-
-c---------------------------------------------------------------------
-c      treat only one cell                                             
-c---------------------------------------------------------------------
-       do    k = start(3,c), cell_size(3,c)-end(3,c)-1
-          do    j = start(2,c), cell_size(2,c)-end(2,c)-1
-             do    i = start(1,c), cell_size(1,c)-end(1,c)-1
-
-                xvel = us(i,j,k,c)
-                yvel = vs(i,j,k,c)
-                zvel = ws(i,j,k,c)
-                ac   = speed(i,j,k,c)
-                acinv = ainv(i,j,k,c)
-
-                ac2u = ac*ac
-
-                r1 = rhs(i,j,k,1,c)
-                r2 = rhs(i,j,k,2,c)
-                r3 = rhs(i,j,k,3,c)
-                r4 = rhs(i,j,k,4,c)
-                r5 = rhs(i,j,k,5,c)      
-
-                uzik1 = u(i,j,k,1,c)
-                btuz  = bt * uzik1
-
-                t1 = btuz*acinv * (r4 + r5)
-                t2 = r3 + t1
-                t3 = btuz * (r4 - r5)
-
-                rhs(i,j,k,1,c) = t2
-                rhs(i,j,k,2,c) = -uzik1*r2 + xvel*t2
-                rhs(i,j,k,3,c) =  uzik1*r1 + yvel*t2
-                rhs(i,j,k,4,c) =  zvel*t2  + t3
-                rhs(i,j,k,5,c) =  uzik1*(-xvel*r2 + yvel*r1) + 
-     >                    qs(i,j,k,c)*t2 + c2iv*ac2u*t1 + zvel*t3
-
-             end do
-          end do
-       end do
-
-       return
-       end
diff --git a/examples/smpi/NAS/SP/verify.f b/examples/smpi/NAS/SP/verify.f
deleted file mode 100644
index 08be79c8a6..0000000000
--- a/examples/smpi/NAS/SP/verify.f
+++ /dev/null
@@ -1,358 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-        subroutine verify(no_time_steps, class, verified)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c  verification routine                         
-c---------------------------------------------------------------------
-
-        include 'header.h'
-        include 'mpinpb.h'
-
-        double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), 
-     >                   epsilon, xce(5), xcr(5), dtref
-        integer m, no_time_steps
-        character class
-        logical verified
-
-c---------------------------------------------------------------------
-c   tolerance level
-c---------------------------------------------------------------------
-        epsilon = 1.0d-08
-
-
-c---------------------------------------------------------------------
-c   compute the error norm and the residual norm, and exit if not printing
-c---------------------------------------------------------------------
-        call error_norm(xce)
-        call copy_faces
-
-        call rhs_norm(xcr)
-
-        do m = 1, 5
-           xcr(m) = xcr(m) / dt
-        enddo
-
-        if (node .ne. 0) return
-
-        class = 'U'
-        verified = .true.
-
-        do m = 1,5
-           xcrref(m) = 1.0
-           xceref(m) = 1.0
-        end do
-
-c---------------------------------------------------------------------
-c    reference data for 12X12X12 grids after 100 time steps, with DT = 1.50d-02
-c---------------------------------------------------------------------
-        if ( (grid_points(1)  .eq. 12     ) .and. 
-     >       (grid_points(2)  .eq. 12     ) .and.
-     >       (grid_points(3)  .eq. 12     ) .and.
-     >       (no_time_steps   .eq. 100    ))  then
-
-           class = 'S'
-           dtref = 1.5d-2
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-           xcrref(1) = 2.7470315451339479d-02
-           xcrref(2) = 1.0360746705285417d-02
-           xcrref(3) = 1.6235745065095532d-02
-           xcrref(4) = 1.5840557224455615d-02
-           xcrref(5) = 3.4849040609362460d-02
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-           xceref(1) = 2.7289258557377227d-05
-           xceref(2) = 1.0364446640837285d-05
-           xceref(3) = 1.6154798287166471d-05
-           xceref(4) = 1.5750704994480102d-05
-           xceref(5) = 3.4177666183390531d-05
-
-
-c---------------------------------------------------------------------
-c    reference data for 36X36X36 grids after 400 time steps, with DT = 1.5d-03
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 36) .and. 
-     >           (grid_points(2) .eq. 36) .and.
-     >           (grid_points(3) .eq. 36) .and.
-     >           (no_time_steps . eq. 400) ) then
-
-           class = 'W'
-           dtref = 1.5d-3
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-           xcrref(1) = 0.1893253733584d-02
-           xcrref(2) = 0.1717075447775d-03
-           xcrref(3) = 0.2778153350936d-03
-           xcrref(4) = 0.2887475409984d-03
-           xcrref(5) = 0.3143611161242d-02
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-           xceref(1) = 0.7542088599534d-04
-           xceref(2) = 0.6512852253086d-05
-           xceref(3) = 0.1049092285688d-04
-           xceref(4) = 0.1128838671535d-04
-           xceref(5) = 0.1212845639773d-03
-
-c---------------------------------------------------------------------
-c    reference data for 64X64X64 grids after 400 time steps, with DT = 1.5d-03
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 64) .and. 
-     >           (grid_points(2) .eq. 64) .and.
-     >           (grid_points(3) .eq. 64) .and.
-     >           (no_time_steps . eq. 400) ) then
-
-           class = 'A'
-           dtref = 1.5d-3
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-           xcrref(1) = 2.4799822399300195d0
-           xcrref(2) = 1.1276337964368832d0
-           xcrref(3) = 1.5028977888770491d0
-           xcrref(4) = 1.4217816211695179d0
-           xcrref(5) = 2.1292113035138280d0
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-           xceref(1) = 1.0900140297820550d-04
-           xceref(2) = 3.7343951769282091d-05
-           xceref(3) = 5.0092785406541633d-05
-           xceref(4) = 4.7671093939528255d-05
-           xceref(5) = 1.3621613399213001d-04
-
-c---------------------------------------------------------------------
-c    reference data for 102X102X102 grids after 400 time steps,
-c    with DT = 1.0d-03
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 102) .and. 
-     >           (grid_points(2) .eq. 102) .and.
-     >           (grid_points(3) .eq. 102) .and.
-     >           (no_time_steps . eq. 400) ) then
-
-           class = 'B'
-           dtref = 1.0d-3
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-           xcrref(1) = 0.6903293579998d+02
-           xcrref(2) = 0.3095134488084d+02
-           xcrref(3) = 0.4103336647017d+02
-           xcrref(4) = 0.3864769009604d+02
-           xcrref(5) = 0.5643482272596d+02
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-           xceref(1) = 0.9810006190188d-02
-           xceref(2) = 0.1022827905670d-02
-           xceref(3) = 0.1720597911692d-02
-           xceref(4) = 0.1694479428231d-02
-           xceref(5) = 0.1847456263981d-01
-
-c---------------------------------------------------------------------
-c    reference data for 162X162X162 grids after 400 time steps,
-c    with DT = 0.67d-03
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 162) .and. 
-     >           (grid_points(2) .eq. 162) .and.
-     >           (grid_points(3) .eq. 162) .and.
-     >           (no_time_steps . eq. 400) ) then
-
-           class = 'C'
-           dtref = 0.67d-3
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-           xcrref(1) = 0.5881691581829d+03
-           xcrref(2) = 0.2454417603569d+03
-           xcrref(3) = 0.3293829191851d+03
-           xcrref(4) = 0.3081924971891d+03
-           xcrref(5) = 0.4597223799176d+03
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-           xceref(1) = 0.2598120500183d+00
-           xceref(2) = 0.2590888922315d-01
-           xceref(3) = 0.5132886416320d-01
-           xceref(4) = 0.4806073419454d-01
-           xceref(5) = 0.5483377491301d+00
-
-c---------------------------------------------------------------------
-c    reference data for 408X408X408 grids after 500 time steps,
-c    with DT = 0.3d-03
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 408) .and. 
-     >           (grid_points(2) .eq. 408) .and.
-     >           (grid_points(3) .eq. 408) .and.
-     >           (no_time_steps . eq. 500) ) then
-
-           class = 'D'
-           dtref = 0.30d-3
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-           xcrref(1) = 0.1044696216887d+05
-           xcrref(2) = 0.3204427762578d+04
-           xcrref(3) = 0.4648680733032d+04
-           xcrref(4) = 0.4238923283697d+04
-           xcrref(5) = 0.7588412036136d+04
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-           xceref(1) = 0.5089471423669d+01
-           xceref(2) = 0.5323514855894d+00
-           xceref(3) = 0.1187051008971d+01
-           xceref(4) = 0.1083734951938d+01
-           xceref(5) = 0.1164108338568d+02
-
-c---------------------------------------------------------------------
-c    reference data for 1020X1020X1020 grids after 500 time steps,
-c    with DT = 0.1d-03
-c---------------------------------------------------------------------
-        elseif ( (grid_points(1) .eq. 1020) .and. 
-     >           (grid_points(2) .eq. 1020) .and.
-     >           (grid_points(3) .eq. 1020) .and.
-     >           (no_time_steps . eq. 500) ) then
-
-           class = 'E'
-           dtref = 0.10d-3
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
-           xcrref(1) = 0.6255387422609d+05
-           xcrref(2) = 0.1495317020012d+05
-           xcrref(3) = 0.2347595750586d+05
-           xcrref(4) = 0.2091099783534d+05
-           xcrref(5) = 0.4770412841218d+05
-
-c---------------------------------------------------------------------
-c    Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-           xceref(1) = 0.6742735164909d+02
-           xceref(2) = 0.5390656036938d+01
-           xceref(3) = 0.1680647196477d+02
-           xceref(4) = 0.1536963126457d+02
-           xceref(5) = 0.1575330146156d+03
-
-        else
-           verified = .false.
-        endif
-
-c---------------------------------------------------------------------
-c    verification test for residuals if gridsize is one of 
-c    the defined grid sizes above (class .ne. 'U')
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c    Compute the difference of solution values and the known reference values.
-c---------------------------------------------------------------------
-        do m = 1, 5
-           
-           xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) 
-           xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
-           
-        enddo
-
-c---------------------------------------------------------------------
-c    Output the comparison of computed results to known cases.
-c---------------------------------------------------------------------
-
-        if (class .ne. 'U') then
-           write(*, 1990) class
- 1990      format(' Verification being performed for class ', a)
-           write (*,2000) epsilon
- 2000      format(' accuracy setting for epsilon = ', E20.13)
-           verified = (dabs(dt-dtref) .le. epsilon)
-           if (.not.verified) then  
-              class = 'U'
-              write (*,1000) dtref
- 1000         format(' DT does not match the reference value of ', 
-     >                 E15.8)
-           endif
-        else 
-           write(*, 1995)
- 1995      format(' Unknown class')
-        endif
-
-
-        if (class .ne. 'U') then
-           write (*,2001) 
-        else
-           write (*, 2005)
-        endif
-
- 2001   format(' Comparison of RMS-norms of residual')
- 2005   format(' RMS-norms of residual')
-        do m = 1, 5
-           if (class .eq. 'U') then
-              write(*, 2015) m, xcr(m)
-           else if (xcrdif(m) .le. epsilon) then
-              write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
-           else 
-              verified = .false.
-              write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
-           endif
-        enddo
-
-        if (class .ne. 'U') then
-           write (*,2002)
-        else
-           write (*,2006)
-        endif
- 2002   format(' Comparison of RMS-norms of solution error')
- 2006   format(' RMS-norms of solution error')
-        
-        do m = 1, 5
-           if (class .eq. 'U') then
-              write(*, 2015) m, xce(m)
-           else if (xcedif(m) .le. epsilon) then
-              write (*,2011) m,xce(m),xceref(m),xcedif(m)
-           else
-              verified = .false.
-              write (*,2010) m,xce(m),xceref(m),xcedif(m)
-           endif
-        enddo
-        
- 2010   format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
- 2011   format('          ', i2, E20.13, E20.13, E20.13)
- 2015   format('          ', i2, E20.13)
-        
-        if (class .eq. 'U') then
-           write(*, 2022)
-           write(*, 2023)
- 2022      format(' No reference values provided')
- 2023      format(' No verification performed')
-        else if (verified) then
-           write(*, 2020)
- 2020      format(' Verification Successful')
-        else
-           write(*, 2021)
- 2021      format(' Verification failed')
-        endif
-
-        return
-
-
-        end
diff --git a/examples/smpi/NAS/SP/x_solve.f b/examples/smpi/NAS/SP/x_solve.f
deleted file mode 100644
index cd40756ec3..0000000000
--- a/examples/smpi/NAS/SP/x_solve.f
+++ /dev/null
@@ -1,545 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine x_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function performs the solution of the approximate factorization
-c step in the x-direction for all five matrix components
-c simultaneously. The Thomas algorithm is employed to solve the
-c systems for the x-lines. Boundary conditions are non-periodic
-c---------------------------------------------------------------------
-
-       include 'header.h'
-       include 'mpinpb.h'
-
-
-       integer i, j, k, jp, kp, n, iend, jsize, ksize, i1, i2,
-     >         buffer_size, c, m, p, istart, stage, error,
-     >         requests(2), statuses(MPI_STATUS_SIZE, 2)
-       double precision  r1, r2, d, e, s(5), sm1, sm2,
-     >                   fac1, fac2
-
-
-
-c---------------------------------------------------------------------
-c      OK, now we know that there are multiple processors
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
-c on this node in the direction of increasing i for the forward sweep,
-c and after that reversing the direction for the backsubstitution.
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c                          FORWARD ELIMINATION  
-c---------------------------------------------------------------------
-       do    stage = 1, ncells
-          c         = slice(1,stage)
-
-          istart = 0
-          iend   = cell_size(1,c)-1
-
-          jsize     = cell_size(2,c)
-          ksize     = cell_size(3,c)
-          jp        = cell_coord(2,c)-1
-          kp        = cell_coord(3,c)-1
-
-          buffer_size = (jsize-start(2,c)-end(2,c)) * 
-     >                  (ksize-start(3,c)-end(3,c))
-
-          if ( stage .ne. 1) then
-
-c---------------------------------------------------------------------
-c            if this is not the first processor in this row of cells, 
-c            receive data from predecessor containing the right hand
-c            sides and the upper diagonal elements of the previous two rows
-c---------------------------------------------------------------------
-             call mpi_irecv(in_buffer, 22*buffer_size, 
-     >                      dp_type, predecessor(1), 
-     >                      DEFAULT_TAG,  comm_solve, 
-     >                      requests(1), error)
-
-
-c---------------------------------------------------------------------
-c            communication has already been started. 
-c            compute the left hand side while waiting for the msg
-c---------------------------------------------------------------------
-             call lhsx(c)
-
-c---------------------------------------------------------------------
-c            wait for pending communication to complete
-c            This waits on the current receive and on the send
-c            from the previous stage. They always come in pairs. 
-c---------------------------------------------------------------------
-
-             call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c            unpack the buffer                                 
-c---------------------------------------------------------------------
-             i  = istart
-             i1 = istart + 1
-             n = 0
-
-c---------------------------------------------------------------------
-c            create a running pointer
-c---------------------------------------------------------------------
-             p = 0
-             do    k = start(3,c), ksize-end(3,c)-1
-                do    j = start(2,c), jsize-end(2,c)-1
-                   lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
-     >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
-                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
-     >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
-                   end do
-                   d            = in_buffer(p+6)
-                   e            = in_buffer(p+7)
-                   do    m = 1, 3
-                      s(m) = in_buffer(p+7+m)
-                   end do
-                   r1 = lhs(i,j,k,n+2,c)
-                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
-                   lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
-                   end do
-                   r2 = lhs(i1,j,k,n+1,c)
-                   lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
-                   lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
-                   do    m = 1, 3
-                      rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
-                   end do
-                   p = p + 10
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do    k = start(3,c), ksize-end(3,c)-1
-                   do    j = start(2,c), jsize-end(2,c)-1
-                      lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
-     >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
-                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
-     >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
-                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
-     >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
-                      d                = in_buffer(p+4)
-                      e                = in_buffer(p+5)
-                      s(m)             = in_buffer(p+6)
-                      r1 = lhs(i,j,k,n+2,c)
-                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
-                      lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
-                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
-                      r2 = lhs(i1,j,k,n+1,c)
-                      lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
-                      lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
-                      rhs(i1,j,k,m,c)   = rhs(i1,j,k,m,c) - s(m) * r2
-                      p = p + 6
-                   end do
-                end do
-             end do
-
-          else            
-
-c---------------------------------------------------------------------
-c            if this IS the first cell, we still compute the lhs
-c---------------------------------------------------------------------
-             call lhsx(c)
-          endif
-
-c---------------------------------------------------------------------
-c         perform the Thomas algorithm; first, FORWARD ELIMINATION     
-c---------------------------------------------------------------------
-          n = 0
-
-          do    k = start(3,c), ksize-end(3,c)-1
-             do    j = start(2,c), jsize-end(2,c)-1
-                do    i = istart, iend-2
-                   i1 = i  + 1
-                   i2 = i  + 2
-                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                   end do
-                   lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
-     >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
-                   lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
-     >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
-     >                         lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
-                   end do
-                   lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
-     >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
-                   lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
-     >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
-     >                         lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         The last two rows in this grid block are a bit different, 
-c         since they do not have two more rows available for the
-c         elimination of off-diagonal entries
-c---------------------------------------------------------------------
-
-          i  = iend - 1
-          i1 = iend
-          do    k = start(3,c), ksize-end(3,c)-1
-             do    j = start(2,c), jsize-end(2,c)-1
-                fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                do    m = 1, 3
-                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                end do
-                lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
-     >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
-                lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
-     >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
-                do    m = 1, 3
-                   rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
-     >                      lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
-                end do
-c---------------------------------------------------------------------
-c               scale the last row immediately (some of this is
-c               overkill in case this is the last cell)
-c---------------------------------------------------------------------
-                fac2               = 1.d0/lhs(i1,j,k,n+3,c)
-                lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
-                lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)  
-                do    m = 1, 3
-                   rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         do the u+c and the u-c factors                 
-c---------------------------------------------------------------------
-
-          do    m = 4, 5
-             n = (m-3)*5
-             do    k = start(3,c), ksize-end(3,c)-1
-                do    j = start(2,c), jsize-end(2,c)-1
-                   do    i = istart, iend-2
-                   i1 = i  + 1
-                   i2 = i  + 2
-                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                   lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
-     >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
-                   lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
-     >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
-                   rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
-     >                         lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
-                   lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
-     >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
-                   lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
-     >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
-                   rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
-     >                         lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c            And again the last two rows separately
-c---------------------------------------------------------------------
-             i  = iend - 1
-             i1 = iend
-             do    k = start(3,c), ksize-end(3,c)-1
-                do    j = start(2,c), jsize-end(2,c)-1
-                fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
-                lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
-     >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
-                lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
-     >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
-                rhs(i1,j,k,m,c)   = rhs(i1,j,k,m,c) -
-     >                      lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
-c---------------------------------------------------------------------
-c               Scale the last row immediately (some of this is overkill
-c               if this is the last cell)
-c---------------------------------------------------------------------
-                fac2               = 1.d0/lhs(i1,j,k,n+3,c)
-                lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
-                lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
-                rhs(i1,j,k,m,c)   = fac2*rhs(i1,j,k,m,c)
-
-             end do
-          end do
-       end do
-
-c---------------------------------------------------------------------
-c         send information to the next processor, except when this
-c         is the last grid block
-c---------------------------------------------------------------------
-          if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c            create a running pointer for the send buffer  
-c---------------------------------------------------------------------
-             p = 0
-             n = 0
-             do    k = start(3,c), ksize-end(3,c)-1
-                do    j = start(2,c), jsize-end(2,c)-1
-                   do    i = iend-1, iend
-                      out_buffer(p+1) = lhs(i,j,k,n+4,c)
-                      out_buffer(p+2) = lhs(i,j,k,n+5,c)
-                      do    m = 1, 3
-                         out_buffer(p+2+m) = rhs(i,j,k,m,c)
-                      end do
-                      p = p+5
-                   end do
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do    k = start(3,c), ksize-end(3,c)-1
-                   do    j = start(2,c), jsize-end(2,c)-1
-                      do    i = iend-1, iend
-                         out_buffer(p+1) = lhs(i,j,k,n+4,c)
-                         out_buffer(p+2) = lhs(i,j,k,n+5,c)
-                         out_buffer(p+3) = rhs(i,j,k,m,c)
-                         p = p + 3
-                      end do
-                   end do
-                end do
-             end do
-
-c---------------------------------------------------------------------
-c send data to next phase
-c can't receive data yet because buffer size will be wrong 
-c---------------------------------------------------------------------
-             call mpi_isend(out_buffer, 22*buffer_size, 
-     >                     dp_type, successor(1), 
-     >                     DEFAULT_TAG, comm_solve, 
-     >                     requests(2), error)
-
-          endif
-       end do
-
-c---------------------------------------------------------------------
-c      now go in the reverse direction                      
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c                         BACKSUBSTITUTION 
-c---------------------------------------------------------------------
-       do    stage = ncells, 1, -1
-          c = slice(1,stage)
-
-          istart = 0
-          iend   = cell_size(1,c)-1
-
-          jsize = cell_size(2,c)
-          ksize = cell_size(3,c)
-          jp    = cell_coord(2,c)-1
-          kp    = cell_coord(3,c)-1
-
-          buffer_size = (jsize-start(2,c)-end(2,c)) * 
-     >                  (ksize-start(3,c)-end(3,c))
-
-          if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c            if this is not the starting cell in this row of cells, 
-c            wait for a message to be received, containing the 
-c            solution of the previous two stations     
-c---------------------------------------------------------------------
-             call mpi_irecv(in_buffer, 10*buffer_size, 
-     >                      dp_type, successor(1), 
-     >                      DEFAULT_TAG, comm_solve, 
-     >                      requests(1), error)
-
-
-c---------------------------------------------------------------------
-c            communication has already been started
-c            while waiting, do the block-diagonal inversion for the 
-c            cell that was just finished                
-c---------------------------------------------------------------------
-
-             call ninvr(slice(1,stage+1))
-
-c---------------------------------------------------------------------
-c            wait for pending communication to complete
-c---------------------------------------------------------------------
-             call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c            unpack the buffer for the first three factors         
-c---------------------------------------------------------------------
-             n = 0
-             p = 0
-             i  = iend
-             i1 = i - 1
-             do    m = 1, 3
-                do   k = start(3,c), ksize-end(3,c)-1
-                   do   j = start(2,c), jsize-end(2,c)-1
-                      sm1 = in_buffer(p+1)
-                      sm2 = in_buffer(p+2)
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                        lhs(i,j,k,n+4,c)*sm1 -
-     >                        lhs(i,j,k,n+5,c)*sm2
-                      rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
-     >                        lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - 
-     >                        lhs(i1,j,k,n+5,c) * sm1
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-c---------------------------------------------------------------------
-c            now unpack the buffer for the remaining two factors
-c---------------------------------------------------------------------
-             do    m = 4, 5
-                n = (m-3)*5
-                do   k = start(3,c), ksize-end(3,c)-1
-                   do   j = start(2,c), jsize-end(2,c)-1
-                      sm1 = in_buffer(p+1)
-                      sm2 = in_buffer(p+2)
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                        lhs(i,j,k,n+4,c)*sm1 -
-     >                        lhs(i,j,k,n+5,c)*sm2
-                      rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
-     >                        lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - 
-     >                        lhs(i1,j,k,n+5,c) * sm1
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-          else
-
-c---------------------------------------------------------------------
-c            now we know this is the first grid block on the back sweep,
-c            so we don't need a message to start the substitution. 
-c---------------------------------------------------------------------
-             i  = iend-1
-             i1 = iend
-             n = 0
-             do   m = 1, 3
-                do   k = start(3,c), ksize-end(3,c)-1
-                   do   j = start(2,c), jsize-end(2,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                             lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
-                   end do
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do   k = start(3,c), ksize-end(3,c)-1
-                   do   j = start(2,c), jsize-end(2,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                             lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
-                   end do
-                end do
-             end do
-          endif
-
-c---------------------------------------------------------------------
-c         Whether or not this is the last processor, we always have
-c         to complete the back-substitution 
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c         The first three factors
-c---------------------------------------------------------------------
-          n = 0
-          do   m = 1, 3
-             do   k = start(3,c), ksize-end(3,c)-1
-                do   j = start(2,c), jsize-end(2,c)-1
-                   do    i = iend-2, istart, -1
-                      i1 = i  + 1
-                      i2 = i  + 2
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                          lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
-     >                          lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         And the remaining two
-c---------------------------------------------------------------------
-          do    m = 4, 5
-             n = (m-3)*5
-             do   k = start(3,c), ksize-end(3,c)-1
-                do   j = start(2,c), jsize-end(2,c)-1
-                   do    i = iend-2, istart, -1
-                      i1 = i  + 1
-                      i2 = i  + 2
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                          lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
-     >                          lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         send on information to the previous processor, if needed
-c---------------------------------------------------------------------
-          if (stage .ne.  1) then
-             i  = istart
-             i1 = istart+1
-             p = 0
-             do    m = 1, 5
-                do    k = start(3,c), ksize-end(3,c)-1
-                   do    j = start(2,c), jsize-end(2,c)-1
-                      out_buffer(p+1) = rhs(i,j,k,m,c)
-                       out_buffer(p+2) = rhs(i1,j,k,m,c)
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-c---------------------------------------------------------------------
-c            pack and send the buffer
-c---------------------------------------------------------------------
-             call mpi_isend(out_buffer, 10*buffer_size, 
-     >                     dp_type, predecessor(1), 
-     >                     DEFAULT_TAG, comm_solve, 
-     >                     requests(2), error)
-
-          endif
-
-c---------------------------------------------------------------------
-c         If this was the last stage, do the block-diagonal inversion          
-c---------------------------------------------------------------------
-          if (stage .eq. 1) call ninvr(c)
-
-       end do
-
-       return
-       end
-    
-
-
-
-
-
-
diff --git a/examples/smpi/NAS/SP/y_solve.f b/examples/smpi/NAS/SP/y_solve.f
deleted file mode 100644
index fdcbb4d03f..0000000000
--- a/examples/smpi/NAS/SP/y_solve.f
+++ /dev/null
@@ -1,538 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine y_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function performs the solution of the approximate factorization
-c step in the y-direction for all five matrix components
-c simultaneously. The Thomas algorithm is employed to solve the
-c systems for the y-lines. Boundary conditions are non-periodic
-c---------------------------------------------------------------------
-
-       include 'header.h'
-       include 'mpinpb.h'
-
-       integer i, j, k, stage, ip, kp, n, isize, jend, ksize, j1, j2,
-     >         buffer_size, c, m, p, jstart, error,
-     >         requests(2), statuses(MPI_STATUS_SIZE, 2)
-       double precision  r1, r2, d, e, s(5), sm1, sm2,
-     >                   fac1, fac2
-
-
-c---------------------------------------------------------------------
-c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
-c on this node in the direction of increasing i for the forward sweep,
-c and after that reversing the direction for the backsubstitution  
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c                          FORWARD ELIMINATION  
-c---------------------------------------------------------------------
-       do    stage = 1, ncells
-          c      = slice(2,stage)
-
-          jstart = 0
-          jend   = cell_size(2,c)-1
-
-          isize     = cell_size(1,c)
-          ksize     = cell_size(3,c)
-          ip        = cell_coord(1,c)-1
-          kp        = cell_coord(3,c)-1
-
-          buffer_size = (isize-start(1,c)-end(1,c)) * 
-     >                  (ksize-start(3,c)-end(3,c))
-
-          if ( stage .ne. 1) then
-
-c---------------------------------------------------------------------
-c            if this is not the first processor in this row of cells, 
-c            receive data from predecessor containing the right hand
-c            sides and the upper diagonal elements of the previous two rows
-c---------------------------------------------------------------------
-
-             call mpi_irecv(in_buffer, 22*buffer_size, 
-     >                      dp_type, predecessor(2), 
-     >                      DEFAULT_TAG, comm_solve, 
-     >                      requests(1), error)
-
-c---------------------------------------------------------------------
-c            communication has already been started. 
-c            compute the left hand side while waiting for the msg
-c---------------------------------------------------------------------
-             call lhsy(c)
-
-c---------------------------------------------------------------------
-c            wait for pending communication to complete
-c            This waits on the current receive and on the send
-c            from the previous stage. They always come in pairs. 
-c---------------------------------------------------------------------
-             call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c            unpack the buffer                                 
-c---------------------------------------------------------------------
-             j  = jstart
-             j1 = jstart + 1
-             n = 0
-c---------------------------------------------------------------------
-c            create a running pointer
-c---------------------------------------------------------------------
-             p = 0
-             do    k = start(3,c), ksize-end(3,c)-1
-                do    i = start(1,c), isize-end(1,c)-1
-                   lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
-     >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
-                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
-     >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
-                   end do
-                   d            = in_buffer(p+6)
-                   e            = in_buffer(p+7)
-                   do    m = 1, 3
-                      s(m) = in_buffer(p+7+m)
-                   end do
-                   r1 = lhs(i,j,k,n+2,c)
-                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
-                   lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
-                   end do
-                   r2 = lhs(i,j1,k,n+1,c)
-                   lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
-                   lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
-                   do    m = 1, 3
-                      rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
-                   end do
-                   p = p + 10
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do    k = start(3,c), ksize-end(3,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
-     >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
-                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
-     >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
-                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
-     >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
-                      d                = in_buffer(p+4)
-                      e                = in_buffer(p+5)
-                      s(m)             = in_buffer(p+6)
-                      r1 = lhs(i,j,k,n+2,c)
-                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
-                      lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
-                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
-                      r2 = lhs(i,j1,k,n+1,c)
-                      lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
-                      lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
-                      rhs(i,j1,k,m,c)   = rhs(i,j1,k,m,c) - s(m) * r2
-                      p = p + 6
-                   end do
-                end do
-             end do
-
-          else            
-
-c---------------------------------------------------------------------
-c            if this IS the first cell, we still compute the lhs
-c---------------------------------------------------------------------
-             call lhsy(c)
-          endif
-
-c---------------------------------------------------------------------
-c         perform the Thomas algorithm; first, FORWARD ELIMINATION     
-c---------------------------------------------------------------------
-          n = 0
-
-          do    k = start(3,c), ksize-end(3,c)-1
-             do    j = jstart, jend-2
-                do    i = start(1,c), isize-end(1,c)-1
-                   j1 = j  + 1
-                   j2 = j  + 2
-                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                   end do
-                   lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
-     >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
-                   lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
-     >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
-     >                         lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
-                   end do
-                   lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
-     >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
-                   lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
-     >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
-     >                         lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         The last two rows in this grid block are a bit different, 
-c         since they do not have two more rows available for the
-c         elimination of off-diagonal entries
-c---------------------------------------------------------------------
-
-          j  = jend - 1
-          j1 = jend
-          do    k = start(3,c), ksize-end(3,c)-1
-             do    i = start(1,c), isize-end(1,c)-1
-                fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                do    m = 1, 3
-                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                end do
-                lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
-     >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
-                lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
-     >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
-                do    m = 1, 3
-                   rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
-     >                      lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
-                end do
-c---------------------------------------------------------------------
-c               scale the last row immediately (some of this is
-c               overkill in case this is the last cell)
-c---------------------------------------------------------------------
-                fac2               = 1.d0/lhs(i,j1,k,n+3,c)
-                lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
-                lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)  
-                do    m = 1, 3
-                   rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         do the u+c and the u-c factors                 
-c---------------------------------------------------------------------
-          do    m = 4, 5
-             n = (m-3)*5
-             do    k = start(3,c), ksize-end(3,c)-1
-                do    j = jstart, jend-2
-                   do    i = start(1,c), isize-end(1,c)-1
-                   j1 = j  + 1
-                   j2 = j  + 2
-                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                   lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
-     >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
-                   lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
-     >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
-                   rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
-     >                         lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
-                   lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
-     >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
-                   lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
-     >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
-                   rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
-     >                         lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c            And again the last two rows separately
-c---------------------------------------------------------------------
-             j  = jend - 1
-             j1 = jend
-             do    k = start(3,c), ksize-end(3,c)-1
-                do    i = start(1,c), isize-end(1,c)-1
-                fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
-                lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
-     >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
-                lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
-     >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
-                rhs(i,j1,k,m,c)   = rhs(i,j1,k,m,c) -
-     >                      lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
-c---------------------------------------------------------------------
-c               Scale the last row immediately (some of this is overkill
-c               if this is the last cell)
-c---------------------------------------------------------------------
-                fac2               = 1.d0/lhs(i,j1,k,n+3,c)
-                lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
-                lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
-                rhs(i,j1,k,m,c)   = fac2*rhs(i,j1,k,m,c)
-
-             end do
-          end do
-       end do
-
-c---------------------------------------------------------------------
-c         send information to the next processor, except when this
-c         is the last grid block;
-c---------------------------------------------------------------------
-
-          if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c            create a running pointer for the send buffer  
-c---------------------------------------------------------------------
-             p = 0
-             n = 0
-             do    k = start(3,c), ksize-end(3,c)-1
-                do    i = start(1,c), isize-end(1,c)-1
-                   do    j = jend-1, jend
-                      out_buffer(p+1) = lhs(i,j,k,n+4,c)
-                      out_buffer(p+2) = lhs(i,j,k,n+5,c)
-                      do    m = 1, 3
-                         out_buffer(p+2+m) = rhs(i,j,k,m,c)
-                      end do
-                      p = p+5
-                   end do
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do    k = start(3,c), ksize-end(3,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      do    j = jend-1, jend
-                         out_buffer(p+1) = lhs(i,j,k,n+4,c)
-                         out_buffer(p+2) = lhs(i,j,k,n+5,c)
-                         out_buffer(p+3) = rhs(i,j,k,m,c)
-                         p = p + 3
-                      end do
-                   end do
-                end do
-             end do
-
-c---------------------------------------------------------------------
-c            pack and send the buffer
-c---------------------------------------------------------------------
-             call mpi_isend(out_buffer, 22*buffer_size, 
-     >                     dp_type, successor(2), 
-     >                     DEFAULT_TAG, comm_solve, 
-     >                     requests(2), error)
-
-          endif
-       end do
-
-c---------------------------------------------------------------------
-c      now go in the reverse direction                      
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c                         BACKSUBSTITUTION 
-c---------------------------------------------------------------------
-       do    stage = ncells, 1, -1
-          c = slice(2,stage)
-
-          jstart = 0
-          jend   = cell_size(2,c)-1
-
-          isize = cell_size(1,c)
-          ksize = cell_size(3,c)
-          ip    = cell_coord(1,c)-1
-          kp    = cell_coord(3,c)-1
-
-          buffer_size = (isize-start(1,c)-end(1,c)) * 
-     >                  (ksize-start(3,c)-end(3,c))
-
-          if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c            if this is not the starting cell in this row of cells, 
-c            wait for a message to be received, containing the 
-c            solution of the previous two stations     
-c---------------------------------------------------------------------
-
-             call mpi_irecv(in_buffer, 10*buffer_size, 
-     >                      dp_type, successor(2), 
-     >                      DEFAULT_TAG, comm_solve, 
-     >                      requests(1), error)
-
-
-c---------------------------------------------------------------------
-c            communication has already been started
-c            while waiting, do the block-diagonal inversion for the 
-c            cell that was just finished                
-c---------------------------------------------------------------------
-
-             call pinvr(slice(2,stage+1))
-
-c---------------------------------------------------------------------
-c            wait for pending communication to complete
-c---------------------------------------------------------------------
-             call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c            unpack the buffer for the first three factors         
-c---------------------------------------------------------------------
-             n = 0
-             p = 0
-             j  = jend
-             j1 = j - 1
-             do    m = 1, 3
-                do   k = start(3,c), ksize-end(3,c)-1
-                   do   i = start(1,c), isize-end(1,c)-1
-                      sm1 = in_buffer(p+1)
-                      sm2 = in_buffer(p+2)
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                        lhs(i,j,k,n+4,c)*sm1 -
-     >                        lhs(i,j,k,n+5,c)*sm2
-                      rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
-     >                        lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) - 
-     >                        lhs(i,j1,k,n+5,c) * sm1
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-c---------------------------------------------------------------------
-c            now unpack the buffer for the remaining two factors
-c---------------------------------------------------------------------
-             do    m = 4, 5
-                n = (m-3)*5
-                do   k = start(3,c), ksize-end(3,c)-1
-                   do   i = start(1,c), isize-end(1,c)-1
-                      sm1 = in_buffer(p+1)
-                      sm2 = in_buffer(p+2)
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                        lhs(i,j,k,n+4,c)*sm1 -
-     >                        lhs(i,j,k,n+5,c)*sm2
-                      rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
-     >                        lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) - 
-     >                        lhs(i,j1,k,n+5,c) * sm1
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-          else
-c---------------------------------------------------------------------
-c            now we know this is the first grid block on the back sweep,
-c            so we don't need a message to start the substitution. 
-c---------------------------------------------------------------------
-
-             j  = jend - 1
-             j1 = jend
-             n = 0
-             do   m = 1, 3
-                do   k = start(3,c), ksize-end(3,c)-1
-                   do   i = start(1,c), isize-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                             lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
-                   end do
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do   k = start(3,c), ksize-end(3,c)-1
-                   do   i = start(1,c), isize-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                             lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
-                   end do
-                end do
-             end do
-          endif
-
-c---------------------------------------------------------------------
-c         Whether or not this is the last processor, we always have
-c         to complete the back-substitution 
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c         The first three factors
-c---------------------------------------------------------------------
-          n = 0
-          do   m = 1, 3
-             do   k = start(3,c), ksize-end(3,c)-1
-                do   j = jend-2, jstart, -1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      j1 = j  + 1
-                      j2 = j  + 2
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                          lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
-     >                          lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         And the remaining two
-c---------------------------------------------------------------------
-          do    m = 4, 5
-             n = (m-3)*5
-             do   k = start(3,c), ksize-end(3,c)-1
-                do   j = jend-2, jstart, -1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      j1 = j  + 1
-                      j2 = j1 + 1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                          lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
-     >                          lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         send on information to the previous processor, if needed
-c---------------------------------------------------------------------
-          if (stage .ne.  1) then
-             j  = jstart
-             j1 = jstart + 1
-             p = 0
-             do    m = 1, 5
-                do    k = start(3,c), ksize-end(3,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      out_buffer(p+1) = rhs(i,j,k,m,c)
-                      out_buffer(p+2) = rhs(i,j1,k,m,c)
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-c---------------------------------------------------------------------
-c            pack and send the buffer
-c---------------------------------------------------------------------
-
-             call mpi_isend(out_buffer, 10*buffer_size, 
-     >                     dp_type, predecessor(2), 
-     >                     DEFAULT_TAG, comm_solve, 
-     >                     requests(2), error)
-
-          endif
-
-c---------------------------------------------------------------------
-c         If this was the last stage, do the block-diagonal inversion          
-c---------------------------------------------------------------------
-          if (stage .eq. 1) call pinvr(c)
-
-       end do
-
-       return
-       end
-    
-
-
-
-
-
-
diff --git a/examples/smpi/NAS/SP/z_solve.f b/examples/smpi/NAS/SP/z_solve.f
deleted file mode 100644
index ad0dc7e727..0000000000
--- a/examples/smpi/NAS/SP/z_solve.f
+++ /dev/null
@@ -1,532 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-       subroutine z_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function performs the solution of the approximate factorization
-c step in the z-direction for all five matrix components
-c simultaneously. The Thomas algorithm is employed to solve the
-c systems for the z-lines. Boundary conditions are non-periodic
-c---------------------------------------------------------------------
-
-       include 'header.h'
-       include 'mpinpb.h'
-
-       integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2,
-     >         buffer_size, c, m, p, kstart, error,
-     >         requests(2), statuses(MPI_STATUS_SIZE, 2)
-       double precision  r1, r2, d, e, s(5), sm1, sm2,
-     >                   fac1, fac2
-
-c---------------------------------------------------------------------
-c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
-c on this node in the direction of increasing i for the forward sweep,
-c and after that reversing the direction for the backsubstitution  
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c                          FORWARD ELIMINATION  
-c---------------------------------------------------------------------
-       do    stage = 1, ncells
-          c         = slice(3,stage)
-
-          kstart = 0
-          kend   = cell_size(3,c)-1
-
-          isize     = cell_size(1,c)
-          jsize     = cell_size(2,c)
-          ip        = cell_coord(1,c)-1
-          jp        = cell_coord(2,c)-1
-
-          buffer_size = (isize-start(1,c)-end(1,c)) * 
-     >                  (jsize-start(2,c)-end(2,c))
-
-          if (stage .ne. 1) then
-
-
-c---------------------------------------------------------------------
-c            if this is not the first processor in this row of cells, 
-c            receive data from predecessor containing the right hand
-c            sides and the upper diagonal elements of the previous two rows
-c---------------------------------------------------------------------
-
-             call mpi_irecv(in_buffer, 22*buffer_size, 
-     >                      dp_type, predecessor(3), 
-     >                      DEFAULT_TAG, comm_solve, 
-     >                      requests(1), error)
-
-
-c---------------------------------------------------------------------
-c            communication has already been started. 
-c            compute the left hand side while waiting for the msg
-c---------------------------------------------------------------------
-             call lhsz(c)
-
-c---------------------------------------------------------------------
-c            wait for pending communication to complete
-c---------------------------------------------------------------------
-             call mpi_waitall(2, requests, statuses, error)
-             
-c---------------------------------------------------------------------
-c            unpack the buffer                                 
-c---------------------------------------------------------------------
-             k  = kstart
-             k1 = kstart + 1
-             n = 0
-
-c---------------------------------------------------------------------
-c            create a running pointer
-c---------------------------------------------------------------------
-             p = 0
-             do    j = start(2,c), jsize-end(2,c)-1
-                do    i = start(1,c), isize-end(1,c)-1
-                   lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
-     >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
-                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
-     >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
-                   end do
-                   d            = in_buffer(p+6)
-                   e            = in_buffer(p+7)
-                   do    m = 1, 3
-                      s(m) = in_buffer(p+7+m)
-                   end do
-                   r1 = lhs(i,j,k,n+2,c)
-                   lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
-                   lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
-                   end do
-                   r2 = lhs(i,j,k1,n+1,c)
-                   lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
-                   lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
-                   do    m = 1, 3
-                      rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
-                   end do
-                   p = p + 10
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do    j = start(2,c), jsize-end(2,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
-     >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
-                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
-     >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
-                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
-     >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
-                      d                = in_buffer(p+4)
-                      e                = in_buffer(p+5)
-                      s(m)             = in_buffer(p+6)
-                      r1 = lhs(i,j,k,n+2,c)
-                      lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
-                      lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
-                      rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
-                      r2 = lhs(i,j,k1,n+1,c)
-                      lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
-                      lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
-                      rhs(i,j,k1,m,c)   = rhs(i,j,k1,m,c) - s(m) * r2
-                      p = p + 6
-                   end do
-                end do
-             end do
-
-          else            
-
-c---------------------------------------------------------------------
-c            if this IS the first cell, we still compute the lhs
-c---------------------------------------------------------------------
-             call lhsz(c)
-          endif
-
-c---------------------------------------------------------------------
-c         perform the Thomas algorithm; first, FORWARD ELIMINATION     
-c---------------------------------------------------------------------
-          n = 0
-
-          do    k = kstart, kend-2
-             do    j = start(2,c), jsize-end(2,c)-1
-                do    i = start(1,c), isize-end(1,c)-1
-                   k1 = k  + 1
-                   k2 = k  + 2
-                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                   end do
-                   lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
-     >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
-     >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
-     >                         lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
-                   end do
-                   lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
-     >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
-     >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
-                   do    m = 1, 3
-                      rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
-     >                         lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         The last two rows in this grid block are a bit different, 
-c         since they do not have two more rows available for the
-c         elimination of off-diagonal entries
-c---------------------------------------------------------------------
-          k  = kend - 1
-          k1 = kend
-          do    j = start(2,c), jsize-end(2,c)-1
-             do    i = start(1,c), isize-end(1,c)-1
-                fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                do    m = 1, 3
-                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                end do
-                lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
-     >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
-                lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
-     >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
-                do    m = 1, 3
-                   rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
-     >                      lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
-                end do
-c---------------------------------------------------------------------
-c               scale the last row immediately (some of this is
-c               overkill in case this is the last cell)
-c---------------------------------------------------------------------
-                fac2               = 1.d0/lhs(i,j,k1,n+3,c)
-                lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
-                lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)  
-                do    m = 1, 3
-                   rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         do the u+c and the u-c factors               
-c---------------------------------------------------------------------
-          do   m = 4, 5
-             n = (m-3)*5
-             do    k = kstart, kend-2
-                do    j = start(2,c), jsize-end(2,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                   k1 = k  + 1
-                   k2 = k  + 2
-                   fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                   lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                   rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
-                   lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
-     >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
-     >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
-                   rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
-     >                         lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
-                   lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
-     >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
-                   lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
-     >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
-                   rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
-     >                         lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c            And again the last two rows separately
-c---------------------------------------------------------------------
-             k  = kend - 1
-             k1 = kend
-             do    j = start(2,c), jsize-end(2,c)-1
-                do    i = start(1,c), isize-end(1,c)-1
-                fac1               = 1.d0/lhs(i,j,k,n+3,c)
-                lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
-                lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
-                rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
-                lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
-     >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
-                lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
-     >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
-                rhs(i,j,k1,m,c)   = rhs(i,j,k1,m,c) -
-     >                      lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
-c---------------------------------------------------------------------
-c               Scale the last row immediately (some of this is overkill
-c               if this is the last cell)
-c---------------------------------------------------------------------
-                fac2               = 1.d0/lhs(i,j,k1,n+3,c)
-                lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
-                lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
-                rhs(i,j,k1,m,c)   = fac2*rhs(i,j,k1,m,c)
-
-             end do
-          end do
-       end do
-
-c---------------------------------------------------------------------
-c         send information to the next processor, except when this
-c         is the last grid block,
-c---------------------------------------------------------------------
-
-          if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c            create a running pointer for the send buffer  
-c---------------------------------------------------------------------
-             p = 0
-             n = 0
-             do    j = start(2,c), jsize-end(2,c)-1
-                do    i = start(1,c), isize-end(1,c)-1
-                   do    k = kend-1, kend
-                      out_buffer(p+1) = lhs(i,j,k,n+4,c)
-                      out_buffer(p+2) = lhs(i,j,k,n+5,c)
-                      do    m = 1, 3
-                         out_buffer(p+2+m) = rhs(i,j,k,m,c)
-                      end do
-                      p = p+5
-                   end do
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do    j = start(2,c), jsize-end(2,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      do    k = kend-1, kend
-                         out_buffer(p+1) = lhs(i,j,k,n+4,c)
-                         out_buffer(p+2) = lhs(i,j,k,n+5,c)
-                         out_buffer(p+3) = rhs(i,j,k,m,c)
-                         p = p + 3
-                      end do
-                   end do
-                end do
-             end do
-
-
-             call mpi_isend(out_buffer, 22*buffer_size, 
-     >                     dp_type, successor(3), 
-     >                     DEFAULT_TAG, comm_solve, 
-     >                     requests(2), error)
-
-          endif
-       end do
-
-c---------------------------------------------------------------------
-c      now go in the reverse direction                      
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c                         BACKSUBSTITUTION 
-c---------------------------------------------------------------------
-       do    stage = ncells, 1, -1
-          c = slice(3,stage)
-
-          kstart = 0
-          kend   = cell_size(3,c)-1
-
-          isize     = cell_size(1,c)
-          jsize     = cell_size(2,c)
-          ip        = cell_coord(1,c)-1
-          jp        = cell_coord(2,c)-1
-
-          buffer_size = (isize-start(1,c)-end(1,c)) * 
-     >                  (jsize-start(2,c)-end(2,c))
-
-          if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c            if this is not the starting cell in this row of cells, 
-c            wait for a message to be received, containing the 
-c            solution of the previous two stations     
-c---------------------------------------------------------------------
-
-             call mpi_irecv(in_buffer, 10*buffer_size, 
-     >                      dp_type, successor(3), 
-     >                      DEFAULT_TAG, comm_solve, 
-     >                      requests(1), error)
-
-
-c---------------------------------------------------------------------
-c            communication has already been started
-c            while waiting, do the  block-diagonal inversion for the 
-c            cell that was just finished                
-c---------------------------------------------------------------------
-
-             call tzetar(slice(3,stage+1))
-
-c---------------------------------------------------------------------
-c            wait for pending communication to complete
-c---------------------------------------------------------------------
-             call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c            unpack the buffer for the first three factors         
-c---------------------------------------------------------------------
-             n = 0
-             p = 0
-             k  = kend
-             k1 = k - 1
-             do    m = 1, 3
-                do   j = start(2,c), jsize-end(2,c)-1
-                   do   i = start(1,c), isize-end(1,c)-1
-                      sm1 = in_buffer(p+1)
-                      sm2 = in_buffer(p+2)
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                        lhs(i,j,k,n+4,c)*sm1 -
-     >                        lhs(i,j,k,n+5,c)*sm2
-                      rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
-     >                        lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - 
-     >                        lhs(i,j,k1,n+5,c) * sm1
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-c---------------------------------------------------------------------
-c            now unpack the buffer for the remaining two factors
-c---------------------------------------------------------------------
-             do    m = 4, 5
-                n = (m-3)*5
-                do   j = start(2,c), jsize-end(2,c)-1
-                   do   i = start(1,c), isize-end(1,c)-1
-                      sm1 = in_buffer(p+1)
-                      sm2 = in_buffer(p+2)
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                        lhs(i,j,k,n+4,c)*sm1 -
-     >                        lhs(i,j,k,n+5,c)*sm2
-                      rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
-     >                        lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - 
-     >                        lhs(i,j,k1,n+5,c) * sm1
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-          else
-
-c---------------------------------------------------------------------
-c            now we know this is the first grid block on the back sweep,
-c            so we don't need a message to start the substitution. 
-c---------------------------------------------------------------------
-
-             k  = kend - 1
-             k1 = kend
-             n = 0
-             do   m = 1, 3
-                do   j = start(2,c), jsize-end(2,c)-1
-                   do   i = start(1,c), isize-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                             lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
-                   end do
-                end do
-             end do
-
-             do    m = 4, 5
-                n = (m-3)*5
-                do   j = start(2,c), jsize-end(2,c)-1
-                   do   i = start(1,c), isize-end(1,c)-1
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
-     >                             lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
-                   end do
-                end do
-             end do
-          endif
-
-c---------------------------------------------------------------------
-c         Whether or not this is the last processor, we always have
-c         to complete the back-substitution 
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c         The first three factors
-c---------------------------------------------------------------------
-          n = 0
-          do   m = 1, 3
-             do   k = kend-2, kstart, -1
-                do   j = start(2,c), jsize-end(2,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      k1 = k  + 1
-                      k2 = k  + 2
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                          lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
-     >                          lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         And the remaining two
-c---------------------------------------------------------------------
-          do    m = 4, 5
-             n = (m-3)*5
-             do   k = kend-2, kstart, -1
-                do   j = start(2,c), jsize-end(2,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      k1 = k  + 1
-                      k2 = k  + 2
-                      rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
-     >                          lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
-     >                          lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
-                   end do
-                end do
-             end do
-          end do
-
-c---------------------------------------------------------------------
-c         send on information to the previous processor, if needed
-c---------------------------------------------------------------------
-          if (stage .ne.  1) then
-             k  = kstart
-             k1 = kstart + 1
-             p = 0
-             do    m = 1, 5
-                do    j = start(2,c), jsize-end(2,c)-1
-                   do    i = start(1,c), isize-end(1,c)-1
-                      out_buffer(p+1) = rhs(i,j,k,m,c)
-                      out_buffer(p+2) = rhs(i,j,k1,m,c)
-                      p = p + 2
-                   end do
-                end do
-             end do
-
-             call mpi_isend(out_buffer, 10*buffer_size, 
-     >                     dp_type, predecessor(3), 
-     >                     DEFAULT_TAG, comm_solve, 
-     >                     requests(2), error)
-
-          endif
-
-c---------------------------------------------------------------------
-c         If this was the last stage, do the block-diagonal inversion
-c---------------------------------------------------------------------
-          if (stage .eq. 1) call tzetar(c)
-
-       end do
-
-       return
-       end
-    
-
-
-
-
-
-