From 4c74bb7b6f2398da81ce462cbdfd9c5a77ffa683 Mon Sep 17 00:00:00 2001 From: pini Date: Thu, 11 Mar 2010 14:47:35 +0000 Subject: [PATCH] Added our tweaked version of NAS benchmarks. git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/simgrid/simgrid/trunk@7221 48e7efb5-ca39-0410-a469-dd3cf9ba447f --- examples/smpi/NAS/BT/Makefile | 106 + examples/smpi/NAS/BT/add.f | 30 + examples/smpi/NAS/BT/adi.f | 21 + examples/smpi/NAS/BT/bt.f | 275 ++ examples/smpi/NAS/BT/btio.f | 72 + examples/smpi/NAS/BT/btio_common.f | 30 + examples/smpi/NAS/BT/copy_faces.f | 316 +++ examples/smpi/NAS/BT/define.f | 64 + examples/smpi/NAS/BT/epio.f | 165 ++ examples/smpi/NAS/BT/error.f | 106 + examples/smpi/NAS/BT/exact_rhs.f | 360 +++ examples/smpi/NAS/BT/exact_solution.f | 29 + examples/smpi/NAS/BT/fortran_io.f | 174 ++ examples/smpi/NAS/BT/full_mpiio.f | 307 ++ examples/smpi/NAS/BT/header.h | 137 + examples/smpi/NAS/BT/initialize.f | 308 ++ examples/smpi/NAS/BT/inputbt.data.sample | 5 + examples/smpi/NAS/BT/make_set.f | 124 + examples/smpi/NAS/BT/mpinpb.h | 12 + examples/smpi/NAS/BT/rhs.f | 425 +++ examples/smpi/NAS/BT/set_constants.f | 202 ++ examples/smpi/NAS/BT/setup_mpi.f | 64 + examples/smpi/NAS/BT/simple_mpiio.f | 213 ++ examples/smpi/NAS/BT/solve_subs.f | 642 +++++ examples/smpi/NAS/BT/verify.f | 435 +++ examples/smpi/NAS/BT/work_lhs.h | 14 + examples/smpi/NAS/BT/work_lhs_vec.h | 14 + examples/smpi/NAS/BT/x_solve.f | 761 +++++ examples/smpi/NAS/BT/x_solve_vec.f | 789 ++++++ examples/smpi/NAS/BT/y_solve.f | 771 +++++ examples/smpi/NAS/BT/y_solve_vec.f | 788 ++++++ examples/smpi/NAS/BT/z_solve.f | 776 ++++++ examples/smpi/NAS/BT/z_solve_vec.f | 793 ++++++ examples/smpi/NAS/CG/Makefile | 23 + examples/smpi/NAS/CG/cg.f | 1787 ++++++++++++ examples/smpi/NAS/CG/mpinpb.h | 9 + examples/smpi/NAS/DT/DGraph.c | 184 ++ examples/smpi/NAS/DT/DGraph.h | 43 + examples/smpi/NAS/DT/Makefile | 26 + examples/smpi/NAS/DT/README | 22 + examples/smpi/NAS/DT/dt.c | 759 +++++ examples/smpi/NAS/EP/Makefile | 28 + examples/smpi/NAS/EP/README | 6 + examples/smpi/NAS/EP/ep.c | 440 +++ examples/smpi/NAS/EP/ep.f | 316 +++ examples/smpi/NAS/EP/mpinpb.h | 9 + examples/smpi/NAS/EP/randlc.c | 107 + examples/smpi/NAS/EP/randlc.h | 3 + examples/smpi/NAS/FT/Makefile | 23 + examples/smpi/NAS/FT/README | 5 + examples/smpi/NAS/FT/ft.f | 1993 +++++++++++++ examples/smpi/NAS/FT/global.h | 134 + examples/smpi/NAS/FT/inputft.data.sample | 3 + examples/smpi/NAS/FT/mpinpb.h | 4 + examples/smpi/NAS/IS/Makefile | 23 + examples/smpi/NAS/IS/is.c | 1150 ++++++++ examples/smpi/NAS/LU/Makefile | 74 + examples/smpi/NAS/LU/applu.incl | 153 + examples/smpi/NAS/LU/bcast_inputs.f | 41 + examples/smpi/NAS/LU/blts.f | 261 ++ examples/smpi/NAS/LU/blts_vec.f | 334 +++ examples/smpi/NAS/LU/buts.f | 259 ++ examples/smpi/NAS/LU/buts_vec.f | 332 +++ examples/smpi/NAS/LU/erhs.f | 536 ++++ examples/smpi/NAS/LU/error.f | 81 + examples/smpi/NAS/LU/exact.f | 53 + examples/smpi/NAS/LU/exchange_1.f | 180 ++ examples/smpi/NAS/LU/exchange_3.f | 312 +++ examples/smpi/NAS/LU/exchange_4.f | 133 + examples/smpi/NAS/LU/exchange_5.f | 81 + examples/smpi/NAS/LU/exchange_6.f | 81 + examples/smpi/NAS/LU/init_comm.f | 57 + examples/smpi/NAS/LU/inputlu.data.sample | 24 + examples/smpi/NAS/LU/jacld.f | 384 +++ examples/smpi/NAS/LU/jacu.f | 384 +++ examples/smpi/NAS/LU/l2norm.f | 68 + examples/smpi/NAS/LU/lu.f | 164 ++ examples/smpi/NAS/LU/mpinpb.h | 11 + examples/smpi/NAS/LU/neighbors.f | 48 + examples/smpi/NAS/LU/nodedim.f | 36 + examples/smpi/NAS/LU/pintgr.f | 288 ++ examples/smpi/NAS/LU/proc_grid.f | 36 + examples/smpi/NAS/LU/read_input.f | 127 + examples/smpi/NAS/LU/rhs.f | 504 ++++ examples/smpi/NAS/LU/setbv.f | 79 + examples/smpi/NAS/LU/setcoeff.f | 159 ++ examples/smpi/NAS/LU/sethyper.f | 94 + examples/smpi/NAS/LU/setiv.f | 67 + examples/smpi/NAS/LU/ssor.f | 241 ++ examples/smpi/NAS/LU/subdomain.f | 103 + examples/smpi/NAS/LU/verify.f | 403 +++ examples/smpi/NAS/MG/Makefile | 23 + examples/smpi/NAS/MG/README | 138 + examples/smpi/NAS/MG/globals.h | 55 + examples/smpi/NAS/MG/mg.f | 2479 +++++++++++++++++ examples/smpi/NAS/MG/mg.input.sample | 4 + examples/smpi/NAS/MG/mpinpb.h | 9 + examples/smpi/NAS/MPI_dummy/Makefile | 38 + examples/smpi/NAS/MPI_dummy/README | 52 + examples/smpi/NAS/MPI_dummy/mpi.h | 112 + examples/smpi/NAS/MPI_dummy/mpi_dummy.c | 267 ++ examples/smpi/NAS/MPI_dummy/mpi_dummy.f | 309 ++ examples/smpi/NAS/MPI_dummy/mpif.h | 27 + examples/smpi/NAS/MPI_dummy/test.f | 10 + examples/smpi/NAS/MPI_dummy/wtime.c | 13 + examples/smpi/NAS/MPI_dummy/wtime.f | 12 + examples/smpi/NAS/MPI_dummy/wtime.h | 12 + examples/smpi/NAS/MPI_dummy/wtime_sgi64.c | 74 + examples/smpi/NAS/Makefile | 69 + examples/smpi/NAS/README | 52 + examples/smpi/NAS/README.install | 156 ++ examples/smpi/NAS/SP/Makefile | 60 + examples/smpi/NAS/SP/README | 17 + examples/smpi/NAS/SP/add.f | 31 + examples/smpi/NAS/SP/adi.f | 24 + examples/smpi/NAS/SP/copy_faces.f | 306 ++ examples/smpi/NAS/SP/define.f | 66 + examples/smpi/NAS/SP/error.f | 105 + examples/smpi/NAS/SP/exact_rhs.f | 363 +++ examples/smpi/NAS/SP/exact_solution.f | 30 + examples/smpi/NAS/SP/header.h | 113 + examples/smpi/NAS/SP/initialize.f | 286 ++ examples/smpi/NAS/SP/inputsp.data.sample | 3 + examples/smpi/NAS/SP/lhsx.f | 124 + examples/smpi/NAS/SP/lhsy.f | 125 + examples/smpi/NAS/SP/lhsz.f | 123 + examples/smpi/NAS/SP/make_set.f | 120 + examples/smpi/NAS/SP/mpinpb.h | 13 + examples/smpi/NAS/SP/ninvr.f | 45 + examples/smpi/NAS/SP/pinvr.f | 48 + examples/smpi/NAS/SP/rhs.f | 446 +++ examples/smpi/NAS/SP/set_constants.f | 203 ++ examples/smpi/NAS/SP/setup_mpi.f | 65 + examples/smpi/NAS/SP/sp.f | 194 ++ examples/smpi/NAS/SP/txinvr.f | 59 + examples/smpi/NAS/SP/tzetar.f | 60 + examples/smpi/NAS/SP/verify.f | 358 +++ examples/smpi/NAS/SP/x_solve.f | 545 ++++ examples/smpi/NAS/SP/y_solve.f | 538 ++++ examples/smpi/NAS/SP/z_solve.f | 532 ++++ examples/smpi/NAS/common/c_print_results.c | 94 + examples/smpi/NAS/common/c_timers.c | 45 + examples/smpi/NAS/common/print_results.f | 115 + examples/smpi/NAS/common/randdp.c | 64 + examples/smpi/NAS/common/randdp.f | 137 + examples/smpi/NAS/common/randdpvec.f | 186 ++ examples/smpi/NAS/common/randi8.f | 79 + examples/smpi/NAS/common/randi8_safe.f | 64 + examples/smpi/NAS/common/timers.f | 78 + examples/smpi/NAS/config/NAS.samples/README | 7 + .../NAS/config/NAS.samples/make.def.dec_alpha | 18 + .../NAS/config/NAS.samples/make.def.irix6.2 | 16 + .../NAS/config/NAS.samples/make.def.origin | 20 + .../NAS.samples/make.def.sgi_powerchallenge | 16 + .../config/NAS.samples/make.def.sp2_babbage | 17 + .../NAS.samples/make.def.sun_ultra_sparc | 30 + .../config/NAS.samples/make.def.t3d_cosmos | 25 + .../NAS/config/NAS.samples/make.def_sun_mpich | 165 ++ .../smpi/NAS/config/NAS.samples/suite.def.bt | 37 + .../smpi/NAS/config/NAS.samples/suite.def.cg | 29 + .../smpi/NAS/config/NAS.samples/suite.def.ep | 29 + .../smpi/NAS/config/NAS.samples/suite.def.ft | 29 + .../smpi/NAS/config/NAS.samples/suite.def.is | 29 + .../smpi/NAS/config/NAS.samples/suite.def.lu | 29 + .../smpi/NAS/config/NAS.samples/suite.def.mg | 29 + .../NAS/config/NAS.samples/suite.def.small | 8 + .../smpi/NAS/config/NAS.samples/suite.def.sp | 37 + examples/smpi/NAS/config/make.def | 162 ++ examples/smpi/NAS/config/make.def.template | 162 ++ examples/smpi/NAS/config/make.dummy | 7 + examples/smpi/NAS/config/suite.def.template | 24 + examples/smpi/NAS/sys/Makefile | 22 + examples/smpi/NAS/sys/README | 39 + examples/smpi/NAS/sys/make.common | 54 + examples/smpi/NAS/sys/print_header | 5 + examples/smpi/NAS/sys/print_instructions | 26 + examples/smpi/NAS/sys/setparams.c | 1210 ++++++++ examples/smpi/NAS/sys/suite.awk | 20 + 178 files changed, 34689 insertions(+) create mode 100644 examples/smpi/NAS/BT/Makefile create mode 100644 examples/smpi/NAS/BT/add.f create mode 100644 examples/smpi/NAS/BT/adi.f create mode 100644 examples/smpi/NAS/BT/bt.f create mode 100644 examples/smpi/NAS/BT/btio.f create mode 100644 examples/smpi/NAS/BT/btio_common.f create mode 100644 examples/smpi/NAS/BT/copy_faces.f create mode 100644 examples/smpi/NAS/BT/define.f create mode 100644 examples/smpi/NAS/BT/epio.f create mode 100644 examples/smpi/NAS/BT/error.f create mode 100644 examples/smpi/NAS/BT/exact_rhs.f create mode 100644 examples/smpi/NAS/BT/exact_solution.f create mode 100644 examples/smpi/NAS/BT/fortran_io.f create mode 100644 examples/smpi/NAS/BT/full_mpiio.f create mode 100644 examples/smpi/NAS/BT/header.h create mode 100644 examples/smpi/NAS/BT/initialize.f create mode 100644 examples/smpi/NAS/BT/inputbt.data.sample create mode 100644 examples/smpi/NAS/BT/make_set.f create mode 100644 examples/smpi/NAS/BT/mpinpb.h create mode 100644 examples/smpi/NAS/BT/rhs.f create mode 100644 examples/smpi/NAS/BT/set_constants.f create mode 100644 examples/smpi/NAS/BT/setup_mpi.f create mode 100644 examples/smpi/NAS/BT/simple_mpiio.f create mode 100644 examples/smpi/NAS/BT/solve_subs.f create mode 100644 examples/smpi/NAS/BT/verify.f create mode 100644 examples/smpi/NAS/BT/work_lhs.h create mode 100644 examples/smpi/NAS/BT/work_lhs_vec.h create mode 100644 examples/smpi/NAS/BT/x_solve.f create mode 100644 examples/smpi/NAS/BT/x_solve_vec.f create mode 100644 examples/smpi/NAS/BT/y_solve.f create mode 100644 examples/smpi/NAS/BT/y_solve_vec.f create mode 100644 examples/smpi/NAS/BT/z_solve.f create mode 100644 examples/smpi/NAS/BT/z_solve_vec.f create mode 100644 examples/smpi/NAS/CG/Makefile create mode 100644 examples/smpi/NAS/CG/cg.f create mode 100644 examples/smpi/NAS/CG/mpinpb.h create mode 100644 examples/smpi/NAS/DT/DGraph.c create mode 100644 examples/smpi/NAS/DT/DGraph.h create mode 100644 examples/smpi/NAS/DT/Makefile create mode 100644 examples/smpi/NAS/DT/README create mode 100644 examples/smpi/NAS/DT/dt.c create mode 100644 examples/smpi/NAS/EP/Makefile create mode 100644 examples/smpi/NAS/EP/README create mode 100644 examples/smpi/NAS/EP/ep.c create mode 100644 examples/smpi/NAS/EP/ep.f create mode 100644 examples/smpi/NAS/EP/mpinpb.h create mode 100644 examples/smpi/NAS/EP/randlc.c create mode 100644 examples/smpi/NAS/EP/randlc.h create mode 100644 examples/smpi/NAS/FT/Makefile create mode 100644 examples/smpi/NAS/FT/README create mode 100644 examples/smpi/NAS/FT/ft.f create mode 100644 examples/smpi/NAS/FT/global.h create mode 100644 examples/smpi/NAS/FT/inputft.data.sample create mode 100644 examples/smpi/NAS/FT/mpinpb.h create mode 100644 examples/smpi/NAS/IS/Makefile create mode 100644 examples/smpi/NAS/IS/is.c create mode 100644 examples/smpi/NAS/LU/Makefile create mode 100644 examples/smpi/NAS/LU/applu.incl create mode 100644 examples/smpi/NAS/LU/bcast_inputs.f create mode 100644 examples/smpi/NAS/LU/blts.f create mode 100644 examples/smpi/NAS/LU/blts_vec.f create mode 100644 examples/smpi/NAS/LU/buts.f create mode 100644 examples/smpi/NAS/LU/buts_vec.f create mode 100644 examples/smpi/NAS/LU/erhs.f create mode 100644 examples/smpi/NAS/LU/error.f create mode 100644 examples/smpi/NAS/LU/exact.f create mode 100644 examples/smpi/NAS/LU/exchange_1.f create mode 100644 examples/smpi/NAS/LU/exchange_3.f create mode 100644 examples/smpi/NAS/LU/exchange_4.f create mode 100644 examples/smpi/NAS/LU/exchange_5.f create mode 100644 examples/smpi/NAS/LU/exchange_6.f create mode 100644 examples/smpi/NAS/LU/init_comm.f create mode 100644 examples/smpi/NAS/LU/inputlu.data.sample create mode 100644 examples/smpi/NAS/LU/jacld.f create mode 100644 examples/smpi/NAS/LU/jacu.f create mode 100644 examples/smpi/NAS/LU/l2norm.f create mode 100644 examples/smpi/NAS/LU/lu.f create mode 100644 examples/smpi/NAS/LU/mpinpb.h create mode 100644 examples/smpi/NAS/LU/neighbors.f create mode 100644 examples/smpi/NAS/LU/nodedim.f create mode 100644 examples/smpi/NAS/LU/pintgr.f create mode 100644 examples/smpi/NAS/LU/proc_grid.f create mode 100644 examples/smpi/NAS/LU/read_input.f create mode 100644 examples/smpi/NAS/LU/rhs.f create mode 100644 examples/smpi/NAS/LU/setbv.f create mode 100644 examples/smpi/NAS/LU/setcoeff.f create mode 100644 examples/smpi/NAS/LU/sethyper.f create mode 100644 examples/smpi/NAS/LU/setiv.f create mode 100644 examples/smpi/NAS/LU/ssor.f create mode 100644 examples/smpi/NAS/LU/subdomain.f create mode 100644 examples/smpi/NAS/LU/verify.f create mode 100644 examples/smpi/NAS/MG/Makefile create mode 100644 examples/smpi/NAS/MG/README create mode 100644 examples/smpi/NAS/MG/globals.h create mode 100644 examples/smpi/NAS/MG/mg.f create mode 100644 examples/smpi/NAS/MG/mg.input.sample create mode 100644 examples/smpi/NAS/MG/mpinpb.h create mode 100644 examples/smpi/NAS/MPI_dummy/Makefile create mode 100644 examples/smpi/NAS/MPI_dummy/README create mode 100644 examples/smpi/NAS/MPI_dummy/mpi.h create mode 100644 examples/smpi/NAS/MPI_dummy/mpi_dummy.c create mode 100644 examples/smpi/NAS/MPI_dummy/mpi_dummy.f create mode 100644 examples/smpi/NAS/MPI_dummy/mpif.h create mode 100644 examples/smpi/NAS/MPI_dummy/test.f create mode 100644 examples/smpi/NAS/MPI_dummy/wtime.c create mode 100644 examples/smpi/NAS/MPI_dummy/wtime.f create mode 100644 examples/smpi/NAS/MPI_dummy/wtime.h create mode 100644 examples/smpi/NAS/MPI_dummy/wtime_sgi64.c create mode 100644 examples/smpi/NAS/Makefile create mode 100644 examples/smpi/NAS/README create mode 100644 examples/smpi/NAS/README.install create mode 100644 examples/smpi/NAS/SP/Makefile create mode 100644 examples/smpi/NAS/SP/README create mode 100644 examples/smpi/NAS/SP/add.f create mode 100644 examples/smpi/NAS/SP/adi.f create mode 100644 examples/smpi/NAS/SP/copy_faces.f create mode 100644 examples/smpi/NAS/SP/define.f create mode 100644 examples/smpi/NAS/SP/error.f create mode 100644 examples/smpi/NAS/SP/exact_rhs.f create mode 100644 examples/smpi/NAS/SP/exact_solution.f create mode 100644 examples/smpi/NAS/SP/header.h create mode 100644 examples/smpi/NAS/SP/initialize.f create mode 100644 examples/smpi/NAS/SP/inputsp.data.sample create mode 100644 examples/smpi/NAS/SP/lhsx.f create mode 100644 examples/smpi/NAS/SP/lhsy.f create mode 100644 examples/smpi/NAS/SP/lhsz.f create mode 100644 examples/smpi/NAS/SP/make_set.f create mode 100644 examples/smpi/NAS/SP/mpinpb.h create mode 100644 examples/smpi/NAS/SP/ninvr.f create mode 100644 examples/smpi/NAS/SP/pinvr.f create mode 100644 examples/smpi/NAS/SP/rhs.f create mode 100644 examples/smpi/NAS/SP/set_constants.f create mode 100644 examples/smpi/NAS/SP/setup_mpi.f create mode 100644 examples/smpi/NAS/SP/sp.f create mode 100644 examples/smpi/NAS/SP/txinvr.f create mode 100644 examples/smpi/NAS/SP/tzetar.f create mode 100644 examples/smpi/NAS/SP/verify.f create mode 100644 examples/smpi/NAS/SP/x_solve.f create mode 100644 examples/smpi/NAS/SP/y_solve.f create mode 100644 examples/smpi/NAS/SP/z_solve.f create mode 100644 examples/smpi/NAS/common/c_print_results.c create mode 100644 examples/smpi/NAS/common/c_timers.c create mode 100644 examples/smpi/NAS/common/print_results.f create mode 100644 examples/smpi/NAS/common/randdp.c create mode 100644 examples/smpi/NAS/common/randdp.f create mode 100644 examples/smpi/NAS/common/randdpvec.f create mode 100644 examples/smpi/NAS/common/randi8.f create mode 100644 examples/smpi/NAS/common/randi8_safe.f create mode 100644 examples/smpi/NAS/common/timers.f create mode 100644 examples/smpi/NAS/config/NAS.samples/README create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.irix6.2 create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.origin create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos create mode 100644 examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.bt create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.cg create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.ep create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.ft create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.is create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.lu create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.mg create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.small create mode 100644 examples/smpi/NAS/config/NAS.samples/suite.def.sp create mode 100644 examples/smpi/NAS/config/make.def create mode 100644 examples/smpi/NAS/config/make.def.template create mode 100644 examples/smpi/NAS/config/make.dummy create mode 100644 examples/smpi/NAS/config/suite.def.template create mode 100644 examples/smpi/NAS/sys/Makefile create mode 100644 examples/smpi/NAS/sys/README create mode 100644 examples/smpi/NAS/sys/make.common create mode 100755 examples/smpi/NAS/sys/print_header create mode 100755 examples/smpi/NAS/sys/print_instructions create mode 100644 examples/smpi/NAS/sys/setparams.c create mode 100644 examples/smpi/NAS/sys/suite.awk diff --git a/examples/smpi/NAS/BT/Makefile b/examples/smpi/NAS/BT/Makefile new file mode 100644 index 0000000000..dd27503221 --- /dev/null +++ b/examples/smpi/NAS/BT/Makefile @@ -0,0 +1,106 @@ +SHELL=/bin/sh +BENCHMARK=bt +BENCHMARKU=BT +VEC= + +include ../config/make.def + + +OBJS = bt.o make_set.o initialize.o exact_solution.o exact_rhs.o \ + set_constants.o adi.o define.o copy_faces.o rhs.o solve_subs.o \ + x_solve$(VEC).o y_solve$(VEC).o z_solve$(VEC).o add.o error.o \ + verify.o setup_mpi.o \ + ${COMMON}/print_results.o ${COMMON}/timers.o + +include ../sys/make.common + +# npbparams.h is included by header.h +# The following rule should do the trick but many make programs (not gmake) +# will do the wrong thing and rebuild the world every time (because the +# mod time on header.h is not changed. One solution would be to +# touch header.h but this might cause confusion if someone has +# accidentally deleted it. Instead, make the dependency on npbparams.h +# explicit in all the lines below (even though dependence is indirect). + +# header.h: npbparams.h + +${PROGRAM}: config + @if [ x$(VERSION) = xvec ] ; then \ + ${MAKE} VEC=_vec exec; \ + elif [ x$(VERSION) = xVEC ] ; then \ + ${MAKE} VEC=_vec exec; \ + else \ + ${MAKE} exec; \ + fi + +exec: $(OBJS) + @if [ x$(SUBTYPE) = xfull ] ; then \ + ${MAKE} bt-full; \ + elif [ x$(SUBTYPE) = xFULL ] ; then \ + ${MAKE} bt-full; \ + elif [ x$(SUBTYPE) = xsimple ] ; then \ + ${MAKE} bt-simple; \ + elif [ x$(SUBTYPE) = xSIMPLE ] ; then \ + ${MAKE} bt-simple; \ + elif [ x$(SUBTYPE) = xfortran ] ; then \ + ${MAKE} bt-fortran; \ + elif [ x$(SUBTYPE) = xFORTRAN ] ; then \ + ${MAKE} bt-fortran; \ + elif [ x$(SUBTYPE) = xepio ] ; then \ + ${MAKE} bt-epio; \ + elif [ x$(SUBTYPE) = xEPIO ] ; then \ + ${MAKE} bt-epio; \ + else \ + ${MAKE} bt-bt; \ + fi + +bt-bt: ${OBJS} btio.o + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} btio.o ${FMPI_LIB} + +bt-full: ${OBJS} full_mpiio.o btio_common.o + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_full ${OBJS} btio_common.o full_mpiio.o ${FMPI_LIB} + +bt-simple: ${OBJS} simple_mpiio.o btio_common.o + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_simple ${OBJS} btio_common.o simple_mpiio.o ${FMPI_LIB} + +bt-fortran: ${OBJS} fortran_io.o btio_common.o + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.fortran_io ${OBJS} btio_common.o fortran_io.o ${FMPI_LIB} + +bt-epio: ${OBJS} epio.o btio_common.o + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.ep_io ${OBJS} btio_common.o epio.o ${FMPI_LIB} + +.f.o: + ${FCOMPILE} $< + +.c.o: + ${CCOMPILE} $< + + +bt.o: bt.f header.h npbparams.h mpinpb.h +make_set.o: make_set.f header.h npbparams.h mpinpb.h +initialize.o: initialize.f header.h npbparams.h +exact_solution.o: exact_solution.f header.h npbparams.h +exact_rhs.o: exact_rhs.f header.h npbparams.h +set_constants.o: set_constants.f header.h npbparams.h +adi.o: adi.f header.h npbparams.h +define.o: define.f header.h npbparams.h +copy_faces.o: copy_faces.f header.h npbparams.h mpinpb.h +rhs.o: rhs.f header.h npbparams.h +x_solve$(VEC).o: x_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h +y_solve$(VEC).o: y_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h +z_solve$(VEC).o: z_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h +solve_subs.o: solve_subs.f npbparams.h +add.o: add.f header.h npbparams.h +error.o: error.f header.h npbparams.h mpinpb.h +verify.o: verify.f header.h npbparams.h mpinpb.h +setup_mpi.o: setup_mpi.f mpinpb.h npbparams.h +btio.o: btio.f header.h npbparams.h +btio_common.o: btio_common.f mpinpb.h npbparams.h +fortran_io.o: fortran_io.f mpinpb.h npbparams.h +simple_mpiio.o: simple_mpiio.f mpinpb.h npbparams.h +full_mpiio.o: full_mpiio.f mpinpb.h npbparams.h +epio.o: epio.f mpinpb.h npbparams.h + +clean: + - rm -f *.o *~ mputil* + - rm -f npbparams.h core diff --git a/examples/smpi/NAS/BT/add.f b/examples/smpi/NAS/BT/add.f new file mode 100644 index 0000000000..e14cde46ef --- /dev/null +++ b/examples/smpi/NAS/BT/add.f @@ -0,0 +1,30 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine add + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c addition of update to the vector u +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, i, j, k, m + + do c = 1, ncells + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c) + enddo + enddo + enddo + enddo + enddo + + return + end diff --git a/examples/smpi/NAS/BT/adi.f b/examples/smpi/NAS/BT/adi.f new file mode 100644 index 0000000000..58450c028e --- /dev/null +++ b/examples/smpi/NAS/BT/adi.f @@ -0,0 +1,21 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine adi + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + call copy_faces + + call x_solve + + call y_solve + + call z_solve + + call add + + return + end + diff --git a/examples/smpi/NAS/BT/bt.f b/examples/smpi/NAS/BT/bt.f new file mode 100644 index 0000000000..36e50781b3 --- /dev/null +++ b/examples/smpi/NAS/BT/bt.f @@ -0,0 +1,275 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! B T ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! +! It is described in NAS Technical Reports 95-020 and 02-007. ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +c--------------------------------------------------------------------- +c +c Authors: R. F. Van der Wijngaart +c T. Harris +c M. Yarrow +c +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- + program MPBT +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i, niter, step, c, error, fstatus + double precision navg, mflops, mbytes, n3 + + external timer_read + double precision t, tmax, tiominv, tpc, timer_read + logical verified + character class, cbuff*40 + + integer wr_interval + + call setup_mpi + if (.not. active) goto 999 + +c--------------------------------------------------------------------- +c Root node reads input file (if it exists) else takes +c defaults from parameters +c--------------------------------------------------------------------- + if (node .eq. root) then + + write(*, 1000) + open (unit=2,file='inputbt.data',status='old', iostat=fstatus) +c + rd_interval = 0 + if (fstatus .eq. 0) then + write(*,233) + 233 format(' Reading from input file inputbt.data') + read (2,*) niter + read (2,*) dt + read (2,*) grid_points(1), grid_points(2), grid_points(3) + if (iotype .ne. 0) then + read (2,'(A)') cbuff + read (cbuff,*,iostat=i) wr_interval, rd_interval + if (i .ne. 0) rd_interval = 0 + if (wr_interval .le. 0) wr_interval = wr_default + endif + if (iotype .eq. 1) then + read (2,*) collbuf_nodes, collbuf_size + write(*,*) 'collbuf_nodes ', collbuf_nodes + write(*,*) 'collbuf_size ', collbuf_size + endif + close(2) + else + write(*,234) + niter = niter_default + dt = dt_default + grid_points(1) = problem_size + grid_points(2) = problem_size + grid_points(3) = problem_size + wr_interval = wr_default + if (iotype .eq. 1) then +c set number of nodes involved in collective buffering to 4, +c unless total number of nodes is smaller than that. +c set buffer size for collective buffering to 1MB per node +c collbuf_nodes = min(4,no_nodes) +c set default to No-File-Hints with a value of 0 + collbuf_nodes = 0 + collbuf_size = 1000000 + endif + endif + 234 format(' No input file inputbt.data. Using compiled defaults') + + write(*, 1001) grid_points(1), grid_points(2), grid_points(3) + write(*, 1002) niter, dt + if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes + if (no_nodes .ne. maxcells*maxcells) + > write(*, 1005) maxcells*maxcells + write(*, 1003) no_nodes + + if (iotype .eq. 1) write(*, 1006) 'FULL MPI-IO', wr_interval + if (iotype .eq. 2) write(*, 1006) 'SIMPLE MPI-IO', wr_interval + if (iotype .eq. 3) write(*, 1006) 'EPIO', wr_interval + if (iotype .eq. 4) write(*, 1006) 'FORTRAN IO', wr_interval + + 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- BT Benchmark ',/) + 1001 format(' Size: ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations: ', i4, ' dt: ', F11.7) + 1004 format(' Total number of processes: ', i5) + 1005 format(' WARNING: compiled for ', i5, ' processes ') + 1003 format(' Number of active processes: ', i5, /) + 1006 format(' BTIO -- ', A, ' write interval: ', i3 /) + + endif + + call mpi_bcast(niter, 1, MPI_INTEGER, + > root, comm_setup, error) + + call mpi_bcast(dt, 1, dp_type, + > root, comm_setup, error) + + call mpi_bcast(grid_points(1), 3, MPI_INTEGER, + > root, comm_setup, error) + + call mpi_bcast(wr_interval, 1, MPI_INTEGER, + > root, comm_setup, error) + + call mpi_bcast(rd_interval, 1, MPI_INTEGER, + > root, comm_setup, error) + + call make_set + + do c = 1, maxcells + if ( (cell_size(1,c) .gt. IMAX) .or. + > (cell_size(2,c) .gt. JMAX) .or. + > (cell_size(3,c) .gt. KMAX) ) then + print *,node, c, (cell_size(i,c),i=1,3) + print *,' Problem size too big for compiled array sizes' + goto 999 + endif + end do + + call set_constants + + call initialize + + call setup_btio + idump = 0 + + call lhsinit + + call exact_rhs + + call compute_buffer_size(5) + +c--------------------------------------------------------------------- +c do one time step to touch all code, and reinitialize +c--------------------------------------------------------------------- + call adi + call initialize + + call timer_clear(2) + +c--------------------------------------------------------------------- +c Synchronize before placing time stamp +c--------------------------------------------------------------------- + call mpi_barrier(comm_setup, error) + + call timer_clear(1) + call timer_start(1) + + do step = 1, niter + + if (node .eq. root) then + if (mod(step, 20) .eq. 0 .or. step .eq. niter .or. + > step .eq. 1) then + write(*, 200) step + 200 format(' Time step ', i4) + endif + endif + + call adi + + if (iotype .ne. 0) then + call timer_start(2) + if (mod(step, wr_interval).eq.0 .or. step .eq. niter) then + if (node .eq. root) then + print *, 'Writing data set, time step', step + endif + if (step .eq. niter .and. rd_interval .gt. 1) then + rd_interval = 1 + endif + call output_timestep + idump = idump + 1 + endif + call timer_stop(2) + endif + end do + + call btio_cleanup + + call timer_stop(1) + t = timer_read(1) + + call verify(niter, class, verified) + + call mpi_reduce(t, tmax, 1, + > dp_type, MPI_MAX, + > root, comm_setup, error) + + if (iotype .ne. 0) then + t = timer_read(2) + if (t .ne. 0.d0) t = 1.0d0 / t + call mpi_reduce(t, tiominv, 1, + > dp_type, MPI_SUM, + > root, comm_setup, error) + endif + + if( node .eq. root ) then + n3 = 1.0d0*grid_points(1)*grid_points(2)*grid_points(3) + navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0 + if( tmax .ne. 0. ) then + mflops = 1.0e-6*float(niter)* + > (3478.8*n3-17655.7*navg**2+28023.7*navg) + > / tmax + else + mflops = 0.0 + endif + + if (iotype .ne. 0) then + mbytes = n3 * 40.0 * idump * 1.0d-6 + tiominv = tiominv / no_nodes + t = 0.0 + if (tiominv .ne. 0.) t = 1.d0 / tiominv + tpc = 0.0 + if (tmax .ne. 0.) tpc = t * 100.0 / tmax + write(*,1100) t, tpc, mbytes, mbytes*tiominv + 1100 format(/' BTIO -- statistics:'/ + > ' I/O timing in seconds : ', f14.2/ + > ' I/O timing percentage : ', f14.2/ + > ' Total data written (MB) : ', f14.2/ + > ' I/O data rate (MB/sec) : ', f14.2) + endif + + call print_results('BT', class, grid_points(1), + > grid_points(2), grid_points(3), niter, maxcells*maxcells, + > total_nodes, tmax, mflops, ' floating point', + > verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, + > cs6, '(none)') + endif + + 999 continue + call mpi_barrier(MPI_COMM_WORLD, error) + call mpi_finalize(error) + + end + diff --git a/examples/smpi/NAS/BT/btio.f b/examples/smpi/NAS/BT/btio.f new file mode 100644 index 0000000000..1fb730b1f6 --- /dev/null +++ b/examples/smpi/NAS/BT/btio.f @@ -0,0 +1,72 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup_btio + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine output_timestep + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine btio_cleanup + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine btio_verify(verified) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + logical verified + + verified = .true. + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine accumulate_norms(xce_acc) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + double precision xce_acc(5) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine checksum_timestep + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + return + end diff --git a/examples/smpi/NAS/BT/btio_common.f b/examples/smpi/NAS/BT/btio_common.f new file mode 100644 index 0000000000..9227a12b70 --- /dev/null +++ b/examples/smpi/NAS/BT/btio_common.f @@ -0,0 +1,30 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine clear_timestep + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer cio, kio, jio, ix + + do cio=1,ncells + do kio=0, cell_size(3,cio)-1 + do jio=0, cell_size(2,cio)-1 + do ix=0,cell_size(1,cio)-1 + u(1,ix, jio,kio,cio) = 0 + u(2,ix, jio,kio,cio) = 0 + u(3,ix, jio,kio,cio) = 0 + u(4,ix, jio,kio,cio) = 0 + u(5,ix, jio,kio,cio) = 0 + enddo + enddo + enddo + enddo + + return + end + diff --git a/examples/smpi/NAS/BT/copy_faces.f b/examples/smpi/NAS/BT/copy_faces.f new file mode 100644 index 0000000000..14b82caf83 --- /dev/null +++ b/examples/smpi/NAS/BT/copy_faces.f @@ -0,0 +1,316 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine copy_faces + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c This function copies the face values of a variable defined on a set +c of cells to the overlap locations of the adjacent sets of cells. +c Because a set of cells interfaces in each direction with exactly one +c other set, we only need to fill six different buffers. We could try to +c overlap communication with computation, by computing +c some internal values while communicating boundary values, but this +c adds so much overhead that it's not clearly useful. +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i, j, k, c, m, requests(0:11), p0, p1, + > p2, p3, p4, p5, b_size(0:5), ss(0:5), + > sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11) + +c--------------------------------------------------------------------- +c exit immediately if there are no faces to be copied +c--------------------------------------------------------------------- + if (no_nodes .eq. 1) then + call compute_rhs + return + endif + + ss(0) = start_send_east + ss(1) = start_send_west + ss(2) = start_send_north + ss(3) = start_send_south + ss(4) = start_send_top + ss(5) = start_send_bottom + + sr(0) = start_recv_east + sr(1) = start_recv_west + sr(2) = start_recv_north + sr(3) = start_recv_south + sr(4) = start_recv_top + sr(5) = start_recv_bottom + + b_size(0) = east_size + b_size(1) = west_size + b_size(2) = north_size + b_size(3) = south_size + b_size(4) = top_size + b_size(5) = bottom_size + +c--------------------------------------------------------------------- +c because the difference stencil for the diagonalized scheme is +c orthogonal, we do not have to perform the staged copying of faces, +c but can send all face information simultaneously to the neighboring +c cells in all directions +c--------------------------------------------------------------------- + p0 = 0 + p1 = 0 + p2 = 0 + p3 = 0 + p4 = 0 + p5 = 0 + + do c = 1, ncells + +c--------------------------------------------------------------------- +c fill the buffer to be sent to eastern neighbors (i-dir) +c--------------------------------------------------------------------- + if (cell_coord(1,c) .ne. ncells) then + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = cell_size(1,c)-2, cell_size(1,c)-1 + do m = 1, 5 + out_buffer(ss(0)+p0) = u(m,i,j,k,c) + p0 = p0 + 1 + end do + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to western neighbors +c--------------------------------------------------------------------- + if (cell_coord(1,c) .ne. 1) then + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, 1 + do m = 1, 5 + out_buffer(ss(1)+p1) = u(m,i,j,k,c) + p1 = p1 + 1 + end do + end do + end do + end do + + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to northern neighbors (j_dir) +c--------------------------------------------------------------------- + if (cell_coord(2,c) .ne. ncells) then + do k = 0, cell_size(3,c)-1 + do j = cell_size(2,c)-2, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + out_buffer(ss(2)+p2) = u(m,i,j,k,c) + p2 = p2 + 1 + end do + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to southern neighbors +c--------------------------------------------------------------------- + if (cell_coord(2,c).ne. 1) then + do k = 0, cell_size(3,c)-1 + do j = 0, 1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + out_buffer(ss(3)+p3) = u(m,i,j,k,c) + p3 = p3 + 1 + end do + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to top neighbors (k-dir) +c--------------------------------------------------------------------- + if (cell_coord(3,c) .ne. ncells) then + do k = cell_size(3,c)-2, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + out_buffer(ss(4)+p4) = u(m,i,j,k,c) + p4 = p4 + 1 + end do + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to bottom neighbors +c--------------------------------------------------------------------- + if (cell_coord(3,c).ne. 1) then + do k=0, 1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + out_buffer(ss(5)+p5) = u(m,i,j,k,c) + p5 = p5 + 1 + end do + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c cell loop +c--------------------------------------------------------------------- + end do + + call mpi_irecv(in_buffer(sr(0)), b_size(0), + > dp_type, successor(1), WEST, + > comm_rhs, requests(0), error) + call mpi_irecv(in_buffer(sr(1)), b_size(1), + > dp_type, predecessor(1), EAST, + > comm_rhs, requests(1), error) + call mpi_irecv(in_buffer(sr(2)), b_size(2), + > dp_type, successor(2), SOUTH, + > comm_rhs, requests(2), error) + call mpi_irecv(in_buffer(sr(3)), b_size(3), + > dp_type, predecessor(2), NORTH, + > comm_rhs, requests(3), error) + call mpi_irecv(in_buffer(sr(4)), b_size(4), + > dp_type, successor(3), BOTTOM, + > comm_rhs, requests(4), error) + call mpi_irecv(in_buffer(sr(5)), b_size(5), + > dp_type, predecessor(3), TOP, + > comm_rhs, requests(5), error) + + call mpi_isend(out_buffer(ss(0)), b_size(0), + > dp_type, successor(1), EAST, + > comm_rhs, requests(6), error) + call mpi_isend(out_buffer(ss(1)), b_size(1), + > dp_type, predecessor(1), WEST, + > comm_rhs, requests(7), error) + call mpi_isend(out_buffer(ss(2)), b_size(2), + > dp_type,successor(2), NORTH, + > comm_rhs, requests(8), error) + call mpi_isend(out_buffer(ss(3)), b_size(3), + > dp_type,predecessor(2), SOUTH, + > comm_rhs, requests(9), error) + call mpi_isend(out_buffer(ss(4)), b_size(4), + > dp_type,successor(3), TOP, + > comm_rhs, requests(10), error) + call mpi_isend(out_buffer(ss(5)), b_size(5), + > dp_type,predecessor(3), BOTTOM, + > comm_rhs,requests(11), error) + + + call mpi_waitall(12, requests, statuses, error) + +c--------------------------------------------------------------------- +c unpack the data that has just been received; +c--------------------------------------------------------------------- + p0 = 0 + p1 = 0 + p2 = 0 + p3 = 0 + p4 = 0 + p5 = 0 + + do c = 1, ncells + + if (cell_coord(1,c) .ne. 1) then + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = -2, -1 + do m = 1, 5 + u(m,i,j,k,c) = in_buffer(sr(1)+p0) + p0 = p0 + 1 + end do + end do + end do + end do + endif + + if (cell_coord(1,c) .ne. ncells) then + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = cell_size(1,c), cell_size(1,c)+1 + do m = 1, 5 + u(m,i,j,k,c) = in_buffer(sr(0)+p1) + p1 = p1 + 1 + end do + end do + end do + end do + end if + + if (cell_coord(2,c) .ne. 1) then + do k = 0, cell_size(3,c)-1 + do j = -2, -1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + u(m,i,j,k,c) = in_buffer(sr(3)+p2) + p2 = p2 + 1 + end do + end do + end do + end do + + endif + + if (cell_coord(2,c) .ne. ncells) then + do k = 0, cell_size(3,c)-1 + do j = cell_size(2,c), cell_size(2,c)+1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + u(m,i,j,k,c) = in_buffer(sr(2)+p3) + p3 = p3 + 1 + end do + end do + end do + end do + endif + + if (cell_coord(3,c) .ne. 1) then + do k = -2, -1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + u(m,i,j,k,c) = in_buffer(sr(5)+p4) + p4 = p4 + 1 + end do + end do + end do + end do + endif + + if (cell_coord(3,c) .ne. ncells) then + do k = cell_size(3,c), cell_size(3,c)+1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + u(m,i,j,k,c) = in_buffer(sr(4)+p5) + p5 = p5 + 1 + end do + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c cells loop +c--------------------------------------------------------------------- + end do + +c--------------------------------------------------------------------- +c do the rest of the rhs that uses the copied face values +c--------------------------------------------------------------------- + call compute_rhs + + return + end diff --git a/examples/smpi/NAS/BT/define.f b/examples/smpi/NAS/BT/define.f new file mode 100644 index 0000000000..03c4c6edd7 --- /dev/null +++ b/examples/smpi/NAS/BT/define.f @@ -0,0 +1,64 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine compute_buffer_size(dim) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, dim, face_size + + if (ncells .eq. 1) return + +c--------------------------------------------------------------------- +c compute the actual sizes of the buffers; note that there is +c always one cell face that doesn't need buffer space, because it +c is at the boundary of the grid +c--------------------------------------------------------------------- + west_size = 0 + east_size = 0 + + do c = 1, ncells + face_size = cell_size(2,c) * cell_size(3,c) * dim * 2 + if (cell_coord(1,c).ne.1) west_size = west_size + face_size + if (cell_coord(1,c).ne.ncells) east_size = east_size + + > face_size + end do + + north_size = 0 + south_size = 0 + do c = 1, ncells + face_size = cell_size(1,c)*cell_size(3,c) * dim * 2 + if (cell_coord(2,c).ne.1) south_size = south_size + face_size + if (cell_coord(2,c).ne.ncells) north_size = north_size + + > face_size + end do + + top_size = 0 + bottom_size = 0 + do c = 1, ncells + face_size = cell_size(1,c) * cell_size(2,c) * dim * 2 + if (cell_coord(3,c).ne.1) bottom_size = bottom_size + + > face_size + if (cell_coord(3,c).ne.ncells) top_size = top_size + + > face_size + end do + + start_send_west = 1 + start_send_east = start_send_west + west_size + start_send_south = start_send_east + east_size + start_send_north = start_send_south + south_size + start_send_bottom = start_send_north + north_size + start_send_top = start_send_bottom + bottom_size + start_recv_west = 1 + start_recv_east = start_recv_west + west_size + start_recv_south = start_recv_east + east_size + start_recv_north = start_recv_south + south_size + start_recv_bottom = start_recv_north + north_size + start_recv_top = start_recv_bottom + bottom_size + + return + end + diff --git a/examples/smpi/NAS/BT/epio.f b/examples/smpi/NAS/BT/epio.f new file mode 100644 index 0000000000..52b630999d --- /dev/null +++ b/examples/smpi/NAS/BT/epio.f @@ -0,0 +1,165 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup_btio + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + character*(128) newfilenm + integer m + + if (node .lt. 10000) then + write (newfilenm, 996) filenm,node + else + print *, 'error generating file names (> 10000 nodes)' + stop + endif + +996 format (a,'.',i4.4) + + open (unit=99, file=newfilenm, form='unformatted', + $ status='unknown') + + do m = 1, 5 + xce_sub(m) = 0.d0 + end do + + idump_sub = 0 + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine output_timestep + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + include 'header.h' + include 'mpinpb.h' + + integer ix, iio, jio, kio, cio, aio + + do cio=1,ncells + write(99) + $ ((((u(aio,ix, jio,kio,cio),aio=1,5), + $ ix=0, cell_size(1,cio)-1), + $ jio=0, cell_size(2,cio)-1), + $ kio=0, cell_size(3,cio)-1) + enddo + + idump_sub = idump_sub + 1 + if (rd_interval .gt. 0) then + if (idump_sub .ge. rd_interval) then + + rewind(99) + call acc_sub_norms(idump+1) + + rewind(99) + idump_sub = 0 + endif + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine acc_sub_norms(idump_cur) + + include 'header.h' + include 'mpinpb.h' + + integer idump_cur + + integer ix, jio, kio, cio, ii, m, ichunk + double precision xce_single(5) + + ichunk = idump_cur - idump_sub + 1 + do ii=0, idump_sub-1 + do cio=1,ncells + read(99) + $ ((((u(m,ix, jio,kio,cio),m=1,5), + $ ix=0, cell_size(1,cio)-1), + $ jio=0, cell_size(2,cio)-1), + $ kio=0, cell_size(3,cio)-1) + enddo + + if (node .eq. root) print *, 'Reading data set ', ii+ichunk + + call error_norm(xce_single) + do m = 1, 5 + xce_sub(m) = xce_sub(m) + xce_single(m) + end do + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine btio_cleanup + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + close(unit=99) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine accumulate_norms(xce_acc) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + double precision xce_acc(5) + + character*(128) newfilenm + integer m + + if (rd_interval .gt. 0) goto 20 + + if (node .lt. 10000) then + write (newfilenm, 996) filenm,node + else + print *, 'error generating file names (> 10000 nodes)' + stop + endif + +996 format (a,'.',i4.4) + + open (unit=99, file=newfilenm, + $ form='unformatted') + +c clear the last time step + + call clear_timestep + +c read back the time steps and accumulate norms + + call acc_sub_norms(idump) + + close(unit=99) + + 20 continue + do m = 1, 5 + xce_acc(m) = xce_sub(m) / dble(idump) + end do + + return + end diff --git a/examples/smpi/NAS/BT/error.f b/examples/smpi/NAS/BT/error.f new file mode 100644 index 0000000000..147a582b58 --- /dev/null +++ b/examples/smpi/NAS/BT/error.f @@ -0,0 +1,106 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine error_norm(rms) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c this function computes the norm of the difference between the +c computed solution and the exact solution +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer c, i, j, k, m, ii, jj, kk, d, error + double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5), + > add + + do m = 1, 5 + rms_work(m) = 0.0d0 + enddo + + do c = 1, ncells + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + ii = 0 + do i = cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + call exact_solution(xi, eta, zeta, u_exact) + + do m = 1, 5 + add = u(m,ii,jj,kk,c)-u_exact(m) + rms_work(m) = rms_work(m) + add*add + enddo + ii = ii + 1 + enddo + jj = jj + 1 + enddo + kk = kk + 1 + enddo + enddo + + call mpi_allreduce(rms_work, rms, 5, dp_type, + > MPI_SUM, comm_setup, error) + + do m = 1, 5 + do d = 1, 3 + rms(m) = rms(m) / dble(grid_points(d)-2) + enddo + rms(m) = dsqrt(rms(m)) + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine rhs_norm(rms) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer c, i, j, k, d, m, error + double precision rms(5), rms_work(5), add + + do m = 1, 5 + rms_work(m) = 0.0d0 + enddo + + do c = 1, ncells + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + add = rhs(m,i,j,k,c) + rms_work(m) = rms_work(m) + add*add + enddo + enddo + enddo + enddo + enddo + + call mpi_allreduce(rms_work, rms, 5, dp_type, + > MPI_SUM, comm_setup, error) + + do m = 1, 5 + do d = 1, 3 + rms(m) = rms(m) / dble(grid_points(d)-2) + enddo + rms(m) = dsqrt(rms(m)) + enddo + + return + end + diff --git a/examples/smpi/NAS/BT/exact_rhs.f b/examples/smpi/NAS/BT/exact_rhs.f new file mode 100644 index 0000000000..26a2871d20 --- /dev/null +++ b/examples/smpi/NAS/BT/exact_rhs.f @@ -0,0 +1,360 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exact_rhs + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the right hand side based on exact solution +c--------------------------------------------------------------------- + + include 'header.h' + + double precision dtemp(5), xi, eta, zeta, dtpp + integer c, m, i, j, k, ip1, im1, jp1, + > jm1, km1, kp1 + + +c--------------------------------------------------------------------- +c loop over all cells owned by this node +c--------------------------------------------------------------------- + do c = 1, ncells + +c--------------------------------------------------------------------- +c initialize +c--------------------------------------------------------------------- + do k= 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + forcing(m,i,j,k,c) = 0.0d0 + enddo + enddo + enddo + enddo + +c--------------------------------------------------------------------- +c xi-direction flux differences +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + zeta = dble(k+cell_low(3,c)) * dnzm1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + eta = dble(j+cell_low(2,c)) * dnym1 + + do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c) + xi = dble(i+cell_low(1,c)) * dnxm1 + + call exact_solution(xi, eta, zeta, dtemp) + do m = 1, 5 + ue(i,m) = dtemp(m) + enddo + + dtpp = 1.0d0 / dtemp(1) + + do m = 2, 5 + buf(i,m) = dtpp * dtemp(m) + enddo + + cuf(i) = buf(i,2) * buf(i,2) + buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + + > buf(i,4) * buf(i,4) + q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) + + > buf(i,4)*ue(i,4)) + + enddo + + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + im1 = i-1 + ip1 = i+1 + + forcing(1,i,j,k,c) = forcing(1,i,j,k,c) - + > tx2*( ue(ip1,2)-ue(im1,2) )+ + > dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1)) + + forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * ( + > (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))- + > (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+ + > xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+ + > dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2)) + + forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * ( + > ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+ + > xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+ + > dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3)) + + forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*( + > ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+ + > xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+ + > dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4)) + + forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*( + > buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))- + > buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+ + > 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+ + > buf(im1,1))+ + > xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+ + > xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+ + > dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5)) + enddo + +c--------------------------------------------------------------------- +c Fourth-order dissipation +c--------------------------------------------------------------------- + if (start(1,c) .gt. 0) then + do m = 1, 5 + i = 1 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m)) + i = 2 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) - + > 4.0d0*ue(i+1,m) + ue(i+2,m)) + enddo + endif + + do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1 + do m = 1, 5 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp* + > (ue(i-2,m) - 4.0d0*ue(i-1,m) + + > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m)) + enddo + enddo + + if (end(1,c) .gt. 0) then + do m = 1, 5 + i = cell_size(1,c)-3 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (ue(i-2,m) - 4.0d0*ue(i-1,m) + + > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m)) + i = cell_size(1,c)-2 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m)) + enddo + endif + + enddo + enddo + +c--------------------------------------------------------------------- +c eta-direction flux differences +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + zeta = dble(k+cell_low(3,c)) * dnzm1 + do i=start(1,c), cell_size(1,c)-end(1,c)-1 + xi = dble(i+cell_low(1,c)) * dnxm1 + + do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c) + eta = dble(j+cell_low(2,c)) * dnym1 + + call exact_solution(xi, eta, zeta, dtemp) + do m = 1, 5 + ue(j,m) = dtemp(m) + enddo + + dtpp = 1.0d0/dtemp(1) + + do m = 2, 5 + buf(j,m) = dtpp * dtemp(m) + enddo + + cuf(j) = buf(j,3) * buf(j,3) + buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + + > buf(j,4) * buf(j,4) + q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) + + > buf(j,4)*ue(j,4)) + enddo + + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + jm1 = j-1 + jp1 = j+1 + + forcing(1,i,j,k,c) = forcing(1,i,j,k,c) - + > ty2*( ue(jp1,3)-ue(jm1,3) )+ + > dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1)) + + forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*( + > ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+ + > yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+ + > dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2)) + + forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*( + > (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))- + > (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+ + > yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+ + > dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3)) + + forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*( + > ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+ + > yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+ + > dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4)) + + forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*( + > buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))- + > buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+ + > 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+ + > buf(jm1,1))+ + > yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+ + > yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+ + > dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5)) + enddo + +c--------------------------------------------------------------------- +c Fourth-order dissipation +c--------------------------------------------------------------------- + if (start(2,c) .gt. 0) then + do m = 1, 5 + j = 1 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m)) + j = 2 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) - + > 4.0d0*ue(j+1,m) + ue(j+2,m)) + enddo + endif + + do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1 + do m = 1, 5 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp* + > (ue(j-2,m) - 4.0d0*ue(j-1,m) + + > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m)) + enddo + enddo + + if (end(2,c) .gt. 0) then + do m = 1, 5 + j = cell_size(2,c)-3 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (ue(j-2,m) - 4.0d0*ue(j-1,m) + + > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m)) + j = cell_size(2,c)-2 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m)) + + enddo + endif + + enddo + enddo + +c--------------------------------------------------------------------- +c zeta-direction flux differences +c--------------------------------------------------------------------- + do j=start(2,c), cell_size(2,c)-end(2,c)-1 + eta = dble(j+cell_low(2,c)) * dnym1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + xi = dble(i+cell_low(1,c)) * dnxm1 + + do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c) + zeta = dble(k+cell_low(3,c)) * dnzm1 + + call exact_solution(xi, eta, zeta, dtemp) + do m = 1, 5 + ue(k,m) = dtemp(m) + enddo + + dtpp = 1.0d0/dtemp(1) + + do m = 2, 5 + buf(k,m) = dtpp * dtemp(m) + enddo + + cuf(k) = buf(k,4) * buf(k,4) + buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + + > buf(k,3) * buf(k,3) + q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) + + > buf(k,4)*ue(k,4)) + enddo + + do k=start(3,c), cell_size(3,c)-end(3,c)-1 + km1 = k-1 + kp1 = k+1 + + forcing(1,i,j,k,c) = forcing(1,i,j,k,c) - + > tz2*( ue(kp1,4)-ue(km1,4) )+ + > dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1)) + + forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * ( + > ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+ + > zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+ + > dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2)) + + forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * ( + > ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+ + > zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+ + > dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3)) + + forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * ( + > (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))- + > (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+ + > zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+ + > dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4)) + + forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * ( + > buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))- + > buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+ + > 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1) + > +buf(km1,1))+ + > zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+ + > zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+ + > dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5)) + enddo + +c--------------------------------------------------------------------- +c Fourth-order dissipation +c--------------------------------------------------------------------- + if (start(3,c) .gt. 0) then + do m = 1, 5 + k = 1 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m)) + k = 2 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) - + > 4.0d0*ue(k+1,m) + ue(k+2,m)) + enddo + endif + + do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1 + do m = 1, 5 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp* + > (ue(k-2,m) - 4.0d0*ue(k-1,m) + + > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m)) + enddo + enddo + + if (end(3,c) .gt. 0) then + do m = 1, 5 + k = cell_size(3,c)-3 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (ue(k-2,m) - 4.0d0*ue(k-1,m) + + > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m)) + k = cell_size(3,c)-2 + forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * + > (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m)) + enddo + endif + + enddo + enddo + +c--------------------------------------------------------------------- +c now change the sign of the forcing function, +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + forcing(m,i,j,k,c) = -1.d0 * forcing(m,i,j,k,c) + enddo + enddo + enddo + enddo + + enddo + + return + end diff --git a/examples/smpi/NAS/BT/exact_solution.f b/examples/smpi/NAS/BT/exact_solution.f new file mode 100644 index 0000000000..b093b46d16 --- /dev/null +++ b/examples/smpi/NAS/BT/exact_solution.f @@ -0,0 +1,29 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exact_solution(xi,eta,zeta,dtemp) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c this function returns the exact solution at point xi, eta, zeta +c--------------------------------------------------------------------- + + include 'header.h' + + double precision xi, eta, zeta, dtemp(5) + integer m + + do m = 1, 5 + dtemp(m) = ce(m,1) + + > xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) + + > eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+ + > zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + + > zeta*ce(m,13)))) + enddo + + return + end + + diff --git a/examples/smpi/NAS/BT/fortran_io.f b/examples/smpi/NAS/BT/fortran_io.f new file mode 100644 index 0000000000..d3085a030a --- /dev/null +++ b/examples/smpi/NAS/BT/fortran_io.f @@ -0,0 +1,174 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup_btio + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + character*(128) newfilenm + integer m, ierr + + if (node.eq.root) record_length = 40/fortran_rec_sz + call mpi_bcast(record_length, 1, MPI_INTEGER, + > root, comm_setup, ierr) + + open (unit=99, file=filenm, + $ form='unformatted', access='direct', + $ recl=record_length) + + do m = 1, 5 + xce_sub(m) = 0.d0 + end do + + idump_sub = 0 + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine output_timestep + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + include 'header.h' + include 'mpinpb.h' + + integer ix, jio, kio, cio + + do cio=1,ncells + do kio=0, cell_size(3,cio)-1 + do jio=0, cell_size(2,cio)-1 + iseek=(cell_low(1,cio) + + $ PROBLEM_SIZE*((cell_low(2,cio)+jio) + + $ PROBLEM_SIZE*((cell_low(3,cio)+kio) + + $ PROBLEM_SIZE*idump_sub))) + + do ix=0,cell_size(1,cio)-1 + write(99, rec=iseek+ix+1) + $ u(1,ix, jio,kio,cio), + $ u(2,ix, jio,kio,cio), + $ u(3,ix, jio,kio,cio), + $ u(4,ix, jio,kio,cio), + $ u(5,ix, jio,kio,cio) + enddo + enddo + enddo + enddo + + idump_sub = idump_sub + 1 + if (rd_interval .gt. 0) then + if (idump_sub .ge. rd_interval) then + + call acc_sub_norms(idump+1) + + idump_sub = 0 + endif + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine acc_sub_norms(idump_cur) + + include 'header.h' + include 'mpinpb.h' + + integer idump_cur + + integer ix, jio, kio, cio, ii, m, ichunk + double precision xce_single(5) + + ichunk = idump_cur - idump_sub + 1 + do ii=0, idump_sub-1 + do cio=1,ncells + do kio=0, cell_size(3,cio)-1 + do jio=0, cell_size(2,cio)-1 + iseek=(cell_low(1,cio) + + $ PROBLEM_SIZE*((cell_low(2,cio)+jio) + + $ PROBLEM_SIZE*((cell_low(3,cio)+kio) + + $ PROBLEM_SIZE*ii))) + + + do ix=0,cell_size(1,cio)-1 + read(99, rec=iseek+ix+1) + $ u(1,ix, jio,kio,cio), + $ u(2,ix, jio,kio,cio), + $ u(3,ix, jio,kio,cio), + $ u(4,ix, jio,kio,cio), + $ u(5,ix, jio,kio,cio) + enddo + enddo + enddo + enddo + + if (node .eq. root) print *, 'Reading data set ', ii+ichunk + + call error_norm(xce_single) + do m = 1, 5 + xce_sub(m) = xce_sub(m) + xce_single(m) + end do + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine btio_cleanup + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + close(unit=99) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine accumulate_norms(xce_acc) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + include 'header.h' + include 'mpinpb.h' + + double precision xce_acc(5) + integer m + + if (rd_interval .gt. 0) goto 20 + + open (unit=99, file=filenm, + $ form='unformatted', access='direct', + $ recl=record_length) + +c clear the last time step + + call clear_timestep + +c read back the time steps and accumulate norms + + call acc_sub_norms(idump) + + close(unit=99) + + 20 continue + do m = 1, 5 + xce_acc(m) = xce_sub(m) / dble(idump) + end do + + return + end diff --git a/examples/smpi/NAS/BT/full_mpiio.f b/examples/smpi/NAS/BT/full_mpiio.f new file mode 100644 index 0000000000..ecfd41ca73 --- /dev/null +++ b/examples/smpi/NAS/BT/full_mpiio.f @@ -0,0 +1,307 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup_btio + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer ierr + integer mstatus(MPI_STATUS_SIZE) + integer sizes(4), starts(4), subsizes(4) + integer cell_btype(maxcells), cell_ftype(maxcells) + integer cell_blength(maxcells) + integer info + character*20 cb_nodes, cb_size + integer c, m + integer cell_disp(maxcells) + + call mpi_bcast(collbuf_nodes, 1, MPI_INTEGER, + > root, comm_setup, ierr) + + call mpi_bcast(collbuf_size, 1, MPI_INTEGER, + > root, comm_setup, ierr) + + if (collbuf_nodes .eq. 0) then + info = MPI_INFO_NULL + else + write (cb_nodes,*) collbuf_nodes + write (cb_size,*) collbuf_size + call MPI_Info_create(info, ierr) + call MPI_Info_set(info, 'cb_nodes', cb_nodes, ierr) + call MPI_Info_set(info, 'cb_buffer_size', cb_size, ierr) + call MPI_Info_set(info, 'collective_buffering', 'true', ierr) + endif + + call MPI_Type_contiguous(5, MPI_DOUBLE_PRECISION, + $ element, ierr) + call MPI_Type_commit(element, ierr) + call MPI_Type_extent(element, eltext, ierr) + + do c = 1, ncells +c +c Outer array dimensions ar same for every cell +c + sizes(1) = IMAX+4 + sizes(2) = JMAX+4 + sizes(3) = KMAX+4 +c +c 4th dimension is cell number, total of maxcells cells +c + sizes(4) = maxcells +c +c Internal dimensions of cells can differ slightly between cells +c + subsizes(1) = cell_size(1, c) + subsizes(2) = cell_size(2, c) + subsizes(3) = cell_size(3, c) +c +c Cell is 4th dimension, 1 cell per cell type to handle varying +c cell sub-array sizes +c + subsizes(4) = 1 + +c +c type constructors use 0-based start addresses +c + starts(1) = 2 + starts(2) = 2 + starts(3) = 2 + starts(4) = c-1 + +c +c Create buftype for a cell +c + call MPI_Type_create_subarray(4, sizes, subsizes, + $ starts, MPI_ORDER_FORTRAN, element, + $ cell_btype(c), ierr) +c +c block length and displacement for joining cells - +c 1 cell buftype per block, cell buftypes have own displacment +c generated from cell number (4th array dimension) +c + cell_blength(c) = 1 + cell_disp(c) = 0 + + enddo +c +c Create combined buftype for all cells +c + call MPI_Type_struct(ncells, cell_blength, cell_disp, + $ cell_btype, combined_btype, ierr) + call MPI_Type_commit(combined_btype, ierr) + + do c = 1, ncells +c +c Entire array size +c + sizes(1) = PROBLEM_SIZE + sizes(2) = PROBLEM_SIZE + sizes(3) = PROBLEM_SIZE + +c +c Size of c'th cell +c + subsizes(1) = cell_size(1, c) + subsizes(2) = cell_size(2, c) + subsizes(3) = cell_size(3, c) + +c +c Starting point in full array of c'th cell +c + starts(1) = cell_low(1,c) + starts(2) = cell_low(2,c) + starts(3) = cell_low(3,c) + + call MPI_Type_create_subarray(3, sizes, subsizes, + $ starts, MPI_ORDER_FORTRAN, + $ element, cell_ftype(c), ierr) + cell_blength(c) = 1 + cell_disp(c) = 0 + enddo + + call MPI_Type_struct(ncells, cell_blength, cell_disp, + $ cell_ftype, combined_ftype, ierr) + call MPI_Type_commit(combined_ftype, ierr) + + iseek=0 + if (node .eq. root) then + call MPI_File_delete(filenm, MPI_INFO_NULL, ierr) + endif + + + call MPI_Barrier(comm_solve, ierr) + + call MPI_File_open(comm_solve, + $ filenm, + $ MPI_MODE_RDWR+MPI_MODE_CREATE, + $ MPI_INFO_NULL, fp, ierr) + + if (ierr .ne. MPI_SUCCESS) then + print *, 'Error opening file' + stop + endif + + call MPI_File_set_view(fp, iseek, element, + $ combined_ftype, 'native', info, ierr) + + if (ierr .ne. MPI_SUCCESS) then + print *, 'Error setting file view' + stop + endif + + do m = 1, 5 + xce_sub(m) = 0.d0 + end do + + idump_sub = 0 + + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine output_timestep + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + include 'header.h' + include 'mpinpb.h' + + integer mstatus(MPI_STATUS_SIZE) + integer ierr + + call MPI_File_write_at_all(fp, iseek, u, + $ 1, combined_btype, mstatus, ierr) + if (ierr .ne. MPI_SUCCESS) then + print *, 'Error writing to file' + stop + endif + + call MPI_Type_size(combined_btype, iosize, ierr) + iseek = iseek + iosize/eltext + + idump_sub = idump_sub + 1 + if (rd_interval .gt. 0) then + if (idump_sub .ge. rd_interval) then + + iseek = 0 + call acc_sub_norms(idump+1) + + iseek = 0 + idump_sub = 0 + endif + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine acc_sub_norms(idump_cur) + + include 'header.h' + include 'mpinpb.h' + + integer idump_cur + + integer ii, m, ichunk + integer ierr + integer mstatus(MPI_STATUS_SIZE) + double precision xce_single(5) + + ichunk = idump_cur - idump_sub + 1 + do ii=0, idump_sub-1 + + call MPI_File_read_at_all(fp, iseek, u, + $ 1, combined_btype, mstatus, ierr) + if (ierr .ne. MPI_SUCCESS) then + print *, 'Error reading back file' + call MPI_File_close(fp, ierr) + stop + endif + + call MPI_Type_size(combined_btype, iosize, ierr) + iseek = iseek + iosize/eltext + + if (node .eq. root) print *, 'Reading data set ', ii+ichunk + + call error_norm(xce_single) + do m = 1, 5 + xce_sub(m) = xce_sub(m) + xce_single(m) + end do + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine btio_cleanup + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + include 'header.h' + include 'mpinpb.h' + + integer ierr + + call MPI_File_close(fp, ierr) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + + subroutine accumulate_norms(xce_acc) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + double precision xce_acc(5) + integer m, ierr + + if (rd_interval .gt. 0) goto 20 + + call MPI_File_open(comm_solve, + $ filenm, + $ MPI_MODE_RDONLY, + $ MPI_INFO_NULL, + $ fp, + $ ierr) + + iseek = 0 + call MPI_File_set_view(fp, iseek, element, combined_ftype, + $ 'native', MPI_INFO_NULL, ierr) + +c clear the last time step + + call clear_timestep + +c read back the time steps and accumulate norms + + call acc_sub_norms(idump) + + call MPI_File_close(fp, ierr) + + 20 continue + do m = 1, 5 + xce_acc(m) = xce_sub(m) / dble(idump) + end do + + return + end + diff --git a/examples/smpi/NAS/BT/header.h b/examples/smpi/NAS/BT/header.h new file mode 100644 index 0000000000..47719da674 --- /dev/null +++ b/examples/smpi/NAS/BT/header.h @@ -0,0 +1,137 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- +c +c header.h +c +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + +c--------------------------------------------------------------------- +c The following include file is generated automatically by the +c "setparams" utility. It defines +c maxcells: the square root of the maximum number of processors +c problem_size: 12, 64, 102, 162 (for class T, A, B, C) +c dt_default: default time step for this problem size if no +c config file +c niter_default: default number of iterations for this problem size +c--------------------------------------------------------------------- + + include 'npbparams.h' + + integer aa, bb, cc, BLOCK_SIZE + parameter (aa=1, bb=2, cc=3, BLOCK_SIZE=5) + + integer ncells, grid_points(3) + double precision elapsed_time + common /global/ elapsed_time, ncells, grid_points + + double precision tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, + > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, + > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, + > ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, + > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, + > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, + > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, + > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, + > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, + > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, + > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, + > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, + > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, + > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16 + + common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, + > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, + > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, + > ce, dxmax, dymax, dzmax, xxcon1, xxcon2, + > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, + > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, + > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, + > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, + > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, + > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, + > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, + > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, + > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, + > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16 + + integer EAST, WEST, NORTH, SOUTH, + > BOTTOM, TOP + + parameter (EAST=2000, WEST=3000, NORTH=4000, SOUTH=5000, + > BOTTOM=6000, TOP=7000) + + integer cell_coord (3,maxcells), cell_low (3,maxcells), + > cell_high (3,maxcells), cell_size(3,maxcells), + > predecessor(3), slice (3,maxcells), + > grid_size (3), successor(3) , + > start (3,maxcells), end (3,maxcells) + common /partition/ cell_coord, cell_low, cell_high, cell_size, + > grid_size, successor, predecessor, slice, + > start, end + + integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE + + parameter (MAX_CELL_DIM = (problem_size/maxcells)+1) + + parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM) + + parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60+1) + + double precision + > us ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > vs ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > ws ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > qs ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > rho_i ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > square ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > forcing (5, 0:IMAX-1, 0:JMAX-1, 0:KMAX-1, maxcells), + > u (5, -2:IMAX+1,-2:JMAX+1,-2:KMAX+1, maxcells), + > rhs (5, -1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells), + > lhsc (5,5,-1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells), + > backsub_info (5, 0:MAX_CELL_DIM, 0:MAX_CELL_DIM, maxcells), + > in_buffer(BUF_SIZE), out_buffer(BUF_SIZE) + common /fields/ u, us, vs, ws, qs, rho_i, square, + > rhs, forcing, lhsc, in_buffer, out_buffer, + > backsub_info + + double precision cv(-2:MAX_CELL_DIM+1), rhon(-2:MAX_CELL_DIM+1), + > rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1), + > cuf(-2:MAX_CELL_DIM+1), q(-2:MAX_CELL_DIM+1), + > ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5) + common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf + + integer west_size, east_size, bottom_size, top_size, + > north_size, south_size, start_send_west, + > start_send_east, start_send_south, start_send_north, + > start_send_bottom, start_send_top, start_recv_west, + > start_recv_east, start_recv_south, start_recv_north, + > start_recv_bottom, start_recv_top + common /box/ west_size, east_size, bottom_size, + > top_size, north_size, south_size, + > start_send_west, start_send_east, start_send_south, + > start_send_north, start_send_bottom, start_send_top, + > start_recv_west, start_recv_east, start_recv_south, + > start_recv_north, start_recv_bottom, start_recv_top + + double precision tmp_block(5,5), b_inverse(5,5), tmp_vec(5) + common /work_solve/ tmp_block, b_inverse, tmp_vec + +c +c These are used by btio +c + integer collbuf_nodes, collbuf_size, iosize, eltext, + $ combined_btype, fp, idump, record_length, element, + $ combined_ftype, idump_sub, rd_interval + common /btio/ collbuf_nodes, collbuf_size, iosize, eltext, + $ combined_btype, fp, idump, record_length, + $ idump_sub, rd_interval + double precision sum(niter_default), xce_sub(5) + common /btio/ sum, xce_sub + integer*8 iseek + common /btio/ iseek, element, combined_ftype + + + diff --git a/examples/smpi/NAS/BT/initialize.f b/examples/smpi/NAS/BT/initialize.f new file mode 100644 index 0000000000..274cdb1899 --- /dev/null +++ b/examples/smpi/NAS/BT/initialize.f @@ -0,0 +1,308 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine initialize + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c This subroutine initializes the field variable u using +c tri-linear transfinite interpolation of the boundary values +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, i, j, k, m, ii, jj, kk, ix, iy, iz + double precision xi, eta, zeta, Pface(5,3,2), Pxi, Peta, + > Pzeta, temp(5) + +c--------------------------------------------------------------------- +c Later (in compute_rhs) we compute 1/u for every element. A few of +c the corner elements are not used, but it convenient (and faster) +c to compute the whole thing with a simple loop. Make sure those +c values are nonzero by initializing the whole thing here. +c--------------------------------------------------------------------- + do c = 1, ncells + do kk = -1, KMAX + do jj = -1, JMAX + do ii = -1, IMAX + do m = 1, 5 + u(m, ii, jj, kk, c) = 1.0 + end do + end do + end do + end do + end do +c--------------------------------------------------------------------- + + + +c--------------------------------------------------------------------- +c first store the "interpolated" values everywhere on the grid +c--------------------------------------------------------------------- + do c=1, ncells + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + ii = 0 + do i = cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + + do ix = 1, 2 + call exact_solution(dble(ix-1), eta, zeta, + > Pface(1,1,ix)) + enddo + + do iy = 1, 2 + call exact_solution(xi, dble(iy-1) , zeta, + > Pface(1,2,iy)) + enddo + + do iz = 1, 2 + call exact_solution(xi, eta, dble(iz-1), + > Pface(1,3,iz)) + enddo + + do m = 1, 5 + Pxi = xi * Pface(m,1,2) + + > (1.0d0-xi) * Pface(m,1,1) + Peta = eta * Pface(m,2,2) + + > (1.0d0-eta) * Pface(m,2,1) + Pzeta = zeta * Pface(m,3,2) + + > (1.0d0-zeta) * Pface(m,3,1) + + u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta - + > Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + + > Pxi*Peta*Pzeta + + enddo + ii = ii + 1 + enddo + jj = jj + 1 + enddo + kk = kk+1 + enddo + enddo + +c--------------------------------------------------------------------- +c now store the exact values on the boundaries +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c west face +c--------------------------------------------------------------------- + c = slice(1,1) + ii = 0 + xi = 0.0d0 + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(m,ii,jj,kk,c) = temp(m) + enddo + jj = jj + 1 + enddo + kk = kk + 1 + enddo + +c--------------------------------------------------------------------- +c east face +c--------------------------------------------------------------------- + c = slice(1,ncells) + ii = cell_size(1,c)-1 + xi = 1.0d0 + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(m,ii,jj,kk,c) = temp(m) + enddo + jj = jj + 1 + enddo + kk = kk + 1 + enddo + +c--------------------------------------------------------------------- +c south face +c--------------------------------------------------------------------- + c = slice(2,1) + jj = 0 + eta = 0.0d0 + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + ii = 0 + do i = cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(m,ii,jj,kk,c) = temp(m) + enddo + ii = ii + 1 + enddo + kk = kk + 1 + enddo + + +c--------------------------------------------------------------------- +c north face +c--------------------------------------------------------------------- + c = slice(2,ncells) + jj = cell_size(2,c)-1 + eta = 1.0d0 + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + ii = 0 + do i = cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(m,ii,jj,kk,c) = temp(m) + enddo + ii = ii + 1 + enddo + kk = kk + 1 + enddo + +c--------------------------------------------------------------------- +c bottom face +c--------------------------------------------------------------------- + c = slice(3,1) + kk = 0 + zeta = 0.0d0 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + ii = 0 + do i =cell_low(1,c), cell_high(1,c) + xi = dble(i) *dnxm1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(m,ii,jj,kk,c) = temp(m) + enddo + ii = ii + 1 + enddo + jj = jj + 1 + enddo + +c--------------------------------------------------------------------- +c top face +c--------------------------------------------------------------------- + c = slice(3,ncells) + kk = cell_size(3,c)-1 + zeta = 1.0d0 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + ii = 0 + do i =cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(m,ii,jj,kk,c) = temp(m) + enddo + ii = ii + 1 + enddo + jj = jj + 1 + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine lhsinit + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + + integer i, j, k, d, c, m, n + +c--------------------------------------------------------------------- +c loop over all cells +c--------------------------------------------------------------------- + do c = 1, ncells + +c--------------------------------------------------------------------- +c first, initialize the start and end arrays +c--------------------------------------------------------------------- + do d = 1, 3 + if (cell_coord(d,c) .eq. 1) then + start(d,c) = 1 + else + start(d,c) = 0 + endif + if (cell_coord(d,c) .eq. ncells) then + end(d,c) = 1 + else + end(d,c) = 0 + endif + enddo + +c--------------------------------------------------------------------- +c zero the whole left hand side for starters +c--------------------------------------------------------------------- + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + do m = 1,5 + do n = 1, 5 + lhsc(m,n,i,j,k,c) = 0.0d0 + enddo + enddo + enddo + enddo + enddo + + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine lhsabinit(lhsa, lhsb, size) + implicit none + + integer size + double precision lhsa(5, 5, -1:size), lhsb(5, 5, -1:size) + + integer i, m, n + +c--------------------------------------------------------------------- +c next, set all diagonal values to 1. This is overkill, but convenient +c--------------------------------------------------------------------- + do i = 0, size + do m = 1, 5 + do n = 1, 5 + lhsa(m,n,i) = 0.0d0 + lhsb(m,n,i) = 0.0d0 + enddo + lhsb(m,m,i) = 1.0d0 + enddo + enddo + + return + end + + + diff --git a/examples/smpi/NAS/BT/inputbt.data.sample b/examples/smpi/NAS/BT/inputbt.data.sample new file mode 100644 index 0000000000..776654e8d0 --- /dev/null +++ b/examples/smpi/NAS/BT/inputbt.data.sample @@ -0,0 +1,5 @@ +200 number of time steps +0.0008d0 dt for class A = 0.0008d0. class B = 0.0003d0 class C = 0.0001d0 +64 64 64 +5 0 write interval (optional read interval) for BTIO +0 1000000 number of nodes in collective buffering and buffer size for BTIO diff --git a/examples/smpi/NAS/BT/make_set.f b/examples/smpi/NAS/BT/make_set.f new file mode 100644 index 0000000000..b8d90c65a4 --- /dev/null +++ b/examples/smpi/NAS/BT/make_set.f @@ -0,0 +1,124 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine make_set + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c This function allocates space for a set of cells and fills the set +c such that communication between cells on different nodes is only +c nearest neighbor +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + + integer p, i, j, c, dir, size, excess, ierr,ierrcode + +c--------------------------------------------------------------------- +c compute square root; add small number to allow for roundoff +c (note: this is computed in setup_mpi.f also, but prefer to do +c it twice because of some include file problems). +c--------------------------------------------------------------------- + ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0)) + +c--------------------------------------------------------------------- +c this makes coding easier +c--------------------------------------------------------------------- + p = ncells + +c--------------------------------------------------------------------- +c determine the location of the cell at the bottom of the 3D +c array of cells +c--------------------------------------------------------------------- + cell_coord(1,1) = mod(node,p) + cell_coord(2,1) = node/p + cell_coord(3,1) = 0 + +c--------------------------------------------------------------------- +c set the cell_coords for cells in the rest of the z-layers; +c this comes down to a simple linear numbering in the z-direct- +c ion, and to the doubly-cyclic numbering in the other dirs +c--------------------------------------------------------------------- + do c=2, p + cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) + cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) + cell_coord(3,c) = c-1 + end do + +c--------------------------------------------------------------------- +c offset all the coordinates by 1 to adjust for Fortran arrays +c--------------------------------------------------------------------- + do dir = 1, 3 + do c = 1, p + cell_coord(dir,c) = cell_coord(dir,c) + 1 + end do + end do + +c--------------------------------------------------------------------- +c slice(dir,n) contains the sequence number of the cell that is in +c coordinate plane n in the dir direction +c--------------------------------------------------------------------- + do dir = 1, 3 + do c = 1, p + slice(dir,cell_coord(dir,c)) = c + end do + end do + + +c--------------------------------------------------------------------- +c fill the predecessor and successor entries, using the indices +c of the bottom cells (they are the same at each level of k +c anyway) acting as if full periodicity pertains; note that p is +c added to those arguments to the mod functions that might +c otherwise return wrong values when using the modulo function +c--------------------------------------------------------------------- + i = cell_coord(1,1)-1 + j = cell_coord(2,1)-1 + + predecessor(1) = mod(i-1+p,p) + p*j + predecessor(2) = i + p*mod(j-1+p,p) + predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p) + successor(1) = mod(i+1,p) + p*j + successor(2) = i + p*mod(j+1,p) + successor(3) = mod(i-1+p,p) + p*mod(j+1,p) + +c--------------------------------------------------------------------- +c now compute the sizes of the cells +c--------------------------------------------------------------------- + do dir= 1, 3 +c--------------------------------------------------------------------- +c set cell_coord range for each direction +c--------------------------------------------------------------------- + size = grid_points(dir)/p + excess = mod(grid_points(dir),p) + do c=1, ncells + if (cell_coord(dir,c) .le. excess) then + cell_size(dir,c) = size+1 + cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1) + cell_high(dir,c) = cell_low(dir,c)+size + else + cell_size(dir,c) = size + cell_low(dir,c) = excess*(size+1)+ + > (cell_coord(dir,c)-excess-1)*size + cell_high(dir,c) = cell_low(dir,c)+size-1 + endif + if (cell_size(dir, c) .le. 2) then + write(*,50) + 50 format(' Error: Cell size too small. Min size is 3') + call MPI_Abort(mpi_comm_world,ierrcode,ierr) + stop + endif + end do + end do + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + diff --git a/examples/smpi/NAS/BT/mpinpb.h b/examples/smpi/NAS/BT/mpinpb.h new file mode 100644 index 0000000000..f621f08b64 --- /dev/null +++ b/examples/smpi/NAS/BT/mpinpb.h @@ -0,0 +1,12 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'mpif.h' + + integer node, no_nodes, total_nodes, root, comm_setup, + > comm_solve, comm_rhs, dp_type + logical active + common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, + > comm_solve, comm_rhs, dp_type, active + diff --git a/examples/smpi/NAS/BT/rhs.f b/examples/smpi/NAS/BT/rhs.f new file mode 100644 index 0000000000..89171a6741 --- /dev/null +++ b/examples/smpi/NAS/BT/rhs.f @@ -0,0 +1,425 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine compute_rhs + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, i, j, k, m + double precision rho_inv, uijk, up1, um1, vijk, vp1, vm1, + > wijk, wp1, wm1 + + +c--------------------------------------------------------------------- +c loop over all cells owned by this node +c--------------------------------------------------------------------- + do c = 1, ncells + +c--------------------------------------------------------------------- +c compute the reciprocal of density, and the kinetic energy, +c and the speed of sound. +c--------------------------------------------------------------------- + do k = -1, cell_size(3,c) + do j = -1, cell_size(2,c) + do i = -1, cell_size(1,c) + rho_inv = 1.0d0/u(1,i,j,k,c) + rho_i(i,j,k,c) = rho_inv + us(i,j,k,c) = u(2,i,j,k,c) * rho_inv + vs(i,j,k,c) = u(3,i,j,k,c) * rho_inv + ws(i,j,k,c) = u(4,i,j,k,c) * rho_inv + square(i,j,k,c) = 0.5d0* ( + > u(2,i,j,k,c)*u(2,i,j,k,c) + + > u(3,i,j,k,c)*u(3,i,j,k,c) + + > u(4,i,j,k,c)*u(4,i,j,k,c) ) * rho_inv + qs(i,j,k,c) = square(i,j,k,c) * rho_inv + enddo + enddo + enddo + +c--------------------------------------------------------------------- +c copy the exact forcing term to the right hand side; because +c this forcing term is known, we can store it on the whole of every +c cell, including the boundary +c--------------------------------------------------------------------- + + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = forcing(m,i,j,k,c) + enddo + enddo + enddo + enddo + + +c--------------------------------------------------------------------- +c compute xi-direction fluxes +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + uijk = us(i,j,k,c) + up1 = us(i+1,j,k,c) + um1 = us(i-1,j,k,c) + + rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dx1tx1 * + > (u(1,i+1,j,k,c) - 2.0d0*u(1,i,j,k,c) + + > u(1,i-1,j,k,c)) - + > tx2 * (u(2,i+1,j,k,c) - u(2,i-1,j,k,c)) + + rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dx2tx1 * + > (u(2,i+1,j,k,c) - 2.0d0*u(2,i,j,k,c) + + > u(2,i-1,j,k,c)) + + > xxcon2*con43 * (up1 - 2.0d0*uijk + um1) - + > tx2 * (u(2,i+1,j,k,c)*up1 - + > u(2,i-1,j,k,c)*um1 + + > (u(5,i+1,j,k,c)- square(i+1,j,k,c)- + > u(5,i-1,j,k,c)+ square(i-1,j,k,c))* + > c2) + + rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dx3tx1 * + > (u(3,i+1,j,k,c) - 2.0d0*u(3,i,j,k,c) + + > u(3,i-1,j,k,c)) + + > xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) + + > vs(i-1,j,k,c)) - + > tx2 * (u(3,i+1,j,k,c)*up1 - + > u(3,i-1,j,k,c)*um1) + + rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dx4tx1 * + > (u(4,i+1,j,k,c) - 2.0d0*u(4,i,j,k,c) + + > u(4,i-1,j,k,c)) + + > xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) + + > ws(i-1,j,k,c)) - + > tx2 * (u(4,i+1,j,k,c)*up1 - + > u(4,i-1,j,k,c)*um1) + + rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dx5tx1 * + > (u(5,i+1,j,k,c) - 2.0d0*u(5,i,j,k,c) + + > u(5,i-1,j,k,c)) + + > xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) + + > qs(i-1,j,k,c)) + + > xxcon4 * (up1*up1 - 2.0d0*uijk*uijk + + > um1*um1) + + > xxcon5 * (u(5,i+1,j,k,c)*rho_i(i+1,j,k,c) - + > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) + + > u(5,i-1,j,k,c)*rho_i(i-1,j,k,c)) - + > tx2 * ( (c1*u(5,i+1,j,k,c) - + > c2*square(i+1,j,k,c))*up1 - + > (c1*u(5,i-1,j,k,c) - + > c2*square(i-1,j,k,c))*um1 ) + enddo + enddo + enddo + +c--------------------------------------------------------------------- +c add fourth order xi-direction dissipation +c--------------------------------------------------------------------- + if (start(1,c) .gt. 0) then + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + i = 1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * + > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) + + > u(m,i+2,j,k,c)) + enddo + + i = 2 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > (-4.0d0*u(m,i-1,j,k,c) + 6.0d0*u(m,i,j,k,c) - + > 4.0d0*u(m,i+1,j,k,c) + u(m,i+2,j,k,c)) + enddo + enddo + enddo + endif + + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + + > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) + + > u(m,i+2,j,k,c) ) + enddo + enddo + enddo + enddo + + + if (end(1,c) .gt. 0) then + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + i = cell_size(1,c)-3 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + + > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) ) + enddo + + i = cell_size(1,c)-2 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i-2,j,k,c) - 4.d0*u(m,i-1,j,k,c) + + > 5.d0*u(m,i,j,k,c) ) + enddo + enddo + enddo + endif + +c--------------------------------------------------------------------- +c compute eta-direction fluxes +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + vijk = vs(i,j,k,c) + vp1 = vs(i,j+1,k,c) + vm1 = vs(i,j-1,k,c) + rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dy1ty1 * + > (u(1,i,j+1,k,c) - 2.0d0*u(1,i,j,k,c) + + > u(1,i,j-1,k,c)) - + > ty2 * (u(3,i,j+1,k,c) - u(3,i,j-1,k,c)) + rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dy2ty1 * + > (u(2,i,j+1,k,c) - 2.0d0*u(2,i,j,k,c) + + > u(2,i,j-1,k,c)) + + > yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + + > us(i,j-1,k,c)) - + > ty2 * (u(2,i,j+1,k,c)*vp1 - + > u(2,i,j-1,k,c)*vm1) + rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dy3ty1 * + > (u(3,i,j+1,k,c) - 2.0d0*u(3,i,j,k,c) + + > u(3,i,j-1,k,c)) + + > yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) - + > ty2 * (u(3,i,j+1,k,c)*vp1 - + > u(3,i,j-1,k,c)*vm1 + + > (u(5,i,j+1,k,c) - square(i,j+1,k,c) - + > u(5,i,j-1,k,c) + square(i,j-1,k,c)) + > *c2) + rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dy4ty1 * + > (u(4,i,j+1,k,c) - 2.0d0*u(4,i,j,k,c) + + > u(4,i,j-1,k,c)) + + > yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + + > ws(i,j-1,k,c)) - + > ty2 * (u(4,i,j+1,k,c)*vp1 - + > u(4,i,j-1,k,c)*vm1) + rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dy5ty1 * + > (u(5,i,j+1,k,c) - 2.0d0*u(5,i,j,k,c) + + > u(5,i,j-1,k,c)) + + > yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + + > qs(i,j-1,k,c)) + + > yycon4 * (vp1*vp1 - 2.0d0*vijk*vijk + + > vm1*vm1) + + > yycon5 * (u(5,i,j+1,k,c)*rho_i(i,j+1,k,c) - + > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) + + > u(5,i,j-1,k,c)*rho_i(i,j-1,k,c)) - + > ty2 * ((c1*u(5,i,j+1,k,c) - + > c2*square(i,j+1,k,c)) * vp1 - + > (c1*u(5,i,j-1,k,c) - + > c2*square(i,j-1,k,c)) * vm1) + enddo + enddo + enddo + +c--------------------------------------------------------------------- +c add fourth order eta-direction dissipation +c--------------------------------------------------------------------- + if (start(2,c) .gt. 0) then + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + j = 1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * + > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) + + > u(m,i,j+2,k,c)) + enddo + enddo + + j = 2 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > (-4.0d0*u(m,i,j-1,k,c) + 6.0d0*u(m,i,j,k,c) - + > 4.0d0*u(m,i,j+1,k,c) + u(m,i,j+2,k,c)) + enddo + enddo + enddo + endif + + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1 + do i = start(1,c),cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + + > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) + + > u(m,i,j+2,k,c) ) + enddo + enddo + enddo + enddo + + if (end(2,c) .gt. 0) then + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + j = cell_size(2,c)-3 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + + > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) ) + enddo + enddo + + j = cell_size(2,c)-2 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i,j-2,k,c) - 4.d0*u(m,i,j-1,k,c) + + > 5.d0*u(m,i,j,k,c) ) + enddo + enddo + enddo + endif + +c--------------------------------------------------------------------- +c compute zeta-direction fluxes +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + wijk = ws(i,j,k,c) + wp1 = ws(i,j,k+1,c) + wm1 = ws(i,j,k-1,c) + + rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dz1tz1 * + > (u(1,i,j,k+1,c) - 2.0d0*u(1,i,j,k,c) + + > u(1,i,j,k-1,c)) - + > tz2 * (u(4,i,j,k+1,c) - u(4,i,j,k-1,c)) + rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dz2tz1 * + > (u(2,i,j,k+1,c) - 2.0d0*u(2,i,j,k,c) + + > u(2,i,j,k-1,c)) + + > zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + + > us(i,j,k-1,c)) - + > tz2 * (u(2,i,j,k+1,c)*wp1 - + > u(2,i,j,k-1,c)*wm1) + rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dz3tz1 * + > (u(3,i,j,k+1,c) - 2.0d0*u(3,i,j,k,c) + + > u(3,i,j,k-1,c)) + + > zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + + > vs(i,j,k-1,c)) - + > tz2 * (u(3,i,j,k+1,c)*wp1 - + > u(3,i,j,k-1,c)*wm1) + rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dz4tz1 * + > (u(4,i,j,k+1,c) - 2.0d0*u(4,i,j,k,c) + + > u(4,i,j,k-1,c)) + + > zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) - + > tz2 * (u(4,i,j,k+1,c)*wp1 - + > u(4,i,j,k-1,c)*wm1 + + > (u(5,i,j,k+1,c) - square(i,j,k+1,c) - + > u(5,i,j,k-1,c) + square(i,j,k-1,c)) + > *c2) + rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dz5tz1 * + > (u(5,i,j,k+1,c) - 2.0d0*u(5,i,j,k,c) + + > u(5,i,j,k-1,c)) + + > zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + + > qs(i,j,k-1,c)) + + > zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + + > wm1*wm1) + + > zzcon5 * (u(5,i,j,k+1,c)*rho_i(i,j,k+1,c) - + > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) + + > u(5,i,j,k-1,c)*rho_i(i,j,k-1,c)) - + > tz2 * ( (c1*u(5,i,j,k+1,c) - + > c2*square(i,j,k+1,c))*wp1 - + > (c1*u(5,i,j,k-1,c) - + > c2*square(i,j,k-1,c))*wm1) + enddo + enddo + enddo + +c--------------------------------------------------------------------- +c add fourth order zeta-direction dissipation +c--------------------------------------------------------------------- + if (start(3,c) .gt. 0) then + k = 1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * + > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) + + > u(m,i,j,k+2,c)) + enddo + enddo + enddo + + k = 2 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > (-4.0d0*u(m,i,j,k-1,c) + 6.0d0*u(m,i,j,k,c) - + > 4.0d0*u(m,i,j,k+1,c) + u(m,i,j,k+2,c)) + enddo + enddo + enddo + endif + + do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c),cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + + > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) + + > u(m,i,j,k+2,c) ) + enddo + enddo + enddo + enddo + + if (end(3,c) .gt. 0) then + k = cell_size(3,c)-3 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + + > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) ) + enddo + enddo + enddo + + k = cell_size(3,c)-2 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * + > ( u(m,i,j,k-2,c) - 4.d0*u(m,i,j,k-1,c) + + > 5.d0*u(m,i,j,k,c) ) + enddo + enddo + enddo + endif + + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) * dt + enddo + enddo + enddo + enddo + + enddo + + return + end + + + + diff --git a/examples/smpi/NAS/BT/set_constants.f b/examples/smpi/NAS/BT/set_constants.f new file mode 100644 index 0000000000..81397d4bcf --- /dev/null +++ b/examples/smpi/NAS/BT/set_constants.f @@ -0,0 +1,202 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine set_constants + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + + ce(1,1) = 2.0d0 + ce(1,2) = 0.0d0 + ce(1,3) = 0.0d0 + ce(1,4) = 4.0d0 + ce(1,5) = 5.0d0 + ce(1,6) = 3.0d0 + ce(1,7) = 0.5d0 + ce(1,8) = 0.02d0 + ce(1,9) = 0.01d0 + ce(1,10) = 0.03d0 + ce(1,11) = 0.5d0 + ce(1,12) = 0.4d0 + ce(1,13) = 0.3d0 + + ce(2,1) = 1.0d0 + ce(2,2) = 0.0d0 + ce(2,3) = 0.0d0 + ce(2,4) = 0.0d0 + ce(2,5) = 1.0d0 + ce(2,6) = 2.0d0 + ce(2,7) = 3.0d0 + ce(2,8) = 0.01d0 + ce(2,9) = 0.03d0 + ce(2,10) = 0.02d0 + ce(2,11) = 0.4d0 + ce(2,12) = 0.3d0 + ce(2,13) = 0.5d0 + + ce(3,1) = 2.0d0 + ce(3,2) = 2.0d0 + ce(3,3) = 0.0d0 + ce(3,4) = 0.0d0 + ce(3,5) = 0.0d0 + ce(3,6) = 2.0d0 + ce(3,7) = 3.0d0 + ce(3,8) = 0.04d0 + ce(3,9) = 0.03d0 + ce(3,10) = 0.05d0 + ce(3,11) = 0.3d0 + ce(3,12) = 0.5d0 + ce(3,13) = 0.4d0 + + ce(4,1) = 2.0d0 + ce(4,2) = 2.0d0 + ce(4,3) = 0.0d0 + ce(4,4) = 0.0d0 + ce(4,5) = 0.0d0 + ce(4,6) = 2.0d0 + ce(4,7) = 3.0d0 + ce(4,8) = 0.03d0 + ce(4,9) = 0.05d0 + ce(4,10) = 0.04d0 + ce(4,11) = 0.2d0 + ce(4,12) = 0.1d0 + ce(4,13) = 0.3d0 + + ce(5,1) = 5.0d0 + ce(5,2) = 4.0d0 + ce(5,3) = 3.0d0 + ce(5,4) = 2.0d0 + ce(5,5) = 0.1d0 + ce(5,6) = 0.4d0 + ce(5,7) = 0.3d0 + ce(5,8) = 0.05d0 + ce(5,9) = 0.04d0 + ce(5,10) = 0.03d0 + ce(5,11) = 0.1d0 + ce(5,12) = 0.3d0 + ce(5,13) = 0.2d0 + + c1 = 1.4d0 + c2 = 0.4d0 + c3 = 0.1d0 + c4 = 1.0d0 + c5 = 1.4d0 + + bt = dsqrt(0.5d0) + + dnxm1 = 1.0d0 / dble(grid_points(1)-1) + dnym1 = 1.0d0 / dble(grid_points(2)-1) + dnzm1 = 1.0d0 / dble(grid_points(3)-1) + + c1c2 = c1 * c2 + c1c5 = c1 * c5 + c3c4 = c3 * c4 + c1345 = c1c5 * c3c4 + + conz1 = (1.0d0-c1c5) + + tx1 = 1.0d0 / (dnxm1 * dnxm1) + tx2 = 1.0d0 / (2.0d0 * dnxm1) + tx3 = 1.0d0 / dnxm1 + + ty1 = 1.0d0 / (dnym1 * dnym1) + ty2 = 1.0d0 / (2.0d0 * dnym1) + ty3 = 1.0d0 / dnym1 + + tz1 = 1.0d0 / (dnzm1 * dnzm1) + tz2 = 1.0d0 / (2.0d0 * dnzm1) + tz3 = 1.0d0 / dnzm1 + + dx1 = 0.75d0 + dx2 = 0.75d0 + dx3 = 0.75d0 + dx4 = 0.75d0 + dx5 = 0.75d0 + + dy1 = 0.75d0 + dy2 = 0.75d0 + dy3 = 0.75d0 + dy4 = 0.75d0 + dy5 = 0.75d0 + + dz1 = 1.0d0 + dz2 = 1.0d0 + dz3 = 1.0d0 + dz4 = 1.0d0 + dz5 = 1.0d0 + + dxmax = dmax1(dx3, dx4) + dymax = dmax1(dy2, dy4) + dzmax = dmax1(dz2, dz3) + + dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) ) + + c4dssp = 4.0d0 * dssp + c5dssp = 5.0d0 * dssp + + dttx1 = dt*tx1 + dttx2 = dt*tx2 + dtty1 = dt*ty1 + dtty2 = dt*ty2 + dttz1 = dt*tz1 + dttz2 = dt*tz2 + + c2dttx1 = 2.0d0*dttx1 + c2dtty1 = 2.0d0*dtty1 + c2dttz1 = 2.0d0*dttz1 + + dtdssp = dt*dssp + + comz1 = dtdssp + comz4 = 4.0d0*dtdssp + comz5 = 5.0d0*dtdssp + comz6 = 6.0d0*dtdssp + + c3c4tx3 = c3c4*tx3 + c3c4ty3 = c3c4*ty3 + c3c4tz3 = c3c4*tz3 + + dx1tx1 = dx1*tx1 + dx2tx1 = dx2*tx1 + dx3tx1 = dx3*tx1 + dx4tx1 = dx4*tx1 + dx5tx1 = dx5*tx1 + + dy1ty1 = dy1*ty1 + dy2ty1 = dy2*ty1 + dy3ty1 = dy3*ty1 + dy4ty1 = dy4*ty1 + dy5ty1 = dy5*ty1 + + dz1tz1 = dz1*tz1 + dz2tz1 = dz2*tz1 + dz3tz1 = dz3*tz1 + dz4tz1 = dz4*tz1 + dz5tz1 = dz5*tz1 + + c2iv = 2.5d0 + con43 = 4.0d0/3.0d0 + con16 = 1.0d0/6.0d0 + + xxcon1 = c3c4tx3*con43*tx3 + xxcon2 = c3c4tx3*tx3 + xxcon3 = c3c4tx3*conz1*tx3 + xxcon4 = c3c4tx3*con16*tx3 + xxcon5 = c3c4tx3*c1c5*tx3 + + yycon1 = c3c4ty3*con43*ty3 + yycon2 = c3c4ty3*ty3 + yycon3 = c3c4ty3*conz1*ty3 + yycon4 = c3c4ty3*con16*ty3 + yycon5 = c3c4ty3*c1c5*ty3 + + zzcon1 = c3c4tz3*con43*tz3 + zzcon2 = c3c4tz3*tz3 + zzcon3 = c3c4tz3*conz1*tz3 + zzcon4 = c3c4tz3*con16*tz3 + zzcon5 = c3c4tz3*c1c5*tz3 + + return + end diff --git a/examples/smpi/NAS/BT/setup_mpi.f b/examples/smpi/NAS/BT/setup_mpi.f new file mode 100644 index 0000000000..987c6bfba4 --- /dev/null +++ b/examples/smpi/NAS/BT/setup_mpi.f @@ -0,0 +1,64 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup_mpi + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c set up MPI stuff +c--------------------------------------------------------------------- + + implicit none + include 'mpinpb.h' + include 'npbparams.h' + integer error, color, nc + + call mpi_init(error) + + call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error) + call mpi_comm_rank(MPI_COMM_WORLD, node, error) + + if (.not. convertdouble) then + dp_type = MPI_DOUBLE_PRECISION + else + dp_type = MPI_REAL + endif + +c--------------------------------------------------------------------- +c compute square root; add small number to allow for roundoff +c--------------------------------------------------------------------- + nc = dint(dsqrt(dble(total_nodes) + 0.00001d0)) + +c--------------------------------------------------------------------- +c We handle a non-square number of nodes by making the excess nodes +c inactive. However, we can never handle more cells than were compiled +c in. +c--------------------------------------------------------------------- + + if (nc .gt. maxcells) nc = maxcells + if (node .ge. nc*nc) then + active = .false. + color = 1 + else + active = .true. + color = 0 + end if + + call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error) + if (.not. active) return + + call mpi_comm_size(comm_setup, no_nodes, error) + call mpi_comm_dup(comm_setup, comm_solve, error) + call mpi_comm_dup(comm_setup, comm_rhs, error) + +c--------------------------------------------------------------------- +c let node 0 be the root for the group (there is only one) +c--------------------------------------------------------------------- + root = 0 + + return + end + diff --git a/examples/smpi/NAS/BT/simple_mpiio.f b/examples/smpi/NAS/BT/simple_mpiio.f new file mode 100644 index 0000000000..02e2700177 --- /dev/null +++ b/examples/smpi/NAS/BT/simple_mpiio.f @@ -0,0 +1,213 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup_btio + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer m, ierr + + iseek=0 + + if (node .eq. root) then + call MPI_File_delete(filenm, MPI_INFO_NULL, ierr) + endif + + call MPI_Barrier(comm_solve, ierr) + + call MPI_File_open(comm_solve, + $ filenm, + $ MPI_MODE_RDWR + MPI_MODE_CREATE, + $ MPI_INFO_NULL, + $ fp, + $ ierr) + + call MPI_File_set_view(fp, + $ iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION, + $ 'native', MPI_INFO_NULL, ierr) + + if (ierr .ne. MPI_SUCCESS) then + print *, 'Error opening file' + stop + endif + + do m = 1, 5 + xce_sub(m) = 0.d0 + end do + + idump_sub = 0 + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine output_timestep + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + include 'header.h' + include 'mpinpb.h' + + integer count, jio, kio, cio, aio + integer ierr + integer mstatus(MPI_STATUS_SIZE) + + do cio=1,ncells + do kio=0, cell_size(3,cio)-1 + do jio=0, cell_size(2,cio)-1 + iseek=5*(cell_low(1,cio) + + $ PROBLEM_SIZE*((cell_low(2,cio)+jio) + + $ PROBLEM_SIZE*((cell_low(3,cio)+kio) + + $ PROBLEM_SIZE*idump_sub))) + + count=5*cell_size(1,cio) + + call MPI_File_write_at(fp, iseek, + $ u(1,0,jio,kio,cio), + $ count, MPI_DOUBLE_PRECISION, + $ mstatus, ierr) + + if (ierr .ne. MPI_SUCCESS) then + print *, 'Error writing to file' + stop + endif + enddo + enddo + enddo + + idump_sub = idump_sub + 1 + if (rd_interval .gt. 0) then + if (idump_sub .ge. rd_interval) then + + call acc_sub_norms(idump+1) + + idump_sub = 0 + endif + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine acc_sub_norms(idump_cur) + + include 'header.h' + include 'mpinpb.h' + + integer idump_cur + + integer count, jio, kio, cio, ii, m, ichunk + integer ierr + integer mstatus(MPI_STATUS_SIZE) + double precision xce_single(5) + + ichunk = idump_cur - idump_sub + 1 + do ii=0, idump_sub-1 + do cio=1,ncells + do kio=0, cell_size(3,cio)-1 + do jio=0, cell_size(2,cio)-1 + iseek=5*(cell_low(1,cio) + + $ PROBLEM_SIZE*((cell_low(2,cio)+jio) + + $ PROBLEM_SIZE*((cell_low(3,cio)+kio) + + $ PROBLEM_SIZE*ii))) + + count=5*cell_size(1,cio) + + call MPI_File_read_at(fp, iseek, + $ u(1,0,jio,kio,cio), + $ count, MPI_DOUBLE_PRECISION, + $ mstatus, ierr) + + if (ierr .ne. MPI_SUCCESS) then + print *, 'Error reading back file' + call MPI_File_close(fp, ierr) + stop + endif + enddo + enddo + enddo + + if (node .eq. root) print *, 'Reading data set ', ii+ichunk + + call error_norm(xce_single) + do m = 1, 5 + xce_sub(m) = xce_sub(m) + xce_single(m) + end do + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine btio_cleanup + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer ierr + + call MPI_File_close(fp, ierr) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine accumulate_norms(xce_acc) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + double precision xce_acc(5) + integer m, ierr + + if (rd_interval .gt. 0) goto 20 + + call MPI_File_open(comm_solve, + $ filenm, + $ MPI_MODE_RDONLY, + $ MPI_INFO_NULL, + $ fp, + $ ierr) + + iseek = 0 + call MPI_File_set_view(fp, + $ iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION, + $ 'native', MPI_INFO_NULL, ierr) + +c clear the last time step + + call clear_timestep + +c read back the time steps and accumulate norms + + call acc_sub_norms(idump) + + call MPI_File_close(fp, ierr) + + 20 continue + do m = 1, 5 + xce_acc(m) = xce_sub(m) / dble(idump) + end do + + return + end + diff --git a/examples/smpi/NAS/BT/solve_subs.f b/examples/smpi/NAS/BT/solve_subs.f new file mode 100644 index 0000000000..351489a313 --- /dev/null +++ b/examples/smpi/NAS/BT/solve_subs.f @@ -0,0 +1,642 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine matvec_sub(ablock,avec,bvec) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c subtracts bvec=bvec - ablock*avec +c--------------------------------------------------------------------- + + implicit none + + double precision ablock,avec,bvec + dimension ablock(5,5),avec(5),bvec(5) + +c--------------------------------------------------------------------- +c rhs(i,ic,jc,kc,ccell) = rhs(i,ic,jc,kc,ccell) +c $ - lhs(i,1,ablock,ia,ja,ka,acell)* +c--------------------------------------------------------------------- + bvec(1) = bvec(1) - ablock(1,1)*avec(1) + > - ablock(1,2)*avec(2) + > - ablock(1,3)*avec(3) + > - ablock(1,4)*avec(4) + > - ablock(1,5)*avec(5) + bvec(2) = bvec(2) - ablock(2,1)*avec(1) + > - ablock(2,2)*avec(2) + > - ablock(2,3)*avec(3) + > - ablock(2,4)*avec(4) + > - ablock(2,5)*avec(5) + bvec(3) = bvec(3) - ablock(3,1)*avec(1) + > - ablock(3,2)*avec(2) + > - ablock(3,3)*avec(3) + > - ablock(3,4)*avec(4) + > - ablock(3,5)*avec(5) + bvec(4) = bvec(4) - ablock(4,1)*avec(1) + > - ablock(4,2)*avec(2) + > - ablock(4,3)*avec(3) + > - ablock(4,4)*avec(4) + > - ablock(4,5)*avec(5) + bvec(5) = bvec(5) - ablock(5,1)*avec(1) + > - ablock(5,2)*avec(2) + > - ablock(5,3)*avec(3) + > - ablock(5,4)*avec(4) + > - ablock(5,5)*avec(5) + + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine matmul_sub(ablock, bblock, cblock) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c subtracts a(i,j,k) X b(i,j,k) from c(i,j,k) +c--------------------------------------------------------------------- + + implicit none + + double precision ablock, bblock, cblock + dimension ablock(5,5), bblock(5,5), cblock(5,5) + + + cblock(1,1) = cblock(1,1) - ablock(1,1)*bblock(1,1) + > - ablock(1,2)*bblock(2,1) + > - ablock(1,3)*bblock(3,1) + > - ablock(1,4)*bblock(4,1) + > - ablock(1,5)*bblock(5,1) + cblock(2,1) = cblock(2,1) - ablock(2,1)*bblock(1,1) + > - ablock(2,2)*bblock(2,1) + > - ablock(2,3)*bblock(3,1) + > - ablock(2,4)*bblock(4,1) + > - ablock(2,5)*bblock(5,1) + cblock(3,1) = cblock(3,1) - ablock(3,1)*bblock(1,1) + > - ablock(3,2)*bblock(2,1) + > - ablock(3,3)*bblock(3,1) + > - ablock(3,4)*bblock(4,1) + > - ablock(3,5)*bblock(5,1) + cblock(4,1) = cblock(4,1) - ablock(4,1)*bblock(1,1) + > - ablock(4,2)*bblock(2,1) + > - ablock(4,3)*bblock(3,1) + > - ablock(4,4)*bblock(4,1) + > - ablock(4,5)*bblock(5,1) + cblock(5,1) = cblock(5,1) - ablock(5,1)*bblock(1,1) + > - ablock(5,2)*bblock(2,1) + > - ablock(5,3)*bblock(3,1) + > - ablock(5,4)*bblock(4,1) + > - ablock(5,5)*bblock(5,1) + cblock(1,2) = cblock(1,2) - ablock(1,1)*bblock(1,2) + > - ablock(1,2)*bblock(2,2) + > - ablock(1,3)*bblock(3,2) + > - ablock(1,4)*bblock(4,2) + > - ablock(1,5)*bblock(5,2) + cblock(2,2) = cblock(2,2) - ablock(2,1)*bblock(1,2) + > - ablock(2,2)*bblock(2,2) + > - ablock(2,3)*bblock(3,2) + > - ablock(2,4)*bblock(4,2) + > - ablock(2,5)*bblock(5,2) + cblock(3,2) = cblock(3,2) - ablock(3,1)*bblock(1,2) + > - ablock(3,2)*bblock(2,2) + > - ablock(3,3)*bblock(3,2) + > - ablock(3,4)*bblock(4,2) + > - ablock(3,5)*bblock(5,2) + cblock(4,2) = cblock(4,2) - ablock(4,1)*bblock(1,2) + > - ablock(4,2)*bblock(2,2) + > - ablock(4,3)*bblock(3,2) + > - ablock(4,4)*bblock(4,2) + > - ablock(4,5)*bblock(5,2) + cblock(5,2) = cblock(5,2) - ablock(5,1)*bblock(1,2) + > - ablock(5,2)*bblock(2,2) + > - ablock(5,3)*bblock(3,2) + > - ablock(5,4)*bblock(4,2) + > - ablock(5,5)*bblock(5,2) + cblock(1,3) = cblock(1,3) - ablock(1,1)*bblock(1,3) + > - ablock(1,2)*bblock(2,3) + > - ablock(1,3)*bblock(3,3) + > - ablock(1,4)*bblock(4,3) + > - ablock(1,5)*bblock(5,3) + cblock(2,3) = cblock(2,3) - ablock(2,1)*bblock(1,3) + > - ablock(2,2)*bblock(2,3) + > - ablock(2,3)*bblock(3,3) + > - ablock(2,4)*bblock(4,3) + > - ablock(2,5)*bblock(5,3) + cblock(3,3) = cblock(3,3) - ablock(3,1)*bblock(1,3) + > - ablock(3,2)*bblock(2,3) + > - ablock(3,3)*bblock(3,3) + > - ablock(3,4)*bblock(4,3) + > - ablock(3,5)*bblock(5,3) + cblock(4,3) = cblock(4,3) - ablock(4,1)*bblock(1,3) + > - ablock(4,2)*bblock(2,3) + > - ablock(4,3)*bblock(3,3) + > - ablock(4,4)*bblock(4,3) + > - ablock(4,5)*bblock(5,3) + cblock(5,3) = cblock(5,3) - ablock(5,1)*bblock(1,3) + > - ablock(5,2)*bblock(2,3) + > - ablock(5,3)*bblock(3,3) + > - ablock(5,4)*bblock(4,3) + > - ablock(5,5)*bblock(5,3) + cblock(1,4) = cblock(1,4) - ablock(1,1)*bblock(1,4) + > - ablock(1,2)*bblock(2,4) + > - ablock(1,3)*bblock(3,4) + > - ablock(1,4)*bblock(4,4) + > - ablock(1,5)*bblock(5,4) + cblock(2,4) = cblock(2,4) - ablock(2,1)*bblock(1,4) + > - ablock(2,2)*bblock(2,4) + > - ablock(2,3)*bblock(3,4) + > - ablock(2,4)*bblock(4,4) + > - ablock(2,5)*bblock(5,4) + cblock(3,4) = cblock(3,4) - ablock(3,1)*bblock(1,4) + > - ablock(3,2)*bblock(2,4) + > - ablock(3,3)*bblock(3,4) + > - ablock(3,4)*bblock(4,4) + > - ablock(3,5)*bblock(5,4) + cblock(4,4) = cblock(4,4) - ablock(4,1)*bblock(1,4) + > - ablock(4,2)*bblock(2,4) + > - ablock(4,3)*bblock(3,4) + > - ablock(4,4)*bblock(4,4) + > - ablock(4,5)*bblock(5,4) + cblock(5,4) = cblock(5,4) - ablock(5,1)*bblock(1,4) + > - ablock(5,2)*bblock(2,4) + > - ablock(5,3)*bblock(3,4) + > - ablock(5,4)*bblock(4,4) + > - ablock(5,5)*bblock(5,4) + cblock(1,5) = cblock(1,5) - ablock(1,1)*bblock(1,5) + > - ablock(1,2)*bblock(2,5) + > - ablock(1,3)*bblock(3,5) + > - ablock(1,4)*bblock(4,5) + > - ablock(1,5)*bblock(5,5) + cblock(2,5) = cblock(2,5) - ablock(2,1)*bblock(1,5) + > - ablock(2,2)*bblock(2,5) + > - ablock(2,3)*bblock(3,5) + > - ablock(2,4)*bblock(4,5) + > - ablock(2,5)*bblock(5,5) + cblock(3,5) = cblock(3,5) - ablock(3,1)*bblock(1,5) + > - ablock(3,2)*bblock(2,5) + > - ablock(3,3)*bblock(3,5) + > - ablock(3,4)*bblock(4,5) + > - ablock(3,5)*bblock(5,5) + cblock(4,5) = cblock(4,5) - ablock(4,1)*bblock(1,5) + > - ablock(4,2)*bblock(2,5) + > - ablock(4,3)*bblock(3,5) + > - ablock(4,4)*bblock(4,5) + > - ablock(4,5)*bblock(5,5) + cblock(5,5) = cblock(5,5) - ablock(5,1)*bblock(1,5) + > - ablock(5,2)*bblock(2,5) + > - ablock(5,3)*bblock(3,5) + > - ablock(5,4)*bblock(4,5) + > - ablock(5,5)*bblock(5,5) + + + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine binvcrhs( lhs,c,r ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c--------------------------------------------------------------------- + + implicit none + + double precision pivot, coeff, lhs + dimension lhs(5,5) + double precision c(5,5), r(5) + +c--------------------------------------------------------------------- +c +c--------------------------------------------------------------------- + + pivot = 1.00d0/lhs(1,1) + lhs(1,2) = lhs(1,2)*pivot + lhs(1,3) = lhs(1,3)*pivot + lhs(1,4) = lhs(1,4)*pivot + lhs(1,5) = lhs(1,5)*pivot + c(1,1) = c(1,1)*pivot + c(1,2) = c(1,2)*pivot + c(1,3) = c(1,3)*pivot + c(1,4) = c(1,4)*pivot + c(1,5) = c(1,5)*pivot + r(1) = r(1) *pivot + + coeff = lhs(2,1) + lhs(2,2)= lhs(2,2) - coeff*lhs(1,2) + lhs(2,3)= lhs(2,3) - coeff*lhs(1,3) + lhs(2,4)= lhs(2,4) - coeff*lhs(1,4) + lhs(2,5)= lhs(2,5) - coeff*lhs(1,5) + c(2,1) = c(2,1) - coeff*c(1,1) + c(2,2) = c(2,2) - coeff*c(1,2) + c(2,3) = c(2,3) - coeff*c(1,3) + c(2,4) = c(2,4) - coeff*c(1,4) + c(2,5) = c(2,5) - coeff*c(1,5) + r(2) = r(2) - coeff*r(1) + + coeff = lhs(3,1) + lhs(3,2)= lhs(3,2) - coeff*lhs(1,2) + lhs(3,3)= lhs(3,3) - coeff*lhs(1,3) + lhs(3,4)= lhs(3,4) - coeff*lhs(1,4) + lhs(3,5)= lhs(3,5) - coeff*lhs(1,5) + c(3,1) = c(3,1) - coeff*c(1,1) + c(3,2) = c(3,2) - coeff*c(1,2) + c(3,3) = c(3,3) - coeff*c(1,3) + c(3,4) = c(3,4) - coeff*c(1,4) + c(3,5) = c(3,5) - coeff*c(1,5) + r(3) = r(3) - coeff*r(1) + + coeff = lhs(4,1) + lhs(4,2)= lhs(4,2) - coeff*lhs(1,2) + lhs(4,3)= lhs(4,3) - coeff*lhs(1,3) + lhs(4,4)= lhs(4,4) - coeff*lhs(1,4) + lhs(4,5)= lhs(4,5) - coeff*lhs(1,5) + c(4,1) = c(4,1) - coeff*c(1,1) + c(4,2) = c(4,2) - coeff*c(1,2) + c(4,3) = c(4,3) - coeff*c(1,3) + c(4,4) = c(4,4) - coeff*c(1,4) + c(4,5) = c(4,5) - coeff*c(1,5) + r(4) = r(4) - coeff*r(1) + + coeff = lhs(5,1) + lhs(5,2)= lhs(5,2) - coeff*lhs(1,2) + lhs(5,3)= lhs(5,3) - coeff*lhs(1,3) + lhs(5,4)= lhs(5,4) - coeff*lhs(1,4) + lhs(5,5)= lhs(5,5) - coeff*lhs(1,5) + c(5,1) = c(5,1) - coeff*c(1,1) + c(5,2) = c(5,2) - coeff*c(1,2) + c(5,3) = c(5,3) - coeff*c(1,3) + c(5,4) = c(5,4) - coeff*c(1,4) + c(5,5) = c(5,5) - coeff*c(1,5) + r(5) = r(5) - coeff*r(1) + + + pivot = 1.00d0/lhs(2,2) + lhs(2,3) = lhs(2,3)*pivot + lhs(2,4) = lhs(2,4)*pivot + lhs(2,5) = lhs(2,5)*pivot + c(2,1) = c(2,1)*pivot + c(2,2) = c(2,2)*pivot + c(2,3) = c(2,3)*pivot + c(2,4) = c(2,4)*pivot + c(2,5) = c(2,5)*pivot + r(2) = r(2) *pivot + + coeff = lhs(1,2) + lhs(1,3)= lhs(1,3) - coeff*lhs(2,3) + lhs(1,4)= lhs(1,4) - coeff*lhs(2,4) + lhs(1,5)= lhs(1,5) - coeff*lhs(2,5) + c(1,1) = c(1,1) - coeff*c(2,1) + c(1,2) = c(1,2) - coeff*c(2,2) + c(1,3) = c(1,3) - coeff*c(2,3) + c(1,4) = c(1,4) - coeff*c(2,4) + c(1,5) = c(1,5) - coeff*c(2,5) + r(1) = r(1) - coeff*r(2) + + coeff = lhs(3,2) + lhs(3,3)= lhs(3,3) - coeff*lhs(2,3) + lhs(3,4)= lhs(3,4) - coeff*lhs(2,4) + lhs(3,5)= lhs(3,5) - coeff*lhs(2,5) + c(3,1) = c(3,1) - coeff*c(2,1) + c(3,2) = c(3,2) - coeff*c(2,2) + c(3,3) = c(3,3) - coeff*c(2,3) + c(3,4) = c(3,4) - coeff*c(2,4) + c(3,5) = c(3,5) - coeff*c(2,5) + r(3) = r(3) - coeff*r(2) + + coeff = lhs(4,2) + lhs(4,3)= lhs(4,3) - coeff*lhs(2,3) + lhs(4,4)= lhs(4,4) - coeff*lhs(2,4) + lhs(4,5)= lhs(4,5) - coeff*lhs(2,5) + c(4,1) = c(4,1) - coeff*c(2,1) + c(4,2) = c(4,2) - coeff*c(2,2) + c(4,3) = c(4,3) - coeff*c(2,3) + c(4,4) = c(4,4) - coeff*c(2,4) + c(4,5) = c(4,5) - coeff*c(2,5) + r(4) = r(4) - coeff*r(2) + + coeff = lhs(5,2) + lhs(5,3)= lhs(5,3) - coeff*lhs(2,3) + lhs(5,4)= lhs(5,4) - coeff*lhs(2,4) + lhs(5,5)= lhs(5,5) - coeff*lhs(2,5) + c(5,1) = c(5,1) - coeff*c(2,1) + c(5,2) = c(5,2) - coeff*c(2,2) + c(5,3) = c(5,3) - coeff*c(2,3) + c(5,4) = c(5,4) - coeff*c(2,4) + c(5,5) = c(5,5) - coeff*c(2,5) + r(5) = r(5) - coeff*r(2) + + + pivot = 1.00d0/lhs(3,3) + lhs(3,4) = lhs(3,4)*pivot + lhs(3,5) = lhs(3,5)*pivot + c(3,1) = c(3,1)*pivot + c(3,2) = c(3,2)*pivot + c(3,3) = c(3,3)*pivot + c(3,4) = c(3,4)*pivot + c(3,5) = c(3,5)*pivot + r(3) = r(3) *pivot + + coeff = lhs(1,3) + lhs(1,4)= lhs(1,4) - coeff*lhs(3,4) + lhs(1,5)= lhs(1,5) - coeff*lhs(3,5) + c(1,1) = c(1,1) - coeff*c(3,1) + c(1,2) = c(1,2) - coeff*c(3,2) + c(1,3) = c(1,3) - coeff*c(3,3) + c(1,4) = c(1,4) - coeff*c(3,4) + c(1,5) = c(1,5) - coeff*c(3,5) + r(1) = r(1) - coeff*r(3) + + coeff = lhs(2,3) + lhs(2,4)= lhs(2,4) - coeff*lhs(3,4) + lhs(2,5)= lhs(2,5) - coeff*lhs(3,5) + c(2,1) = c(2,1) - coeff*c(3,1) + c(2,2) = c(2,2) - coeff*c(3,2) + c(2,3) = c(2,3) - coeff*c(3,3) + c(2,4) = c(2,4) - coeff*c(3,4) + c(2,5) = c(2,5) - coeff*c(3,5) + r(2) = r(2) - coeff*r(3) + + coeff = lhs(4,3) + lhs(4,4)= lhs(4,4) - coeff*lhs(3,4) + lhs(4,5)= lhs(4,5) - coeff*lhs(3,5) + c(4,1) = c(4,1) - coeff*c(3,1) + c(4,2) = c(4,2) - coeff*c(3,2) + c(4,3) = c(4,3) - coeff*c(3,3) + c(4,4) = c(4,4) - coeff*c(3,4) + c(4,5) = c(4,5) - coeff*c(3,5) + r(4) = r(4) - coeff*r(3) + + coeff = lhs(5,3) + lhs(5,4)= lhs(5,4) - coeff*lhs(3,4) + lhs(5,5)= lhs(5,5) - coeff*lhs(3,5) + c(5,1) = c(5,1) - coeff*c(3,1) + c(5,2) = c(5,2) - coeff*c(3,2) + c(5,3) = c(5,3) - coeff*c(3,3) + c(5,4) = c(5,4) - coeff*c(3,4) + c(5,5) = c(5,5) - coeff*c(3,5) + r(5) = r(5) - coeff*r(3) + + + pivot = 1.00d0/lhs(4,4) + lhs(4,5) = lhs(4,5)*pivot + c(4,1) = c(4,1)*pivot + c(4,2) = c(4,2)*pivot + c(4,3) = c(4,3)*pivot + c(4,4) = c(4,4)*pivot + c(4,5) = c(4,5)*pivot + r(4) = r(4) *pivot + + coeff = lhs(1,4) + lhs(1,5)= lhs(1,5) - coeff*lhs(4,5) + c(1,1) = c(1,1) - coeff*c(4,1) + c(1,2) = c(1,2) - coeff*c(4,2) + c(1,3) = c(1,3) - coeff*c(4,3) + c(1,4) = c(1,4) - coeff*c(4,4) + c(1,5) = c(1,5) - coeff*c(4,5) + r(1) = r(1) - coeff*r(4) + + coeff = lhs(2,4) + lhs(2,5)= lhs(2,5) - coeff*lhs(4,5) + c(2,1) = c(2,1) - coeff*c(4,1) + c(2,2) = c(2,2) - coeff*c(4,2) + c(2,3) = c(2,3) - coeff*c(4,3) + c(2,4) = c(2,4) - coeff*c(4,4) + c(2,5) = c(2,5) - coeff*c(4,5) + r(2) = r(2) - coeff*r(4) + + coeff = lhs(3,4) + lhs(3,5)= lhs(3,5) - coeff*lhs(4,5) + c(3,1) = c(3,1) - coeff*c(4,1) + c(3,2) = c(3,2) - coeff*c(4,2) + c(3,3) = c(3,3) - coeff*c(4,3) + c(3,4) = c(3,4) - coeff*c(4,4) + c(3,5) = c(3,5) - coeff*c(4,5) + r(3) = r(3) - coeff*r(4) + + coeff = lhs(5,4) + lhs(5,5)= lhs(5,5) - coeff*lhs(4,5) + c(5,1) = c(5,1) - coeff*c(4,1) + c(5,2) = c(5,2) - coeff*c(4,2) + c(5,3) = c(5,3) - coeff*c(4,3) + c(5,4) = c(5,4) - coeff*c(4,4) + c(5,5) = c(5,5) - coeff*c(4,5) + r(5) = r(5) - coeff*r(4) + + + pivot = 1.00d0/lhs(5,5) + c(5,1) = c(5,1)*pivot + c(5,2) = c(5,2)*pivot + c(5,3) = c(5,3)*pivot + c(5,4) = c(5,4)*pivot + c(5,5) = c(5,5)*pivot + r(5) = r(5) *pivot + + coeff = lhs(1,5) + c(1,1) = c(1,1) - coeff*c(5,1) + c(1,2) = c(1,2) - coeff*c(5,2) + c(1,3) = c(1,3) - coeff*c(5,3) + c(1,4) = c(1,4) - coeff*c(5,4) + c(1,5) = c(1,5) - coeff*c(5,5) + r(1) = r(1) - coeff*r(5) + + coeff = lhs(2,5) + c(2,1) = c(2,1) - coeff*c(5,1) + c(2,2) = c(2,2) - coeff*c(5,2) + c(2,3) = c(2,3) - coeff*c(5,3) + c(2,4) = c(2,4) - coeff*c(5,4) + c(2,5) = c(2,5) - coeff*c(5,5) + r(2) = r(2) - coeff*r(5) + + coeff = lhs(3,5) + c(3,1) = c(3,1) - coeff*c(5,1) + c(3,2) = c(3,2) - coeff*c(5,2) + c(3,3) = c(3,3) - coeff*c(5,3) + c(3,4) = c(3,4) - coeff*c(5,4) + c(3,5) = c(3,5) - coeff*c(5,5) + r(3) = r(3) - coeff*r(5) + + coeff = lhs(4,5) + c(4,1) = c(4,1) - coeff*c(5,1) + c(4,2) = c(4,2) - coeff*c(5,2) + c(4,3) = c(4,3) - coeff*c(5,3) + c(4,4) = c(4,4) - coeff*c(5,4) + c(4,5) = c(4,5) - coeff*c(5,5) + r(4) = r(4) - coeff*r(5) + + + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine binvrhs( lhs,r ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c--------------------------------------------------------------------- + + implicit none + + double precision pivot, coeff, lhs + dimension lhs(5,5) + double precision r(5) + +c--------------------------------------------------------------------- +c +c--------------------------------------------------------------------- + + + pivot = 1.00d0/lhs(1,1) + lhs(1,2) = lhs(1,2)*pivot + lhs(1,3) = lhs(1,3)*pivot + lhs(1,4) = lhs(1,4)*pivot + lhs(1,5) = lhs(1,5)*pivot + r(1) = r(1) *pivot + + coeff = lhs(2,1) + lhs(2,2)= lhs(2,2) - coeff*lhs(1,2) + lhs(2,3)= lhs(2,3) - coeff*lhs(1,3) + lhs(2,4)= lhs(2,4) - coeff*lhs(1,4) + lhs(2,5)= lhs(2,5) - coeff*lhs(1,5) + r(2) = r(2) - coeff*r(1) + + coeff = lhs(3,1) + lhs(3,2)= lhs(3,2) - coeff*lhs(1,2) + lhs(3,3)= lhs(3,3) - coeff*lhs(1,3) + lhs(3,4)= lhs(3,4) - coeff*lhs(1,4) + lhs(3,5)= lhs(3,5) - coeff*lhs(1,5) + r(3) = r(3) - coeff*r(1) + + coeff = lhs(4,1) + lhs(4,2)= lhs(4,2) - coeff*lhs(1,2) + lhs(4,3)= lhs(4,3) - coeff*lhs(1,3) + lhs(4,4)= lhs(4,4) - coeff*lhs(1,4) + lhs(4,5)= lhs(4,5) - coeff*lhs(1,5) + r(4) = r(4) - coeff*r(1) + + coeff = lhs(5,1) + lhs(5,2)= lhs(5,2) - coeff*lhs(1,2) + lhs(5,3)= lhs(5,3) - coeff*lhs(1,3) + lhs(5,4)= lhs(5,4) - coeff*lhs(1,4) + lhs(5,5)= lhs(5,5) - coeff*lhs(1,5) + r(5) = r(5) - coeff*r(1) + + + pivot = 1.00d0/lhs(2,2) + lhs(2,3) = lhs(2,3)*pivot + lhs(2,4) = lhs(2,4)*pivot + lhs(2,5) = lhs(2,5)*pivot + r(2) = r(2) *pivot + + coeff = lhs(1,2) + lhs(1,3)= lhs(1,3) - coeff*lhs(2,3) + lhs(1,4)= lhs(1,4) - coeff*lhs(2,4) + lhs(1,5)= lhs(1,5) - coeff*lhs(2,5) + r(1) = r(1) - coeff*r(2) + + coeff = lhs(3,2) + lhs(3,3)= lhs(3,3) - coeff*lhs(2,3) + lhs(3,4)= lhs(3,4) - coeff*lhs(2,4) + lhs(3,5)= lhs(3,5) - coeff*lhs(2,5) + r(3) = r(3) - coeff*r(2) + + coeff = lhs(4,2) + lhs(4,3)= lhs(4,3) - coeff*lhs(2,3) + lhs(4,4)= lhs(4,4) - coeff*lhs(2,4) + lhs(4,5)= lhs(4,5) - coeff*lhs(2,5) + r(4) = r(4) - coeff*r(2) + + coeff = lhs(5,2) + lhs(5,3)= lhs(5,3) - coeff*lhs(2,3) + lhs(5,4)= lhs(5,4) - coeff*lhs(2,4) + lhs(5,5)= lhs(5,5) - coeff*lhs(2,5) + r(5) = r(5) - coeff*r(2) + + + pivot = 1.00d0/lhs(3,3) + lhs(3,4) = lhs(3,4)*pivot + lhs(3,5) = lhs(3,5)*pivot + r(3) = r(3) *pivot + + coeff = lhs(1,3) + lhs(1,4)= lhs(1,4) - coeff*lhs(3,4) + lhs(1,5)= lhs(1,5) - coeff*lhs(3,5) + r(1) = r(1) - coeff*r(3) + + coeff = lhs(2,3) + lhs(2,4)= lhs(2,4) - coeff*lhs(3,4) + lhs(2,5)= lhs(2,5) - coeff*lhs(3,5) + r(2) = r(2) - coeff*r(3) + + coeff = lhs(4,3) + lhs(4,4)= lhs(4,4) - coeff*lhs(3,4) + lhs(4,5)= lhs(4,5) - coeff*lhs(3,5) + r(4) = r(4) - coeff*r(3) + + coeff = lhs(5,3) + lhs(5,4)= lhs(5,4) - coeff*lhs(3,4) + lhs(5,5)= lhs(5,5) - coeff*lhs(3,5) + r(5) = r(5) - coeff*r(3) + + + pivot = 1.00d0/lhs(4,4) + lhs(4,5) = lhs(4,5)*pivot + r(4) = r(4) *pivot + + coeff = lhs(1,4) + lhs(1,5)= lhs(1,5) - coeff*lhs(4,5) + r(1) = r(1) - coeff*r(4) + + coeff = lhs(2,4) + lhs(2,5)= lhs(2,5) - coeff*lhs(4,5) + r(2) = r(2) - coeff*r(4) + + coeff = lhs(3,4) + lhs(3,5)= lhs(3,5) - coeff*lhs(4,5) + r(3) = r(3) - coeff*r(4) + + coeff = lhs(5,4) + lhs(5,5)= lhs(5,5) - coeff*lhs(4,5) + r(5) = r(5) - coeff*r(4) + + + pivot = 1.00d0/lhs(5,5) + r(5) = r(5) *pivot + + coeff = lhs(1,5) + r(1) = r(1) - coeff*r(5) + + coeff = lhs(2,5) + r(2) = r(2) - coeff*r(5) + + coeff = lhs(3,5) + r(3) = r(3) - coeff*r(5) + + coeff = lhs(4,5) + r(4) = r(4) - coeff*r(5) + + + return + end + + + diff --git a/examples/smpi/NAS/BT/verify.f b/examples/smpi/NAS/BT/verify.f new file mode 100644 index 0000000000..7dbc8a96a2 --- /dev/null +++ b/examples/smpi/NAS/BT/verify.f @@ -0,0 +1,435 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine verify(no_time_steps, class, verified) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c verification routine +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), + > epsilon, xce(5), xcr(5), dtref + integer m, no_time_steps + character class + logical verified + +c--------------------------------------------------------------------- +c tolerance level +c--------------------------------------------------------------------- + epsilon = 1.0d-08 + verified = .true. + +c--------------------------------------------------------------------- +c compute the error norm and the residual norm, and exit if not printing +c--------------------------------------------------------------------- + + if (iotype .ne. 0) then + call accumulate_norms(xce) + else + call error_norm(xce) + endif + + call copy_faces + + call rhs_norm(xcr) + + do m = 1, 5 + xcr(m) = xcr(m) / dt + enddo + + if (node .ne. 0) return + + class = 'U' + + do m = 1,5 + xcrref(m) = 1.0 + xceref(m) = 1.0 + end do + +c--------------------------------------------------------------------- +c reference data for 12X12X12 grids after 60 time steps, with DT = 1.0d-02 +c--------------------------------------------------------------------- + if ( (grid_points(1) .eq. 12 ) .and. + > (grid_points(2) .eq. 12 ) .and. + > (grid_points(3) .eq. 12 ) .and. + > (no_time_steps .eq. 60 )) then + + class = 'S' + dtref = 1.0d-2 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 1.7034283709541311d-01 + xcrref(2) = 1.2975252070034097d-02 + xcrref(3) = 3.2527926989486055d-02 + xcrref(4) = 2.6436421275166801d-02 + xcrref(5) = 1.9211784131744430d-01 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + + if (iotype .eq. 0) then + xceref(1) = 4.9976913345811579d-04 + xceref(2) = 4.5195666782961927d-05 + xceref(3) = 7.3973765172921357d-05 + xceref(4) = 7.3821238632439731d-05 + xceref(5) = 8.9269630987491446d-04 + else + xceref(1) = 0.1149036328945d+02 + xceref(2) = 0.9156788904727d+00 + xceref(3) = 0.2857899428614d+01 + xceref(4) = 0.2598273346734d+01 + xceref(5) = 0.2652795397547d+02 + endif + +c--------------------------------------------------------------------- +c reference data for 24X24X24 grids after 200 time steps, with DT = 0.8d-3 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 24) .and. + > (grid_points(2) .eq. 24) .and. + > (grid_points(3) .eq. 24) .and. + > (no_time_steps . eq. 200) ) then + + class = 'W' + dtref = 0.8d-3 +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.1125590409344d+03 + xcrref(2) = 0.1180007595731d+02 + xcrref(3) = 0.2710329767846d+02 + xcrref(4) = 0.2469174937669d+02 + xcrref(5) = 0.2638427874317d+03 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + + if (iotype .eq. 0) then + xceref(1) = 0.4419655736008d+01 + xceref(2) = 0.4638531260002d+00 + xceref(3) = 0.1011551749967d+01 + xceref(4) = 0.9235878729944d+00 + xceref(5) = 0.1018045837718d+02 + else + xceref(1) = 0.6729594398612d+02 + xceref(2) = 0.5264523081690d+01 + xceref(3) = 0.1677107142637d+02 + xceref(4) = 0.1508721463436d+02 + xceref(5) = 0.1477018363393d+03 + endif + + +c--------------------------------------------------------------------- +c reference data for 64X64X64 grids after 200 time steps, with DT = 0.8d-3 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 64) .and. + > (grid_points(2) .eq. 64) .and. + > (grid_points(3) .eq. 64) .and. + > (no_time_steps . eq. 200) ) then + + class = 'A' + dtref = 0.8d-3 +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 1.0806346714637264d+02 + xcrref(2) = 1.1319730901220813d+01 + xcrref(3) = 2.5974354511582465d+01 + xcrref(4) = 2.3665622544678910d+01 + xcrref(5) = 2.5278963211748344d+02 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + + if (iotype .eq. 0) then + xceref(1) = 4.2348416040525025d+00 + xceref(2) = 4.4390282496995698d-01 + xceref(3) = 9.6692480136345650d-01 + xceref(4) = 8.8302063039765474d-01 + xceref(5) = 9.7379901770829278d+00 + else + xceref(1) = 0.6482218724961d+02 + xceref(2) = 0.5066461714527d+01 + xceref(3) = 0.1613931961359d+02 + xceref(4) = 0.1452010201481d+02 + xceref(5) = 0.1420099377681d+03 + endif + +c--------------------------------------------------------------------- +c reference data for 102X102X102 grids after 200 time steps, +c with DT = 3.0d-04 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 102) .and. + > (grid_points(2) .eq. 102) .and. + > (grid_points(3) .eq. 102) .and. + > (no_time_steps . eq. 200) ) then + + class = 'B' + dtref = 3.0d-4 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 1.4233597229287254d+03 + xcrref(2) = 9.9330522590150238d+01 + xcrref(3) = 3.5646025644535285d+02 + xcrref(4) = 3.2485447959084092d+02 + xcrref(5) = 3.2707541254659363d+03 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + + if (iotype .eq. 0) then + xceref(1) = 5.2969847140936856d+01 + xceref(2) = 4.4632896115670668d+00 + xceref(3) = 1.3122573342210174d+01 + xceref(4) = 1.2006925323559144d+01 + xceref(5) = 1.2459576151035986d+02 + else + xceref(1) = 0.1477545106464d+03 + xceref(2) = 0.1108895555053d+02 + xceref(3) = 0.3698065590331d+02 + xceref(4) = 0.3310505581440d+02 + xceref(5) = 0.3157928282563d+03 + endif + +c--------------------------------------------------------------------- +c reference data for 162X162X162 grids after 200 time steps, +c with DT = 1.0d-04 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 162) .and. + > (grid_points(2) .eq. 162) .and. + > (grid_points(3) .eq. 162) .and. + > (no_time_steps . eq. 200) ) then + + class = 'C' + dtref = 1.0d-4 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.62398116551764615d+04 + xcrref(2) = 0.50793239190423964d+03 + xcrref(3) = 0.15423530093013596d+04 + xcrref(4) = 0.13302387929291190d+04 + xcrref(5) = 0.11604087428436455d+05 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + + if (iotype .eq. 0) then + xceref(1) = 0.16462008369091265d+03 + xceref(2) = 0.11497107903824313d+02 + xceref(3) = 0.41207446207461508d+02 + xceref(4) = 0.37087651059694167d+02 + xceref(5) = 0.36211053051841265d+03 + else + xceref(1) = 0.2597156483475d+03 + xceref(2) = 0.1985384289495d+02 + xceref(3) = 0.6517950485788d+02 + xceref(4) = 0.5757235541520d+02 + xceref(5) = 0.5215668188726d+03 + endif + + +c--------------------------------------------------------------------- +c reference data for 408x408x408 grids after 250 time steps, +c with DT = 0.2d-04 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 408) .and. + > (grid_points(2) .eq. 408) .and. + > (grid_points(3) .eq. 408) .and. + > (no_time_steps . eq. 250) ) then + + class = 'D' + dtref = 0.2d-4 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.2533188551738d+05 + xcrref(2) = 0.2346393716980d+04 + xcrref(3) = 0.6294554366904d+04 + xcrref(4) = 0.5352565376030d+04 + xcrref(5) = 0.3905864038618d+05 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + + if (iotype .eq. 0) then + xceref(1) = 0.3100009377557d+03 + xceref(2) = 0.2424086324913d+02 + xceref(3) = 0.7782212022645d+02 + xceref(4) = 0.6835623860116d+02 + xceref(5) = 0.6065737200368d+03 + else + xceref(1) = 0.3813781566713d+03 + xceref(2) = 0.3160872966198d+02 + xceref(3) = 0.9593576357290d+02 + xceref(4) = 0.8363391989815d+02 + xceref(5) = 0.7063466087423d+03 + endif + + +c--------------------------------------------------------------------- +c reference data for 1020x1020x1020 grids after 250 time steps, +c with DT = 0.4d-05 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 1020) .and. + > (grid_points(2) .eq. 1020) .and. + > (grid_points(3) .eq. 1020) .and. + > (no_time_steps . eq. 250) ) then + + class = 'E' + dtref = 0.4d-5 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.9795372484517d+05 + xcrref(2) = 0.9739814511521d+04 + xcrref(3) = 0.2467606342965d+05 + xcrref(4) = 0.2092419572860d+05 + xcrref(5) = 0.1392138856939d+06 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + + if (iotype .eq. 0) then + xceref(1) = 0.4327562208414d+03 + xceref(2) = 0.3699051964887d+02 + xceref(3) = 0.1089845040954d+03 + xceref(4) = 0.9462517622043d+02 + xceref(5) = 0.7765512765309d+03 + else +c wr_interval = 5 + xceref(1) = 0.4729898413058d+03 + xceref(2) = 0.4145899331704d+02 + xceref(3) = 0.1192850917138d+03 + xceref(4) = 0.1032746026932d+03 + xceref(5) = 0.8270322177634d+03 +c wr_interval = 10 +c xceref(1) = 0.4718135916251d+03 +c xceref(2) = 0.4132620259096d+02 +c xceref(3) = 0.1189831133503d+03 +c xceref(4) = 0.1030212798803d+03 +c xceref(5) = 0.8255924078458d+03 + endif + + else + verified = .false. + endif + +c--------------------------------------------------------------------- +c verification test for residuals if gridsize is one of +c the defined grid sizes above (class .ne. 'U') +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Compute the difference of solution values and the known reference +c values. +c--------------------------------------------------------------------- + do m = 1, 5 + + xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) + xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m)) + + enddo + +c--------------------------------------------------------------------- +c Output the comparison of computed results to known cases. +c--------------------------------------------------------------------- + + if (class .ne. 'U') then + write(*, 1990) class + 1990 format(' Verification being performed for class ', a) + write (*,2000) epsilon + 2000 format(' accuracy setting for epsilon = ', E20.13) + verified = (dabs(dt-dtref) .le. epsilon) + if (.not.verified) then + verified = .false. + class = 'U' + write (*,1000) dtref + 1000 format(' DT does not match the reference value of ', + > E15.8) + endif + else + write(*, 1995) + 1995 format(' Unknown class') + endif + + + if (class .ne. 'U') then + write (*,2001) + else + write (*, 2005) + endif + + 2001 format(' Comparison of RMS-norms of residual') + 2005 format(' RMS-norms of residual') + do m = 1, 5 + if (class .eq. 'U') then + write(*, 2015) m, xcr(m) + else if (xcrdif(m) .le. epsilon) then + write (*,2011) m,xcr(m),xcrref(m),xcrdif(m) + else + verified = .false. + write (*,2010) m,xcr(m),xcrref(m),xcrdif(m) + endif + enddo + + if (class .ne. 'U') then + write (*,2002) + else + write (*,2006) + endif + 2002 format(' Comparison of RMS-norms of solution error') + 2006 format(' RMS-norms of solution error') + + do m = 1, 5 + if (class .eq. 'U') then + write(*, 2015) m, xce(m) + else if (xcedif(m) .le. epsilon) then + write (*,2011) m,xce(m),xceref(m),xcedif(m) + else + verified = .false. + write (*,2010) m,xce(m),xceref(m),xcedif(m) + endif + enddo + + 2010 format(' FAILURE: ', i2, E20.13, E20.13, E20.13) + 2011 format(' ', i2, E20.13, E20.13, E20.13) + 2015 format(' ', i2, E20.13) + + if (class .eq. 'U') then + write(*, 2022) + write(*, 2023) + 2022 format(' No reference values provided') + 2023 format(' No verification performed') + else if (verified) then + write(*, 2020) + 2020 format(' Verification Successful') + else + write(*, 2021) + 2021 format(' Verification failed') + endif + + return + + + end diff --git a/examples/smpi/NAS/BT/work_lhs.h b/examples/smpi/NAS/BT/work_lhs.h new file mode 100644 index 0000000000..d9bc9e4d66 --- /dev/null +++ b/examples/smpi/NAS/BT/work_lhs.h @@ -0,0 +1,14 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- +c +c work_lhs.h +c +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + double precision fjac(5, 5, -2:MAX_CELL_DIM+1), + > njac(5, 5, -2:MAX_CELL_DIM+1), + > lhsa(5, 5, -1:MAX_CELL_DIM), + > lhsb(5, 5, -1:MAX_CELL_DIM), + > tmp1, tmp2, tmp3 + common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3 diff --git a/examples/smpi/NAS/BT/work_lhs_vec.h b/examples/smpi/NAS/BT/work_lhs_vec.h new file mode 100644 index 0000000000..a97054f419 --- /dev/null +++ b/examples/smpi/NAS/BT/work_lhs_vec.h @@ -0,0 +1,14 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- +c +c work_lhs_vec.h +c +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + double precision fjac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1), + > njac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1), + > lhsa(5, 5, -1:MAX_CELL_DIM, -1:MAX_CELL_DIM), + > lhsb(5, 5, -1:MAX_CELL_DIM, -1:MAX_CELL_DIM), + > tmp1, tmp2, tmp3 + common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3 diff --git a/examples/smpi/NAS/BT/x_solve.f b/examples/smpi/NAS/BT/x_solve.f new file mode 100644 index 0000000000..5386732616 --- /dev/null +++ b/examples/smpi/NAS/BT/x_solve.f @@ -0,0 +1,761 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c Performs line solves in X direction by first factoring +c the block-tridiagonal matrix into an upper triangular matrix, +c and then performing back substitution to solve for the unknow +c vectors of each line. +c +c Make sure we treat elements zero to cell_size in the direction +c of the sweep. +c +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + integer c, istart, stage, + > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), + > isize,jsize,ksize,send_id + + istart = 0 + +c--------------------------------------------------------------------- +c in our terminology stage is the number of the cell in the x-direction +c i.e. stage = 1 means the start of the line stage=ncells means end +c--------------------------------------------------------------------- + do stage = 1,ncells + c = slice(1,stage) + isize = cell_size(1,c) - 1 + jsize = cell_size(2,c) - 1 + ksize = cell_size(3,c) - 1 + +c--------------------------------------------------------------------- +c set last-cell flag +c--------------------------------------------------------------------- + if (stage .eq. ncells) then + last = 1 + else + last = 0 + endif + + if (stage .eq. 1) then +c--------------------------------------------------------------------- +c This is the first cell, so solve without receiving data +c--------------------------------------------------------------------- + first = 1 +c call lhsx(c) + call x_solve_cell(first,last,c) + else +c--------------------------------------------------------------------- +c Not the first cell of this line, so receive info from +c processor working on preceeding cell +c--------------------------------------------------------------------- + first = 0 + call x_receive_solve_info(recv_id,c) +c--------------------------------------------------------------------- +c overlap computations and communications +c--------------------------------------------------------------------- +c call lhsx(c) +c--------------------------------------------------------------------- +c wait for completion +c--------------------------------------------------------------------- + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) +c--------------------------------------------------------------------- +c install C'(istart) and rhs'(istart) to be used in this cell +c--------------------------------------------------------------------- + call x_unpack_solve_info(c) + call x_solve_cell(first,last,c) + endif + + if (last .eq. 0) call x_send_solve_info(send_id,c) + enddo + +c--------------------------------------------------------------------- +c now perform backsubstitution in reverse direction +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(1,stage) + first = 0 + last = 0 + if (stage .eq. 1) first = 1 + if (stage .eq. ncells) then + last = 1 +c--------------------------------------------------------------------- +c last cell, so perform back substitute without waiting +c--------------------------------------------------------------------- + call x_backsubstitute(first, last,c) + else + call x_receive_backsub_info(recv_id,c) + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) + call x_unpack_backsub_info(c) + call x_backsubstitute(first,last,c) + endif + if (first .eq. 0) call x_send_backsub_info(send_id,c) + enddo + + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_unpack_solve_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack C'(-1) and rhs'(-1) for +c all j and k +c--------------------------------------------------------------------- + + include 'header.h' + integer j,k,m,n,ptr,c,istart + + istart = 0 + ptr = 0 + do k=0,KMAX-1 + do j=0,JMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + rhs(n,istart-1,j,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_send_solve_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send C'(iend) and rhs'(iend) for +c all j and k +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer j,k,m,n,isize,ptr,c,jp,kp + integer error,send_id,buffer_size + + isize = cell_size(1,c)-1 + jp = cell_coord(2,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + +c--------------------------------------------------------------------- +c pack up buffer +c--------------------------------------------------------------------- + ptr = 0 + do k=0,KMAX-1 + do j=0,JMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,isize,j,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + +c--------------------------------------------------------------------- +c send buffer +c--------------------------------------------------------------------- + call mpi_isend(in_buffer, buffer_size, + > dp_type, successor(1), + > WEST+jp+kp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_send_backsub_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send U(istart) for all j and k +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer j,k,n,ptr,c,istart,jp,kp + integer error,send_id,buffer_size + +c--------------------------------------------------------------------- +c Send element 0 to previous processor +c--------------------------------------------------------------------- + istart = 0 + jp = cell_coord(2,c)-1 + kp = cell_coord(3,c)-1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + ptr = 0 + do k=0,KMAX-1 + do j=0,JMAX-1 + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,istart,j,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + call mpi_isend(in_buffer, buffer_size, + > dp_type, predecessor(1), + > EAST+jp+kp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_unpack_backsub_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack U(isize) for all j and k +c--------------------------------------------------------------------- + + include 'header.h' + integer j,k,n,ptr,c + + ptr = 0 + do k=0,KMAX-1 + do j=0,JMAX-1 + do n=1,BLOCK_SIZE + backsub_info(n,j,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_receive_backsub_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer error,recv_id,jp,kp,c,buffer_size + jp = cell_coord(2,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + call mpi_irecv(out_buffer, buffer_size, + > dp_type, successor(1), + > EAST+jp+kp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_receive_solve_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer jp,kp,recv_id,error,c,buffer_size + jp = cell_coord(2,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + call mpi_irecv(out_buffer, buffer_size, + > dp_type, predecessor(1), + > WEST+jp+kp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_backsubstitute(first, last, c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c back solve: if last cell, then generate U(isize)=rhs(isize) +c else assume U(isize) is loaded in un pack backsub_info +c so just use it +c after call u(istart) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + + integer first, last, c, i, j, k + integer m,n,isize,jsize,ksize,istart + + istart = 0 + isize = cell_size(1,c)-1 + jsize = cell_size(2,c)-end(2,c)-1 + ksize = cell_size(3,c)-end(3,c)-1 + if (last .eq. 0) then + do k=start(3,c),ksize + do j=start(2,c),jsize +c--------------------------------------------------------------------- +c U(isize) uses info from previous cell if not last cell +c--------------------------------------------------------------------- + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) + > - lhsc(m,n,isize,j,k,c)* + > backsub_info(n,j,k,c) +c--------------------------------------------------------------------- +c rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) +c $ - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c) +c--------------------------------------------------------------------- + enddo + enddo + enddo + enddo + endif + do k=start(3,c),ksize + do j=start(2,c),jsize + do i=isize-1,istart,-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) + > - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c) + enddo + enddo + enddo + enddo + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_solve_cell(first,last,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c performs guaussian elimination on this cell. +c +c assumes that unpacking routines for non-first cells +c preload C' and rhs' from previous cell. +c +c assumed send happens outside this routine, but that +c c'(IMAX) and rhs'(IMAX) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + include 'work_lhs.h' + + integer first,last,c + integer i,j,k,isize,ksize,jsize,istart + + istart = 0 + isize = cell_size(1,c)-1 + jsize = cell_size(2,c)-end(2,c)-1 + ksize = cell_size(3,c)-end(3,c)-1 + + call lhsabinit(lhsa, lhsb, isize) + + do k=start(3,c),ksize + do j=start(2,c),jsize + +c--------------------------------------------------------------------- +c This function computes the left hand side in the xi-direction +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c determine a (labeled f) and n jacobians for cell c +c--------------------------------------------------------------------- + do i = start(1,c)-1, cell_size(1,c) - end(1,c) + + tmp1 = rho_i(i,j,k,c) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 +c--------------------------------------------------------------------- +c +c--------------------------------------------------------------------- + fjac(1,1,i) = 0.0d+00 + fjac(1,2,i) = 1.0d+00 + fjac(1,3,i) = 0.0d+00 + fjac(1,4,i) = 0.0d+00 + fjac(1,5,i) = 0.0d+00 + + fjac(2,1,i) = -(u(2,i,j,k,c) * tmp2 * + > u(2,i,j,k,c)) + > + c2 * qs(i,j,k,c) + fjac(2,2,i) = ( 2.0d+00 - c2 ) + > * ( u(2,i,j,k,c) * tmp1 ) + fjac(2,3,i) = - c2 * ( u(3,i,j,k,c) * tmp1 ) + fjac(2,4,i) = - c2 * ( u(4,i,j,k,c) * tmp1 ) + fjac(2,5,i) = c2 + + fjac(3,1,i) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2 + fjac(3,2,i) = u(3,i,j,k,c) * tmp1 + fjac(3,3,i) = u(2,i,j,k,c) * tmp1 + fjac(3,4,i) = 0.0d+00 + fjac(3,5,i) = 0.0d+00 + + fjac(4,1,i) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2 + fjac(4,2,i) = u(4,i,j,k,c) * tmp1 + fjac(4,3,i) = 0.0d+00 + fjac(4,4,i) = u(2,i,j,k,c) * tmp1 + fjac(4,5,i) = 0.0d+00 + + fjac(5,1,i) = ( c2 * 2.0d0 * qs(i,j,k,c) + > - c1 * ( u(5,i,j,k,c) * tmp1 ) ) + > * ( u(2,i,j,k,c) * tmp1 ) + fjac(5,2,i) = c1 * u(5,i,j,k,c) * tmp1 + > - c2 + > * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2 + > + qs(i,j,k,c) ) + fjac(5,3,i) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) ) + > * tmp2 + fjac(5,4,i) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) ) + > * tmp2 + fjac(5,5,i) = c1 * ( u(2,i,j,k,c) * tmp1 ) + + njac(1,1,i) = 0.0d+00 + njac(1,2,i) = 0.0d+00 + njac(1,3,i) = 0.0d+00 + njac(1,4,i) = 0.0d+00 + njac(1,5,i) = 0.0d+00 + + njac(2,1,i) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c) + njac(2,2,i) = con43 * c3c4 * tmp1 + njac(2,3,i) = 0.0d+00 + njac(2,4,i) = 0.0d+00 + njac(2,5,i) = 0.0d+00 + + njac(3,1,i) = - c3c4 * tmp2 * u(3,i,j,k,c) + njac(3,2,i) = 0.0d+00 + njac(3,3,i) = c3c4 * tmp1 + njac(3,4,i) = 0.0d+00 + njac(3,5,i) = 0.0d+00 + + njac(4,1,i) = - c3c4 * tmp2 * u(4,i,j,k,c) + njac(4,2,i) = 0.0d+00 + njac(4,3,i) = 0.0d+00 + njac(4,4,i) = c3c4 * tmp1 + njac(4,5,i) = 0.0d+00 + + njac(5,1,i) = - ( con43 * c3c4 + > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2) + > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2) + > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2) + > - c1345 * tmp2 * u(5,i,j,k,c) + + njac(5,2,i) = ( con43 * c3c4 + > - c1345 ) * tmp2 * u(2,i,j,k,c) + njac(5,3,i) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c) + njac(5,4,i) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c) + njac(5,5,i) = ( c1345 ) * tmp1 + + enddo +c--------------------------------------------------------------------- +c now jacobians set, so form left hand side in x direction +c--------------------------------------------------------------------- + do i = start(1,c), isize - end(1,c) + + tmp1 = dt * tx1 + tmp2 = dt * tx2 + + lhsa(1,1,i) = - tmp2 * fjac(1,1,i-1) + > - tmp1 * njac(1,1,i-1) + > - tmp1 * dx1 + lhsa(1,2,i) = - tmp2 * fjac(1,2,i-1) + > - tmp1 * njac(1,2,i-1) + lhsa(1,3,i) = - tmp2 * fjac(1,3,i-1) + > - tmp1 * njac(1,3,i-1) + lhsa(1,4,i) = - tmp2 * fjac(1,4,i-1) + > - tmp1 * njac(1,4,i-1) + lhsa(1,5,i) = - tmp2 * fjac(1,5,i-1) + > - tmp1 * njac(1,5,i-1) + + lhsa(2,1,i) = - tmp2 * fjac(2,1,i-1) + > - tmp1 * njac(2,1,i-1) + lhsa(2,2,i) = - tmp2 * fjac(2,2,i-1) + > - tmp1 * njac(2,2,i-1) + > - tmp1 * dx2 + lhsa(2,3,i) = - tmp2 * fjac(2,3,i-1) + > - tmp1 * njac(2,3,i-1) + lhsa(2,4,i) = - tmp2 * fjac(2,4,i-1) + > - tmp1 * njac(2,4,i-1) + lhsa(2,5,i) = - tmp2 * fjac(2,5,i-1) + > - tmp1 * njac(2,5,i-1) + + lhsa(3,1,i) = - tmp2 * fjac(3,1,i-1) + > - tmp1 * njac(3,1,i-1) + lhsa(3,2,i) = - tmp2 * fjac(3,2,i-1) + > - tmp1 * njac(3,2,i-1) + lhsa(3,3,i) = - tmp2 * fjac(3,3,i-1) + > - tmp1 * njac(3,3,i-1) + > - tmp1 * dx3 + lhsa(3,4,i) = - tmp2 * fjac(3,4,i-1) + > - tmp1 * njac(3,4,i-1) + lhsa(3,5,i) = - tmp2 * fjac(3,5,i-1) + > - tmp1 * njac(3,5,i-1) + + lhsa(4,1,i) = - tmp2 * fjac(4,1,i-1) + > - tmp1 * njac(4,1,i-1) + lhsa(4,2,i) = - tmp2 * fjac(4,2,i-1) + > - tmp1 * njac(4,2,i-1) + lhsa(4,3,i) = - tmp2 * fjac(4,3,i-1) + > - tmp1 * njac(4,3,i-1) + lhsa(4,4,i) = - tmp2 * fjac(4,4,i-1) + > - tmp1 * njac(4,4,i-1) + > - tmp1 * dx4 + lhsa(4,5,i) = - tmp2 * fjac(4,5,i-1) + > - tmp1 * njac(4,5,i-1) + + lhsa(5,1,i) = - tmp2 * fjac(5,1,i-1) + > - tmp1 * njac(5,1,i-1) + lhsa(5,2,i) = - tmp2 * fjac(5,2,i-1) + > - tmp1 * njac(5,2,i-1) + lhsa(5,3,i) = - tmp2 * fjac(5,3,i-1) + > - tmp1 * njac(5,3,i-1) + lhsa(5,4,i) = - tmp2 * fjac(5,4,i-1) + > - tmp1 * njac(5,4,i-1) + lhsa(5,5,i) = - tmp2 * fjac(5,5,i-1) + > - tmp1 * njac(5,5,i-1) + > - tmp1 * dx5 + + lhsb(1,1,i) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(1,1,i) + > + tmp1 * 2.0d+00 * dx1 + lhsb(1,2,i) = tmp1 * 2.0d+00 * njac(1,2,i) + lhsb(1,3,i) = tmp1 * 2.0d+00 * njac(1,3,i) + lhsb(1,4,i) = tmp1 * 2.0d+00 * njac(1,4,i) + lhsb(1,5,i) = tmp1 * 2.0d+00 * njac(1,5,i) + + lhsb(2,1,i) = tmp1 * 2.0d+00 * njac(2,1,i) + lhsb(2,2,i) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(2,2,i) + > + tmp1 * 2.0d+00 * dx2 + lhsb(2,3,i) = tmp1 * 2.0d+00 * njac(2,3,i) + lhsb(2,4,i) = tmp1 * 2.0d+00 * njac(2,4,i) + lhsb(2,5,i) = tmp1 * 2.0d+00 * njac(2,5,i) + + lhsb(3,1,i) = tmp1 * 2.0d+00 * njac(3,1,i) + lhsb(3,2,i) = tmp1 * 2.0d+00 * njac(3,2,i) + lhsb(3,3,i) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(3,3,i) + > + tmp1 * 2.0d+00 * dx3 + lhsb(3,4,i) = tmp1 * 2.0d+00 * njac(3,4,i) + lhsb(3,5,i) = tmp1 * 2.0d+00 * njac(3,5,i) + + lhsb(4,1,i) = tmp1 * 2.0d+00 * njac(4,1,i) + lhsb(4,2,i) = tmp1 * 2.0d+00 * njac(4,2,i) + lhsb(4,3,i) = tmp1 * 2.0d+00 * njac(4,3,i) + lhsb(4,4,i) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(4,4,i) + > + tmp1 * 2.0d+00 * dx4 + lhsb(4,5,i) = tmp1 * 2.0d+00 * njac(4,5,i) + + lhsb(5,1,i) = tmp1 * 2.0d+00 * njac(5,1,i) + lhsb(5,2,i) = tmp1 * 2.0d+00 * njac(5,2,i) + lhsb(5,3,i) = tmp1 * 2.0d+00 * njac(5,3,i) + lhsb(5,4,i) = tmp1 * 2.0d+00 * njac(5,4,i) + lhsb(5,5,i) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(5,5,i) + > + tmp1 * 2.0d+00 * dx5 + + lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i+1) + > - tmp1 * njac(1,1,i+1) + > - tmp1 * dx1 + lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i+1) + > - tmp1 * njac(1,2,i+1) + lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i+1) + > - tmp1 * njac(1,3,i+1) + lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i+1) + > - tmp1 * njac(1,4,i+1) + lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i+1) + > - tmp1 * njac(1,5,i+1) + + lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i+1) + > - tmp1 * njac(2,1,i+1) + lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i+1) + > - tmp1 * njac(2,2,i+1) + > - tmp1 * dx2 + lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i+1) + > - tmp1 * njac(2,3,i+1) + lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i+1) + > - tmp1 * njac(2,4,i+1) + lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i+1) + > - tmp1 * njac(2,5,i+1) + + lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i+1) + > - tmp1 * njac(3,1,i+1) + lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i+1) + > - tmp1 * njac(3,2,i+1) + lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i+1) + > - tmp1 * njac(3,3,i+1) + > - tmp1 * dx3 + lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i+1) + > - tmp1 * njac(3,4,i+1) + lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i+1) + > - tmp1 * njac(3,5,i+1) + + lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i+1) + > - tmp1 * njac(4,1,i+1) + lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i+1) + > - tmp1 * njac(4,2,i+1) + lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i+1) + > - tmp1 * njac(4,3,i+1) + lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i+1) + > - tmp1 * njac(4,4,i+1) + > - tmp1 * dx4 + lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i+1) + > - tmp1 * njac(4,5,i+1) + + lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i+1) + > - tmp1 * njac(5,1,i+1) + lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i+1) + > - tmp1 * njac(5,2,i+1) + lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i+1) + > - tmp1 * njac(5,3,i+1) + lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i+1) + > - tmp1 * njac(5,4,i+1) + lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i+1) + > - tmp1 * njac(5,5,i+1) + > - tmp1 * dx5 + + enddo + + +c--------------------------------------------------------------------- +c outer most do loops - sweeping in i direction +c--------------------------------------------------------------------- + if (first .eq. 1) then + +c--------------------------------------------------------------------- +c multiply c(istart,j,k) by b_inverse and copy back to c +c multiply rhs(istart) by b_inverse(istart) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,istart), + > lhsc(1,1,istart,j,k,c), + > rhs(1,istart,j,k,c) ) + + endif + +c--------------------------------------------------------------------- +c begin inner most do loop +c do all the elements of the cell unless last +c--------------------------------------------------------------------- + do i=istart+first,isize-last + +c--------------------------------------------------------------------- +c rhs(i) = rhs(i) - A*rhs(i-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,i), + > rhs(1,i-1,j,k,c),rhs(1,i,j,k,c)) + +c--------------------------------------------------------------------- +c B(i) = B(i) - C(i-1)*A(i) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,i), + > lhsc(1,1,i-1,j,k,c), + > lhsb(1,1,i)) + + +c--------------------------------------------------------------------- +c multiply c(i,j,k) by b_inverse and copy back to c +c multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,i), + > lhsc(1,1,i,j,k,c), + > rhs(1,i,j,k,c) ) + + enddo + +c--------------------------------------------------------------------- +c Now finish up special cases for last cell +c--------------------------------------------------------------------- + if (last .eq. 1) then + +c--------------------------------------------------------------------- +c rhs(isize) = rhs(isize) - A*rhs(isize-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,isize), + > rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c)) + +c--------------------------------------------------------------------- +c B(isize) = B(isize) - C(isize-1)*A(isize) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,isize), + > lhsc(1,1,isize-1,j,k,c), + > lhsb(1,1,isize)) + +c--------------------------------------------------------------------- +c multiply rhs() by b_inverse() and copy to rhs +c--------------------------------------------------------------------- + call binvrhs( lhsb(1,1,isize), + > rhs(1,isize,j,k,c) ) + + endif + enddo + enddo + + + return + end + diff --git a/examples/smpi/NAS/BT/x_solve_vec.f b/examples/smpi/NAS/BT/x_solve_vec.f new file mode 100644 index 0000000000..8f1c1371db --- /dev/null +++ b/examples/smpi/NAS/BT/x_solve_vec.f @@ -0,0 +1,789 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c Performs line solves in X direction by first factoring +c the block-tridiagonal matrix into an upper triangular matrix, +c and then performing back substitution to solve for the unknow +c vectors of each line. +c +c Make sure we treat elements zero to cell_size in the direction +c of the sweep. +c +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + integer c, istart, stage, + > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), + > isize,jsize,ksize,send_id + + istart = 0 + +c--------------------------------------------------------------------- +c in our terminology stage is the number of the cell in the x-direct +c i.e. stage = 1 means the start of the line stage=ncells means end +c--------------------------------------------------------------------- + do stage = 1,ncells + c = slice(1,stage) + isize = cell_size(1,c) - 1 + jsize = cell_size(2,c) - 1 + ksize = cell_size(3,c) - 1 + +c--------------------------------------------------------------------- +c set last-cell flag +c--------------------------------------------------------------------- + if (stage .eq. ncells) then + last = 1 + else + last = 0 + endif + + if (stage .eq. 1) then +c--------------------------------------------------------------------- +c This is the first cell, so solve without receiving data +c--------------------------------------------------------------------- + first = 1 +c call lhsx(c) + call x_solve_cell(first,last,c) + else +c--------------------------------------------------------------------- +c Not the first cell of this line, so receive info from +c processor working on preceeding cell +c--------------------------------------------------------------------- + first = 0 + call x_receive_solve_info(recv_id,c) +c--------------------------------------------------------------------- +c overlap computations and communications +c--------------------------------------------------------------------- +c call lhsx(c) +c--------------------------------------------------------------------- +c wait for completion +c--------------------------------------------------------------------- + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) +c--------------------------------------------------------------------- +c install C'(istart) and rhs'(istart) to be used in this cell +c--------------------------------------------------------------------- + call x_unpack_solve_info(c) + call x_solve_cell(first,last,c) + endif + + if (last .eq. 0) call x_send_solve_info(send_id,c) + enddo + +c--------------------------------------------------------------------- +c now perform backsubstitution in reverse direction +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(1,stage) + first = 0 + last = 0 + if (stage .eq. 1) first = 1 + if (stage .eq. ncells) then + last = 1 +c--------------------------------------------------------------------- +c last cell, so perform back substitute without waiting +c--------------------------------------------------------------------- + call x_backsubstitute(first, last,c) + else + call x_receive_backsub_info(recv_id,c) + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) + call x_unpack_backsub_info(c) + call x_backsubstitute(first,last,c) + endif + if (first .eq. 0) call x_send_backsub_info(send_id,c) + enddo + + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_unpack_solve_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack C'(-1) and rhs'(-1) for +c all j and k +c--------------------------------------------------------------------- + + include 'header.h' + integer j,k,m,n,ptr,c,istart + + istart = 0 + ptr = 0 + do k=0,KMAX-1 + do j=0,JMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + rhs(n,istart-1,j,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_send_solve_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send C'(iend) and rhs'(iend) for +c all j and k +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer j,k,m,n,isize,ptr,c,jp,kp + integer error,send_id,buffer_size + + isize = cell_size(1,c)-1 + jp = cell_coord(2,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + +c--------------------------------------------------------------------- +c pack up buffer +c--------------------------------------------------------------------- + ptr = 0 + do k=0,KMAX-1 + do j=0,JMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,isize,j,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + +c--------------------------------------------------------------------- +c send buffer +c--------------------------------------------------------------------- + call mpi_isend(in_buffer, buffer_size, + > dp_type, successor(1), + > WEST+jp+kp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_send_backsub_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send U(istart) for all j and k +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer j,k,n,ptr,c,istart,jp,kp + integer error,send_id,buffer_size + +c--------------------------------------------------------------------- +c Send element 0 to previous processor +c--------------------------------------------------------------------- + istart = 0 + jp = cell_coord(2,c)-1 + kp = cell_coord(3,c)-1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + ptr = 0 + do k=0,KMAX-1 + do j=0,JMAX-1 + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,istart,j,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + call mpi_isend(in_buffer, buffer_size, + > dp_type, predecessor(1), + > EAST+jp+kp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_unpack_backsub_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack U(isize) for all j and k +c--------------------------------------------------------------------- + + include 'header.h' + integer j,k,n,ptr,c + + ptr = 0 + do k=0,KMAX-1 + do j=0,JMAX-1 + do n=1,BLOCK_SIZE + backsub_info(n,j,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_receive_backsub_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer error,recv_id,jp,kp,c,buffer_size + jp = cell_coord(2,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + call mpi_irecv(out_buffer, buffer_size, + > dp_type, successor(1), + > EAST+jp+kp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_receive_solve_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer jp,kp,recv_id,error,c,buffer_size + jp = cell_coord(2,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + call mpi_irecv(out_buffer, buffer_size, + > dp_type, predecessor(1), + > WEST+jp+kp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_backsubstitute(first, last, c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c back solve: if last cell, then generate U(isize)=rhs(isize) +c else assume U(isize) is loaded in un pack backsub_info +c so just use it +c after call u(istart) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + + integer first, last, c, i, j, k + integer m,n,isize,jsize,ksize,istart + + istart = 0 + isize = cell_size(1,c)-1 + jsize = cell_size(2,c)-end(2,c)-1 + ksize = cell_size(3,c)-end(3,c)-1 + if (last .eq. 0) then + do k=start(3,c),ksize + do j=start(2,c),jsize +c--------------------------------------------------------------------- +c U(isize) uses info from previous cell if not last cell +c--------------------------------------------------------------------- + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) + > - lhsc(m,n,isize,j,k,c)* + > backsub_info(n,j,k,c) +c--------------------------------------------------------------------- +c rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) +c $ - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c) +c--------------------------------------------------------------------- + enddo + enddo + enddo + enddo + endif + do k=start(3,c),ksize + do j=start(2,c),jsize + do i=isize-1,istart,-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) + > - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c) + enddo + enddo + enddo + enddo + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_solve_cell(first,last,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c performs guaussian elimination on this cell. +c +c assumes that unpacking routines for non-first cells +c preload C' and rhs' from previous cell. +c +c assumed send happens outside this routine, but that +c c'(IMAX) and rhs'(IMAX) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + include 'work_lhs_vec.h' + + integer first,last,c + integer i,j,k,m,n,isize,ksize,jsize,istart + + istart = 0 + isize = cell_size(1,c)-1 + jsize = cell_size(2,c)-end(2,c)-1 + ksize = cell_size(3,c)-end(3,c)-1 + +c--------------------------------------------------------------------- +c zero the left hand side for starters +c set diagonal values to 1. This is overkill, but convenient +c--------------------------------------------------------------------- + do j = 0, jsize + do m = 1, 5 + do n = 1, 5 + lhsa(m,n,0,j) = 0.0d0 + lhsb(m,n,0,j) = 0.0d0 + lhsa(m,n,isize,j) = 0.0d0 + lhsb(m,n,isize,j) = 0.0d0 + enddo + lhsb(m,m,0,j) = 1.0d0 + lhsb(m,m,isize,j) = 1.0d0 + enddo + enddo + + do k=start(3,c),ksize + +c--------------------------------------------------------------------- +c This function computes the left hand side in the xi-direction +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c determine a (labeled f) and n jacobians for cell c +c--------------------------------------------------------------------- + do j=start(2,c),jsize + do i = start(1,c)-1, cell_size(1,c) - end(1,c) + + tmp1 = rho_i(i,j,k,c) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 +c--------------------------------------------------------------------- +c +c--------------------------------------------------------------------- + fjac(1,1,i,j) = 0.0d+00 + fjac(1,2,i,j) = 1.0d+00 + fjac(1,3,i,j) = 0.0d+00 + fjac(1,4,i,j) = 0.0d+00 + fjac(1,5,i,j) = 0.0d+00 + + fjac(2,1,i,j) = -(u(2,i,j,k,c) * tmp2 * + > u(2,i,j,k,c)) + > + c2 * qs(i,j,k,c) + fjac(2,2,i,j) = ( 2.0d+00 - c2 ) + > * ( u(2,i,j,k,c) * tmp1 ) + fjac(2,3,i,j) = - c2 * ( u(3,i,j,k,c) * tmp1 ) + fjac(2,4,i,j) = - c2 * ( u(4,i,j,k,c) * tmp1 ) + fjac(2,5,i,j) = c2 + + fjac(3,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2 + fjac(3,2,i,j) = u(3,i,j,k,c) * tmp1 + fjac(3,3,i,j) = u(2,i,j,k,c) * tmp1 + fjac(3,4,i,j) = 0.0d+00 + fjac(3,5,i,j) = 0.0d+00 + + fjac(4,1,i,j) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2 + fjac(4,2,i,j) = u(4,i,j,k,c) * tmp1 + fjac(4,3,i,j) = 0.0d+00 + fjac(4,4,i,j) = u(2,i,j,k,c) * tmp1 + fjac(4,5,i,j) = 0.0d+00 + + fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c) + > - c1 * ( u(5,i,j,k,c) * tmp1 ) ) + > * ( u(2,i,j,k,c) * tmp1 ) + fjac(5,2,i,j) = c1 * u(5,i,j,k,c) * tmp1 + > - c2 + > * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2 + > + qs(i,j,k,c) ) + fjac(5,3,i,j) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) ) + > * tmp2 + fjac(5,4,i,j) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) ) + > * tmp2 + fjac(5,5,i,j) = c1 * ( u(2,i,j,k,c) * tmp1 ) + + njac(1,1,i,j) = 0.0d+00 + njac(1,2,i,j) = 0.0d+00 + njac(1,3,i,j) = 0.0d+00 + njac(1,4,i,j) = 0.0d+00 + njac(1,5,i,j) = 0.0d+00 + + njac(2,1,i,j) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c) + njac(2,2,i,j) = con43 * c3c4 * tmp1 + njac(2,3,i,j) = 0.0d+00 + njac(2,4,i,j) = 0.0d+00 + njac(2,5,i,j) = 0.0d+00 + + njac(3,1,i,j) = - c3c4 * tmp2 * u(3,i,j,k,c) + njac(3,2,i,j) = 0.0d+00 + njac(3,3,i,j) = c3c4 * tmp1 + njac(3,4,i,j) = 0.0d+00 + njac(3,5,i,j) = 0.0d+00 + + njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c) + njac(4,2,i,j) = 0.0d+00 + njac(4,3,i,j) = 0.0d+00 + njac(4,4,i,j) = c3c4 * tmp1 + njac(4,5,i,j) = 0.0d+00 + + njac(5,1,i,j) = - ( con43 * c3c4 + > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2) + > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2) + > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2) + > - c1345 * tmp2 * u(5,i,j,k,c) + + njac(5,2,i,j) = ( con43 * c3c4 + > - c1345 ) * tmp2 * u(2,i,j,k,c) + njac(5,3,i,j) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c) + njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c) + njac(5,5,i,j) = ( c1345 ) * tmp1 + + enddo + enddo + +c--------------------------------------------------------------------- +c now jacobians set, so form left hand side in x direction +c--------------------------------------------------------------------- + do j=start(2,c),jsize + do i = start(1,c), isize - end(1,c) + + tmp1 = dt * tx1 + tmp2 = dt * tx2 + + lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i-1,j) + > - tmp1 * njac(1,1,i-1,j) + > - tmp1 * dx1 + lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i-1,j) + > - tmp1 * njac(1,2,i-1,j) + lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i-1,j) + > - tmp1 * njac(1,3,i-1,j) + lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i-1,j) + > - tmp1 * njac(1,4,i-1,j) + lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i-1,j) + > - tmp1 * njac(1,5,i-1,j) + + lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i-1,j) + > - tmp1 * njac(2,1,i-1,j) + lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i-1,j) + > - tmp1 * njac(2,2,i-1,j) + > - tmp1 * dx2 + lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i-1,j) + > - tmp1 * njac(2,3,i-1,j) + lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i-1,j) + > - tmp1 * njac(2,4,i-1,j) + lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i-1,j) + > - tmp1 * njac(2,5,i-1,j) + + lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i-1,j) + > - tmp1 * njac(3,1,i-1,j) + lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i-1,j) + > - tmp1 * njac(3,2,i-1,j) + lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i-1,j) + > - tmp1 * njac(3,3,i-1,j) + > - tmp1 * dx3 + lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i-1,j) + > - tmp1 * njac(3,4,i-1,j) + lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i-1,j) + > - tmp1 * njac(3,5,i-1,j) + + lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i-1,j) + > - tmp1 * njac(4,1,i-1,j) + lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i-1,j) + > - tmp1 * njac(4,2,i-1,j) + lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i-1,j) + > - tmp1 * njac(4,3,i-1,j) + lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i-1,j) + > - tmp1 * njac(4,4,i-1,j) + > - tmp1 * dx4 + lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i-1,j) + > - tmp1 * njac(4,5,i-1,j) + + lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i-1,j) + > - tmp1 * njac(5,1,i-1,j) + lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i-1,j) + > - tmp1 * njac(5,2,i-1,j) + lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i-1,j) + > - tmp1 * njac(5,3,i-1,j) + lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i-1,j) + > - tmp1 * njac(5,4,i-1,j) + lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i-1,j) + > - tmp1 * njac(5,5,i-1,j) + > - tmp1 * dx5 + + lhsb(1,1,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(1,1,i,j) + > + tmp1 * 2.0d+00 * dx1 + lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j) + lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j) + lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j) + lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j) + + lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j) + lhsb(2,2,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(2,2,i,j) + > + tmp1 * 2.0d+00 * dx2 + lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j) + lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j) + lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j) + + lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j) + lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j) + lhsb(3,3,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(3,3,i,j) + > + tmp1 * 2.0d+00 * dx3 + lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j) + lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j) + + lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j) + lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j) + lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j) + lhsb(4,4,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(4,4,i,j) + > + tmp1 * 2.0d+00 * dx4 + lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j) + + lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j) + lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j) + lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j) + lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j) + lhsb(5,5,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(5,5,i,j) + > + tmp1 * 2.0d+00 * dx5 + + lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i+1,j) + > - tmp1 * njac(1,1,i+1,j) + > - tmp1 * dx1 + lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i+1,j) + > - tmp1 * njac(1,2,i+1,j) + lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i+1,j) + > - tmp1 * njac(1,3,i+1,j) + lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i+1,j) + > - tmp1 * njac(1,4,i+1,j) + lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i+1,j) + > - tmp1 * njac(1,5,i+1,j) + + lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i+1,j) + > - tmp1 * njac(2,1,i+1,j) + lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i+1,j) + > - tmp1 * njac(2,2,i+1,j) + > - tmp1 * dx2 + lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i+1,j) + > - tmp1 * njac(2,3,i+1,j) + lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i+1,j) + > - tmp1 * njac(2,4,i+1,j) + lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i+1,j) + > - tmp1 * njac(2,5,i+1,j) + + lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i+1,j) + > - tmp1 * njac(3,1,i+1,j) + lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i+1,j) + > - tmp1 * njac(3,2,i+1,j) + lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i+1,j) + > - tmp1 * njac(3,3,i+1,j) + > - tmp1 * dx3 + lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i+1,j) + > - tmp1 * njac(3,4,i+1,j) + lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i+1,j) + > - tmp1 * njac(3,5,i+1,j) + + lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i+1,j) + > - tmp1 * njac(4,1,i+1,j) + lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i+1,j) + > - tmp1 * njac(4,2,i+1,j) + lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i+1,j) + > - tmp1 * njac(4,3,i+1,j) + lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i+1,j) + > - tmp1 * njac(4,4,i+1,j) + > - tmp1 * dx4 + lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i+1,j) + > - tmp1 * njac(4,5,i+1,j) + + lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i+1,j) + > - tmp1 * njac(5,1,i+1,j) + lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i+1,j) + > - tmp1 * njac(5,2,i+1,j) + lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i+1,j) + > - tmp1 * njac(5,3,i+1,j) + lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i+1,j) + > - tmp1 * njac(5,4,i+1,j) + lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i+1,j) + > - tmp1 * njac(5,5,i+1,j) + > - tmp1 * dx5 + + enddo + enddo + + +c--------------------------------------------------------------------- +c outer most do loops - sweeping in i direction +c--------------------------------------------------------------------- + if (first .eq. 1) then + +c--------------------------------------------------------------------- +c multiply c(istart,j,k) by b_inverse and copy back to c +c multiply rhs(istart) by b_inverse(istart) and copy to rhs +c--------------------------------------------------------------------- +!dir$ ivdep + do j=start(2,c),jsize + call binvcrhs( lhsb(1,1,istart,j), + > lhsc(1,1,istart,j,k,c), + > rhs(1,istart,j,k,c) ) + enddo + + endif + +c--------------------------------------------------------------------- +c begin inner most do loop +c do all the elements of the cell unless last +c--------------------------------------------------------------------- +!dir$ ivdep +!dir$ interchange(i,j) + do j=start(2,c),jsize + do i=istart+first,isize-last + +c--------------------------------------------------------------------- +c rhs(i) = rhs(i) - A*rhs(i-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,i,j), + > rhs(1,i-1,j,k,c),rhs(1,i,j,k,c)) + +c--------------------------------------------------------------------- +c B(i) = B(i) - C(i-1)*A(i) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,i,j), + > lhsc(1,1,i-1,j,k,c), + > lhsb(1,1,i,j)) + + +c--------------------------------------------------------------------- +c multiply c(i,j,k) by b_inverse and copy back to c +c multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,i,j), + > lhsc(1,1,i,j,k,c), + > rhs(1,i,j,k,c) ) + + enddo + enddo + +c--------------------------------------------------------------------- +c Now finish up special cases for last cell +c--------------------------------------------------------------------- + if (last .eq. 1) then + +!dir$ ivdep + do j=start(2,c),jsize +c--------------------------------------------------------------------- +c rhs(isize) = rhs(isize) - A*rhs(isize-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,isize,j), + > rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c)) + +c--------------------------------------------------------------------- +c B(isize) = B(isize) - C(isize-1)*A(isize) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,isize,j), + > lhsc(1,1,isize-1,j,k,c), + > lhsb(1,1,isize,j)) + +c--------------------------------------------------------------------- +c multiply rhs() by b_inverse() and copy to rhs +c--------------------------------------------------------------------- + call binvrhs( lhsb(1,1,isize,j), + > rhs(1,isize,j,k,c) ) + enddo + + endif + enddo + + + return + end + diff --git a/examples/smpi/NAS/BT/y_solve.f b/examples/smpi/NAS/BT/y_solve.f new file mode 100644 index 0000000000..33e2ebc018 --- /dev/null +++ b/examples/smpi/NAS/BT/y_solve.f @@ -0,0 +1,771 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Performs line solves in Y direction by first factoring +c the block-tridiagonal matrix into an upper triangular matrix, +c and then performing back substitution to solve for the unknow +c vectors of each line. +c +c Make sure we treat elements zero to cell_size in the direction +c of the sweep. +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer + > c, jstart, stage, + > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), + > isize,jsize,ksize,send_id + + jstart = 0 + +c--------------------------------------------------------------------- +c in our terminology stage is the number of the cell in the y-direction +c i.e. stage = 1 means the start of the line stage=ncells means end +c--------------------------------------------------------------------- + do stage = 1,ncells + c = slice(2,stage) + isize = cell_size(1,c) - 1 + jsize = cell_size(2,c) - 1 + ksize = cell_size(3,c) - 1 + +c--------------------------------------------------------------------- +c set last-cell flag +c--------------------------------------------------------------------- + if (stage .eq. ncells) then + last = 1 + else + last = 0 + endif + + if (stage .eq. 1) then +c--------------------------------------------------------------------- +c This is the first cell, so solve without receiving data +c--------------------------------------------------------------------- + first = 1 +c call lhsy(c) + call y_solve_cell(first,last,c) + else +c--------------------------------------------------------------------- +c Not the first cell of this line, so receive info from +c processor working on preceeding cell +c--------------------------------------------------------------------- + first = 0 + call y_receive_solve_info(recv_id,c) +c--------------------------------------------------------------------- +c overlap computations and communications +c--------------------------------------------------------------------- +c call lhsy(c) +c--------------------------------------------------------------------- +c wait for completion +c--------------------------------------------------------------------- + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) +c--------------------------------------------------------------------- +c install C'(jstart+1) and rhs'(jstart+1) to be used in this cell +c--------------------------------------------------------------------- + call y_unpack_solve_info(c) + call y_solve_cell(first,last,c) + endif + + if (last .eq. 0) call y_send_solve_info(send_id,c) + enddo + +c--------------------------------------------------------------------- +c now perform backsubstitution in reverse direction +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(2,stage) + first = 0 + last = 0 + if (stage .eq. 1) first = 1 + if (stage .eq. ncells) then + last = 1 +c--------------------------------------------------------------------- +c last cell, so perform back substitute without waiting +c--------------------------------------------------------------------- + call y_backsubstitute(first, last,c) + else + call y_receive_backsub_info(recv_id,c) + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) + call y_unpack_backsub_info(c) + call y_backsubstitute(first,last,c) + endif + if (first .eq. 0) call y_send_backsub_info(send_id,c) + enddo + + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_unpack_solve_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack C'(-1) and rhs'(-1) for +c all i and k +c--------------------------------------------------------------------- + + include 'header.h' + + integer i,k,m,n,ptr,c,jstart + + jstart = 0 + ptr = 0 + do k=0,KMAX-1 + do i=0,IMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_send_solve_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send C'(jend) and rhs'(jend) for +c all i and k +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i,k,m,n,jsize,ptr,c,ip,kp + integer error,send_id,buffer_size + + jsize = cell_size(2,c)-1 + ip = cell_coord(1,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + +c--------------------------------------------------------------------- +c pack up buffer +c--------------------------------------------------------------------- + ptr = 0 + do k=0,KMAX-1 + do i=0,IMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,i,jsize,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + +c--------------------------------------------------------------------- +c send buffer +c--------------------------------------------------------------------- + call mpi_isend(in_buffer, buffer_size, + > dp_type, successor(2), + > SOUTH+ip+kp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_send_backsub_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send U(jstart) for all i and k +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i,k,n,ptr,c,jstart,ip,kp + integer error,send_id,buffer_size + +c--------------------------------------------------------------------- +c Send element 0 to previous processor +c--------------------------------------------------------------------- + jstart = 0 + ip = cell_coord(1,c)-1 + kp = cell_coord(3,c)-1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + ptr = 0 + do k=0,KMAX-1 + do i=0,IMAX-1 + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,i,jstart,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + call mpi_isend(in_buffer, buffer_size, + > dp_type, predecessor(2), + > NORTH+ip+kp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_unpack_backsub_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack U(jsize) for all i and k +c--------------------------------------------------------------------- + + include 'header.h' + + integer i,k,n,ptr,c + + ptr = 0 + do k=0,KMAX-1 + do i=0,IMAX-1 + do n=1,BLOCK_SIZE + backsub_info(n,i,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_receive_backsub_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer error,recv_id,ip,kp,c,buffer_size + ip = cell_coord(1,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + call mpi_irecv(out_buffer, buffer_size, + > dp_type, successor(2), + > NORTH+ip+kp*NCELLS, comm_solve, + > recv_id, error) + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_receive_solve_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer ip,kp,recv_id,error,c,buffer_size + ip = cell_coord(1,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + call mpi_irecv(out_buffer, buffer_size, + > dp_type, predecessor(2), + > SOUTH+ip+kp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_backsubstitute(first, last, c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c back solve: if last cell, then generate U(jsize)=rhs(jsize) +c else assume U(jsize) is loaded in un pack backsub_info +c so just use it +c after call u(jstart) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + + integer first, last, c, i, k + integer m,n,j,jsize,isize,ksize,jstart + + jstart = 0 + isize = cell_size(1,c)-end(1,c)-1 + jsize = cell_size(2,c)-1 + ksize = cell_size(3,c)-end(3,c)-1 + if (last .eq. 0) then + do k=start(3,c),ksize + do i=start(1,c),isize +c--------------------------------------------------------------------- +c U(jsize) uses info from previous cell if not last cell +c--------------------------------------------------------------------- + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) + > - lhsc(m,n,i,jsize,k,c)* + > backsub_info(n,i,k,c) + enddo + enddo + enddo + enddo + endif + do k=start(3,c),ksize + do j=jsize-1,jstart,-1 + do i=start(1,c),isize + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) + > - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c) + enddo + enddo + enddo + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_solve_cell(first,last,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c performs guaussian elimination on this cell. +c +c assumes that unpacking routines for non-first cells +c preload C' and rhs' from previous cell. +c +c assumed send happens outside this routine, but that +c c'(JMAX) and rhs'(JMAX) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + include 'work_lhs.h' + + integer first,last,c + integer i,j,k,isize,ksize,jsize,jstart + double precision utmp(6,-2:JMAX+1) + + jstart = 0 + isize = cell_size(1,c)-end(1,c)-1 + jsize = cell_size(2,c)-1 + ksize = cell_size(3,c)-end(3,c)-1 + + call lhsabinit(lhsa, lhsb, jsize) + + do k=start(3,c),ksize + do i=start(1,c),isize + +c--------------------------------------------------------------------- +c This function computes the left hand side for the three y-factors +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Compute the indices for storing the tri-diagonal matrix; +c determine a (labeled f) and n jacobians for cell c +c--------------------------------------------------------------------- + do j = start(2,c)-1, cell_size(2,c)-end(2,c) + utmp(1,j) = 1.0d0 / u(1,i,j,k,c) + utmp(2,j) = u(2,i,j,k,c) + utmp(3,j) = u(3,i,j,k,c) + utmp(4,j) = u(4,i,j,k,c) + utmp(5,j) = u(5,i,j,k,c) + utmp(6,j) = qs(i,j,k,c) + end do + + do j = start(2,c)-1, cell_size(2,c)-end(2,c) + + tmp1 = utmp(1,j) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + fjac(1,1,j) = 0.0d+00 + fjac(1,2,j) = 0.0d+00 + fjac(1,3,j) = 1.0d+00 + fjac(1,4,j) = 0.0d+00 + fjac(1,5,j) = 0.0d+00 + + fjac(2,1,j) = - ( utmp(2,j)*utmp(3,j) ) + > * tmp2 + fjac(2,2,j) = utmp(3,j) * tmp1 + fjac(2,3,j) = utmp(2,j) * tmp1 + fjac(2,4,j) = 0.0d+00 + fjac(2,5,j) = 0.0d+00 + + fjac(3,1,j) = - ( utmp(3,j)*utmp(3,j)*tmp2) + > + c2 * utmp(6,j) + fjac(3,2,j) = - c2 * utmp(2,j) * tmp1 + fjac(3,3,j) = ( 2.0d+00 - c2 ) + > * utmp(3,j) * tmp1 + fjac(3,4,j) = - c2 * utmp(4,j) * tmp1 + fjac(3,5,j) = c2 + + fjac(4,1,j) = - ( utmp(3,j)*utmp(4,j) ) + > * tmp2 + fjac(4,2,j) = 0.0d+00 + fjac(4,3,j) = utmp(4,j) * tmp1 + fjac(4,4,j) = utmp(3,j) * tmp1 + fjac(4,5,j) = 0.0d+00 + + fjac(5,1,j) = ( c2 * 2.0d0 * utmp(6,j) + > - c1 * utmp(5,j) * tmp1 ) + > * utmp(3,j) * tmp1 + fjac(5,2,j) = - c2 * utmp(2,j)*utmp(3,j) + > * tmp2 + fjac(5,3,j) = c1 * utmp(5,j) * tmp1 + > - c2 * ( utmp(6,j) + > + utmp(3,j)*utmp(3,j) * tmp2 ) + fjac(5,4,j) = - c2 * ( utmp(3,j)*utmp(4,j) ) + > * tmp2 + fjac(5,5,j) = c1 * utmp(3,j) * tmp1 + + njac(1,1,j) = 0.0d+00 + njac(1,2,j) = 0.0d+00 + njac(1,3,j) = 0.0d+00 + njac(1,4,j) = 0.0d+00 + njac(1,5,j) = 0.0d+00 + + njac(2,1,j) = - c3c4 * tmp2 * utmp(2,j) + njac(2,2,j) = c3c4 * tmp1 + njac(2,3,j) = 0.0d+00 + njac(2,4,j) = 0.0d+00 + njac(2,5,j) = 0.0d+00 + + njac(3,1,j) = - con43 * c3c4 * tmp2 * utmp(3,j) + njac(3,2,j) = 0.0d+00 + njac(3,3,j) = con43 * c3c4 * tmp1 + njac(3,4,j) = 0.0d+00 + njac(3,5,j) = 0.0d+00 + + njac(4,1,j) = - c3c4 * tmp2 * utmp(4,j) + njac(4,2,j) = 0.0d+00 + njac(4,3,j) = 0.0d+00 + njac(4,4,j) = c3c4 * tmp1 + njac(4,5,j) = 0.0d+00 + + njac(5,1,j) = - ( c3c4 + > - c1345 ) * tmp3 * (utmp(2,j)**2) + > - ( con43 * c3c4 + > - c1345 ) * tmp3 * (utmp(3,j)**2) + > - ( c3c4 - c1345 ) * tmp3 * (utmp(4,j)**2) + > - c1345 * tmp2 * utmp(5,j) + + njac(5,2,j) = ( c3c4 - c1345 ) * tmp2 * utmp(2,j) + njac(5,3,j) = ( con43 * c3c4 + > - c1345 ) * tmp2 * utmp(3,j) + njac(5,4,j) = ( c3c4 - c1345 ) * tmp2 * utmp(4,j) + njac(5,5,j) = ( c1345 ) * tmp1 + + enddo + +c--------------------------------------------------------------------- +c now joacobians set, so form left hand side in y direction +c--------------------------------------------------------------------- + do j = start(2,c), jsize-end(2,c) + + tmp1 = dt * ty1 + tmp2 = dt * ty2 + + lhsa(1,1,j) = - tmp2 * fjac(1,1,j-1) + > - tmp1 * njac(1,1,j-1) + > - tmp1 * dy1 + lhsa(1,2,j) = - tmp2 * fjac(1,2,j-1) + > - tmp1 * njac(1,2,j-1) + lhsa(1,3,j) = - tmp2 * fjac(1,3,j-1) + > - tmp1 * njac(1,3,j-1) + lhsa(1,4,j) = - tmp2 * fjac(1,4,j-1) + > - tmp1 * njac(1,4,j-1) + lhsa(1,5,j) = - tmp2 * fjac(1,5,j-1) + > - tmp1 * njac(1,5,j-1) + + lhsa(2,1,j) = - tmp2 * fjac(2,1,j-1) + > - tmp1 * njac(2,1,j-1) + lhsa(2,2,j) = - tmp2 * fjac(2,2,j-1) + > - tmp1 * njac(2,2,j-1) + > - tmp1 * dy2 + lhsa(2,3,j) = - tmp2 * fjac(2,3,j-1) + > - tmp1 * njac(2,3,j-1) + lhsa(2,4,j) = - tmp2 * fjac(2,4,j-1) + > - tmp1 * njac(2,4,j-1) + lhsa(2,5,j) = - tmp2 * fjac(2,5,j-1) + > - tmp1 * njac(2,5,j-1) + + lhsa(3,1,j) = - tmp2 * fjac(3,1,j-1) + > - tmp1 * njac(3,1,j-1) + lhsa(3,2,j) = - tmp2 * fjac(3,2,j-1) + > - tmp1 * njac(3,2,j-1) + lhsa(3,3,j) = - tmp2 * fjac(3,3,j-1) + > - tmp1 * njac(3,3,j-1) + > - tmp1 * dy3 + lhsa(3,4,j) = - tmp2 * fjac(3,4,j-1) + > - tmp1 * njac(3,4,j-1) + lhsa(3,5,j) = - tmp2 * fjac(3,5,j-1) + > - tmp1 * njac(3,5,j-1) + + lhsa(4,1,j) = - tmp2 * fjac(4,1,j-1) + > - tmp1 * njac(4,1,j-1) + lhsa(4,2,j) = - tmp2 * fjac(4,2,j-1) + > - tmp1 * njac(4,2,j-1) + lhsa(4,3,j) = - tmp2 * fjac(4,3,j-1) + > - tmp1 * njac(4,3,j-1) + lhsa(4,4,j) = - tmp2 * fjac(4,4,j-1) + > - tmp1 * njac(4,4,j-1) + > - tmp1 * dy4 + lhsa(4,5,j) = - tmp2 * fjac(4,5,j-1) + > - tmp1 * njac(4,5,j-1) + + lhsa(5,1,j) = - tmp2 * fjac(5,1,j-1) + > - tmp1 * njac(5,1,j-1) + lhsa(5,2,j) = - tmp2 * fjac(5,2,j-1) + > - tmp1 * njac(5,2,j-1) + lhsa(5,3,j) = - tmp2 * fjac(5,3,j-1) + > - tmp1 * njac(5,3,j-1) + lhsa(5,4,j) = - tmp2 * fjac(5,4,j-1) + > - tmp1 * njac(5,4,j-1) + lhsa(5,5,j) = - tmp2 * fjac(5,5,j-1) + > - tmp1 * njac(5,5,j-1) + > - tmp1 * dy5 + + lhsb(1,1,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(1,1,j) + > + tmp1 * 2.0d+00 * dy1 + lhsb(1,2,j) = tmp1 * 2.0d+00 * njac(1,2,j) + lhsb(1,3,j) = tmp1 * 2.0d+00 * njac(1,3,j) + lhsb(1,4,j) = tmp1 * 2.0d+00 * njac(1,4,j) + lhsb(1,5,j) = tmp1 * 2.0d+00 * njac(1,5,j) + + lhsb(2,1,j) = tmp1 * 2.0d+00 * njac(2,1,j) + lhsb(2,2,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(2,2,j) + > + tmp1 * 2.0d+00 * dy2 + lhsb(2,3,j) = tmp1 * 2.0d+00 * njac(2,3,j) + lhsb(2,4,j) = tmp1 * 2.0d+00 * njac(2,4,j) + lhsb(2,5,j) = tmp1 * 2.0d+00 * njac(2,5,j) + + lhsb(3,1,j) = tmp1 * 2.0d+00 * njac(3,1,j) + lhsb(3,2,j) = tmp1 * 2.0d+00 * njac(3,2,j) + lhsb(3,3,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(3,3,j) + > + tmp1 * 2.0d+00 * dy3 + lhsb(3,4,j) = tmp1 * 2.0d+00 * njac(3,4,j) + lhsb(3,5,j) = tmp1 * 2.0d+00 * njac(3,5,j) + + lhsb(4,1,j) = tmp1 * 2.0d+00 * njac(4,1,j) + lhsb(4,2,j) = tmp1 * 2.0d+00 * njac(4,2,j) + lhsb(4,3,j) = tmp1 * 2.0d+00 * njac(4,3,j) + lhsb(4,4,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(4,4,j) + > + tmp1 * 2.0d+00 * dy4 + lhsb(4,5,j) = tmp1 * 2.0d+00 * njac(4,5,j) + + lhsb(5,1,j) = tmp1 * 2.0d+00 * njac(5,1,j) + lhsb(5,2,j) = tmp1 * 2.0d+00 * njac(5,2,j) + lhsb(5,3,j) = tmp1 * 2.0d+00 * njac(5,3,j) + lhsb(5,4,j) = tmp1 * 2.0d+00 * njac(5,4,j) + lhsb(5,5,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(5,5,j) + > + tmp1 * 2.0d+00 * dy5 + + lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,j+1) + > - tmp1 * njac(1,1,j+1) + > - tmp1 * dy1 + lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,j+1) + > - tmp1 * njac(1,2,j+1) + lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,j+1) + > - tmp1 * njac(1,3,j+1) + lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,j+1) + > - tmp1 * njac(1,4,j+1) + lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,j+1) + > - tmp1 * njac(1,5,j+1) + + lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,j+1) + > - tmp1 * njac(2,1,j+1) + lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,j+1) + > - tmp1 * njac(2,2,j+1) + > - tmp1 * dy2 + lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,j+1) + > - tmp1 * njac(2,3,j+1) + lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,j+1) + > - tmp1 * njac(2,4,j+1) + lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,j+1) + > - tmp1 * njac(2,5,j+1) + + lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,j+1) + > - tmp1 * njac(3,1,j+1) + lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,j+1) + > - tmp1 * njac(3,2,j+1) + lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,j+1) + > - tmp1 * njac(3,3,j+1) + > - tmp1 * dy3 + lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,j+1) + > - tmp1 * njac(3,4,j+1) + lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,j+1) + > - tmp1 * njac(3,5,j+1) + + lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,j+1) + > - tmp1 * njac(4,1,j+1) + lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,j+1) + > - tmp1 * njac(4,2,j+1) + lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,j+1) + > - tmp1 * njac(4,3,j+1) + lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,j+1) + > - tmp1 * njac(4,4,j+1) + > - tmp1 * dy4 + lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,j+1) + > - tmp1 * njac(4,5,j+1) + + lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,j+1) + > - tmp1 * njac(5,1,j+1) + lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,j+1) + > - tmp1 * njac(5,2,j+1) + lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,j+1) + > - tmp1 * njac(5,3,j+1) + lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,j+1) + > - tmp1 * njac(5,4,j+1) + lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,j+1) + > - tmp1 * njac(5,5,j+1) + > - tmp1 * dy5 + + enddo + + +c--------------------------------------------------------------------- +c outer most do loops - sweeping in i direction +c--------------------------------------------------------------------- + if (first .eq. 1) then + +c--------------------------------------------------------------------- +c multiply c(i,jstart,k) by b_inverse and copy back to c +c multiply rhs(jstart) by b_inverse(jstart) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,jstart), + > lhsc(1,1,i,jstart,k,c), + > rhs(1,i,jstart,k,c) ) + + endif + +c--------------------------------------------------------------------- +c begin inner most do loop +c do all the elements of the cell unless last +c--------------------------------------------------------------------- + do j=jstart+first,jsize-last + +c--------------------------------------------------------------------- +c subtract A*lhs_vector(j-1) from lhs_vector(j) +c +c rhs(j) = rhs(j) - A*rhs(j-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,j), + > rhs(1,i,j-1,k,c),rhs(1,i,j,k,c)) + +c--------------------------------------------------------------------- +c B(j) = B(j) - C(j-1)*A(j) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,j), + > lhsc(1,1,i,j-1,k,c), + > lhsb(1,1,j)) + +c--------------------------------------------------------------------- +c multiply c(i,j,k) by b_inverse and copy back to c +c multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,j), + > lhsc(1,1,i,j,k,c), + > rhs(1,i,j,k,c) ) + + enddo + +c--------------------------------------------------------------------- +c Now finish up special cases for last cell +c--------------------------------------------------------------------- + if (last .eq. 1) then + +c--------------------------------------------------------------------- +c rhs(jsize) = rhs(jsize) - A*rhs(jsize-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,jsize), + > rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c)) + +c--------------------------------------------------------------------- +c B(jsize) = B(jsize) - C(jsize-1)*A(jsize) +c call matmul_sub(aa,i,jsize,k,c, +c $ cc,i,jsize-1,k,c,bb,i,jsize,k,c) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,jsize), + > lhsc(1,1,i,jsize-1,k,c), + > lhsb(1,1,jsize)) + +c--------------------------------------------------------------------- +c multiply rhs(jsize) by b_inverse(jsize) and copy to rhs +c--------------------------------------------------------------------- + call binvrhs( lhsb(1,1,jsize), + > rhs(1,i,jsize,k,c) ) + + endif + enddo + enddo + + + return + end + + + diff --git a/examples/smpi/NAS/BT/y_solve_vec.f b/examples/smpi/NAS/BT/y_solve_vec.f new file mode 100644 index 0000000000..e21cfa36ce --- /dev/null +++ b/examples/smpi/NAS/BT/y_solve_vec.f @@ -0,0 +1,788 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Performs line solves in Y direction by first factoring +c the block-tridiagonal matrix into an upper triangular matrix, +c and then performing back substitution to solve for the unknow +c vectors of each line. +c +c Make sure we treat elements zero to cell_size in the direction +c of the sweep. +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer + > c, jstart, stage, + > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), + > isize,jsize,ksize,send_id + + jstart = 0 + +c--------------------------------------------------------------------- +c in our terminology stage is the number of the cell in the y-direct +c i.e. stage = 1 means the start of the line stage=ncells means end +c--------------------------------------------------------------------- + do stage = 1,ncells + c = slice(2,stage) + isize = cell_size(1,c) - 1 + jsize = cell_size(2,c) - 1 + ksize = cell_size(3,c) - 1 + +c--------------------------------------------------------------------- +c set last-cell flag +c--------------------------------------------------------------------- + if (stage .eq. ncells) then + last = 1 + else + last = 0 + endif + + if (stage .eq. 1) then +c--------------------------------------------------------------------- +c This is the first cell, so solve without receiving data +c--------------------------------------------------------------------- + first = 1 +c call lhsy(c) + call y_solve_cell(first,last,c) + else +c--------------------------------------------------------------------- +c Not the first cell of this line, so receive info from +c processor working on preceeding cell +c--------------------------------------------------------------------- + first = 0 + call y_receive_solve_info(recv_id,c) +c--------------------------------------------------------------------- +c overlap computations and communications +c--------------------------------------------------------------------- +c call lhsy(c) +c--------------------------------------------------------------------- +c wait for completion +c--------------------------------------------------------------------- + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) +c--------------------------------------------------------------------- +c install C'(jstart+1) and rhs'(jstart+1) to be used in this cell +c--------------------------------------------------------------------- + call y_unpack_solve_info(c) + call y_solve_cell(first,last,c) + endif + + if (last .eq. 0) call y_send_solve_info(send_id,c) + enddo + +c--------------------------------------------------------------------- +c now perform backsubstitution in reverse direction +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(2,stage) + first = 0 + last = 0 + if (stage .eq. 1) first = 1 + if (stage .eq. ncells) then + last = 1 +c--------------------------------------------------------------------- +c last cell, so perform back substitute without waiting +c--------------------------------------------------------------------- + call y_backsubstitute(first, last,c) + else + call y_receive_backsub_info(recv_id,c) + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) + call y_unpack_backsub_info(c) + call y_backsubstitute(first,last,c) + endif + if (first .eq. 0) call y_send_backsub_info(send_id,c) + enddo + + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_unpack_solve_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack C'(-1) and rhs'(-1) for +c all i and k +c--------------------------------------------------------------------- + + include 'header.h' + + integer i,k,m,n,ptr,c,jstart + + jstart = 0 + ptr = 0 + do k=0,KMAX-1 + do i=0,IMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_send_solve_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send C'(jend) and rhs'(jend) for +c all i and k +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i,k,m,n,jsize,ptr,c,ip,kp + integer error,send_id,buffer_size + + jsize = cell_size(2,c)-1 + ip = cell_coord(1,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + +c--------------------------------------------------------------------- +c pack up buffer +c--------------------------------------------------------------------- + ptr = 0 + do k=0,KMAX-1 + do i=0,IMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,i,jsize,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + +c--------------------------------------------------------------------- +c send buffer +c--------------------------------------------------------------------- + call mpi_isend(in_buffer, buffer_size, + > dp_type, successor(2), + > SOUTH+ip+kp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_send_backsub_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send U(jstart) for all i and k +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i,k,n,ptr,c,jstart,ip,kp + integer error,send_id,buffer_size + +c--------------------------------------------------------------------- +c Send element 0 to previous processor +c--------------------------------------------------------------------- + jstart = 0 + ip = cell_coord(1,c)-1 + kp = cell_coord(3,c)-1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + ptr = 0 + do k=0,KMAX-1 + do i=0,IMAX-1 + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,i,jstart,k,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + call mpi_isend(in_buffer, buffer_size, + > dp_type, predecessor(2), + > NORTH+ip+kp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_unpack_backsub_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack U(jsize) for all i and k +c--------------------------------------------------------------------- + + include 'header.h' + + integer i,k,n,ptr,c + + ptr = 0 + do k=0,KMAX-1 + do i=0,IMAX-1 + do n=1,BLOCK_SIZE + backsub_info(n,i,k,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_receive_backsub_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer error,recv_id,ip,kp,c,buffer_size + ip = cell_coord(1,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + call mpi_irecv(out_buffer, buffer_size, + > dp_type, successor(2), + > NORTH+ip+kp*NCELLS, comm_solve, + > recv_id, error) + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_receive_solve_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer ip,kp,recv_id,error,c,buffer_size + ip = cell_coord(1,c) - 1 + kp = cell_coord(3,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + call mpi_irecv(out_buffer, buffer_size, + > dp_type, predecessor(2), + > SOUTH+ip+kp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_backsubstitute(first, last, c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c back solve: if last cell, then generate U(jsize)=rhs(jsize) +c else assume U(jsize) is loaded in un pack backsub_info +c so just use it +c after call u(jstart) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + + integer first, last, c, i, k + integer m,n,j,jsize,isize,ksize,jstart + + jstart = 0 + isize = cell_size(1,c)-end(1,c)-1 + jsize = cell_size(2,c)-1 + ksize = cell_size(3,c)-end(3,c)-1 + if (last .eq. 0) then + do k=start(3,c),ksize + do i=start(1,c),isize +c--------------------------------------------------------------------- +c U(jsize) uses info from previous cell if not last cell +c--------------------------------------------------------------------- + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) + > - lhsc(m,n,i,jsize,k,c)* + > backsub_info(n,i,k,c) + enddo + enddo + enddo + enddo + endif + do k=start(3,c),ksize + do j=jsize-1,jstart,-1 + do i=start(1,c),isize + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) + > - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c) + enddo + enddo + enddo + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_solve_cell(first,last,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c performs guaussian elimination on this cell. +c +c assumes that unpacking routines for non-first cells +c preload C' and rhs' from previous cell. +c +c assumed send happens outside this routine, but that +c c'(JMAX) and rhs'(JMAX) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + include 'work_lhs_vec.h' + + integer first,last,c + integer i,j,k,m,n,isize,ksize,jsize,jstart + + jstart = 0 + isize = cell_size(1,c)-end(1,c)-1 + jsize = cell_size(2,c)-1 + ksize = cell_size(3,c)-end(3,c)-1 + +c--------------------------------------------------------------------- +c zero the left hand side for starters +c set diagonal values to 1. This is overkill, but convenient +c--------------------------------------------------------------------- + do i = 0, isize + do m = 1, 5 + do n = 1, 5 + lhsa(m,n,i,0) = 0.0d0 + lhsb(m,n,i,0) = 0.0d0 + lhsa(m,n,i,jsize) = 0.0d0 + lhsb(m,n,i,jsize) = 0.0d0 + enddo + lhsb(m,m,i,0) = 1.0d0 + lhsb(m,m,i,jsize) = 1.0d0 + enddo + enddo + + do k=start(3,c),ksize + +c--------------------------------------------------------------------- +c This function computes the left hand side for the three y-factors +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Compute the indices for storing the tri-diagonal matrix; +c determine a (labeled f) and n jacobians for cell c +c--------------------------------------------------------------------- + + do j = start(2,c)-1, cell_size(2,c)-end(2,c) + do i=start(1,c),isize + + tmp1 = 1.0d0 / u(1,i,j,k,c) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + fjac(1,1,i,j) = 0.0d+00 + fjac(1,2,i,j) = 0.0d+00 + fjac(1,3,i,j) = 1.0d+00 + fjac(1,4,i,j) = 0.0d+00 + fjac(1,5,i,j) = 0.0d+00 + + fjac(2,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) + > * tmp2 + fjac(2,2,i,j) = u(3,i,j,k,c) * tmp1 + fjac(2,3,i,j) = u(2,i,j,k,c) * tmp1 + fjac(2,4,i,j) = 0.0d+00 + fjac(2,5,i,j) = 0.0d+00 + + fjac(3,1,i,j) = - ( u(3,i,j,k,c)*u(3,i,j,k,c)*tmp2) + > + c2 * qs(i,j,k,c) + fjac(3,2,i,j) = - c2 * u(2,i,j,k,c) * tmp1 + fjac(3,3,i,j) = ( 2.0d+00 - c2 ) + > * u(3,i,j,k,c) * tmp1 + fjac(3,4,i,j) = - c2 * u(4,i,j,k,c) * tmp1 + fjac(3,5,i,j) = c2 + + fjac(4,1,i,j) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) ) + > * tmp2 + fjac(4,2,i,j) = 0.0d+00 + fjac(4,3,i,j) = u(4,i,j,k,c) * tmp1 + fjac(4,4,i,j) = u(3,i,j,k,c) * tmp1 + fjac(4,5,i,j) = 0.0d+00 + + fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c) + > - c1 * u(5,i,j,k,c) * tmp1 ) + > * u(3,i,j,k,c) * tmp1 + fjac(5,2,i,j) = - c2 * u(2,i,j,k,c)*u(3,i,j,k,c) + > * tmp2 + fjac(5,3,i,j) = c1 * u(5,i,j,k,c) * tmp1 + > - c2 * ( qs(i,j,k,c) + > + u(3,i,j,k,c)*u(3,i,j,k,c) * tmp2 ) + fjac(5,4,i,j) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) ) + > * tmp2 + fjac(5,5,i,j) = c1 * u(3,i,j,k,c) * tmp1 + + njac(1,1,i,j) = 0.0d+00 + njac(1,2,i,j) = 0.0d+00 + njac(1,3,i,j) = 0.0d+00 + njac(1,4,i,j) = 0.0d+00 + njac(1,5,i,j) = 0.0d+00 + + njac(2,1,i,j) = - c3c4 * tmp2 * u(2,i,j,k,c) + njac(2,2,i,j) = c3c4 * tmp1 + njac(2,3,i,j) = 0.0d+00 + njac(2,4,i,j) = 0.0d+00 + njac(2,5,i,j) = 0.0d+00 + + njac(3,1,i,j) = - con43 * c3c4 * tmp2 * u(3,i,j,k,c) + njac(3,2,i,j) = 0.0d+00 + njac(3,3,i,j) = con43 * c3c4 * tmp1 + njac(3,4,i,j) = 0.0d+00 + njac(3,5,i,j) = 0.0d+00 + + njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c) + njac(4,2,i,j) = 0.0d+00 + njac(4,3,i,j) = 0.0d+00 + njac(4,4,i,j) = c3c4 * tmp1 + njac(4,5,i,j) = 0.0d+00 + + njac(5,1,i,j) = - ( c3c4 + > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2) + > - ( con43 * c3c4 + > - c1345 ) * tmp3 * (u(3,i,j,k,c)**2) + > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2) + > - c1345 * tmp2 * u(5,i,j,k,c) + + njac(5,2,i,j) = ( c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c) + njac(5,3,i,j) = ( con43 * c3c4 + > - c1345 ) * tmp2 * u(3,i,j,k,c) + njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c) + njac(5,5,i,j) = ( c1345 ) * tmp1 + + enddo + enddo + +c--------------------------------------------------------------------- +c now joacobians set, so form left hand side in y direction +c--------------------------------------------------------------------- + do j = start(2,c), jsize-end(2,c) + do i=start(1,c),isize + + tmp1 = dt * ty1 + tmp2 = dt * ty2 + + lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i,j-1) + > - tmp1 * njac(1,1,i,j-1) + > - tmp1 * dy1 + lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i,j-1) + > - tmp1 * njac(1,2,i,j-1) + lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i,j-1) + > - tmp1 * njac(1,3,i,j-1) + lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i,j-1) + > - tmp1 * njac(1,4,i,j-1) + lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i,j-1) + > - tmp1 * njac(1,5,i,j-1) + + lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i,j-1) + > - tmp1 * njac(2,1,i,j-1) + lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i,j-1) + > - tmp1 * njac(2,2,i,j-1) + > - tmp1 * dy2 + lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i,j-1) + > - tmp1 * njac(2,3,i,j-1) + lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i,j-1) + > - tmp1 * njac(2,4,i,j-1) + lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i,j-1) + > - tmp1 * njac(2,5,i,j-1) + + lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i,j-1) + > - tmp1 * njac(3,1,i,j-1) + lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i,j-1) + > - tmp1 * njac(3,2,i,j-1) + lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i,j-1) + > - tmp1 * njac(3,3,i,j-1) + > - tmp1 * dy3 + lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i,j-1) + > - tmp1 * njac(3,4,i,j-1) + lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i,j-1) + > - tmp1 * njac(3,5,i,j-1) + + lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i,j-1) + > - tmp1 * njac(4,1,i,j-1) + lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i,j-1) + > - tmp1 * njac(4,2,i,j-1) + lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i,j-1) + > - tmp1 * njac(4,3,i,j-1) + lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i,j-1) + > - tmp1 * njac(4,4,i,j-1) + > - tmp1 * dy4 + lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i,j-1) + > - tmp1 * njac(4,5,i,j-1) + + lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i,j-1) + > - tmp1 * njac(5,1,i,j-1) + lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i,j-1) + > - tmp1 * njac(5,2,i,j-1) + lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i,j-1) + > - tmp1 * njac(5,3,i,j-1) + lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i,j-1) + > - tmp1 * njac(5,4,i,j-1) + lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i,j-1) + > - tmp1 * njac(5,5,i,j-1) + > - tmp1 * dy5 + + lhsb(1,1,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(1,1,i,j) + > + tmp1 * 2.0d+00 * dy1 + lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j) + lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j) + lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j) + lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j) + + lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j) + lhsb(2,2,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(2,2,i,j) + > + tmp1 * 2.0d+00 * dy2 + lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j) + lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j) + lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j) + + lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j) + lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j) + lhsb(3,3,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(3,3,i,j) + > + tmp1 * 2.0d+00 * dy3 + lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j) + lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j) + + lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j) + lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j) + lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j) + lhsb(4,4,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(4,4,i,j) + > + tmp1 * 2.0d+00 * dy4 + lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j) + + lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j) + lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j) + lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j) + lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j) + lhsb(5,5,i,j) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(5,5,i,j) + > + tmp1 * 2.0d+00 * dy5 + + lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i,j+1) + > - tmp1 * njac(1,1,i,j+1) + > - tmp1 * dy1 + lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i,j+1) + > - tmp1 * njac(1,2,i,j+1) + lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i,j+1) + > - tmp1 * njac(1,3,i,j+1) + lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i,j+1) + > - tmp1 * njac(1,4,i,j+1) + lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i,j+1) + > - tmp1 * njac(1,5,i,j+1) + + lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i,j+1) + > - tmp1 * njac(2,1,i,j+1) + lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i,j+1) + > - tmp1 * njac(2,2,i,j+1) + > - tmp1 * dy2 + lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i,j+1) + > - tmp1 * njac(2,3,i,j+1) + lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i,j+1) + > - tmp1 * njac(2,4,i,j+1) + lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i,j+1) + > - tmp1 * njac(2,5,i,j+1) + + lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i,j+1) + > - tmp1 * njac(3,1,i,j+1) + lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i,j+1) + > - tmp1 * njac(3,2,i,j+1) + lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i,j+1) + > - tmp1 * njac(3,3,i,j+1) + > - tmp1 * dy3 + lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i,j+1) + > - tmp1 * njac(3,4,i,j+1) + lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i,j+1) + > - tmp1 * njac(3,5,i,j+1) + + lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i,j+1) + > - tmp1 * njac(4,1,i,j+1) + lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i,j+1) + > - tmp1 * njac(4,2,i,j+1) + lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i,j+1) + > - tmp1 * njac(4,3,i,j+1) + lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i,j+1) + > - tmp1 * njac(4,4,i,j+1) + > - tmp1 * dy4 + lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i,j+1) + > - tmp1 * njac(4,5,i,j+1) + + lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i,j+1) + > - tmp1 * njac(5,1,i,j+1) + lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i,j+1) + > - tmp1 * njac(5,2,i,j+1) + lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i,j+1) + > - tmp1 * njac(5,3,i,j+1) + lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i,j+1) + > - tmp1 * njac(5,4,i,j+1) + lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i,j+1) + > - tmp1 * njac(5,5,i,j+1) + > - tmp1 * dy5 + + enddo + enddo + + +c--------------------------------------------------------------------- +c outer most do loops - sweeping in i direction +c--------------------------------------------------------------------- + if (first .eq. 1) then + +c--------------------------------------------------------------------- +c multiply c(i,jstart,k) by b_inverse and copy back to c +c multiply rhs(jstart) by b_inverse(jstart) and copy to rhs +c--------------------------------------------------------------------- +!dir$ ivdep + do i=start(1,c),isize + call binvcrhs( lhsb(1,1,i,jstart), + > lhsc(1,1,i,jstart,k,c), + > rhs(1,i,jstart,k,c) ) + enddo + + endif + +c--------------------------------------------------------------------- +c begin inner most do loop +c do all the elements of the cell unless last +c--------------------------------------------------------------------- + do j=jstart+first,jsize-last +!dir$ ivdep + do i=start(1,c),isize + +c--------------------------------------------------------------------- +c subtract A*lhs_vector(j-1) from lhs_vector(j) +c +c rhs(j) = rhs(j) - A*rhs(j-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,i,j), + > rhs(1,i,j-1,k,c),rhs(1,i,j,k,c)) + +c--------------------------------------------------------------------- +c B(j) = B(j) - C(j-1)*A(j) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,i,j), + > lhsc(1,1,i,j-1,k,c), + > lhsb(1,1,i,j)) + +c--------------------------------------------------------------------- +c multiply c(i,j,k) by b_inverse and copy back to c +c multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,i,j), + > lhsc(1,1,i,j,k,c), + > rhs(1,i,j,k,c) ) + + enddo + enddo + +c--------------------------------------------------------------------- +c Now finish up special cases for last cell +c--------------------------------------------------------------------- + if (last .eq. 1) then + +!dir$ ivdep + do i=start(1,c),isize +c--------------------------------------------------------------------- +c rhs(jsize) = rhs(jsize) - A*rhs(jsize-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,i,jsize), + > rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c)) + +c--------------------------------------------------------------------- +c B(jsize) = B(jsize) - C(jsize-1)*A(jsize) +c call matmul_sub(aa,i,jsize,k,c, +c $ cc,i,jsize-1,k,c,bb,i,jsize,k,c) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,i,jsize), + > lhsc(1,1,i,jsize-1,k,c), + > lhsb(1,1,i,jsize)) + +c--------------------------------------------------------------------- +c multiply rhs(jsize) by b_inverse(jsize) and copy to rhs +c--------------------------------------------------------------------- + call binvrhs( lhsb(1,1,i,jsize), + > rhs(1,i,jsize,k,c) ) + enddo + + endif + enddo + + + return + end + + + diff --git a/examples/smpi/NAS/BT/z_solve.f b/examples/smpi/NAS/BT/z_solve.f new file mode 100644 index 0000000000..d7a5a2f1ec --- /dev/null +++ b/examples/smpi/NAS/BT/z_solve.f @@ -0,0 +1,776 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Performs line solves in Z direction by first factoring +c the block-tridiagonal matrix into an upper triangular matrix, +c and then performing back substitution to solve for the unknow +c vectors of each line. +c +c Make sure we treat elements zero to cell_size in the direction +c of the sweep. +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer c, kstart, stage, + > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), + > isize,jsize,ksize,send_id + + kstart = 0 + +c--------------------------------------------------------------------- +c in our terminology stage is the number of the cell in the y-direction +c i.e. stage = 1 means the start of the line stage=ncells means end +c--------------------------------------------------------------------- + do stage = 1,ncells + c = slice(3,stage) + isize = cell_size(1,c) - 1 + jsize = cell_size(2,c) - 1 + ksize = cell_size(3,c) - 1 +c--------------------------------------------------------------------- +c set last-cell flag +c--------------------------------------------------------------------- + if (stage .eq. ncells) then + last = 1 + else + last = 0 + endif + + if (stage .eq. 1) then +c--------------------------------------------------------------------- +c This is the first cell, so solve without receiving data +c--------------------------------------------------------------------- + first = 1 +c call lhsz(c) + call z_solve_cell(first,last,c) + else +c--------------------------------------------------------------------- +c Not the first cell of this line, so receive info from +c processor working on preceeding cell +c--------------------------------------------------------------------- + first = 0 + call z_receive_solve_info(recv_id,c) +c--------------------------------------------------------------------- +c overlap computations and communications +c--------------------------------------------------------------------- +c call lhsz(c) +c--------------------------------------------------------------------- +c wait for completion +c--------------------------------------------------------------------- + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) +c--------------------------------------------------------------------- +c install C'(kstart+1) and rhs'(kstart+1) to be used in this cell +c--------------------------------------------------------------------- + call z_unpack_solve_info(c) + call z_solve_cell(first,last,c) + endif + + if (last .eq. 0) call z_send_solve_info(send_id,c) + enddo + +c--------------------------------------------------------------------- +c now perform backsubstitution in reverse direction +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(3,stage) + first = 0 + last = 0 + if (stage .eq. 1) first = 1 + if (stage .eq. ncells) then + last = 1 +c--------------------------------------------------------------------- +c last cell, so perform back substitute without waiting +c--------------------------------------------------------------------- + call z_backsubstitute(first, last,c) + else + call z_receive_backsub_info(recv_id,c) + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) + call z_unpack_backsub_info(c) + call z_backsubstitute(first,last,c) + endif + if (first .eq. 0) call z_send_backsub_info(send_id,c) + enddo + + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_unpack_solve_info(c) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack C'(-1) and rhs'(-1) for +c all i and j +c--------------------------------------------------------------------- + + include 'header.h' + + integer i,j,m,n,ptr,c,kstart + + kstart = 0 + ptr = 0 + do j=0,JMAX-1 + do i=0,IMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_send_solve_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send C'(kend) and rhs'(kend) for +c all i and j +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i,j,m,n,ksize,ptr,c,ip,jp + integer error,send_id,buffer_size + + ksize = cell_size(3,c)-1 + ip = cell_coord(1,c) - 1 + jp = cell_coord(2,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + +c--------------------------------------------------------------------- +c pack up buffer +c--------------------------------------------------------------------- + ptr = 0 + do j=0,JMAX-1 + do i=0,IMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,i,j,ksize,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + +c--------------------------------------------------------------------- +c send buffer +c--------------------------------------------------------------------- + call mpi_isend(in_buffer, buffer_size, + > dp_type, successor(3), + > BOTTOM+ip+jp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_send_backsub_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send U(jstart) for all i and j +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i,j,n,ptr,c,kstart,ip,jp + integer error,send_id,buffer_size + +c--------------------------------------------------------------------- +c Send element 0 to previous processor +c--------------------------------------------------------------------- + kstart = 0 + ip = cell_coord(1,c)-1 + jp = cell_coord(2,c)-1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + ptr = 0 + do j=0,JMAX-1 + do i=0,IMAX-1 + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,i,j,kstart,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + call mpi_isend(in_buffer, buffer_size, + > dp_type, predecessor(3), + > TOP+ip+jp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_unpack_backsub_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack U(ksize) for all i and j +c--------------------------------------------------------------------- + + include 'header.h' + + integer i,j,n,ptr,c + + ptr = 0 + do j=0,JMAX-1 + do i=0,IMAX-1 + do n=1,BLOCK_SIZE + backsub_info(n,i,j,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_receive_backsub_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer error,recv_id,ip,jp,c,buffer_size + ip = cell_coord(1,c) - 1 + jp = cell_coord(2,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + call mpi_irecv(out_buffer, buffer_size, + > dp_type, successor(3), + > TOP+ip+jp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_receive_solve_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer ip,jp,recv_id,error,c,buffer_size + ip = cell_coord(1,c) - 1 + jp = cell_coord(2,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + call mpi_irecv(out_buffer, buffer_size, + > dp_type, predecessor(3), + > BOTTOM+ip+jp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_backsubstitute(first, last, c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c back solve: if last cell, then generate U(ksize)=rhs(ksize) +c else assume U(ksize) is loaded in un pack backsub_info +c so just use it +c after call u(kstart) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + + integer first, last, c, i, k + integer m,n,j,jsize,isize,ksize,kstart + + kstart = 0 + isize = cell_size(1,c)-end(1,c)-1 + jsize = cell_size(2,c)-end(2,c)-1 + ksize = cell_size(3,c)-1 + if (last .eq. 0) then + do j=start(2,c),jsize + do i=start(1,c),isize +c--------------------------------------------------------------------- +c U(jsize) uses info from previous cell if not last cell +c--------------------------------------------------------------------- + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) + > - lhsc(m,n,i,j,ksize,c)* + > backsub_info(n,i,j,c) + enddo + enddo + enddo + enddo + endif + do k=ksize-1,kstart,-1 + do j=start(2,c),jsize + do i=start(1,c),isize + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) + > - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c) + enddo + enddo + enddo + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_solve_cell(first,last,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c performs guaussian elimination on this cell. +c +c assumes that unpacking routines for non-first cells +c preload C' and rhs' from previous cell. +c +c assumed send happens outside this routine, but that +c c'(KMAX) and rhs'(KMAX) will be sent to next cell. +c--------------------------------------------------------------------- + + include 'header.h' + include 'work_lhs.h' + + integer first,last,c + integer i,j,k,isize,ksize,jsize,kstart + double precision utmp(6,-2:KMAX+1) + + kstart = 0 + isize = cell_size(1,c)-end(1,c)-1 + jsize = cell_size(2,c)-end(2,c)-1 + ksize = cell_size(3,c)-1 + + call lhsabinit(lhsa, lhsb, ksize) + + do j=start(2,c),jsize + do i=start(1,c),isize + +c--------------------------------------------------------------------- +c This function computes the left hand side for the three z-factors +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Compute the indices for storing the block-diagonal matrix; +c determine c (labeled f) and s jacobians for cell c +c--------------------------------------------------------------------- + do k = start(3,c)-1, cell_size(3,c)-end(3,c) + utmp(1,k) = 1.0d0 / u(1,i,j,k,c) + utmp(2,k) = u(2,i,j,k,c) + utmp(3,k) = u(3,i,j,k,c) + utmp(4,k) = u(4,i,j,k,c) + utmp(5,k) = u(5,i,j,k,c) + utmp(6,k) = qs(i,j,k,c) + end do + + do k = start(3,c)-1, cell_size(3,c)-end(3,c) + + tmp1 = utmp(1,k) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + fjac(1,1,k) = 0.0d+00 + fjac(1,2,k) = 0.0d+00 + fjac(1,3,k) = 0.0d+00 + fjac(1,4,k) = 1.0d+00 + fjac(1,5,k) = 0.0d+00 + + fjac(2,1,k) = - ( utmp(2,k)*utmp(4,k) ) + > * tmp2 + fjac(2,2,k) = utmp(4,k) * tmp1 + fjac(2,3,k) = 0.0d+00 + fjac(2,4,k) = utmp(2,k) * tmp1 + fjac(2,5,k) = 0.0d+00 + + fjac(3,1,k) = - ( utmp(3,k)*utmp(4,k) ) + > * tmp2 + fjac(3,2,k) = 0.0d+00 + fjac(3,3,k) = utmp(4,k) * tmp1 + fjac(3,4,k) = utmp(3,k) * tmp1 + fjac(3,5,k) = 0.0d+00 + + fjac(4,1,k) = - (utmp(4,k)*utmp(4,k) * tmp2 ) + > + c2 * utmp(6,k) + fjac(4,2,k) = - c2 * utmp(2,k) * tmp1 + fjac(4,3,k) = - c2 * utmp(3,k) * tmp1 + fjac(4,4,k) = ( 2.0d+00 - c2 ) + > * utmp(4,k) * tmp1 + fjac(4,5,k) = c2 + + fjac(5,1,k) = ( c2 * 2.0d0 * utmp(6,k) + > - c1 * ( utmp(5,k) * tmp1 ) ) + > * ( utmp(4,k) * tmp1 ) + fjac(5,2,k) = - c2 * ( utmp(2,k)*utmp(4,k) ) + > * tmp2 + fjac(5,3,k) = - c2 * ( utmp(3,k)*utmp(4,k) ) + > * tmp2 + fjac(5,4,k) = c1 * ( utmp(5,k) * tmp1 ) + > - c2 * ( utmp(6,k) + > + utmp(4,k)*utmp(4,k) * tmp2 ) + fjac(5,5,k) = c1 * utmp(4,k) * tmp1 + + njac(1,1,k) = 0.0d+00 + njac(1,2,k) = 0.0d+00 + njac(1,3,k) = 0.0d+00 + njac(1,4,k) = 0.0d+00 + njac(1,5,k) = 0.0d+00 + + njac(2,1,k) = - c3c4 * tmp2 * utmp(2,k) + njac(2,2,k) = c3c4 * tmp1 + njac(2,3,k) = 0.0d+00 + njac(2,4,k) = 0.0d+00 + njac(2,5,k) = 0.0d+00 + + njac(3,1,k) = - c3c4 * tmp2 * utmp(3,k) + njac(3,2,k) = 0.0d+00 + njac(3,3,k) = c3c4 * tmp1 + njac(3,4,k) = 0.0d+00 + njac(3,5,k) = 0.0d+00 + + njac(4,1,k) = - con43 * c3c4 * tmp2 * utmp(4,k) + njac(4,2,k) = 0.0d+00 + njac(4,3,k) = 0.0d+00 + njac(4,4,k) = con43 * c3 * c4 * tmp1 + njac(4,5,k) = 0.0d+00 + + njac(5,1,k) = - ( c3c4 + > - c1345 ) * tmp3 * (utmp(2,k)**2) + > - ( c3c4 - c1345 ) * tmp3 * (utmp(3,k)**2) + > - ( con43 * c3c4 + > - c1345 ) * tmp3 * (utmp(4,k)**2) + > - c1345 * tmp2 * utmp(5,k) + + njac(5,2,k) = ( c3c4 - c1345 ) * tmp2 * utmp(2,k) + njac(5,3,k) = ( c3c4 - c1345 ) * tmp2 * utmp(3,k) + njac(5,4,k) = ( con43 * c3c4 + > - c1345 ) * tmp2 * utmp(4,k) + njac(5,5,k) = ( c1345 )* tmp1 + + + enddo + +c--------------------------------------------------------------------- +c now joacobians set, so form left hand side in z direction +c--------------------------------------------------------------------- + do k = start(3,c), ksize-end(3,c) + + tmp1 = dt * tz1 + tmp2 = dt * tz2 + + lhsa(1,1,k) = - tmp2 * fjac(1,1,k-1) + > - tmp1 * njac(1,1,k-1) + > - tmp1 * dz1 + lhsa(1,2,k) = - tmp2 * fjac(1,2,k-1) + > - tmp1 * njac(1,2,k-1) + lhsa(1,3,k) = - tmp2 * fjac(1,3,k-1) + > - tmp1 * njac(1,3,k-1) + lhsa(1,4,k) = - tmp2 * fjac(1,4,k-1) + > - tmp1 * njac(1,4,k-1) + lhsa(1,5,k) = - tmp2 * fjac(1,5,k-1) + > - tmp1 * njac(1,5,k-1) + + lhsa(2,1,k) = - tmp2 * fjac(2,1,k-1) + > - tmp1 * njac(2,1,k-1) + lhsa(2,2,k) = - tmp2 * fjac(2,2,k-1) + > - tmp1 * njac(2,2,k-1) + > - tmp1 * dz2 + lhsa(2,3,k) = - tmp2 * fjac(2,3,k-1) + > - tmp1 * njac(2,3,k-1) + lhsa(2,4,k) = - tmp2 * fjac(2,4,k-1) + > - tmp1 * njac(2,4,k-1) + lhsa(2,5,k) = - tmp2 * fjac(2,5,k-1) + > - tmp1 * njac(2,5,k-1) + + lhsa(3,1,k) = - tmp2 * fjac(3,1,k-1) + > - tmp1 * njac(3,1,k-1) + lhsa(3,2,k) = - tmp2 * fjac(3,2,k-1) + > - tmp1 * njac(3,2,k-1) + lhsa(3,3,k) = - tmp2 * fjac(3,3,k-1) + > - tmp1 * njac(3,3,k-1) + > - tmp1 * dz3 + lhsa(3,4,k) = - tmp2 * fjac(3,4,k-1) + > - tmp1 * njac(3,4,k-1) + lhsa(3,5,k) = - tmp2 * fjac(3,5,k-1) + > - tmp1 * njac(3,5,k-1) + + lhsa(4,1,k) = - tmp2 * fjac(4,1,k-1) + > - tmp1 * njac(4,1,k-1) + lhsa(4,2,k) = - tmp2 * fjac(4,2,k-1) + > - tmp1 * njac(4,2,k-1) + lhsa(4,3,k) = - tmp2 * fjac(4,3,k-1) + > - tmp1 * njac(4,3,k-1) + lhsa(4,4,k) = - tmp2 * fjac(4,4,k-1) + > - tmp1 * njac(4,4,k-1) + > - tmp1 * dz4 + lhsa(4,5,k) = - tmp2 * fjac(4,5,k-1) + > - tmp1 * njac(4,5,k-1) + + lhsa(5,1,k) = - tmp2 * fjac(5,1,k-1) + > - tmp1 * njac(5,1,k-1) + lhsa(5,2,k) = - tmp2 * fjac(5,2,k-1) + > - tmp1 * njac(5,2,k-1) + lhsa(5,3,k) = - tmp2 * fjac(5,3,k-1) + > - tmp1 * njac(5,3,k-1) + lhsa(5,4,k) = - tmp2 * fjac(5,4,k-1) + > - tmp1 * njac(5,4,k-1) + lhsa(5,5,k) = - tmp2 * fjac(5,5,k-1) + > - tmp1 * njac(5,5,k-1) + > - tmp1 * dz5 + + lhsb(1,1,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(1,1,k) + > + tmp1 * 2.0d+00 * dz1 + lhsb(1,2,k) = tmp1 * 2.0d+00 * njac(1,2,k) + lhsb(1,3,k) = tmp1 * 2.0d+00 * njac(1,3,k) + lhsb(1,4,k) = tmp1 * 2.0d+00 * njac(1,4,k) + lhsb(1,5,k) = tmp1 * 2.0d+00 * njac(1,5,k) + + lhsb(2,1,k) = tmp1 * 2.0d+00 * njac(2,1,k) + lhsb(2,2,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(2,2,k) + > + tmp1 * 2.0d+00 * dz2 + lhsb(2,3,k) = tmp1 * 2.0d+00 * njac(2,3,k) + lhsb(2,4,k) = tmp1 * 2.0d+00 * njac(2,4,k) + lhsb(2,5,k) = tmp1 * 2.0d+00 * njac(2,5,k) + + lhsb(3,1,k) = tmp1 * 2.0d+00 * njac(3,1,k) + lhsb(3,2,k) = tmp1 * 2.0d+00 * njac(3,2,k) + lhsb(3,3,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(3,3,k) + > + tmp1 * 2.0d+00 * dz3 + lhsb(3,4,k) = tmp1 * 2.0d+00 * njac(3,4,k) + lhsb(3,5,k) = tmp1 * 2.0d+00 * njac(3,5,k) + + lhsb(4,1,k) = tmp1 * 2.0d+00 * njac(4,1,k) + lhsb(4,2,k) = tmp1 * 2.0d+00 * njac(4,2,k) + lhsb(4,3,k) = tmp1 * 2.0d+00 * njac(4,3,k) + lhsb(4,4,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(4,4,k) + > + tmp1 * 2.0d+00 * dz4 + lhsb(4,5,k) = tmp1 * 2.0d+00 * njac(4,5,k) + + lhsb(5,1,k) = tmp1 * 2.0d+00 * njac(5,1,k) + lhsb(5,2,k) = tmp1 * 2.0d+00 * njac(5,2,k) + lhsb(5,3,k) = tmp1 * 2.0d+00 * njac(5,3,k) + lhsb(5,4,k) = tmp1 * 2.0d+00 * njac(5,4,k) + lhsb(5,5,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(5,5,k) + > + tmp1 * 2.0d+00 * dz5 + + lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,k+1) + > - tmp1 * njac(1,1,k+1) + > - tmp1 * dz1 + lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,k+1) + > - tmp1 * njac(1,2,k+1) + lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,k+1) + > - tmp1 * njac(1,3,k+1) + lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,k+1) + > - tmp1 * njac(1,4,k+1) + lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,k+1) + > - tmp1 * njac(1,5,k+1) + + lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,k+1) + > - tmp1 * njac(2,1,k+1) + lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,k+1) + > - tmp1 * njac(2,2,k+1) + > - tmp1 * dz2 + lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,k+1) + > - tmp1 * njac(2,3,k+1) + lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,k+1) + > - tmp1 * njac(2,4,k+1) + lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,k+1) + > - tmp1 * njac(2,5,k+1) + + lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,k+1) + > - tmp1 * njac(3,1,k+1) + lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,k+1) + > - tmp1 * njac(3,2,k+1) + lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,k+1) + > - tmp1 * njac(3,3,k+1) + > - tmp1 * dz3 + lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,k+1) + > - tmp1 * njac(3,4,k+1) + lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,k+1) + > - tmp1 * njac(3,5,k+1) + + lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,k+1) + > - tmp1 * njac(4,1,k+1) + lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,k+1) + > - tmp1 * njac(4,2,k+1) + lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,k+1) + > - tmp1 * njac(4,3,k+1) + lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,k+1) + > - tmp1 * njac(4,4,k+1) + > - tmp1 * dz4 + lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,k+1) + > - tmp1 * njac(4,5,k+1) + + lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,k+1) + > - tmp1 * njac(5,1,k+1) + lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,k+1) + > - tmp1 * njac(5,2,k+1) + lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,k+1) + > - tmp1 * njac(5,3,k+1) + lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,k+1) + > - tmp1 * njac(5,4,k+1) + lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,k+1) + > - tmp1 * njac(5,5,k+1) + > - tmp1 * dz5 + + enddo + + +c--------------------------------------------------------------------- +c outer most do loops - sweeping in i direction +c--------------------------------------------------------------------- + if (first .eq. 1) then + +c--------------------------------------------------------------------- +c multiply c(i,j,kstart) by b_inverse and copy back to c +c multiply rhs(kstart) by b_inverse(kstart) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,kstart), + > lhsc(1,1,i,j,kstart,c), + > rhs(1,i,j,kstart,c) ) + + endif + +c--------------------------------------------------------------------- +c begin inner most do loop +c do all the elements of the cell unless last +c--------------------------------------------------------------------- + do k=kstart+first,ksize-last + +c--------------------------------------------------------------------- +c subtract A*lhs_vector(k-1) from lhs_vector(k) +c +c rhs(k) = rhs(k) - A*rhs(k-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,k), + > rhs(1,i,j,k-1,c),rhs(1,i,j,k,c)) + +c--------------------------------------------------------------------- +c B(k) = B(k) - C(k-1)*A(k) +c call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,k), + > lhsc(1,1,i,j,k-1,c), + > lhsb(1,1,k)) + +c--------------------------------------------------------------------- +c multiply c(i,j,k) by b_inverse and copy back to c +c multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,k), + > lhsc(1,1,i,j,k,c), + > rhs(1,i,j,k,c) ) + + enddo + +c--------------------------------------------------------------------- +c Now finish up special cases for last cell +c--------------------------------------------------------------------- + if (last .eq. 1) then + +c--------------------------------------------------------------------- +c rhs(ksize) = rhs(ksize) - A*rhs(ksize-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,ksize), + > rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c)) + +c--------------------------------------------------------------------- +c B(ksize) = B(ksize) - C(ksize-1)*A(ksize) +c call matmul_sub(aa,i,j,ksize,c, +c $ cc,i,j,ksize-1,c,bb,i,j,ksize,c) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,ksize), + > lhsc(1,1,i,j,ksize-1,c), + > lhsb(1,1,ksize)) + +c--------------------------------------------------------------------- +c multiply rhs(ksize) by b_inverse(ksize) and copy to rhs +c--------------------------------------------------------------------- + call binvrhs( lhsb(1,1,ksize), + > rhs(1,i,j,ksize,c) ) + + endif + enddo + enddo + + + return + end + + + + + + diff --git a/examples/smpi/NAS/BT/z_solve_vec.f b/examples/smpi/NAS/BT/z_solve_vec.f new file mode 100644 index 0000000000..2c27fb00d1 --- /dev/null +++ b/examples/smpi/NAS/BT/z_solve_vec.f @@ -0,0 +1,793 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Performs line solves in Z direction by first factoring +c the block-tridiagonal matrix into an upper triangular matrix, +c and then performing back substitution to solve for the unknow +c vectors of each line. +c +c Make sure we treat elements zero to cell_size in the direction +c of the sweep. +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer c, kstart, stage, + > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), + > isize,jsize,ksize,send_id + + kstart = 0 + +c--------------------------------------------------------------------- +c in our terminology stage is the number of the cell in the y-direct +c i.e. stage = 1 means the start of the line stage=ncells means end +c--------------------------------------------------------------------- + do stage = 1,ncells + c = slice(3,stage) + isize = cell_size(1,c) - 1 + jsize = cell_size(2,c) - 1 + ksize = cell_size(3,c) - 1 +c--------------------------------------------------------------------- +c set last-cell flag +c--------------------------------------------------------------------- + if (stage .eq. ncells) then + last = 1 + else + last = 0 + endif + + if (stage .eq. 1) then +c--------------------------------------------------------------------- +c This is the first cell, so solve without receiving data +c--------------------------------------------------------------------- + first = 1 +c call lhsz(c) + call z_solve_cell(first,last,c) + else +c--------------------------------------------------------------------- +c Not the first cell of this line, so receive info from +c processor working on preceeding cell +c--------------------------------------------------------------------- + first = 0 + call z_receive_solve_info(recv_id,c) +c--------------------------------------------------------------------- +c overlap computations and communications +c--------------------------------------------------------------------- +c call lhsz(c) +c--------------------------------------------------------------------- +c wait for completion +c--------------------------------------------------------------------- + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) +c--------------------------------------------------------------------- +c install C'(kstart+1) and rhs'(kstart+1) to be used in this cell +c--------------------------------------------------------------------- + call z_unpack_solve_info(c) + call z_solve_cell(first,last,c) + endif + + if (last .eq. 0) call z_send_solve_info(send_id,c) + enddo + +c--------------------------------------------------------------------- +c now perform backsubstitution in reverse direction +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(3,stage) + first = 0 + last = 0 + if (stage .eq. 1) first = 1 + if (stage .eq. ncells) then + last = 1 +c--------------------------------------------------------------------- +c last cell, so perform back substitute without waiting +c--------------------------------------------------------------------- + call z_backsubstitute(first, last,c) + else + call z_receive_backsub_info(recv_id,c) + call mpi_wait(send_id,r_status,error) + call mpi_wait(recv_id,r_status,error) + call z_unpack_backsub_info(c) + call z_backsubstitute(first,last,c) + endif + if (first .eq. 0) call z_send_backsub_info(send_id,c) + enddo + + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_unpack_solve_info(c) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack C'(-1) and rhs'(-1) for +c all i and j +c--------------------------------------------------------------------- + + include 'header.h' + + integer i,j,m,n,ptr,c,kstart + + kstart = 0 + ptr = 0 + do j=0,JMAX-1 + do i=0,IMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_send_solve_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send C'(kend) and rhs'(kend) for +c all i and j +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i,j,m,n,ksize,ptr,c,ip,jp + integer error,send_id,buffer_size + + ksize = cell_size(3,c)-1 + ip = cell_coord(1,c) - 1 + jp = cell_coord(2,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + +c--------------------------------------------------------------------- +c pack up buffer +c--------------------------------------------------------------------- + ptr = 0 + do j=0,JMAX-1 + do i=0,IMAX-1 + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,i,j,ksize,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + +c--------------------------------------------------------------------- +c send buffer +c--------------------------------------------------------------------- + call mpi_isend(in_buffer, buffer_size, + > dp_type, successor(3), + > BOTTOM+ip+jp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_send_backsub_info(send_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c pack up and send U(jstart) for all i and j +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i,j,n,ptr,c,kstart,ip,jp + integer error,send_id,buffer_size + +c--------------------------------------------------------------------- +c Send element 0 to previous processor +c--------------------------------------------------------------------- + kstart = 0 + ip = cell_coord(1,c)-1 + jp = cell_coord(2,c)-1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + ptr = 0 + do j=0,JMAX-1 + do i=0,IMAX-1 + do n=1,BLOCK_SIZE + in_buffer(ptr+n) = rhs(n,i,j,kstart,c) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + call mpi_isend(in_buffer, buffer_size, + > dp_type, predecessor(3), + > TOP+ip+jp*NCELLS, comm_solve, + > send_id,error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_unpack_backsub_info(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c unpack U(ksize) for all i and j +c--------------------------------------------------------------------- + + include 'header.h' + + integer i,j,n,ptr,c + + ptr = 0 + do j=0,JMAX-1 + do i=0,IMAX-1 + do n=1,BLOCK_SIZE + backsub_info(n,i,j,c) = out_buffer(ptr+n) + enddo + ptr = ptr+BLOCK_SIZE + enddo + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_receive_backsub_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer error,recv_id,ip,jp,c,buffer_size + ip = cell_coord(1,c) - 1 + jp = cell_coord(2,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE + call mpi_irecv(out_buffer, buffer_size, + > dp_type, successor(3), + > TOP+ip+jp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_receive_solve_info(recv_id,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c post mpi receives +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer ip,jp,recv_id,error,c,buffer_size + ip = cell_coord(1,c) - 1 + jp = cell_coord(2,c) - 1 + buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* + > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) + call mpi_irecv(out_buffer, buffer_size, + > dp_type, predecessor(3), + > BOTTOM+ip+jp*NCELLS, comm_solve, + > recv_id, error) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_backsubstitute(first, last, c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c back solve: if last cell, then generate U(ksize)=rhs(ksize) +c else assume U(ksize) is loaded in un pack backsub_info +c so just use it +c after call u(kstart) will be sent to next cell +c--------------------------------------------------------------------- + + include 'header.h' + + integer first, last, c, i, k + integer m,n,j,jsize,isize,ksize,kstart + + kstart = 0 + isize = cell_size(1,c)-end(1,c)-1 + jsize = cell_size(2,c)-end(2,c)-1 + ksize = cell_size(3,c)-1 + if (last .eq. 0) then + do j=start(2,c),jsize + do i=start(1,c),isize +c--------------------------------------------------------------------- +c U(jsize) uses info from previous cell if not last cell +c--------------------------------------------------------------------- + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) + > - lhsc(m,n,i,j,ksize,c)* + > backsub_info(n,i,j,c) + enddo + enddo + enddo + enddo + endif + do k=ksize-1,kstart,-1 + do j=start(2,c),jsize + do i=start(1,c),isize + do m=1,BLOCK_SIZE + do n=1,BLOCK_SIZE + rhs(m,i,j,k,c) = rhs(m,i,j,k,c) + > - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c) + enddo + enddo + enddo + enddo + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_solve_cell(first,last,c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c performs guaussian elimination on this cell. +c +c assumes that unpacking routines for non-first cells +c preload C' and rhs' from previous cell. +c +c assumed send happens outside this routine, but that +c c'(KMAX) and rhs'(KMAX) will be sent to next cell. +c--------------------------------------------------------------------- + + include 'header.h' + include 'work_lhs_vec.h' + + integer first,last,c + integer i,j,k,m,n,isize,ksize,jsize,kstart + + kstart = 0 + isize = cell_size(1,c)-end(1,c)-1 + jsize = cell_size(2,c)-end(2,c)-1 + ksize = cell_size(3,c)-1 + +c--------------------------------------------------------------------- +c zero the left hand side for starters +c set diagonal values to 1. This is overkill, but convenient +c--------------------------------------------------------------------- + do i = 0, isize + do m = 1, 5 + do n = 1, 5 + lhsa(m,n,i,0) = 0.0d0 + lhsb(m,n,i,0) = 0.0d0 + lhsa(m,n,i,ksize) = 0.0d0 + lhsb(m,n,i,ksize) = 0.0d0 + enddo + lhsb(m,m,i,0) = 1.0d0 + lhsb(m,m,i,ksize) = 1.0d0 + enddo + enddo + + do j=start(2,c),jsize + +c--------------------------------------------------------------------- +c This function computes the left hand side for the three z-factors +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Compute the indices for storing the block-diagonal matrix; +c determine c (labeled f) and s jacobians for cell c +c--------------------------------------------------------------------- + + do k = start(3,c)-1, cell_size(3,c)-end(3,c) + do i=start(1,c),isize + + tmp1 = 1.0d0 / u(1,i,j,k,c) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + fjac(1,1,i,k) = 0.0d+00 + fjac(1,2,i,k) = 0.0d+00 + fjac(1,3,i,k) = 0.0d+00 + fjac(1,4,i,k) = 1.0d+00 + fjac(1,5,i,k) = 0.0d+00 + + fjac(2,1,i,k) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) + > * tmp2 + fjac(2,2,i,k) = u(4,i,j,k,c) * tmp1 + fjac(2,3,i,k) = 0.0d+00 + fjac(2,4,i,k) = u(2,i,j,k,c) * tmp1 + fjac(2,5,i,k) = 0.0d+00 + + fjac(3,1,i,k) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) ) + > * tmp2 + fjac(3,2,i,k) = 0.0d+00 + fjac(3,3,i,k) = u(4,i,j,k,c) * tmp1 + fjac(3,4,i,k) = u(3,i,j,k,c) * tmp1 + fjac(3,5,i,k) = 0.0d+00 + + fjac(4,1,i,k) = - (u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 ) + > + c2 * qs(i,j,k,c) + fjac(4,2,i,k) = - c2 * u(2,i,j,k,c) * tmp1 + fjac(4,3,i,k) = - c2 * u(3,i,j,k,c) * tmp1 + fjac(4,4,i,k) = ( 2.0d+00 - c2 ) + > * u(4,i,j,k,c) * tmp1 + fjac(4,5,i,k) = c2 + + fjac(5,1,i,k) = ( c2 * 2.0d0 * qs(i,j,k,c) + > - c1 * ( u(5,i,j,k,c) * tmp1 ) ) + > * ( u(4,i,j,k,c) * tmp1 ) + fjac(5,2,i,k) = - c2 * ( u(2,i,j,k,c)*u(4,i,j,k,c) ) + > * tmp2 + fjac(5,3,i,k) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) ) + > * tmp2 + fjac(5,4,i,k) = c1 * ( u(5,i,j,k,c) * tmp1 ) + > - c2 * ( qs(i,j,k,c) + > + u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 ) + fjac(5,5,i,k) = c1 * u(4,i,j,k,c) * tmp1 + + njac(1,1,i,k) = 0.0d+00 + njac(1,2,i,k) = 0.0d+00 + njac(1,3,i,k) = 0.0d+00 + njac(1,4,i,k) = 0.0d+00 + njac(1,5,i,k) = 0.0d+00 + + njac(2,1,i,k) = - c3c4 * tmp2 * u(2,i,j,k,c) + njac(2,2,i,k) = c3c4 * tmp1 + njac(2,3,i,k) = 0.0d+00 + njac(2,4,i,k) = 0.0d+00 + njac(2,5,i,k) = 0.0d+00 + + njac(3,1,i,k) = - c3c4 * tmp2 * u(3,i,j,k,c) + njac(3,2,i,k) = 0.0d+00 + njac(3,3,i,k) = c3c4 * tmp1 + njac(3,4,i,k) = 0.0d+00 + njac(3,5,i,k) = 0.0d+00 + + njac(4,1,i,k) = - con43 * c3c4 * tmp2 * u(4,i,j,k,c) + njac(4,2,i,k) = 0.0d+00 + njac(4,3,i,k) = 0.0d+00 + njac(4,4,i,k) = con43 * c3 * c4 * tmp1 + njac(4,5,i,k) = 0.0d+00 + + njac(5,1,i,k) = - ( c3c4 + > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2) + > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2) + > - ( con43 * c3c4 + > - c1345 ) * tmp3 * (u(4,i,j,k,c)**2) + > - c1345 * tmp2 * u(5,i,j,k,c) + + njac(5,2,i,k) = ( c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c) + njac(5,3,i,k) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c) + njac(5,4,i,k) = ( con43 * c3c4 + > - c1345 ) * tmp2 * u(4,i,j,k,c) + njac(5,5,i,k) = ( c1345 )* tmp1 + + + enddo + enddo + +c--------------------------------------------------------------------- +c now joacobians set, so form left hand side in z direction +c--------------------------------------------------------------------- + do k = start(3,c), ksize-end(3,c) + do i=start(1,c),isize + + tmp1 = dt * tz1 + tmp2 = dt * tz2 + + lhsa(1,1,i,k) = - tmp2 * fjac(1,1,i,k-1) + > - tmp1 * njac(1,1,i,k-1) + > - tmp1 * dz1 + lhsa(1,2,i,k) = - tmp2 * fjac(1,2,i,k-1) + > - tmp1 * njac(1,2,i,k-1) + lhsa(1,3,i,k) = - tmp2 * fjac(1,3,i,k-1) + > - tmp1 * njac(1,3,i,k-1) + lhsa(1,4,i,k) = - tmp2 * fjac(1,4,i,k-1) + > - tmp1 * njac(1,4,i,k-1) + lhsa(1,5,i,k) = - tmp2 * fjac(1,5,i,k-1) + > - tmp1 * njac(1,5,i,k-1) + + lhsa(2,1,i,k) = - tmp2 * fjac(2,1,i,k-1) + > - tmp1 * njac(2,1,i,k-1) + lhsa(2,2,i,k) = - tmp2 * fjac(2,2,i,k-1) + > - tmp1 * njac(2,2,i,k-1) + > - tmp1 * dz2 + lhsa(2,3,i,k) = - tmp2 * fjac(2,3,i,k-1) + > - tmp1 * njac(2,3,i,k-1) + lhsa(2,4,i,k) = - tmp2 * fjac(2,4,i,k-1) + > - tmp1 * njac(2,4,i,k-1) + lhsa(2,5,i,k) = - tmp2 * fjac(2,5,i,k-1) + > - tmp1 * njac(2,5,i,k-1) + + lhsa(3,1,i,k) = - tmp2 * fjac(3,1,i,k-1) + > - tmp1 * njac(3,1,i,k-1) + lhsa(3,2,i,k) = - tmp2 * fjac(3,2,i,k-1) + > - tmp1 * njac(3,2,i,k-1) + lhsa(3,3,i,k) = - tmp2 * fjac(3,3,i,k-1) + > - tmp1 * njac(3,3,i,k-1) + > - tmp1 * dz3 + lhsa(3,4,i,k) = - tmp2 * fjac(3,4,i,k-1) + > - tmp1 * njac(3,4,i,k-1) + lhsa(3,5,i,k) = - tmp2 * fjac(3,5,i,k-1) + > - tmp1 * njac(3,5,i,k-1) + + lhsa(4,1,i,k) = - tmp2 * fjac(4,1,i,k-1) + > - tmp1 * njac(4,1,i,k-1) + lhsa(4,2,i,k) = - tmp2 * fjac(4,2,i,k-1) + > - tmp1 * njac(4,2,i,k-1) + lhsa(4,3,i,k) = - tmp2 * fjac(4,3,i,k-1) + > - tmp1 * njac(4,3,i,k-1) + lhsa(4,4,i,k) = - tmp2 * fjac(4,4,i,k-1) + > - tmp1 * njac(4,4,i,k-1) + > - tmp1 * dz4 + lhsa(4,5,i,k) = - tmp2 * fjac(4,5,i,k-1) + > - tmp1 * njac(4,5,i,k-1) + + lhsa(5,1,i,k) = - tmp2 * fjac(5,1,i,k-1) + > - tmp1 * njac(5,1,i,k-1) + lhsa(5,2,i,k) = - tmp2 * fjac(5,2,i,k-1) + > - tmp1 * njac(5,2,i,k-1) + lhsa(5,3,i,k) = - tmp2 * fjac(5,3,i,k-1) + > - tmp1 * njac(5,3,i,k-1) + lhsa(5,4,i,k) = - tmp2 * fjac(5,4,i,k-1) + > - tmp1 * njac(5,4,i,k-1) + lhsa(5,5,i,k) = - tmp2 * fjac(5,5,i,k-1) + > - tmp1 * njac(5,5,i,k-1) + > - tmp1 * dz5 + + lhsb(1,1,i,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(1,1,i,k) + > + tmp1 * 2.0d+00 * dz1 + lhsb(1,2,i,k) = tmp1 * 2.0d+00 * njac(1,2,i,k) + lhsb(1,3,i,k) = tmp1 * 2.0d+00 * njac(1,3,i,k) + lhsb(1,4,i,k) = tmp1 * 2.0d+00 * njac(1,4,i,k) + lhsb(1,5,i,k) = tmp1 * 2.0d+00 * njac(1,5,i,k) + + lhsb(2,1,i,k) = tmp1 * 2.0d+00 * njac(2,1,i,k) + lhsb(2,2,i,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(2,2,i,k) + > + tmp1 * 2.0d+00 * dz2 + lhsb(2,3,i,k) = tmp1 * 2.0d+00 * njac(2,3,i,k) + lhsb(2,4,i,k) = tmp1 * 2.0d+00 * njac(2,4,i,k) + lhsb(2,5,i,k) = tmp1 * 2.0d+00 * njac(2,5,i,k) + + lhsb(3,1,i,k) = tmp1 * 2.0d+00 * njac(3,1,i,k) + lhsb(3,2,i,k) = tmp1 * 2.0d+00 * njac(3,2,i,k) + lhsb(3,3,i,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(3,3,i,k) + > + tmp1 * 2.0d+00 * dz3 + lhsb(3,4,i,k) = tmp1 * 2.0d+00 * njac(3,4,i,k) + lhsb(3,5,i,k) = tmp1 * 2.0d+00 * njac(3,5,i,k) + + lhsb(4,1,i,k) = tmp1 * 2.0d+00 * njac(4,1,i,k) + lhsb(4,2,i,k) = tmp1 * 2.0d+00 * njac(4,2,i,k) + lhsb(4,3,i,k) = tmp1 * 2.0d+00 * njac(4,3,i,k) + lhsb(4,4,i,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(4,4,i,k) + > + tmp1 * 2.0d+00 * dz4 + lhsb(4,5,i,k) = tmp1 * 2.0d+00 * njac(4,5,i,k) + + lhsb(5,1,i,k) = tmp1 * 2.0d+00 * njac(5,1,i,k) + lhsb(5,2,i,k) = tmp1 * 2.0d+00 * njac(5,2,i,k) + lhsb(5,3,i,k) = tmp1 * 2.0d+00 * njac(5,3,i,k) + lhsb(5,4,i,k) = tmp1 * 2.0d+00 * njac(5,4,i,k) + lhsb(5,5,i,k) = 1.0d+00 + > + tmp1 * 2.0d+00 * njac(5,5,i,k) + > + tmp1 * 2.0d+00 * dz5 + + lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i,k+1) + > - tmp1 * njac(1,1,i,k+1) + > - tmp1 * dz1 + lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i,k+1) + > - tmp1 * njac(1,2,i,k+1) + lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i,k+1) + > - tmp1 * njac(1,3,i,k+1) + lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i,k+1) + > - tmp1 * njac(1,4,i,k+1) + lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i,k+1) + > - tmp1 * njac(1,5,i,k+1) + + lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i,k+1) + > - tmp1 * njac(2,1,i,k+1) + lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i,k+1) + > - tmp1 * njac(2,2,i,k+1) + > - tmp1 * dz2 + lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i,k+1) + > - tmp1 * njac(2,3,i,k+1) + lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i,k+1) + > - tmp1 * njac(2,4,i,k+1) + lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i,k+1) + > - tmp1 * njac(2,5,i,k+1) + + lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i,k+1) + > - tmp1 * njac(3,1,i,k+1) + lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i,k+1) + > - tmp1 * njac(3,2,i,k+1) + lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i,k+1) + > - tmp1 * njac(3,3,i,k+1) + > - tmp1 * dz3 + lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i,k+1) + > - tmp1 * njac(3,4,i,k+1) + lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i,k+1) + > - tmp1 * njac(3,5,i,k+1) + + lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i,k+1) + > - tmp1 * njac(4,1,i,k+1) + lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i,k+1) + > - tmp1 * njac(4,2,i,k+1) + lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i,k+1) + > - tmp1 * njac(4,3,i,k+1) + lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i,k+1) + > - tmp1 * njac(4,4,i,k+1) + > - tmp1 * dz4 + lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i,k+1) + > - tmp1 * njac(4,5,i,k+1) + + lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i,k+1) + > - tmp1 * njac(5,1,i,k+1) + lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i,k+1) + > - tmp1 * njac(5,2,i,k+1) + lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i,k+1) + > - tmp1 * njac(5,3,i,k+1) + lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i,k+1) + > - tmp1 * njac(5,4,i,k+1) + lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i,k+1) + > - tmp1 * njac(5,5,i,k+1) + > - tmp1 * dz5 + + enddo + enddo + + +c--------------------------------------------------------------------- +c outer most do loops - sweeping in i direction +c--------------------------------------------------------------------- + if (first .eq. 1) then + +c--------------------------------------------------------------------- +c multiply c(i,j,kstart) by b_inverse and copy back to c +c multiply rhs(kstart) by b_inverse(kstart) and copy to rhs +c--------------------------------------------------------------------- +!dir$ ivdep + do i=start(1,c),isize + call binvcrhs( lhsb(1,1,i,kstart), + > lhsc(1,1,i,j,kstart,c), + > rhs(1,i,j,kstart,c) ) + enddo + + endif + +c--------------------------------------------------------------------- +c begin inner most do loop +c do all the elements of the cell unless last +c--------------------------------------------------------------------- + do k=kstart+first,ksize-last +!dir$ ivdep + do i=start(1,c),isize + +c--------------------------------------------------------------------- +c subtract A*lhs_vector(k-1) from lhs_vector(k) +c +c rhs(k) = rhs(k) - A*rhs(k-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,i,k), + > rhs(1,i,j,k-1,c),rhs(1,i,j,k,c)) + +c--------------------------------------------------------------------- +c B(k) = B(k) - C(k-1)*A(k) +c call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,i,k), + > lhsc(1,1,i,j,k-1,c), + > lhsb(1,1,i,k)) + +c--------------------------------------------------------------------- +c multiply c(i,j,k) by b_inverse and copy back to c +c multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs +c--------------------------------------------------------------------- + call binvcrhs( lhsb(1,1,i,k), + > lhsc(1,1,i,j,k,c), + > rhs(1,i,j,k,c) ) + + enddo + enddo + +c--------------------------------------------------------------------- +c Now finish up special cases for last cell +c--------------------------------------------------------------------- + if (last .eq. 1) then + +!dir$ ivdep + do i=start(1,c),isize +c--------------------------------------------------------------------- +c rhs(ksize) = rhs(ksize) - A*rhs(ksize-1) +c--------------------------------------------------------------------- + call matvec_sub(lhsa(1,1,i,ksize), + > rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c)) + +c--------------------------------------------------------------------- +c B(ksize) = B(ksize) - C(ksize-1)*A(ksize) +c call matmul_sub(aa,i,j,ksize,c, +c $ cc,i,j,ksize-1,c,bb,i,j,ksize,c) +c--------------------------------------------------------------------- + call matmul_sub(lhsa(1,1,i,ksize), + > lhsc(1,1,i,j,ksize-1,c), + > lhsb(1,1,i,ksize)) + +c--------------------------------------------------------------------- +c multiply rhs(ksize) by b_inverse(ksize) and copy to rhs +c--------------------------------------------------------------------- + call binvrhs( lhsb(1,1,i,ksize), + > rhs(1,i,j,ksize,c) ) + enddo + + endif + enddo + + + return + end + + + + + + diff --git a/examples/smpi/NAS/CG/Makefile b/examples/smpi/NAS/CG/Makefile new file mode 100644 index 0000000000..33e52c697b --- /dev/null +++ b/examples/smpi/NAS/CG/Makefile @@ -0,0 +1,23 @@ +SHELL=/bin/sh +BENCHMARK=cg +BENCHMARKU=CG + +include ../config/make.def + +OBJS = cg.o ${COMMON}/print_results.o \ + ${COMMON}/${RAND}.o ${COMMON}/timers.o + +include ../sys/make.common + +${PROGRAM}: config ${OBJS} + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} + +cg.o: cg.f mpinpb.h npbparams.h + ${FCOMPILE} cg.f + +clean: + - rm -f *.o *~ + - rm -f npbparams.h core + + + diff --git a/examples/smpi/NAS/CG/cg.f b/examples/smpi/NAS/CG/cg.f new file mode 100644 index 0000000000..0d425d78a4 --- /dev/null +++ b/examples/smpi/NAS/CG/cg.f @@ -0,0 +1,1787 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! C G ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! +! It is described in NAS Technical Reports 95-020 and 02-007 ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + + +c--------------------------------------------------------------------- +c +c Authors: M. Yarrow +c C. Kuszmaul +c R. F. Van der Wijngaart +c H. Jin +c +c--------------------------------------------------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + program cg +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + + implicit none + + include 'mpinpb.h' + integer status(MPI_STATUS_SIZE), request, ierr + + include 'npbparams.h' + +c--------------------------------------------------------------------- +c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows. +c num_proc_cols and num_proc_cols are to be found in npbparams.h. +c When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows. +c--------------------------------------------------------------------- + integer num_procs + parameter( num_procs = num_proc_cols * num_proc_rows ) + + + +c--------------------------------------------------------------------- +c Class specific parameters: +c It appears here for reference only. +c These are their values, however, this info is imported in the npbparams.h +c include file, which is written by the sys/setparams.c program. +c--------------------------------------------------------------------- + +C---------- +C Class S: +C---------- +CC parameter( na=1400, +CC > nonzer=7, +CC > shift=10., +CC > niter=15, +CC > rcond=1.0d-1 ) +C---------- +C Class W: +C---------- +CC parameter( na=7000, +CC > nonzer=8, +CC > shift=12., +CC > niter=15, +CC > rcond=1.0d-1 ) +C---------- +C Class A: +C---------- +CC parameter( na=14000, +CC > nonzer=11, +CC > shift=20., +CC > niter=15, +CC > rcond=1.0d-1 ) +C---------- +C Class B: +C---------- +CC parameter( na=75000, +CC > nonzer=13, +CC > shift=60., +CC > niter=75, +CC > rcond=1.0d-1 ) +C---------- +C Class C: +C---------- +CC parameter( na=150000, +CC > nonzer=15, +CC > shift=110., +CC > niter=75, +CC > rcond=1.0d-1 ) +C---------- +C Class D: +C---------- +CC parameter( na=1500000, +CC > nonzer=21, +CC > shift=500., +CC > niter=100, +CC > rcond=1.0d-1 ) +C---------- +C Class E: +C---------- +CC parameter( na=9000000, +CC > nonzer=26, +CC > shift=1500., +CC > niter=100, +CC > rcond=1.0d-1 ) + + + + integer nz + parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer + > + na*(nonzer+2+num_procs/256)/num_proc_cols ) + + + + common / partit_size / naa, nzz, + > npcols, nprows, + > proc_col, proc_row, + > firstrow, + > lastrow, + > firstcol, + > lastcol, + > exch_proc, + > exch_recv_length, + > send_start, + > send_len + integer naa, nzz, + > npcols, nprows, + > proc_col, proc_row, + > firstrow, + > lastrow, + > firstcol, + > lastcol, + > exch_proc, + > exch_recv_length, + > send_start, + > send_len + + + common / main_int_mem / colidx, rowstr, + > iv, arow, acol + integer colidx(nz), rowstr(na+1), + > iv(2*na+1), arow(nz), acol(nz) + + + common / main_flt_mem / v, aelt, a, + > x, + > z, + > p, + > q, + > r, + > w + double precision v(na+1), aelt(nz), a(nz), + > x(na/num_proc_rows+2), + > z(na/num_proc_rows+2), + > p(na/num_proc_rows+2), + > q(na/num_proc_rows+2), + > r(na/num_proc_rows+2), + > w(na/num_proc_rows+2) + + + common /urando/ amult, tran + double precision amult, tran + + + + integer l2npcols + integer reduce_exch_proc(num_proc_cols) + integer reduce_send_starts(num_proc_cols) + integer reduce_send_lengths(num_proc_cols) + integer reduce_recv_starts(num_proc_cols) + integer reduce_recv_lengths(num_proc_cols) + + integer i, j, k, it + + double precision zeta, randlc + external randlc + double precision rnorm + double precision norm_temp1(2), norm_temp2(2) + + double precision t, tmax, mflops + external timer_read + double precision timer_read + character class + logical verified + double precision zeta_verify_value, epsilon, err + + +c--------------------------------------------------------------------- +c Set up mpi initialization and number of proc testing +c--------------------------------------------------------------------- + call initialize_mpi + + + if( na .eq. 1400 .and. + & nonzer .eq. 7 .and. + & niter .eq. 15 .and. + & shift .eq. 10.d0 ) then + class = 'S' + zeta_verify_value = 8.5971775078648d0 + else if( na .eq. 7000 .and. + & nonzer .eq. 8 .and. + & niter .eq. 15 .and. + & shift .eq. 12.d0 ) then + class = 'W' + zeta_verify_value = 10.362595087124d0 + else if( na .eq. 14000 .and. + & nonzer .eq. 11 .and. + & niter .eq. 15 .and. + & shift .eq. 20.d0 ) then + class = 'A' + zeta_verify_value = 17.130235054029d0 + else if( na .eq. 75000 .and. + & nonzer .eq. 13 .and. + & niter .eq. 75 .and. + & shift .eq. 60.d0 ) then + class = 'B' + zeta_verify_value = 22.712745482631d0 + else if( na .eq. 150000 .and. + & nonzer .eq. 15 .and. + & niter .eq. 75 .and. + & shift .eq. 110.d0 ) then + class = 'C' + zeta_verify_value = 28.973605592845d0 + else if( na .eq. 1500000 .and. + & nonzer .eq. 21 .and. + & niter .eq. 100 .and. + & shift .eq. 500.d0 ) then + class = 'D' + zeta_verify_value = 52.514532105794d0 + else if( na .eq. 9000000 .and. + & nonzer .eq. 26 .and. + & niter .eq. 100 .and. + & shift .eq. 1.5d3 ) then + class = 'E' + zeta_verify_value = 77.522164599383d0 + else + class = 'U' + endif + + if( me .eq. root )then + write( *,1000 ) + write( *,1001 ) na + write( *,1002 ) niter + write( *,1003 ) nprocs + write( *,1004 ) nonzer + write( *,1005 ) shift + 1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /) + 1001 format(' Size: ', i10 ) + 1002 format(' Iterations: ', i5 ) + 1003 format(' Number of active processes: ', i5 ) + 1004 format(' Number of nonzeroes per row: ', i8) + 1005 format(' Eigenvalue shift: ', e8.3) + endif + + if (.not. convertdouble) then + dp_type = MPI_DOUBLE_PRECISION + else + dp_type = MPI_REAL + endif + + + naa = na + nzz = nz + + +c--------------------------------------------------------------------- +c Set up processor info, such as whether sq num of procs, etc +c--------------------------------------------------------------------- + call setup_proc_info( num_procs, + > num_proc_rows, + > num_proc_cols ) + + +c--------------------------------------------------------------------- +c Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow +c--------------------------------------------------------------------- + call setup_submatrix_info( l2npcols, + > reduce_exch_proc, + > reduce_send_starts, + > reduce_send_lengths, + > reduce_recv_starts, + > reduce_recv_lengths ) + + + +c--------------------------------------------------------------------- +c Inialize random number generator +c--------------------------------------------------------------------- + tran = 314159265.0D0 + amult = 1220703125.0D0 + zeta = randlc( tran, amult ) + +c--------------------------------------------------------------------- +c Set up partition's sparse random matrix for given class size +c--------------------------------------------------------------------- + call makea(naa, nzz, a, colidx, rowstr, nonzer, + > firstrow, lastrow, firstcol, lastcol, + > rcond, arow, acol, aelt, v, iv, shift) + + + +c--------------------------------------------------------------------- +c Note: as a result of the above call to makea: +c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 +c values of colidx which are col indexes go from firstcol --> lastcol +c So: +c Shift the col index vals from actual (firstcol --> lastcol ) +c to local, i.e., (1 --> lastcol-firstcol+1) +c--------------------------------------------------------------------- + do j=1,lastrow-firstrow+1 + do k=rowstr(j),rowstr(j+1)-1 + colidx(k) = colidx(k) - firstcol + 1 + enddo + enddo + +c--------------------------------------------------------------------- +c set starting vector to (1, 1, .... 1) +c--------------------------------------------------------------------- + do i = 1, na/num_proc_rows+1 + x(i) = 1.0D0 + enddo + + zeta = 0.0d0 + +c--------------------------------------------------------------------- +c----> +c Do one iteration untimed to init all code and data page tables +c----> (then reinit, start timing, to niter its) +c--------------------------------------------------------------------- + do it = 1, 1 + +c--------------------------------------------------------------------- +c The call to the conjugate gradient routine: +c--------------------------------------------------------------------- + call conj_grad ( colidx, + > rowstr, + > x, + > z, + > a, + > p, + > q, + > r, + > w, + > rnorm, + > l2npcols, + > reduce_exch_proc, + > reduce_send_starts, + > reduce_send_lengths, + > reduce_recv_starts, + > reduce_recv_lengths ) + +c--------------------------------------------------------------------- +c zeta = shift + 1/(x.z) +c So, first: (x.z) +c Also, find norm of z +c So, first: (z.z) +c--------------------------------------------------------------------- + norm_temp1(1) = 0.0d0 + norm_temp1(2) = 0.0d0 + do j=1, lastcol-firstcol+1 + norm_temp1(1) = norm_temp1(1) + x(j)*z(j) + norm_temp1(2) = norm_temp1(2) + z(j)*z(j) + enddo + + do i = 1, l2npcols + call mpi_irecv( norm_temp2, + > 2, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > request, + > ierr ) + call mpi_send( norm_temp1, + > 2, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + + norm_temp1(1) = norm_temp1(1) + norm_temp2(1) + norm_temp1(2) = norm_temp1(2) + norm_temp2(2) + enddo + + norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) ) + + +c--------------------------------------------------------------------- +c Normalize z to obtain x +c--------------------------------------------------------------------- + do j=1, lastcol-firstcol+1 + x(j) = norm_temp1(2)*z(j) + enddo + + + enddo ! end of do one iteration untimed + + +c--------------------------------------------------------------------- +c set starting vector to (1, 1, .... 1) +c--------------------------------------------------------------------- +c +c NOTE: a questionable limit on size: should this be na/num_proc_cols+1 ? +c + do i = 1, na/num_proc_rows+1 + x(i) = 1.0D0 + enddo + + zeta = 0.0d0 + +c--------------------------------------------------------------------- +c Synchronize and start timing +c--------------------------------------------------------------------- + call mpi_barrier( mpi_comm_world, + > ierr ) + + call timer_clear( 1 ) + call timer_start( 1 ) + +c--------------------------------------------------------------------- +c----> +c Main Iteration for inverse power method +c----> +c--------------------------------------------------------------------- + do it = 1, niter + +c--------------------------------------------------------------------- +c The call to the conjugate gradient routine: +c--------------------------------------------------------------------- + call conj_grad ( colidx, + > rowstr, + > x, + > z, + > a, + > p, + > q, + > r, + > w, + > rnorm, + > l2npcols, + > reduce_exch_proc, + > reduce_send_starts, + > reduce_send_lengths, + > reduce_recv_starts, + > reduce_recv_lengths ) + + +c--------------------------------------------------------------------- +c zeta = shift + 1/(x.z) +c So, first: (x.z) +c Also, find norm of z +c So, first: (z.z) +c--------------------------------------------------------------------- + norm_temp1(1) = 0.0d0 + norm_temp1(2) = 0.0d0 + do j=1, lastcol-firstcol+1 + norm_temp1(1) = norm_temp1(1) + x(j)*z(j) + norm_temp1(2) = norm_temp1(2) + z(j)*z(j) + enddo + + do i = 1, l2npcols + call mpi_irecv( norm_temp2, + > 2, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > request, + > ierr ) + call mpi_send( norm_temp1, + > 2, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + + norm_temp1(1) = norm_temp1(1) + norm_temp2(1) + norm_temp1(2) = norm_temp1(2) + norm_temp2(2) + enddo + + norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) ) + + + if( me .eq. root )then + zeta = shift + 1.0d0 / norm_temp1(1) + if( it .eq. 1 ) write( *,9000 ) + write( *,9001 ) it, rnorm, zeta + endif + 9000 format( /,' iteration ||r|| zeta' ) + 9001 format( 4x, i5, 7x, e20.14, f20.13 ) + +c--------------------------------------------------------------------- +c Normalize z to obtain x +c--------------------------------------------------------------------- + do j=1, lastcol-firstcol+1 + x(j) = norm_temp1(2)*z(j) + enddo + + + enddo ! end of main iter inv pow meth + + call timer_stop( 1 ) + +c--------------------------------------------------------------------- +c End of timed section +c--------------------------------------------------------------------- + + t = timer_read( 1 ) + + call mpi_reduce( t, + > tmax, + > 1, + > dp_type, + > MPI_MAX, + > root, + > mpi_comm_world, + > ierr ) + + if( me .eq. root )then + write(*,100) + 100 format(' Benchmark completed ') + + epsilon = 1.d-10 + if (class .ne. 'U') then + + err = abs( zeta - zeta_verify_value )/zeta_verify_value + if( err .le. epsilon ) then + verified = .TRUE. + write(*, 200) + write(*, 201) zeta + write(*, 202) err + 200 format(' VERIFICATION SUCCESSFUL ') + 201 format(' Zeta is ', E20.13) + 202 format(' Error is ', E20.13) + else + verified = .FALSE. + write(*, 300) + write(*, 301) zeta + write(*, 302) zeta_verify_value + 300 format(' VERIFICATION FAILED') + 301 format(' Zeta ', E20.13) + 302 format(' The correct zeta is ', E20.13) + endif + else + verified = .FALSE. + write (*, 400) + write (*, 401) + write (*, 201) zeta + 400 format(' Problem size unknown') + 401 format(' NO VERIFICATION PERFORMED') + endif + + + if( tmax .ne. 0. ) then + mflops = float( 2*niter*na ) + & * ( 3.+float( nonzer*(nonzer+1) ) + & + 25.*(5.+float( nonzer*(nonzer+1) )) + & + 3. ) / tmax / 1000000.0 + else + mflops = 0.0 + endif + + call print_results('CG', class, na, 0, 0, + > niter, nnodes_compiled, nprocs, tmax, + > mflops, ' floating point', + > verified, npbversion, compiletime, + > cs1, cs2, cs3, cs4, cs5, cs6, cs7) + + + endif + + + call mpi_finalize(ierr) + + + + end ! end main + + + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine initialize_mpi +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + + integer ierr + + + call mpi_init( ierr ) + call mpi_comm_rank( mpi_comm_world, me, ierr ) + call mpi_comm_size( mpi_comm_world, nprocs, ierr ) + root = 0 + + + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine setup_proc_info( num_procs, + > num_proc_rows, + > num_proc_cols ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + + common / partit_size / naa, nzz, + > npcols, nprows, + > proc_col, proc_row, + > firstrow, + > lastrow, + > firstcol, + > lastcol, + > exch_proc, + > exch_recv_length, + > send_start, + > send_len + integer naa, nzz, + > npcols, nprows, + > proc_col, proc_row, + > firstrow, + > lastrow, + > firstcol, + > lastcol, + > exch_proc, + > exch_recv_length, + > send_start, + > send_len + + integer num_procs, num_proc_cols, num_proc_rows + integer i, ierr + integer log2nprocs + +c--------------------------------------------------------------------- +c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows +c When num_procs is not square, then num_proc_cols = 2*num_proc_rows +c--------------------------------------------------------------------- +c First, number of procs must be power of two. +c--------------------------------------------------------------------- + if( nprocs .ne. num_procs )then + if( me .eq. root ) write( *,9000 ) nprocs, num_procs + 9000 format( /,'Error: ',/,'num of procs allocated (', + > i4, ' )', + > /,'is not equal to',/, + > 'compiled number of procs (', + > i4, ' )',/ ) + call mpi_finalize(ierr) + stop + endif + + + i = num_proc_cols + 100 continue + if( i .ne. 1 .and. i/2*2 .ne. i )then + if ( me .eq. root ) then + write( *,* ) 'Error: num_proc_cols is ', + > num_proc_cols, + > ' which is not a power of two' + endif + call mpi_finalize(ierr) + stop + endif + i = i / 2 + if( i .ne. 0 )then + goto 100 + endif + + i = num_proc_rows + 200 continue + if( i .ne. 1 .and. i/2*2 .ne. i )then + if ( me .eq. root ) then + write( *,* ) 'Error: num_proc_rows is ', + > num_proc_rows, + > ' which is not a power of two' + endif + call mpi_finalize(ierr) + stop + endif + i = i / 2 + if( i .ne. 0 )then + goto 200 + endif + + log2nprocs = 0 + i = nprocs + 300 continue + if( i .ne. 1 .and. i/2*2 .ne. i )then + write( *,* ) 'Error: nprocs is ', + > nprocs, + > ' which is not a power of two' + call mpi_finalize(ierr) + stop + endif + i = i / 2 + if( i .ne. 0 )then + log2nprocs = log2nprocs + 1 + goto 300 + endif + +CC write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs + + + npcols = num_proc_cols + nprows = num_proc_rows + + + return + end + + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine setup_submatrix_info( l2npcols, + > reduce_exch_proc, + > reduce_send_starts, + > reduce_send_lengths, + > reduce_recv_starts, + > reduce_recv_lengths ) + > +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + + integer col_size, row_size + + common / partit_size / naa, nzz, + > npcols, nprows, + > proc_col, proc_row, + > firstrow, + > lastrow, + > firstcol, + > lastcol, + > exch_proc, + > exch_recv_length, + > send_start, + > send_len + integer naa, nzz, + > npcols, nprows, + > proc_col, proc_row, + > firstrow, + > lastrow, + > firstcol, + > lastcol, + > exch_proc, + > exch_recv_length, + > send_start, + > send_len + + integer reduce_exch_proc(*) + integer reduce_send_starts(*) + integer reduce_send_lengths(*) + integer reduce_recv_starts(*) + integer reduce_recv_lengths(*) + + integer i, j + integer div_factor + integer l2npcols + + + proc_row = me / npcols + proc_col = me - proc_row*npcols + + + +c--------------------------------------------------------------------- +c If naa evenly divisible by npcols, then it is evenly divisible +c by nprows +c--------------------------------------------------------------------- + + if( naa/npcols*npcols .eq. naa )then + col_size = naa/npcols + firstcol = proc_col*col_size + 1 + lastcol = firstcol - 1 + col_size + row_size = naa/nprows + firstrow = proc_row*row_size + 1 + lastrow = firstrow - 1 + row_size +c--------------------------------------------------------------------- +c If naa not evenly divisible by npcols, then first subdivide for nprows +c and then, if npcols not equal to nprows (i.e., not a sq number of procs), +c get col subdivisions by dividing by 2 each row subdivision. +c--------------------------------------------------------------------- + else + if( proc_row .lt. naa - naa/nprows*nprows)then + row_size = naa/nprows+ 1 + firstrow = proc_row*row_size + 1 + lastrow = firstrow - 1 + row_size + else + row_size = naa/nprows + firstrow = (naa - naa/nprows*nprows)*(row_size+1) + > + (proc_row-(naa-naa/nprows*nprows)) + > *row_size + 1 + lastrow = firstrow - 1 + row_size + endif + if( npcols .eq. nprows )then + if( proc_col .lt. naa - naa/npcols*npcols )then + col_size = naa/npcols+ 1 + firstcol = proc_col*col_size + 1 + lastcol = firstcol - 1 + col_size + else + col_size = naa/npcols + firstcol = (naa - naa/npcols*npcols)*(col_size+1) + > + (proc_col-(naa-naa/npcols*npcols)) + > *col_size + 1 + lastcol = firstcol - 1 + col_size + endif + else + if( (proc_col/2) .lt. + > naa - naa/(npcols/2)*(npcols/2) )then + col_size = naa/(npcols/2) + 1 + firstcol = (proc_col/2)*col_size + 1 + lastcol = firstcol - 1 + col_size + else + col_size = naa/(npcols/2) + firstcol = (naa - naa/(npcols/2)*(npcols/2)) + > *(col_size+1) + > + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2))) + > *col_size + 1 + lastcol = firstcol - 1 + col_size + endif +CC write( *,* ) col_size,firstcol,lastcol + if( mod( me,2 ) .eq. 0 )then + lastcol = firstcol - 1 + (col_size-1)/2 + 1 + else + firstcol = firstcol + (col_size-1)/2 + 1 + lastcol = firstcol - 1 + col_size/2 +CC write( *,* ) firstcol,lastcol + endif + endif + endif + + + + if( npcols .eq. nprows )then + send_start = 1 + send_len = lastrow - firstrow + 1 + else + if( mod( me,2 ) .eq. 0 )then + send_start = 1 + send_len = (1 + lastrow-firstrow+1)/2 + else + send_start = (1 + lastrow-firstrow+1)/2 + 1 + send_len = (lastrow-firstrow+1)/2 + endif + endif + + + + +c--------------------------------------------------------------------- +c Transpose exchange processor +c--------------------------------------------------------------------- + + if( npcols .eq. nprows )then + exch_proc = mod( me,nprows )*nprows + me/nprows + else + exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows) + > + mod( me,2 ) + endif + + + + i = npcols / 2 + l2npcols = 0 + do while( i .gt. 0 ) + l2npcols = l2npcols + 1 + i = i / 2 + enddo + + +c--------------------------------------------------------------------- +c Set up the reduce phase schedules... +c--------------------------------------------------------------------- + + div_factor = npcols + do i = 1, l2npcols + + j = mod( proc_col+div_factor/2, div_factor ) + > + proc_col / div_factor * div_factor + reduce_exch_proc(i) = proc_row*npcols + j + + div_factor = div_factor / 2 + + enddo + + + do i = l2npcols, 1, -1 + + if( nprows .eq. npcols )then + reduce_send_starts(i) = send_start + reduce_send_lengths(i) = send_len + reduce_recv_lengths(i) = lastrow - firstrow + 1 + else + reduce_recv_lengths(i) = send_len + if( i .eq. l2npcols )then + reduce_send_lengths(i) = lastrow-firstrow+1 - send_len + if( me/2*2 .eq. me )then + reduce_send_starts(i) = send_start + send_len + else + reduce_send_starts(i) = 1 + endif + else + reduce_send_lengths(i) = send_len + reduce_send_starts(i) = send_start + endif + endif + reduce_recv_starts(i) = send_start + + enddo + + + exch_recv_length = lastcol - firstcol + 1 + + + return + end + + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine conj_grad ( colidx, + > rowstr, + > x, + > z, + > a, + > p, + > q, + > r, + > w, + > rnorm, + > l2npcols, + > reduce_exch_proc, + > reduce_send_starts, + > reduce_send_lengths, + > reduce_recv_starts, + > reduce_recv_lengths ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Floaging point arrays here are named as in NPB1 spec discussion of +c CG algorithm +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + + integer status(MPI_STATUS_SIZE ), request + + + common / partit_size / naa, nzz, + > npcols, nprows, + > proc_col, proc_row, + > firstrow, + > lastrow, + > firstcol, + > lastcol, + > exch_proc, + > exch_recv_length, + > send_start, + > send_len + integer naa, nzz, + > npcols, nprows, + > proc_col, proc_row, + > firstrow, + > lastrow, + > firstcol, + > lastcol, + > exch_proc, + > exch_recv_length, + > send_start, + > send_len + + + + double precision x(*), + > z(*), + > a(nzz) + integer colidx(nzz), rowstr(naa+1) + + double precision p(*), + > q(*), + > r(*), + > w(*) ! used as work temporary + + integer l2npcols + integer reduce_exch_proc(l2npcols) + integer reduce_send_starts(l2npcols) + integer reduce_send_lengths(l2npcols) + integer reduce_recv_starts(l2npcols) + integer reduce_recv_lengths(l2npcols) + + integer i, j, k, ierr + integer cgit, cgitmax + + double precision d, sum, rho, rho0, alpha, beta, rnorm + + external timer_read + double precision timer_read + + data cgitmax / 25 / + + +c--------------------------------------------------------------------- +c Initialize the CG algorithm: +c--------------------------------------------------------------------- + do j=1,naa/nprows+1 + q(j) = 0.0d0 + z(j) = 0.0d0 + r(j) = x(j) + p(j) = r(j) + w(j) = 0.0d0 + enddo + + +c--------------------------------------------------------------------- +c rho = r.r +c Now, obtain the norm of r: First, sum squares of r elements locally... +c--------------------------------------------------------------------- + sum = 0.0d0 + do j=1, lastcol-firstcol+1 + sum = sum + r(j)*r(j) + enddo + +c--------------------------------------------------------------------- +c Exchange and sum with procs identified in reduce_exch_proc +c (This is equivalent to mpi_allreduce.) +c Sum the partial sums of rho, leaving rho on all processors +c--------------------------------------------------------------------- + do i = 1, l2npcols + call mpi_irecv( rho, + > 1, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > request, + > ierr ) + call mpi_send( sum, + > 1, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + + sum = sum + rho + enddo + rho = sum + + + +c--------------------------------------------------------------------- +c----> +c The conj grad iteration loop +c----> +c--------------------------------------------------------------------- + do cgit = 1, cgitmax + + +c--------------------------------------------------------------------- +c q = A.p +c The partition submatrix-vector multiply: use workspace w +c--------------------------------------------------------------------- + do j=1,lastrow-firstrow+1 + sum = 0.d0 + do k=rowstr(j),rowstr(j+1)-1 + sum = sum + a(k)*p(colidx(k)) + enddo + w(j) = sum + enddo + +c--------------------------------------------------------------------- +c Sum the partition submatrix-vec A.p's across rows +c Exchange and sum piece of w with procs identified in reduce_exch_proc +c--------------------------------------------------------------------- + do i = l2npcols, 1, -1 + call mpi_irecv( q(reduce_recv_starts(i)), + > reduce_recv_lengths(i), + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > request, + > ierr ) + call mpi_send( w(reduce_send_starts(i)), + > reduce_send_lengths(i), + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + do j=send_start,send_start + reduce_recv_lengths(i) - 1 + w(j) = w(j) + q(j) + enddo + enddo + + +c--------------------------------------------------------------------- +c Exchange piece of q with transpose processor: +c--------------------------------------------------------------------- + if( l2npcols .ne. 0 )then + call mpi_irecv( q, + > exch_recv_length, + > dp_type, + > exch_proc, + > 1, + > mpi_comm_world, + > request, + > ierr ) + + call mpi_send( w(send_start), + > send_len, + > dp_type, + > exch_proc, + > 1, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + else + do j=1,exch_recv_length + q(j) = w(j) + enddo + endif + + +c--------------------------------------------------------------------- +c Clear w for reuse... +c--------------------------------------------------------------------- + do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 ) + w(j) = 0.0d0 + enddo + + +c--------------------------------------------------------------------- +c Obtain p.q +c--------------------------------------------------------------------- + sum = 0.0d0 + do j=1, lastcol-firstcol+1 + sum = sum + p(j)*q(j) + enddo + +c--------------------------------------------------------------------- +c Obtain d with a sum-reduce +c--------------------------------------------------------------------- + do i = 1, l2npcols + call mpi_irecv( d, + > 1, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > request, + > ierr ) + call mpi_send( sum, + > 1, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > ierr ) + + call mpi_wait( request, status, ierr ) + + sum = sum + d + enddo + d = sum + + +c--------------------------------------------------------------------- +c Obtain alpha = rho / (p.q) +c--------------------------------------------------------------------- + alpha = rho / d + +c--------------------------------------------------------------------- +c Save a temporary of rho +c--------------------------------------------------------------------- + rho0 = rho + +c--------------------------------------------------------------------- +c Obtain z = z + alpha*p +c and r = r - alpha*q +c--------------------------------------------------------------------- + do j=1, lastcol-firstcol+1 + z(j) = z(j) + alpha*p(j) + r(j) = r(j) - alpha*q(j) + enddo + +c--------------------------------------------------------------------- +c rho = r.r +c Now, obtain the norm of r: First, sum squares of r elements locally... +c--------------------------------------------------------------------- + sum = 0.0d0 + do j=1, lastcol-firstcol+1 + sum = sum + r(j)*r(j) + enddo + +c--------------------------------------------------------------------- +c Obtain rho with a sum-reduce +c--------------------------------------------------------------------- + do i = 1, l2npcols + call mpi_irecv( rho, + > 1, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > request, + > ierr ) + call mpi_send( sum, + > 1, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + + sum = sum + rho + enddo + rho = sum + +c--------------------------------------------------------------------- +c Obtain beta: +c--------------------------------------------------------------------- + beta = rho / rho0 + +c--------------------------------------------------------------------- +c p = r + beta*p +c--------------------------------------------------------------------- + do j=1, lastcol-firstcol+1 + p(j) = r(j) + beta*p(j) + enddo + + + + enddo ! end of do cgit=1,cgitmax + + + +c--------------------------------------------------------------------- +c Compute residual norm explicitly: ||r|| = ||x - A.z|| +c First, form A.z +c The partition submatrix-vector multiply +c--------------------------------------------------------------------- + do j=1,lastrow-firstrow+1 + sum = 0.d0 + do k=rowstr(j),rowstr(j+1)-1 + sum = sum + a(k)*z(colidx(k)) + enddo + w(j) = sum + enddo + + + +c--------------------------------------------------------------------- +c Sum the partition submatrix-vec A.z's across rows +c--------------------------------------------------------------------- + do i = l2npcols, 1, -1 + call mpi_irecv( r(reduce_recv_starts(i)), + > reduce_recv_lengths(i), + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > request, + > ierr ) + call mpi_send( w(reduce_send_starts(i)), + > reduce_send_lengths(i), + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + + do j=send_start,send_start + reduce_recv_lengths(i) - 1 + w(j) = w(j) + r(j) + enddo + enddo + + +c--------------------------------------------------------------------- +c Exchange piece of q with transpose processor: +c--------------------------------------------------------------------- + if( l2npcols .ne. 0 )then + call mpi_irecv( r, + > exch_recv_length, + > dp_type, + > exch_proc, + > 1, + > mpi_comm_world, + > request, + > ierr ) + + call mpi_send( w(send_start), + > send_len, + > dp_type, + > exch_proc, + > 1, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + else + do j=1,exch_recv_length + r(j) = w(j) + enddo + endif + + +c--------------------------------------------------------------------- +c At this point, r contains A.z +c--------------------------------------------------------------------- + sum = 0.0d0 + do j=1, lastcol-firstcol+1 + d = x(j) - r(j) + sum = sum + d*d + enddo + +c--------------------------------------------------------------------- +c Obtain d with a sum-reduce +c--------------------------------------------------------------------- + do i = 1, l2npcols + call mpi_irecv( d, + > 1, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > request, + > ierr ) + call mpi_send( sum, + > 1, + > dp_type, + > reduce_exch_proc(i), + > i, + > mpi_comm_world, + > ierr ) + call mpi_wait( request, status, ierr ) + + sum = sum + d + enddo + d = sum + + + if( me .eq. root ) rnorm = sqrt( d ) + + + + return + end ! end of routine conj_grad + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine makea( n, nz, a, colidx, rowstr, nonzer, + > firstrow, lastrow, firstcol, lastcol, + > rcond, arow, acol, aelt, v, iv, shift ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + integer n, nz + integer firstrow, lastrow, firstcol, lastcol + integer colidx(nz), rowstr(n+1) + integer iv(2*n+1), arow(nz), acol(nz) + double precision v(n+1), aelt(nz) + double precision rcond, a(nz), shift + +c--------------------------------------------------------------------- +c generate the test problem for benchmark 6 +c makea generates a sparse matrix with a +c prescribed sparsity distribution +c +c parameter type usage +c +c input +c +c n i number of cols/rows of matrix +c nz i nonzeros as declared array size +c rcond r*8 condition number +c shift r*8 main diagonal shift +c +c output +c +c a r*8 array for nonzeros +c colidx i col indices +c rowstr i row pointers +c +c workspace +c +c iv, arow, acol i +c v, aelt r*8 +c--------------------------------------------------------------------- + + integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER + +c--------------------------------------------------------------------- +c nonzer is approximately (int(sqrt(nnza /n))); +c--------------------------------------------------------------------- + + double precision size, ratio, scale + external sparse, sprnvc, vecset + + size = 1.0D0 + ratio = rcond ** (1.0D0 / dfloat(n)) + nnza = 0 + +c--------------------------------------------------------------------- +c Initialize iv(n+1 .. 2n) to zero. +c Used by sprnvc to mark nonzero positions +c--------------------------------------------------------------------- + + do i = 1, n + iv(n+i) = 0 + enddo + do iouter = 1, n + nzv = nonzer + call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) ) + call vecset( n, v, colidx, nzv, iouter, .5D0 ) + do ivelt = 1, nzv + jcol = colidx(ivelt) + if (jcol.ge.firstcol .and. jcol.le.lastcol) then + scale = size * v(ivelt) + do ivelt1 = 1, nzv + irow = colidx(ivelt1) + if (irow.ge.firstrow .and. irow.le.lastrow) then + nnza = nnza + 1 + if (nnza .gt. nz) goto 9999 + acol(nnza) = jcol + arow(nnza) = irow + aelt(nnza) = v(ivelt1) * scale + endif + enddo + endif + enddo + size = size * ratio + enddo + + +c--------------------------------------------------------------------- +c ... add the identity * rcond to the generated matrix to bound +c the smallest eigenvalue from below by rcond +c--------------------------------------------------------------------- + do i = firstrow, lastrow + if (i.ge.firstcol .and. i.le.lastcol) then + iouter = n + i + nnza = nnza + 1 + if (nnza .gt. nz) goto 9999 + acol(nnza) = i + arow(nnza) = i + aelt(nnza) = rcond - shift + endif + enddo + + +c--------------------------------------------------------------------- +c ... make the sparse matrix from list of elements with duplicates +c (v and iv are used as workspace) +c--------------------------------------------------------------------- + call sparse( a, colidx, rowstr, n, arow, acol, aelt, + > firstrow, lastrow, + > v, iv(1), iv(n+1), nnza ) + return + + 9999 continue + write(*,*) 'Space for matrix elements exceeded in makea' + write(*,*) 'nnza, nzmax = ',nnza, nz + write(*,*) ' iouter = ',iouter + + stop + end +c-------end of makea------------------------------ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt, + > firstrow, lastrow, + > x, mark, nzloc, nnza ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit logical (a-z) + integer colidx(*), rowstr(*) + integer firstrow, lastrow + integer n, arow(*), acol(*), nnza + double precision a(*), aelt(*) + +c--------------------------------------------------------------------- +c rows range from firstrow to lastrow +c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values +c--------------------------------------------------------------------- + integer nzloc(n), nrows + double precision x(n) + logical mark(n) + +c--------------------------------------------------- +c generate a sparse matrix from a list of +c [col, row, element] tri +c--------------------------------------------------- + + integer i, j, jajp1, nza, k, nzrow + double precision xi + +c--------------------------------------------------------------------- +c how many rows of result +c--------------------------------------------------------------------- + nrows = lastrow - firstrow + 1 + +c--------------------------------------------------------------------- +c ...count the number of triples in each row +c--------------------------------------------------------------------- + do j = 1, n + rowstr(j) = 0 + mark(j) = .false. + enddo + rowstr(n+1) = 0 + + do nza = 1, nnza + j = (arow(nza) - firstrow + 1) + 1 + rowstr(j) = rowstr(j) + 1 + enddo + + rowstr(1) = 1 + do j = 2, nrows+1 + rowstr(j) = rowstr(j) + rowstr(j-1) + enddo + + +c--------------------------------------------------------------------- +c ... rowstr(j) now is the location of the first nonzero +c of row j of a +c--------------------------------------------------------------------- + + +c--------------------------------------------------------------------- +c ... do a bucket sort of the triples on the row index +c--------------------------------------------------------------------- + do nza = 1, nnza + j = arow(nza) - firstrow + 1 + k = rowstr(j) + a(k) = aelt(nza) + colidx(k) = acol(nza) + rowstr(j) = rowstr(j) + 1 + enddo + + +c--------------------------------------------------------------------- +c ... rowstr(j) now points to the first element of row j+1 +c--------------------------------------------------------------------- + do j = nrows, 1, -1 + rowstr(j+1) = rowstr(j) + enddo + rowstr(1) = 1 + + +c--------------------------------------------------------------------- +c ... generate the actual output rows by adding elements +c--------------------------------------------------------------------- + nza = 0 + do i = 1, n + x(i) = 0.0 + mark(i) = .false. + enddo + + jajp1 = rowstr(1) + do j = 1, nrows + nzrow = 0 + +c--------------------------------------------------------------------- +c ...loop over the jth row of a +c--------------------------------------------------------------------- + do k = jajp1 , rowstr(j+1)-1 + i = colidx(k) + x(i) = x(i) + a(k) + if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then + mark(i) = .true. + nzrow = nzrow + 1 + nzloc(nzrow) = i + endif + enddo + +c--------------------------------------------------------------------- +c ... extract the nonzeros of this row +c--------------------------------------------------------------------- + do k = 1, nzrow + i = nzloc(k) + mark(i) = .false. + xi = x(i) + x(i) = 0.D0 + if (xi .ne. 0.D0) then + nza = nza + 1 + a(nza) = xi + colidx(nza) = i + endif + enddo + jajp1 = rowstr(j+1) + rowstr(j+1) = nza + rowstr(1) + enddo +CC write (*, 11000) nza + return +11000 format ( //,'final nonzero count in sparse ', + 1 /,'number of nonzeros = ', i16 ) + end +c-------end of sparse----------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine sprnvc( n, nz, v, iv, nzloc, mark ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit logical (a-z) + double precision v(*) + integer n, nz, iv(*), nzloc(n), nn1 + integer mark(n) + common /urando/ amult, tran + double precision amult, tran + + +c--------------------------------------------------------------------- +c generate a sparse n-vector (v, iv) +c having nzv nonzeros +c +c mark(i) is set to 1 if position i is nonzero. +c mark is all zero on entry and is reset to all zero before exit +c this corrects a performance bug found by John G. Lewis, caused by +c reinitialization of mark on every one of the n calls to sprnvc +c--------------------------------------------------------------------- + + integer nzrow, nzv, ii, i, icnvrt + + external randlc, icnvrt + double precision randlc, vecelt, vecloc + + + nzv = 0 + nzrow = 0 + nn1 = 1 + 50 continue + nn1 = 2 * nn1 + if (nn1 .lt. n) goto 50 + +c--------------------------------------------------------------------- +c nn1 is the smallest power of two not less than n +c--------------------------------------------------------------------- + +100 continue + if (nzv .ge. nz) goto 110 + vecelt = randlc( tran, amult ) + +c--------------------------------------------------------------------- +c generate an integer between 1 and n in a portable manner +c--------------------------------------------------------------------- + vecloc = randlc(tran, amult) + i = icnvrt(vecloc, nn1) + 1 + if (i .gt. n) goto 100 + +c--------------------------------------------------------------------- +c was this integer generated already? +c--------------------------------------------------------------------- + if (mark(i) .eq. 0) then + mark(i) = 1 + nzrow = nzrow + 1 + nzloc(nzrow) = i + nzv = nzv + 1 + v(nzv) = vecelt + iv(nzv) = i + endif + goto 100 +110 continue + do ii = 1, nzrow + i = nzloc(ii) + mark(i) = 0 + enddo + return + end +c-------end of sprnvc----------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + function icnvrt(x, ipwr2) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit logical (a-z) + double precision x + integer ipwr2, icnvrt + +c--------------------------------------------------------------------- +c scale a double precision number x in (0,1) by a power of 2 and chop it +c--------------------------------------------------------------------- + icnvrt = int(ipwr2 * x) + + return + end +c-------end of icnvrt----------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine vecset(n, v, iv, nzv, i, val) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit logical (a-z) + integer n, iv(*), nzv, i, k + double precision v(*), val + +c--------------------------------------------------------------------- +c set ith element of sparse vector (v, iv) with +c nzv nonzeros to val +c--------------------------------------------------------------------- + + logical set + + set = .false. + do k = 1, nzv + if (iv(k) .eq. i) then + v(k) = val + set = .true. + endif + enddo + if (.not. set) then + nzv = nzv + 1 + v(nzv) = val + iv(nzv) = i + endif + return + end +c-------end of vecset----------------------------- + diff --git a/examples/smpi/NAS/CG/mpinpb.h b/examples/smpi/NAS/CG/mpinpb.h new file mode 100644 index 0000000000..1f0368c0b7 --- /dev/null +++ b/examples/smpi/NAS/CG/mpinpb.h @@ -0,0 +1,9 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'mpif.h' + + integer me, nprocs, root, dp_type + common /mpistuff/ me, nprocs, root, dp_type + diff --git a/examples/smpi/NAS/DT/DGraph.c b/examples/smpi/NAS/DT/DGraph.c new file mode 100644 index 0000000000..5d5839df19 --- /dev/null +++ b/examples/smpi/NAS/DT/DGraph.c @@ -0,0 +1,184 @@ +#include +#include +#include + +#include "DGraph.h" + +DGArc *newArc(DGNode *tl,DGNode *hd){ + DGArc *ar=(DGArc *)malloc(sizeof(DGArc)); + ar->tail=tl; + ar->head=hd; + return ar; +} +void arcShow(DGArc *ar){ + DGNode *tl=(DGNode *)ar->tail, + *hd=(DGNode *)ar->head; + fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name); +} + +DGNode *newNode(char *nm){ + DGNode *nd=(DGNode *)malloc(sizeof(DGNode)); + nd->attribute=0; + nd->color=0; + nd->inDegree=0; + nd->outDegree=0; + nd->maxInDegree=SMALL_BLOCK_SIZE; + nd->maxOutDegree=SMALL_BLOCK_SIZE; + nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*)); + nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*)); + nd->name=strdup(nm); + nd->feat=NULL; + return nd; +} +void nodeShow(DGNode* nd){ + fprintf( stderr,"%3d.%s: (%d,%d)\n", + nd->id,nd->name,nd->inDegree,nd->outDegree); +/* + if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name); + else if(nd->verified==0) fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name); + else fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name); +*/ +} + +DGraph* newDGraph(char* nm){ + DGraph *dg=(DGraph *)malloc(sizeof(DGraph)); + dg->numNodes=0; + dg->numArcs=0; + dg->maxNodes=BLOCK_SIZE; + dg->maxArcs=BLOCK_SIZE; + dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*)); + dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*)); + dg->name=strdup(nm); + return dg; +} +int AttachNode(DGraph* dg, DGNode* nd) { + int i=0,j,len=0; + DGNode **nds =NULL, *tmpnd=NULL; + DGArc **ar=NULL; + + if (dg->numNodes == dg->maxNodes-1 ) { + dg->maxNodes += BLOCK_SIZE; + nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*)); + memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*)); + free(dg->node); + dg->node=nds; + } + + len = strlen( nd->name); + for (i = 0; i < dg->numNodes; i++) { + tmpnd =dg->node[ i]; + ar=NULL; + if ( strlen( tmpnd->name) != len ) continue; + if ( strncmp( nd->name, tmpnd->name, len) ) continue; + if ( nd->inDegree > 0 ) { + tmpnd->maxInDegree += nd->maxInDegree; + ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*)); + memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*)); + free(tmpnd->inArc); + tmpnd->inArc=ar; + for (j = 0; j < nd->inDegree; j++ ) { + nd->inArc[ j]->head = tmpnd; + } + memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *)); + tmpnd->inDegree += nd->inDegree; + } + if ( nd->outDegree > 0 ) { + tmpnd->maxOutDegree += nd->maxOutDegree; + ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*)); + memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*)); + free(tmpnd->outArc); + tmpnd->outArc=ar; + for (j = 0; j < nd->outDegree; j++ ) { + nd->outArc[ j]->tail = tmpnd; + } + memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *)); + tmpnd->outDegree += nd->outDegree; + } + free(nd); + return i; + } + nd->id = dg->numNodes; + dg->node[dg->numNodes] = nd; + dg->numNodes++; +return nd->id; +} +int AttachArc(DGraph *dg,DGArc* nar){ +int arcId = -1; +int i=0,newNumber=0; +DGNode *head = nar->head, + *tail = nar->tail; +DGArc **ars=NULL,*probe=NULL; +/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */ + if ( !tail || !head ) return arcId; + if ( dg->numArcs == dg->maxArcs-1 ) { + dg->maxArcs += BLOCK_SIZE; + ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*)); + memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*)); + free(dg->arc); + dg->arc=ars; + } + for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */ + probe = tail->outArc[ i]; + if(probe->head == head + && + probe->length == nar->length + ){ + free(nar); + return probe->id; + } + } + + nar->id = dg->numArcs; + arcId=dg->numArcs; + dg->arc[dg->numArcs] = nar; + dg->numArcs++; + + head->inArc[ head->inDegree] = nar; + head->inDegree++; + if ( head->inDegree >= head->maxInDegree ) { + newNumber = head->maxInDegree + SMALL_BLOCK_SIZE; + ars =(DGArc **) calloc(newNumber,sizeof(DGArc*)); + memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*)); + free(head->inArc); + head->inArc=ars; + head->maxInDegree = newNumber; + } + tail->outArc[ tail->outDegree] = nar; + tail->outDegree++; + if(tail->outDegree >= tail->maxOutDegree ) { + newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE; + ars =(DGArc **) calloc(newNumber,sizeof(DGArc*)); + memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*)); + free(tail->outArc); + tail->outArc=ars; + tail->maxOutDegree = newNumber; + } +/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/ +return arcId; +} +void graphShow(DGraph *dg,int DetailsLevel){ + int i=0,j=0; + fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs); + if ( DetailsLevel < 1) return; + for (i = 0; i < dg->numNodes; i++ ) { + DGNode *focusNode = dg->node[ i]; + if(DetailsLevel >= 2) { + for (j = 0; j < focusNode->inDegree; j++ ) { + fprintf(stderr,"\t "); + nodeShow(focusNode->inArc[ j]->tail); + } + } + nodeShow(focusNode); + if ( DetailsLevel < 2) continue; + for (j = 0; j < focusNode->outDegree; j++ ) { + fprintf(stderr, "\t "); + nodeShow(focusNode->outArc[ j]->head); + } + fprintf(stderr, "---\n"); + } + fprintf(stderr,"----------------------------------------\n"); + if ( DetailsLevel < 3) return; +} + + + diff --git a/examples/smpi/NAS/DT/DGraph.h b/examples/smpi/NAS/DT/DGraph.h new file mode 100644 index 0000000000..f38f898b24 --- /dev/null +++ b/examples/smpi/NAS/DT/DGraph.h @@ -0,0 +1,43 @@ +#ifndef _DGRAPH +#define _DGRAPH + +#define BLOCK_SIZE 128 +#define SMALL_BLOCK_SIZE 32 + +typedef struct{ + int id; + void *tail,*head; + int length,width,attribute,maxWidth; +}DGArc; + +typedef struct{ + int maxInDegree,maxOutDegree; + int inDegree,outDegree; + int id; + char *name; + DGArc **inArc,**outArc; + int depth,height,width; + int color,attribute,address,verified; + void *feat; +}DGNode; + +typedef struct{ + int maxNodes,maxArcs; + int id; + char *name; + int numNodes,numArcs; + DGNode **node; + DGArc **arc; +} DGraph; + +DGArc *newArc(DGNode *tl,DGNode *hd); +void arcShow(DGArc *ar); +DGNode *newNode(char *nm); +void nodeShow(DGNode* nd); + +DGraph* newDGraph(char *nm); +int AttachNode(DGraph *dg,DGNode *nd); +int AttachArc(DGraph *dg,DGArc* nar); +void graphShow(DGraph *dg,int DetailsLevel); + +#endif diff --git a/examples/smpi/NAS/DT/Makefile b/examples/smpi/NAS/DT/Makefile new file mode 100644 index 0000000000..28d9502fe2 --- /dev/null +++ b/examples/smpi/NAS/DT/Makefile @@ -0,0 +1,26 @@ +SHELL=/bin/sh +BENCHMARK=dt +BENCHMARKU=DT + +include ../config/make.def + +include ../sys/make.common +#Override PROGRAM +DTPROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS) + +OBJS = dt.o DGraph.o \ + ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o + + +${PROGRAM}: config ${OBJS} + ${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB} + +.c.o: + ${CCOMPILE} $< + +dt.o: dt.c npbparams.h +DGraph.o: DGraph.c DGraph.h + +clean: + - rm -f *.o *~ mputil* + - rm -f dt npbparams.h core diff --git a/examples/smpi/NAS/DT/README b/examples/smpi/NAS/DT/README new file mode 100644 index 0000000000..873e3ae6f2 --- /dev/null +++ b/examples/smpi/NAS/DT/README @@ -0,0 +1,22 @@ +Data Traffic benchmark DT is new in the NPB suite +(released as part of NPB3.x-MPI package). +---------------------------------------------------- + +DT is written in C and same executable can run on any number of processors, +provided this number is not less than the number of nodes in the communication +graph. DT benchmark takes one argument: BH, WH, or SH. This argument +specifies the communication graph Black Hole, White Hole, or SHuffle +respectively. The current release contains verification numbers for +CLASSES S, W, A, and B only. Classes C and D are defined, but verification +numbers are not provided in this release. + +The following table summarizes the number of nodes in the communication +graph based on CLASS and graph TYPE. + +CLASS N_Source N_Nodes(BH,WH) N_Nodes(SH) + S 4 5 12 + W 8 11 32 + A 16 21 80 + B 32 43 192 + C 64 85 448 + D 128 171 1024 diff --git a/examples/smpi/NAS/DT/dt.c b/examples/smpi/NAS/DT/dt.c new file mode 100644 index 0000000000..ed8cfeb55d --- /dev/null +++ b/examples/smpi/NAS/DT/dt.c @@ -0,0 +1,759 @@ +/************************************************************************* + * * + * N A S P A R A L L E L B E N C H M A R K S 3.3 * + * * + * D T * + * * + ************************************************************************* + * * + * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. * + * * + * Permission to use, copy, distribute and modify this software * + * for any purpose with or without fee is hereby granted. We * + * request, however, that all derived work reference the NAS * + * Parallel Benchmarks 3.3. This software is provided "as is" * + * without express or implied warranty. * + * * + * Information on NPB 3.3, including the technical report, the * + * original specifications, source code, results and information * + * on how to submit new results, is available at: * + * * + * http: www.nas.nasa.gov/Software/NPB * + * * + * Send comments or suggestions to npb@nas.nasa.gov * + * Send bug reports to npb-bugs@nas.nasa.gov * + * * + * NAS Parallel Benchmarks Group * + * NASA Ames Research Center * + * Mail Stop: T27A-1 * + * Moffett Field, CA 94035-1000 * + * * + * E-mail: npb@nas.nasa.gov * + * Fax: (650) 604-3957 * + * * + ************************************************************************* + * * + * Author: M. Frumkin * * + * * + *************************************************************************/ + +#include +#include +#include + +#include "mpi.h" +#include "npbparams.h" + +#ifndef CLASS +#define CLASS 'S' +#define NUM_PROCS 1 +#endif + +//int passed_verification; +extern double randlc( double *X, double *A ); +extern +void c_print_results( char *name, + char class, + int n1, + int n2, + int n3, + int niter, + int nprocs_compiled, + int nprocs_total, + double t, + double mops, + char *optype, + int passed_verification, + char *npbversion, + char *compiletime, + char *mpicc, + char *clink, + char *cmpi_lib, + char *cmpi_inc, + char *cflags, + char *clinkflags ); + +void timer_clear( int n ); +void timer_start( int n ); +void timer_stop( int n ); +double timer_read( int n ); +int timer_on=0,timers_tot=64; + +int verify(char *bmname,double rnm2){ + double verify_value=0.0; + double epsilon=1.0E-8; + char cls=CLASS; + int verified=-1; + if (cls != 'U') { + if(cls=='S') { + if(strstr(bmname,"BH")){ + verify_value=30892725.0; + }else if(strstr(bmname,"WH")){ + verify_value=67349758.0; + }else if(strstr(bmname,"SH")){ + verify_value=58875767.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + } + verified = 0; + }else if(cls=='W') { + if(strstr(bmname,"BH")){ + verify_value = 4102461.0; + }else if(strstr(bmname,"WH")){ + verify_value = 204280762.0; + }else if(strstr(bmname,"SH")){ + verify_value = 186944764.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + } + verified = 0; + }else if(cls=='A') { + if(strstr(bmname,"BH")){ + verify_value = 17809491.0; + }else if(strstr(bmname,"WH")){ + verify_value = 1289925229.0; + }else if(strstr(bmname,"SH")){ + verify_value = 610856482.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + } + verified = 0; + }else if(cls=='B') { + if(strstr(bmname,"BH")){ + verify_value = 4317114.0; + }else if(strstr(bmname,"WH")){ + verify_value = 7877279917.0; + }else if(strstr(bmname,"SH")){ + verify_value = 1836863082.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + verified = 0; + } + }else if(cls=='C') { + if(strstr(bmname,"BH")){ + verify_value = 0.0; + }else if(strstr(bmname,"WH")){ + verify_value = 0.0; + }else if(strstr(bmname,"SH")){ + verify_value = 0.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + verified = -1; + } + }else if(cls=='D') { + if(strstr(bmname,"BH")){ + verify_value = 0.0; + }else if(strstr(bmname,"WH")){ + verify_value = 0.0; + }else if(strstr(bmname,"SH")){ + verify_value = 0.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + } + verified = -1; + }else{ + fprintf(stderr,"No such class as %c.\n",cls); + } + fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2); + if(verified==-1){ + fprintf(stderr," No verification was performed.\n"); + }else if( rnm2 - verify_value < epsilon && + rnm2 - verify_value > -epsilon) { /* abs here does not work on ALTIX */ + verified = 1; + fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value)); + }else{ + verified = 0; + fprintf(stderr," The correct verification value = %f\n",verify_value); + fprintf(stderr," Got value = %f\n",rnm2); + } + }else{ + verified = -1; + } + return verified; + } + +int ipowMod(int a,long long int n,int md){ + int seed=1,q=a,r=1; + if(n<0){ + fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n); + n=-n; /* temp fix */ +/* return 1; */ + } + if(md<=0){ + fprintf(stderr,"ipowMod: module must be positive mod=%d",md); + return 1; + } + if(n==0) return 1; + while(n>1){ + int n2 = n/2; + if (n2*2==n){ + seed = (q*q)%md; + q=seed; + n = n2; + }else{ + seed = (r*q)%md; + r=seed; + n = n-1; + } + } + seed = (r*q)%md; + return seed; +} + +#include "DGraph.h" +DGraph *buildSH(char cls){ +/* + Nodes of the graph must be topologically sorted + to avoid MPI deadlock. +*/ + DGraph *dg; + int numSources=NUM_SOURCES; /* must be power of 2 */ + int numOfLayers=0,tmpS=numSources>>1; + int firstLayerNode=0; + DGArc *ar=NULL; + DGNode *nd=NULL; + int mask=0x0,ndid=0,ndoff=0; + int i=0,j=0; + char nm[BLOCK_SIZE]; + + sprintf(nm,"DT_SH.%c",cls); + dg=newDGraph(nm); + + while(tmpS>1){ + numOfLayers++; + tmpS>>=1; + } + for(i=0;inode[ndid],nd); + AttachArc(dg,ar); + ndoff+=mask; + ndid=firstLayerNode+ndoff; + ar=newArc(dg->node[ndid],nd); + AttachArc(dg,ar); + } + firstLayerNode+=numSources; + } + mask=0x00000001<node[ndid],nd); + AttachArc(dg,ar); + ndoff+=mask; + ndid=firstLayerNode+ndoff; + ar=newArc(dg->node[ndid],nd); + AttachArc(dg,ar); + } +return dg; +} +DGraph *buildWH(char cls){ +/* + Nodes of the graph must be topologically sorted + to avoid MPI deadlock. +*/ + int i=0,j=0; + int numSources=NUM_SOURCES,maxInDeg=4; + int numLayerNodes=numSources,firstLayerNode=0; + int totComparators=0; + int numPrevLayerNodes=numLayerNodes; + int id=0,sid=0; + DGraph *dg; + DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL; + DGArc *ar=NULL; + char nm[BLOCK_SIZE]; + + sprintf(nm,"DT_WH.%c",cls); + dg=newDGraph(nm); + + for(i=0;imaxInDeg){ + numLayerNodes=numLayerNodes/maxInDeg; + if(numLayerNodes*maxInDeg=numPrevLayerNodes) break; + snd=dg->node[firstLayerNode+sid]; + ar=newArc(dg->node[id],snd); + AttachArc(dg,ar); + } + } + firstLayerNode+=numPrevLayerNodes; + numPrevLayerNodes=numLayerNodes; + } + source=newNode("Source"); + AttachNode(dg,source); + for(i=0;inode[firstLayerNode+i]; + ar=newArc(source,nd); + AttachArc(dg,ar); + } + + for(i=0;inumNodes/2;i++){ /* Topological sorting */ + tmp=dg->node[i]; + dg->node[i]=dg->node[dg->numNodes-1-i]; + dg->node[i]->id=i; + dg->node[dg->numNodes-1-i]=tmp; + dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i; + } +return dg; +} +DGraph *buildBH(char cls){ +/* + Nodes of the graph must be topologically sorted + to avoid MPI deadlock. +*/ + int i=0,j=0; + int numSources=NUM_SOURCES,maxInDeg=4; + int numLayerNodes=numSources,firstLayerNode=0; + DGraph *dg; + DGNode *nd=NULL, *snd=NULL, *sink=NULL; + DGArc *ar=NULL; + int totComparators=0; + int numPrevLayerNodes=numLayerNodes; + int id=0, sid=0; + char nm[BLOCK_SIZE]; + + sprintf(nm,"DT_BH.%c",cls); + dg=newDGraph(nm); + + for(i=0;imaxInDeg){ + numLayerNodes=numLayerNodes/maxInDeg; + if(numLayerNodes*maxInDeg=numPrevLayerNodes) break; + snd=dg->node[firstLayerNode+sid]; + ar=newArc(snd,dg->node[id]); + AttachArc(dg,ar); + } + } + firstLayerNode+=numPrevLayerNodes; + numPrevLayerNodes=numLayerNodes; + } + sink=newNode("Sink"); + AttachNode(dg,sink); + for(i=0;inode[firstLayerNode+i]; + ar=newArc(nd,sink); + AttachArc(dg,ar); + } +return dg; +} + +typedef struct{ + int len; + double* val; +} Arr; +Arr *newArr(int len){ + Arr *arr=(Arr *)malloc(sizeof(Arr)); + arr->len=len; + arr->val=(double *)malloc(len*sizeof(double)); + return arr; +} +void arrShow(Arr* a){ + if(!a) fprintf(stderr,"-- NULL array\n"); + else{ + fprintf(stderr,"-- length=%d\n",a->len); + } +} +double CheckVal(Arr *feat){ + double csum=0.0; + int i=0; + for(i=0;ilen;i++){ + csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since + result will be 0 for large len */ + } + return csum; +} +int GetFNumDPar(int* mean, int* stdev){ + *mean=NUM_SAMPLES; + *stdev=STD_DEVIATION; + return 0; +} +int GetFeatureNum(char *mbname,int id){ + double tran=314159265.0; + double A=2*id+1; + double denom=randlc(&tran,&A); + char cval='S'; + int mean=NUM_SAMPLES,stdev=128; + int rtfs=0,len=0; + GetFNumDPar(&mean,&stdev); + rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev); + if(rtfs<0) rtfs=-rtfs; + len=mean-stdev+rtfs; + return len; +} +Arr* RandomFeatures(char *bmname,int fdim,int id){ + int len=GetFeatureNum(bmname,id)*fdim; + Arr* feat=newArr(len); + int nxg=2,nyg=2,nzg=2,nfg=5; + int nx=421,ny=419,nz=1427,nf=3527; + long long int expon=(len*(id+1))%3141592; + int seedx=ipowMod(nxg,expon,nx), + seedy=ipowMod(nyg,expon,ny), + seedz=ipowMod(nzg,expon,nz), + seedf=ipowMod(nfg,expon,nf); + int i=0; + if(timer_on){ + timer_clear(id+1); + timer_start(id+1); + } + for(i=0;ival[i]=seedx; + feat->val[i+1]=seedy; + feat->val[i+2]=seedz; + feat->val[i+3]=seedf; + } + if(timer_on){ + timer_stop(id+1); + fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1)); + } + return feat; +} +void Resample(Arr *a,int blen){ + long long int i=0,j=0,jlo=0,jhi=0; + double avval=0.0; + double *nval=(double *)malloc(blen*sizeof(double)); + Arr *tmp=newArr(10); + for(i=0;ilen-1;i++){ + jlo=(int)(0.5*(2*i-1)*(blen/a->len)); + jhi=(int)(0.5*(2*i+1)*(blen/a->len)); + + avval=a->val[i]/(jhi-jlo+1); + for(j=jlo;j<=jhi;j++){ + nval[j]+=avval; + } + } + nval[0]=a->val[0]; + nval[blen-1]=a->val[a->len-1]; + free(a->val); + a->val=nval; + a->len=blen; +} +#define fielddim 4 +Arr* WindowFilter(Arr *a, Arr* b,int w){ + int i=0,j=0,k=0; + double rms0=0.0,rms1=0.0,rmsm1=0.0; + double weight=((double) (w+1))/(w+2); + + w+=1; + if(timer_on){ + timer_clear(w); + timer_start(w); + } + if(a->lenlen) Resample(a,b->len); + if(a->len>b->len) Resample(b,a->len); + for(i=fielddim;ilen-fielddim;i+=fielddim){ + rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i]) + +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1]) + +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2]) + +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]); + j=i+fielddim; + rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j]) + +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1]) + +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2]) + +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]); + j=i-fielddim; + rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j]) + +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1]) + +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2]) + +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]); + k=0; + if(rms1val[i]=weight*b->val[i]; + a->val[i+1]=weight*b->val[i+1]; + a->val[i+2]=weight*b->val[i+2]; + a->val[i+3]=weight*b->val[i+3]; + }else if(k==1){ + j=i+fielddim; + a->val[i]=weight*b->val[j]; + a->val[i+1]=weight*b->val[j+1]; + a->val[i+2]=weight*b->val[j+2]; + a->val[i+3]=weight*b->val[j+3]; + }else { /*if(k==-1)*/ + j=i-fielddim; + a->val[i]=weight*b->val[j]; + a->val[i+1]=weight*b->val[j+1]; + a->val[i+2]=weight*b->val[j+2]; + a->val[i+3]=weight*b->val[j+3]; + } + } + if(timer_on){ + timer_stop(w); + fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w)); + } + return a; +} + +int SendResults(DGraph *dg,DGNode *nd,Arr *feat){ + int i=0,tag=0; + DGArc *ar=NULL; + DGNode *head=NULL; + if(!feat) return 0; + for(i=0;ioutDegree;i++){ + ar=nd->outArc[i]; + if(ar->tail!=nd) continue; + head=ar->head; + tag=ar->id; + if(head->address!=nd->address){ + MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD); + MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD); + } + } + return 1; +} +Arr* CombineStreams(DGraph *dg,DGNode *nd){ + Arr *resfeat=newArr(NUM_SAMPLES*fielddim); + int i=0,len=0,tag=0; + DGArc *ar=NULL; + DGNode *tail=NULL; + MPI_Status status; + Arr *feat=NULL,*featp=NULL; + + if(nd->inDegree==0) return NULL; + for(i=0;iinDegree;i++){ + ar=nd->inArc[i]; + if(ar->head!=nd) continue; + tail=ar->tail; + if(tail->address!=nd->address){ + len=0; + tag=ar->id; + MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status); + feat=newArr(len); + MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status); + resfeat=WindowFilter(resfeat,feat,nd->id); + free(feat); + }else{ + featp=(Arr *)tail->feat; + feat=newArr(featp->len); + memcpy(feat->val,featp->val,featp->len*sizeof(double)); + resfeat=WindowFilter(resfeat,feat,nd->id); + free(feat); + } + } + for(i=0;ilen;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree; + nd->feat=resfeat; + return nd->feat; +} +double Reduce(Arr *a,int w){ + double retv=0.0; + if(timer_on){ + timer_clear(w); + timer_start(w); + } + retv=(int)(w*CheckVal(a));/* The casting needed for node + and array dependent verifcation */ + if(timer_on){ + timer_stop(w); + fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w)); + } + return retv; +} + +double ReduceStreams(DGraph *dg,DGNode *nd){ + double csum=0.0; + int i=0,len=0,tag=0; + DGArc *ar=NULL; + DGNode *tail=NULL; + Arr *feat=NULL; + double retv=0.0; + + for(i=0;iinDegree;i++){ + ar=nd->inArc[i]; + if(ar->head!=nd) continue; + tail=ar->tail; + if(tail->address!=nd->address){ + MPI_Status status; + len=0; + tag=ar->id; + MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status); + feat=newArr(len); + MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status); + csum+=Reduce(feat,(nd->id+1)); + free(feat); + }else{ + csum+=Reduce(tail->feat,(nd->id+1)); + } + } + if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree); + retv=(nd->id+1)*csum; + return retv; +} + +int ProcessNodes(DGraph *dg,int me){ + double chksum=0.0; + Arr *feat=NULL; + int i=0,verified=0,tag; + DGNode *nd=NULL; + double rchksum=0.0; + MPI_Status status; + + for(i=0;inumNodes;i++){ + nd=dg->node[i]; + if(nd->address!=me) continue; + if(strstr(nd->name,"Source")){ + nd->feat=RandomFeatures(dg->name,fielddim,nd->id); + SendResults(dg,nd,nd->feat); + }else if(strstr(nd->name,"Sink")){ + chksum=ReduceStreams(dg,nd); + tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */ + MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD); + }else{ + feat=CombineStreams(dg,nd); + SendResults(dg,nd,feat); + } + } + if(me==0){ /* Report node */ + rchksum=0.0; + chksum=0.0; + for(i=0;inumNodes;i++){ + nd=dg->node[i]; + if(!strstr(nd->name,"Sink")) continue; + tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */ + MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status); + chksum+=rchksum; + } + verified=verify(dg->name,chksum); + } +return verified; +} + +int main(int argc,char **argv ){ + int my_rank,comm_size; + int i; + DGraph *dg=NULL; + int verified=0, featnum=0; + double bytes_sent=2.0,tot_time=0.0; + + MPI_Init( &argc, &argv ); + MPI_Comm_rank( MPI_COMM_WORLD, &my_rank ); + MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); + + if(argc!=2|| + ( strncmp(argv[1],"BH",2)!=0 + &&strncmp(argv[1],"WH",2)!=0 + &&strncmp(argv[1],"SH",2)!=0 + ) + ){ + if(my_rank==0){ + fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n"); + fprintf(stderr,"** Where \n - N is integer number of MPI processes\n"); + fprintf(stderr," - S is the class S, W, or A \n"); + fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n"); + fprintf(stderr," - the number of MPI processes N should not be be less than \n"); + fprintf(stderr," the number of nodes in the graph\n"); + } + MPI_Finalize(); + exit(0); + } + if(strncmp(argv[1],"BH",2)==0){ + dg=buildBH(CLASS); + }else if(strncmp(argv[1],"WH",2)==0){ + dg=buildWH(CLASS); + }else if(strncmp(argv[1],"SH",2)==0){ + dg=buildSH(CLASS); + } + + if(timer_on&&dg->numNodes+1>timers_tot){ + timer_on=0; + if(my_rank==0) + fprintf(stderr,"Not enough timers. Node timeing is off. \n"); + } + if(dg->numNodes>comm_size){ + if(my_rank==0){ + fprintf(stderr,"** The number of MPI processes should not be less than \n"); + fprintf(stderr,"** the number of nodes in the graph\n"); + fprintf(stderr,"** Number of MPI processes = %d\n",comm_size); + fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes); + } + MPI_Finalize(); + exit(0); + } + for(i=0;inumNodes;i++){ + dg->node[i]->address=i; + } + if( my_rank == 0 ){ + printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" ); + graphShow(dg,0); + timer_clear(0); + timer_start(0); + } + verified=ProcessNodes(dg,my_rank); + + featnum=NUM_SAMPLES*fielddim; + bytes_sent=featnum*dg->numArcs; + bytes_sent/=1048576; + if(my_rank==0){ + timer_stop(0); + tot_time=timer_read(0); + c_print_results( dg->name, + CLASS, + featnum, + 0, + 0, + dg->numNodes, + 0, + comm_size, + tot_time, + bytes_sent/tot_time, + "bytes transmitted", + verified, + NPBVERSION, + COMPILETIME, + MPICC, + CLINK, + CMPI_LIB, + CMPI_INC, + CFLAGS, + CLINKFLAGS ); + } + MPI_Finalize(); + return 1; +} diff --git a/examples/smpi/NAS/EP/Makefile b/examples/smpi/NAS/EP/Makefile new file mode 100644 index 0000000000..2014329288 --- /dev/null +++ b/examples/smpi/NAS/EP/Makefile @@ -0,0 +1,28 @@ +SHELL=/bin/sh +BENCHMARK=ep +BENCHMARKU=EP + +include ../config/make.def + +#OBJS = ep.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o +OBJS = ep.o randlc.o + +include ../sys/make.common + +${PROGRAM}: config ${OBJS} +# ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} + ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB} + + +#ep.o: ep.f mpinpb.h npbparams.h +# ${FCOMPILE} ep.f + +ep.o: ep.c randlc.c mpinpb.h npbparams.h + ${CCOMPILE} ep.c + +clean: + - rm -f *.o *~ + - rm -f npbparams.h core + + + diff --git a/examples/smpi/NAS/EP/README b/examples/smpi/NAS/EP/README new file mode 100644 index 0000000000..6eb36571af --- /dev/null +++ b/examples/smpi/NAS/EP/README @@ -0,0 +1,6 @@ +This code implements the random-number generator described in the +NAS Parallel Benchmark document RNR Technical Report RNR-94-007. +The code is "embarrassingly" parallel in that no communication is +required for the generation of the random numbers itself. There is +no special requirement on the number of processors used for running +the benchmark. diff --git a/examples/smpi/NAS/EP/ep.c b/examples/smpi/NAS/EP/ep.c new file mode 100644 index 0000000000..753fd431a3 --- /dev/null +++ b/examples/smpi/NAS/EP/ep.c @@ -0,0 +1,440 @@ +#include +#include +#include +#include + +#include "mpi.h" +#include "npbparams.h" + +#include "randlc.h" + +#ifndef CLASS +#define CLASS 'S' +#define NUM_PROCS 1 +#endif +#define true 1 +#define false 0 + + +//---NOTE : all the timers function have been modified to +// avoid global timers (privatize these). + // ----------------------- timers --------------------- + void timer_clear(double *onetimer) { + //elapsed[n] = 0.0; + *onetimer = 0.0; + } + + void timer_start(double *onetimer) { + *onetimer = MPI_Wtime(); + } + + void timer_stop(int n,double *elapsed,double *start) { + double t, now; + + now = MPI_Wtime(); + t = now - start[n]; + elapsed[n] += t; + } + + double timer_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */ + return(elapsed[n]); + } + /******************************************************************** + ***************** V R A N L C ****************** + ***************** *****************/ + double vranlc(int n, double x, double a, double *y) + { + int i; + long i246m1=0x00003FFFFFFFFFFF; + long LLx, Lx, La; + double d2m46; + +// This doesn't work, because the compiler does the calculation in 32 +// bits and overflows. No standard way (without f90 stuff) to specify +// that the rhs should be done in 64 bit arithmetic. +// parameter(i246m1=2**46-1) + + d2m46=pow(0.5,46); + +// c Note that the v6 compiler on an R8000 does something stupid with +// c the above. Using the following instead (or various other things) +// c makes the calculation run almost 10 times as fast. +// +// c save d2m46 +// c data d2m46/0.0d0/ +// c if (d2m46 .eq. 0.0d0) then +// c d2m46 = 0.5d0**46 +// c endif + + Lx = (long)x; + La = (long)a; + //fprintf(stdout,("================== Vranlc ================"); + //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La); + LLx = Lx; + for (i=0; i< n; i++) { + Lx = Lx*La & i246m1 ; + LLx = Lx; + y[i] = d2m46 * (double)LLx; + /* + if(i == 0) { + fprintf(stdout,("After loop 0:"); + fprintf(stdout,("Lx = " + Lx + ", La = " + La); + fprintf(stdout,("d2m46 = " + d2m46); + fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue()); + fprintf(stdout,("Y[0]" + y[0]); + } + */ + } + + x = (double)LLx; + /* + fprintf(stdout,("Change: Lx = " + Lx); + fprintf(stdout,("=============End Vranlc ================"); + */ + return x; + } + + + +//-------------- the core (unique function) ----------- + void doTest(int argc, char **argv) { + double dum[3] = {1.,1.,1.}; + double x1, x2, sx, sy, tm, an, tt, gc; + double Mops; + double epsilon=1.0E-8, a = 1220703125., s=271828183.; + double t1, t2, t3, t4; + double sx_verify_value, sy_verify_value, sx_err, sy_err; + +#include "npbparams.h" + int mk=16, + // --> set by make : in npbparams.h + //m=28, // for CLASS=A + //m=30, // for CLASS=B + //npm=2, // NPROCS + mm = m-mk, + nn = (int)(pow(2,mm)), + nk = (int)(pow(2,mk)), + nq=10, + np, + node, + no_nodes, + i, + ik, + kk, + l, + k, nit, no_large_nodes, + np_add, k_offset, j; + int me, nprocs, root=0, dp_type; + int verified, + timers_enabled=true; + char size[500]; // mind the size of the string to represent a big number + + //Use in randlc.. + int KS = 0; + double R23, R46, T23, T46; + + double *qq = (double *) malloc (10000*sizeof(double)); + double *start = (double *) malloc (64*sizeof(double)); + double *elapsed = (double *) malloc (64*sizeof(double)); + + double *x = (double *) malloc (2*nk*sizeof(double)); + double *q = (double *) malloc (nq*sizeof(double)); + + MPI_Init( &argc, &argv ); + MPI_Comm_size( MPI_COMM_WORLD, &no_nodes); + MPI_Comm_rank( MPI_COMM_WORLD, &node); + +#ifdef USE_MPE + MPE_Init_log(); +#endif + root = 0; + if (node == root ) { + + /* Because the size of the problem is too large to store in a 32-bit + * integer for some classes, we put it into a string (for printing). + * Have to strip off the decimal point put in there by the floating + * point print statement (internal file) + */ + fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark"); + sprintf(size,"%d",pow(2,m+1)); + //size = size.replace('.', ' '); + fprintf(stdout," Number of random numbers generated: %s\n",size); + fprintf(stdout," Number of active processes: %d\n",no_nodes); + + } + verified = false; + + /* c Compute the number of "batches" of random number pairs generated + c per processor. Adjust if the number of processors does not evenly + c divide the total number +*/ + + np = nn / no_nodes; + no_large_nodes = nn % no_nodes; + if (node < no_large_nodes) np_add = 1; + else np_add = 0; + np = np + np_add; + + if (np == 0) { + fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn); + MPI_Abort(MPI_COMM_WORLD,1); + exit(0); + } + +/* c Call the random number generator functions and initialize + c the x-array to reduce the effects of paging on the timings. + c Also, call all mathematical functions that are used. Make + c sure these initializations cannot be eliminated as dead code. +*/ + + //call vranlc(0, dum[1], dum[2], dum[3]); + // Array indexes start at 1 in Fortran, 0 in Java + vranlc(0, dum[0], dum[1], &(dum[2])); + + dum[0] = randlc(&(dum[1]),&(dum[2])); + ///////////////////////////////// + for (i=0;i<2*nk;i++) { + x[i] = -1e99; + } + Mops = log(sqrt(abs(1))); + + /* + c--------------------------------------------------------------------- + c Synchronize before placing time stamp + c--------------------------------------------------------------------- + */ + MPI_Barrier( MPI_COMM_WORLD ); + + timer_clear(&(elapsed[1])); + timer_clear(&(elapsed[2])); + timer_clear(&(elapsed[3])); + timer_start(&(start[1])); + + t1 = a; + //fprintf(stdout,("(ep.f:160) t1 = " + t1); + t1 = vranlc(0, t1, a, x); + //fprintf(stdout,("(ep.f:161) t1 = " + t1); + + +/* c Compute AN = A ^ (2 * NK) (mod 2^46). */ + + t1 = a; + //fprintf(stdout,("(ep.f:165) t1 = " + t1); + for (i=1; i <= mk+1; i++) { + t2 = randlc(&t1, &t1); + //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1); + } + an = t1; + //fprintf(stdout,("(ep.f:172) s = " + s); + tt = s; + gc = 0.; + sx = 0.; + sy = 0.; + for (i=0; i < nq ; i++) { + q[i] = 0.; + } + +/* + Each instance of this loop may be performed independently. We compute + the k offsets separately to take into account the fact that some nodes + have more numbers to generate than others +*/ + + if (np_add == 1) + k_offset = node * np -1; + else + k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1; + + int stop = false; + for(k = 1; k <= np; k++) { + stop = false; + kk = k_offset + k ; + t1 = s; + //fprintf(stdout,("(ep.f:193) t1 = " + t1); + t2 = an; + +// Find starting seed t1 for this kk. + + for (i=1;i<=100 && !stop;i++) { + ik = kk / 2; + //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk); + if (2 * ik != kk) { + t3 = randlc(&t1, &t2); + //fprintf(stdout,("(ep.f:200) t1= " +t1 ); + } + if (ik==0) + stop = true; + else { + t3 = randlc(&t2, &t2); + kk = ik; + } + } +// Compute uniform pseudorandom numbers. + + //if (timers_enabled) timer_start(3); + timer_start(&(start[3])); + //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified + + //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<"); + //fprintf(stdout,"2*nk = " + (2*nk)); + //fprintf(stdout,"t1 = " + t1); + //fprintf(stdout,"a = " + a); + //fprintf(stdout,"x[0] = " + x[0]); + //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<"); + + t1 = vranlc(2 * nk, t1, a, x); + + //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<"); + //fprintf(stdout,("2*nk = " + (2*nk)); + //fprintf(stdout,("t1 = " + t1); + //fprintf(stdout,("a = " + a); + //fprintf(stdout,("x[0] = " + x[0]); + //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<"); + + //if (timers_enabled) timer_stop(3); + timer_stop(3,elapsed,start); + +/* Compute Gaussian deviates by acceptance-rejection method and + * tally counts in concentric square annuli. This loop is not + * vectorizable. + */ + //if (timers_enabled) timer_start(2); + timer_start(&(start[2])); + for(i=1; i<=nk;i++) { + x1 = 2. * x[2*i-2] -1.0; + x2 = 2. * x[2*i-1] - 1.0; + t1 = x1*x1 + x2*x2; + if (t1 <= 1.) { + t2 = sqrt(-2. * log(t1) / t1); + t3 = (x1 * t2); + t4 = (x2 * t2); + l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4)); + q[l] = q[l] + 1.; + sx = sx + t3; + sy = sy + t4; + } + /* + if(i == 1) { + fprintf(stdout,"x1 = " + x1); + fprintf(stdout,"x2 = " + x2); + fprintf(stdout,"t1 = " + t1); + fprintf(stdout,"t2 = " + t2); + fprintf(stdout,"t3 = " + t3); + fprintf(stdout,"t4 = " + t4); + fprintf(stdout,"l = " + l); + fprintf(stdout,"q[l] = " + q[l]); + fprintf(stdout,"sx = " + sx); + fprintf(stdout,"sy = " + sy); + } + */ + } + //if (timers_enabled) timer_stop(2); + timer_stop(2,elapsed,start); + } + + //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) + MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1 + MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + sy = x[0]; + MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + + for(i = 0; i < nq; i++) { + q[i] = x[i]; + } + for(i = 0; i < nq; i++) { + gc += q[i]; + } + + timer_stop(1,elapsed,start); + tm = timer_read(1,elapsed); + MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + tm = x[0]; + + if(node == root) { + nit = 0; + verified = true; + + if(m == 24) { + sx_verify_value = -3.247834652034740E3; + sy_verify_value = -6.958407078382297E3; + } else if(m == 25) { + sx_verify_value = -2.863319731645753E3; + sy_verify_value = -6.320053679109499E3; + } else if(m == 28) { + sx_verify_value = -4.295875165629892E3; + sy_verify_value = -1.580732573678431E4; + } else if(m == 30) { + sx_verify_value = 4.033815542441498E4; + sy_verify_value = -2.660669192809235E4; + } else if(m == 32) { + sx_verify_value = 4.764367927995374E4; + sy_verify_value = -8.084072988043731E4; + } else if(m == 36) { + sx_verify_value = 1.982481200946593E5; + sy_verify_value = -1.020596636361769E5; + } else { + verified = false; + } + + /* + fprintf(stdout,("sx = " + sx); + fprintf(stdout,("sx_verify = " + sx_verify_value); + fprintf(stdout,("sy = " + sy); + fprintf(stdout,("sy_verify = " + sy_verify_value); + */ + if(verified) { + sx_err = abs((sx - sx_verify_value)/sx_verify_value); + sy_err = abs((sy - sy_verify_value)/sy_verify_value); + /* + fprintf(stdout,("sx_err = " + sx_err); + fprintf(stdout,("sy_err = " + sx_err); + fprintf(stdout,("epsilon= " + epsilon); + */ + verified = ((sx_err < epsilon) && (sy_err < epsilon)); + } + + Mops = (pow(2.0, m+1))/tm/1000; + + fprintf(stdout,"EP Benchmark Results:\n"); + fprintf(stdout,"CPU Time=%d\n",tm); + fprintf(stdout,"N = 2^%d\n",m); + fprintf(stdout,"No. Gaussain Pairs =%d\n",gc); + fprintf(stdout,"Sum = %lf %ld\n",sx,sy); + fprintf(stdout,"Count:"); + for(i = 0; i < nq; i++) { + fprintf(stdout,"%d\t %ld\n",i,q[i]); + } + + /* + print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops, + "Random numbers generated", verified, npbversion, + compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */ + fprintf(stdout,"\nEP Benchmark Completed\n"); + fprintf(stdout,"Class = %s\n", _class); + fprintf(stdout,"Size = %s\n", size); + fprintf(stdout,"Iteration = %d\n", nit); + fprintf(stdout,"Time in seconds = %lf\n",(tm/1000)); + fprintf(stdout,"Total processes = %d\n",no_nodes); + fprintf(stdout,"Mops/s total = %lf\n",Mops); + fprintf(stdout,"Mops/s/process = %lf\n", Mops/no_nodes); + fprintf(stdout,"Operation type = Random number generated\n"); + if(verified) { + fprintf(stdout,"Verification = SUCCESSFUL\n"); + } else { + fprintf(stdout,"Verification = UNSUCCESSFUL\n"); + } + fprintf(stdout,"Total time: %lf\n",(timer_read(1,elapsed)/1000)); + fprintf(stdout,"Gaussian pairs: %lf\n",(timer_read(2,elapsed)/1000)); + fprintf(stdout,"Random numbers: %lf\n",(timer_read(3,elapsed)/1000)); + } +#ifdef USE_MPE + MPE_Finish_log(argv[0]); +#endif + + MPI_Finalize(); + } + + int main(int argc, char **argv) { + doTest(argc,argv); + } diff --git a/examples/smpi/NAS/EP/ep.f b/examples/smpi/NAS/EP/ep.f new file mode 100644 index 0000000000..ca7cc24620 --- /dev/null +++ b/examples/smpi/NAS/EP/ep.f @@ -0,0 +1,316 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! E P ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! +! It is described in NAS Technical Reports 95-020 and 02-007 ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + + +c--------------------------------------------------------------------- +c +c Authors: P. O. Frederickson +c D. H. Bailey +c A. C. Woo +c R. F. Van der Wijngaart +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- + program EMBAR +c--------------------------------------------------------------------- +C +c This is the MPI version of the APP Benchmark 1, +c the "embarassingly parallel" benchmark. +c +c +c M is the Log_2 of the number of complex pairs of uniform (0, 1) random +c numbers. MK is the Log_2 of the size of each batch of uniform random +c numbers. MK can be set for convenience on a given system, since it does +c not affect the results. + + implicit none + + include 'npbparams.h' + include 'mpinpb.h' + + double precision Mops, epsilon, a, s, t1, t2, t3, t4, x, x1, + > x2, q, sx, sy, tm, an, tt, gc, dum(3), + > timer_read + double precision sx_verify_value, sy_verify_value, sx_err, sy_err + integer mk, mm, nn, nk, nq, np, ierr, node, no_nodes, + > i, ik, kk, l, k, nit, ierrcode, no_large_nodes, + > np_add, k_offset, j + logical verified, timers_enabled + parameter (timers_enabled = .false.) + external randlc, timer_read + double precision randlc, qq + character*15 size + + parameter (mk = 16, mm = m - mk, nn = 2 ** mm, + > nk = 2 ** mk, nq = 10, epsilon=1.d-8, + > a = 1220703125.d0, s = 271828183.d0) + + common/storage/ x(2*nk), q(0:nq-1), qq(10000) + data dum /1.d0, 1.d0, 1.d0/ + + call mpi_init(ierr) + call mpi_comm_rank(MPI_COMM_WORLD,node,ierr) + call mpi_comm_size(MPI_COMM_WORLD,no_nodes,ierr) + + root = 0 + + if (.not. convertdouble) then + dp_type = MPI_DOUBLE_PRECISION + else + dp_type = MPI_REAL + endif + + if (node.eq.root) then + +c Because the size of the problem is too large to store in a 32-bit +c integer for some classes, we put it into a string (for printing). +c Have to strip off the decimal point put in there by the floating +c point print statement (internal file) + + write(*, 1000) + write(size, '(f15.0)' ) 2.d0**(m+1) + j = 15 + if (size(j:j) .eq. '.') j = j - 1 + write (*,1001) size(1:j) + write(*, 1003) no_nodes + + 1000 format(/,' NAS Parallel Benchmarks 3.3 -- EP Benchmark',/) + 1001 format(' Number of random numbers generated: ', a15) + 1003 format(' Number of active processes: ', 2x, i13, /) + + endif + + verified = .false. + +c Compute the number of "batches" of random number pairs generated +c per processor. Adjust if the number of processors does not evenly +c divide the total number + + np = nn / no_nodes + no_large_nodes = mod(nn, no_nodes) + if (node .lt. no_large_nodes) then + np_add = 1 + else + np_add = 0 + endif + np = np + np_add + + if (np .eq. 0) then + write (6, 1) no_nodes, nn + 1 format ('Too many nodes:',2i6) + call mpi_abort(MPI_COMM_WORLD,ierrcode,ierr) + stop + endif + +c Call the random number generator functions and initialize +c the x-array to reduce the effects of paging on the timings. +c Also, call all mathematical functions that are used. Make +c sure these initializations cannot be eliminated as dead code. + + call vranlc(0, dum(1), dum(2), dum(3)) + dum(1) = randlc(dum(2), dum(3)) + do 5 i = 1, 2*nk + x(i) = -1.d99 + 5 continue + Mops = log(sqrt(abs(max(1.d0,1.d0)))) + +c--------------------------------------------------------------------- +c Synchronize before placing time stamp +c--------------------------------------------------------------------- + call mpi_barrier(MPI_COMM_WORLD, ierr) + + call timer_clear(1) + call timer_clear(2) + call timer_clear(3) + call timer_start(1) + + t1 = a + call vranlc(0, t1, a, x) + +c Compute AN = A ^ (2 * NK) (mod 2^46). + + t1 = a + + do 100 i = 1, mk + 1 + t2 = randlc(t1, t1) + 100 continue + + an = t1 + tt = s + gc = 0.d0 + sx = 0.d0 + sy = 0.d0 + + do 110 i = 0, nq - 1 + q(i) = 0.d0 + 110 continue + +c Each instance of this loop may be performed independently. We compute +c the k offsets separately to take into account the fact that some nodes +c have more numbers to generate than others + + if (np_add .eq. 1) then + k_offset = node * np -1 + else + k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1 + endif + + do 150 k = 1, np + kk = k_offset + k + t1 = s + t2 = an + +c Find starting seed t1 for this kk. + + do 120 i = 1, 100 + ik = kk / 2 + if (2 * ik .ne. kk) t3 = randlc(t1, t2) + if (ik .eq. 0) goto 130 + t3 = randlc(t2, t2) + kk = ik + 120 continue + +c Compute uniform pseudorandom numbers. + 130 continue + + if (timers_enabled) call timer_start(3) + call vranlc(2 * nk, t1, a, x) + if (timers_enabled) call timer_stop(3) + +c Compute Gaussian deviates by acceptance-rejection method and +c tally counts in concentric square annuli. This loop is not +c vectorizable. + + if (timers_enabled) call timer_start(2) + + do 140 i = 1, nk + x1 = 2.d0 * x(2*i-1) - 1.d0 + x2 = 2.d0 * x(2*i) - 1.d0 + t1 = x1 ** 2 + x2 ** 2 + if (t1 .le. 1.d0) then + t2 = sqrt(-2.d0 * log(t1) / t1) + t3 = (x1 * t2) + t4 = (x2 * t2) + l = max(abs(t3), abs(t4)) + q(l) = q(l) + 1.d0 + sx = sx + t3 + sy = sy + t4 + endif + 140 continue + + if (timers_enabled) call timer_stop(2) + + 150 continue + + call mpi_allreduce(sx, x, 1, dp_type, + > MPI_SUM, MPI_COMM_WORLD, ierr) + sx = x(1) + call mpi_allreduce(sy, x, 1, dp_type, + > MPI_SUM, MPI_COMM_WORLD, ierr) + sy = x(1) + call mpi_allreduce(q, x, nq, dp_type, + > MPI_SUM, MPI_COMM_WORLD, ierr) + + do i = 1, nq + q(i-1) = x(i) + enddo + + do 160 i = 0, nq - 1 + gc = gc + q(i) + 160 continue + + call timer_stop(1) + tm = timer_read(1) + + call mpi_allreduce(tm, x, 1, dp_type, + > MPI_MAX, MPI_COMM_WORLD, ierr) + tm = x(1) + + if (node.eq.root) then + nit=0 + verified = .true. + if (m.eq.24) then + sx_verify_value = -3.247834652034740D+3 + sy_verify_value = -6.958407078382297D+3 + elseif (m.eq.25) then + sx_verify_value = -2.863319731645753D+3 + sy_verify_value = -6.320053679109499D+3 + elseif (m.eq.28) then + sx_verify_value = -4.295875165629892D+3 + sy_verify_value = -1.580732573678431D+4 + elseif (m.eq.30) then + sx_verify_value = 4.033815542441498D+4 + sy_verify_value = -2.660669192809235D+4 + elseif (m.eq.32) then + sx_verify_value = 4.764367927995374D+4 + sy_verify_value = -8.084072988043731D+4 + elseif (m.eq.36) then + sx_verify_value = 1.982481200946593D+5 + sy_verify_value = -1.020596636361769D+5 + elseif (m.eq.40) then + sx_verify_value = -5.319717441530D+05 + sy_verify_value = -3.688834557731D+05 + else + verified = .false. + endif + if (verified) then + sx_err = abs((sx - sx_verify_value)/sx_verify_value) + sy_err = abs((sy - sy_verify_value)/sy_verify_value) + verified = ((sx_err.le.epsilon) .and. (sy_err.le.epsilon)) + endif + Mops = 2.d0**(m+1)/tm/1000000.d0 + + write (6,11) tm, m, gc, sx, sy, (i, q(i), i = 0, nq - 1) + 11 format ('EP Benchmark Results:'//'CPU Time =',f10.4/'N = 2^', + > i5/'No. Gaussian Pairs =',f15.0/'Sums = ',1p,2d25.15/ + > 'Counts:'/(i3,0p,f15.0)) + + call print_results('EP', class, m+1, 0, 0, nit, npm, + > no_nodes, tm, Mops, + > 'Random numbers generated', + > verified, npbversion, compiletime, cs1, + > cs2, cs3, cs4, cs5, cs6, cs7) + + endif + + if (timers_enabled .and. (node .eq. root)) then + print *, 'Total time: ', timer_read(1) + print *, 'Gaussian pairs: ', timer_read(2) + print *, 'Random numbers: ', timer_read(3) + endif + + call mpi_finalize(ierr) + + end diff --git a/examples/smpi/NAS/EP/mpinpb.h b/examples/smpi/NAS/EP/mpinpb.h new file mode 100644 index 0000000000..1f13637ae4 --- /dev/null +++ b/examples/smpi/NAS/EP/mpinpb.h @@ -0,0 +1,9 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'mpif.h' + + integer me, nprocs, root, dp_type + common /mpistuff/ me, nprocs, root, dp_type + diff --git a/examples/smpi/NAS/EP/randlc.c b/examples/smpi/NAS/EP/randlc.c new file mode 100644 index 0000000000..624b800df8 --- /dev/null +++ b/examples/smpi/NAS/EP/randlc.c @@ -0,0 +1,107 @@ + +/* + * FUNCTION RANDLC (X, A) + * + * This routine returns a uniform pseudorandom double precision number in the + * range (0, 1) by using the linear congruential generator + * + * x_{k+1} = a x_k (mod 2^46) + * + * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers + * before repeating. The argument A is the same as 'a' in the above formula, + * and X is the same as x_0. A and X must be odd double precision integers + * in the range (1, 2^46). The returned value RANDLC is normalized to be + * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain + * the new seed x_1, so that subsequent calls to RANDLC using the same + * arguments will generate a continuous sequence. + * + * This routine should produce the same results on any computer with at least + * 48 mantissa bits in double precision floating point data. On Cray systems, + * double precision should be disabled. + * + * David H. Bailey October 26, 1990 + * + * IMPLICIT DOUBLE PRECISION (A-H, O-Z) + * SAVE KS, R23, R46, T23, T46 + * DATA KS/0/ + * + * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46, + * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than + * by merely using the ** operator, in order to insure that the results are + * exact on all systems. This code assumes that 0.5D0 is represented exactly. + */ + + +/*****************************************************************/ +/************* R A N D L C ************/ +/************* ************/ +/************* portable random number generator ************/ +/*****************************************************************/ + +double randlc( double *X, double *A ) +{ + static int KS=0; + static double R23, R46, T23, T46; + double T1, T2, T3, T4; + double A1; + double A2; + double X1; + double X2; + double Z; + int i, j; + + if (KS == 0) + { + R23 = 1.0; + R46 = 1.0; + T23 = 1.0; + T46 = 1.0; + + for (i=1; i<=23; i++) + { + R23 = 0.50 * R23; + T23 = 2.0 * T23; + } + for (i=1; i<=46; i++) + { + R46 = 0.50 * R46; + T46 = 2.0 * T46; + } + KS = 1; + } + +/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ + + T1 = R23 * *A; + j = T1; + A1 = j; + A2 = *A - T23 * A1; + +/* Break X into two parts such that X = 2^23 * X1 + X2, compute + Z = A1 * X2 + A2 * X1 (mod 2^23), and then + X = 2^23 * Z + A2 * X2 (mod 2^46). */ + + T1 = R23 * *X; + j = T1; + X1 = j; + X2 = *X - T23 * X1; + T1 = A1 * X2 + A2 * X1; + + j = R23 * T1; + T2 = j; + Z = T1 - T23 * T2; + T3 = T23 * Z + A2 * X2; + j = R46 * T3; + T4 = j; + *X = T3 - T46 * T4; + return(R46 * *X); +} + + + +/*****************************************************************/ +/************ F I N D _ M Y _ S E E D ************/ +/************ ************/ +/************ returns parallel random number seq seed ************/ +/*****************************************************************/ + diff --git a/examples/smpi/NAS/EP/randlc.h b/examples/smpi/NAS/EP/randlc.h new file mode 100644 index 0000000000..aff84d341a --- /dev/null +++ b/examples/smpi/NAS/EP/randlc.h @@ -0,0 +1,3 @@ + +double randlc( double *X, double *A ); + diff --git a/examples/smpi/NAS/FT/Makefile b/examples/smpi/NAS/FT/Makefile new file mode 100644 index 0000000000..1cc6e1416b --- /dev/null +++ b/examples/smpi/NAS/FT/Makefile @@ -0,0 +1,23 @@ +SHELL=/bin/sh +BENCHMARK=ft +BENCHMARKU=FT + +include ../config/make.def + +include ../sys/make.common + +OBJS = ft.o ${COMMON}/${RAND}.o ${COMMON}/print_results.o ${COMMON}/timers.o + +${PROGRAM}: config ${OBJS} + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} + + + +.f.o: + ${FCOMPILE} $< + +ft.o: ft.f global.h mpinpb.h npbparams.h + +clean: + - rm -f *.o *~ mputil* + - rm -f ft npbparams.h core diff --git a/examples/smpi/NAS/FT/README b/examples/smpi/NAS/FT/README new file mode 100644 index 0000000000..ab08b363b2 --- /dev/null +++ b/examples/smpi/NAS/FT/README @@ -0,0 +1,5 @@ +This code implements the time integration of a three-dimensional +partial differential equation using the Fast Fourier Transform. +Some of the dimension statements are not F77 conforming and will +not work using the g77 compiler. All dimension statements, +however, are legal F90. \ No newline at end of file diff --git a/examples/smpi/NAS/FT/ft.f b/examples/smpi/NAS/FT/ft.f new file mode 100644 index 0000000000..8ab25b9780 --- /dev/null +++ b/examples/smpi/NAS/FT/ft.f @@ -0,0 +1,1993 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! F T ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! +! It is described in NAS Technical Reports 95-020 and 02-007 ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +!TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER +!STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT +!JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID +!FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY +!USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE +!NPB3.0-SER) + +!TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY +!MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE +!NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE +!COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS +!((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS +!THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE +!PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A +!FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER. +!THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL. + +!UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT +!FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE +!"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM +!DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER +!OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1 +!AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION +!OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE +!ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO. + +c--------------------------------------------------------------------- +c +c Authors: D. Bailey +c W. Saphir +c R. F. Van der Wijngaart +c +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c FT benchmark +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + program ft + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpif.h' + include 'global.h' + integer i, ierr + +c--------------------------------------------------------------------- +c u0, u1, u2 are the main arrays in the problem. +c Depending on the decomposition, these arrays will have different +c dimensions. To accomodate all possibilities, we allocate them as +c one-dimensional arrays and pass them to subroutines for different +c views +c - u0 contains the initial (transformed) initial condition +c - u1 and u2 are working arrays +c--------------------------------------------------------------------- + + double complex u0(ntdivnp), + > u1(ntdivnp), + > u2(ntdivnp) + double precision twiddle(ntdivnp) +c--------------------------------------------------------------------- +c Large arrays are in common so that they are allocated on the +c heap rather than the stack. This common block is not +c referenced directly anywhere else. Padding is to avoid accidental +c cache problems, since all array sizes are powers of two. +c--------------------------------------------------------------------- + + double complex pad1(3), pad2(3), pad3(3) + common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle + + integer iter + double precision total_time, mflops + logical verified + character class + + call MPI_Init(ierr) + +c--------------------------------------------------------------------- +c Run the entire problem once to make sure all data is touched. +c This reduces variable startup costs, which is important for such a +c short benchmark. The other NPB 2 implementations are similar. +c--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + + call setup() + call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3)) + call compute_initial_conditions(u1, dims(1,1), dims(2,1), + > dims(3,1)) + call fft_init (dims(1,1)) + call fft(1, u1, u0) + +c--------------------------------------------------------------------- +c Start over from the beginning. Note that all operations must +c be timed, in contrast to other benchmarks. +c--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + call timer_start(T_total) + if (timers_enabled) call timer_start(T_setup) + + call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3)) + call compute_initial_conditions(u1, dims(1,1), dims(2,1), + > dims(3,1)) + call fft_init (dims(1,1)) + + if (timers_enabled) call synchup() + if (timers_enabled) call timer_stop(T_setup) + + if (timers_enabled) call timer_start(T_fft) + call fft(1, u1, u0) + if (timers_enabled) call timer_stop(T_fft) + + do iter = 1, niter + if (timers_enabled) call timer_start(T_evolve) + call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1)) + if (timers_enabled) call timer_stop(T_evolve) + if (timers_enabled) call timer_start(T_fft) + call fft(-1, u1, u2) + if (timers_enabled) call timer_stop(T_fft) + if (timers_enabled) call synchup() + if (timers_enabled) call timer_start(T_checksum) + call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1)) + if (timers_enabled) call timer_stop(T_checksum) + end do + + call verify(nx, ny, nz, niter, verified, class) + call timer_stop(t_total) + if (np .ne. np_min) verified = .false. + total_time = timer_read(t_total) + + if( total_time .ne. 0. ) then + mflops = 1.0d-6*ntotal_f * + > (14.8157+7.19641*log(ntotal_f) + > + (5.23518+7.21113*log(ntotal_f))*niter) + > /total_time + else + mflops = 0.0 + endif + if (me .eq. 0) then + call print_results('FT', class, nx, ny, nz, niter, np_min, np, + > total_time, mflops, ' floating point', verified, + > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + endif + if (timers_enabled) call print_timers() + call MPI_Finalize(ierr) + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine evolve(u0, u1, twiddle, d1, d2, d3) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c evolve u0 -> u1 (t time steps) in fourier space +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer d1, d2, d3 + double precision exi + double complex u0(d1,d2,d3) + double complex u1(d1,d2,d3) + double precision twiddle(d1,d2,d3) + integer i, j, k + + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k)) + u1(i,j,k) = u0(i,j,k) + end do + end do + end do + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine compute_initial_conditions(u0, d1, d2, d3) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Fill in array u0 with initial conditions from +c random number generator +c--------------------------------------------------------------------- + implicit none + include 'global.h' + integer d1, d2, d3 + double complex u0(d1, d2, d3) + integer k + double precision x0, start, an, dummy + +c--------------------------------------------------------------------- +c 0-D and 1-D layouts are easy because each processor gets a contiguous +c chunk of the array, in the Fortran ordering sense. +c For a 2-D layout, it's a bit more complicated. We always +c have entire x-lines (contiguous) in processor. +c We can do ny/np1 of them at a time since we have +c ny/np1 contiguous in y-direction. But then we jump +c by z-planes (nz/np2 of them, total). +c For the 0-D and 1-D layouts we could do larger chunks, but +c this turns out to have no measurable impact on performance. +c--------------------------------------------------------------------- + + + start = seed +c--------------------------------------------------------------------- +c Jump to the starting element for our first plane. +c--------------------------------------------------------------------- + call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an) + dummy = randlc(start, an) + call ipow46(a, 2*nx, ny, an) + +c--------------------------------------------------------------------- +c Go through by z planes filling in one square at a time. +c--------------------------------------------------------------------- + do k = 1, dims(3, 1) ! nz/np2 + x0 = start + call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k)) + if (k .ne. dims(3, 1)) dummy = randlc(start, an) + end do + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine ipow46(a, exp_1, exp_2, result) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute a^exponent mod 2^46 +c--------------------------------------------------------------------- + + implicit none + double precision a, result, dummy, q, r + integer exp_1, exp_2, n, n2, ierr + external randlc + double precision randlc + logical two_pow +c--------------------------------------------------------------------- +c Use +c a^n = a^(n/2)*a^(n/2) if n even else +c a^n = a*a^(n-1) if n odd +c--------------------------------------------------------------------- + result = 1 + if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return + q = a + r = 1 + n = exp_1 + two_pow = .true. + + do while (two_pow) + n2 = n/2 + if (n2 * 2 .eq. n) then + dummy = randlc(q, q) + n = n2 + else + n = n * exp_2 + two_pow = .false. + endif + end do + + do while (n .gt. 1) + n2 = n/2 + if (n2 * 2 .eq. n) then + dummy = randlc(q, q) + n = n2 + else + dummy = randlc(r, q) + n = n-1 + endif + end do + dummy = randlc(r, q) + result = r + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'mpinpb.h' + include 'global.h' + + integer ierr, i, j, fstatus + debug = .FALSE. + + call MPI_Comm_size(MPI_COMM_WORLD, np, ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr) + + if (.not. convertdouble) then + dc_type = MPI_DOUBLE_COMPLEX + else + dc_type = MPI_COMPLEX + endif + + + if (me .eq. 0) then + write(*, 1000) + open (unit=2,file='inputft.data',status='old', iostat=fstatus) + + if (fstatus .eq. 0) then + write(*,233) + 233 format(' Reading from input file inputft.data') + read (2,*) niter + read (2,*) layout_type + read (2,*) np1, np2 + close(2) + +c--------------------------------------------------------------------- +c check to make sure input data is consistent +c--------------------------------------------------------------------- + + +c--------------------------------------------------------------------- +c 1. product of processor grid dims must equal number of processors +c--------------------------------------------------------------------- + + if (np1 * np2 .ne. np) then + write(*, 238) + 238 format(' np1 and np2 given in input file are not valid.') + write(*, 239) np1*np2, np + 239 format(' Product is ', i5, ' and should be ', i5) + call MPI_Abort(MPI_COMM_WORLD, 1, ierr) + endif + +c--------------------------------------------------------------------- +c 2. layout type must be valid +c--------------------------------------------------------------------- + + if (layout_type .ne. layout_0D .and. + > layout_type .ne. layout_1D .and. + > layout_type .ne. layout_2D) then + write(*, 240) + 240 format(' Layout type specified in inputft.data is + > invalid ') + call MPI_Abort(MPI_COMM_WORLD, 1, ierr) + endif + +c--------------------------------------------------------------------- +c 3. 0D layout must be 1x1 grid +c--------------------------------------------------------------------- + + if (layout_type .eq. layout_0D .and. + > (np1 .ne.1 .or. np2 .ne. 1)) then + write(*, 241) + 241 format(' For 0D layout, both np1 and np2 must be 1 ') + call MPI_Abort(MPI_COMM_WORLD, 1, ierr) + endif +c--------------------------------------------------------------------- +c 4. 1D layout must be 1xN grid +c--------------------------------------------------------------------- + + if (layout_type .eq. layout_1D .and. np1 .ne. 1) then + write(*, 242) + 242 format(' For 1D layout, np1 must be 1 ') + call MPI_Abort(MPI_COMM_WORLD, 1, ierr) + endif + + else + write(*,234) + niter = niter_default + if (np .eq. 1) then + np1 = 1 + np2 = 1 + layout_type = layout_0D + else if (np .le. nz) then + np1 = 1 + np2 = np + layout_type = layout_1D + else + np1 = nz + np2 = np/nz + layout_type = layout_2D + endif + endif + + if (np .lt. np_min) then + write(*, 10) np_min + 10 format(' Error: Compiled for ', I5, ' processors. ') + write(*, 11) np + 11 format(' Only ', i5, ' processors found ') + call MPI_Abort(MPI_COMM_WORLD, 1, ierr) + endif + + 234 format(' No input file inputft.data. Using compiled defaults') + write(*, 1001) nx, ny, nz + write(*, 1002) niter + write(*, 1004) np + write(*, 1005) np1, np2 + if (np .ne. np_min) write(*, 1006) np_min + + if (layout_type .eq. layout_0D) then + write(*, 1010) '0D' + else if (layout_type .eq. layout_1D) then + write(*, 1010) '1D' + else + write(*, 1010) '2D' + endif + + 1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/) + 1001 format(' Size : ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations : ', 7x, i7) + 1004 format(' Number of processes : ', 7x, i7) + 1005 format(' Processor array : ', 5x, i4, 'x', i4) + 1006 format(' WARNING: compiled for ', i5, ' processes. ', + > ' Will not verify. ') + 1010 format(' Layout type : ', 9x, A5) + endif + + +c--------------------------------------------------------------------- +c Since np1, np2 and layout_type are in a common block, +c this sends all three. +c--------------------------------------------------------------------- + call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) + call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) + + if (np1 .eq. 1 .and. np2 .eq. 1) then + layout_type = layout_0D + else if (np1 .eq. 1) then + layout_type = layout_1D + else + layout_type = layout_2D + endif + + if (layout_type .eq. layout_0D) then + do i = 1, 3 + dims(1, i) = nx + dims(2, i) = ny + dims(3, i) = nz + end do + else if (layout_type .eq. layout_1D) then + dims(1, 1) = nx + dims(2, 1) = ny + dims(3, 1) = nz + + dims(1, 2) = nx + dims(2, 2) = ny + dims(3, 2) = nz + + dims(1, 3) = nz + dims(2, 3) = nx + dims(3, 3) = ny + else if (layout_type .eq. layout_2D) then + dims(1, 1) = nx + dims(2, 1) = ny + dims(3, 1) = nz + + dims(1, 2) = ny + dims(2, 2) = nx + dims(3, 2) = nz + + dims(1, 3) = nz + dims(2, 3) = nx + dims(3, 3) = ny + + endif + do i = 1, 3 + dims(2, i) = dims(2, i) / np1 + dims(3, i) = dims(3, i) / np2 + end do + + +c--------------------------------------------------------------------- +c Determine processor coordinates of this processor +c Processor grid is np1xnp2. +c Arrays are always (n1, n2/np1, n3/np2) +c Processor coords are zero-based. +c--------------------------------------------------------------------- + me2 = mod(me, np2) ! goes from 0...np2-1 + me1 = me/np2 ! goes from 0...np1-1 +c--------------------------------------------------------------------- +c Communicators for rows/columns of processor grid. +c commslice1 is communicator of all procs with same me1, ranked as me2 +c commslice2 is communicator of all procs with same me2, ranked as me1 +c mpi_comm_split(comm, color, key, ...) +c--------------------------------------------------------------------- + call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr) + call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr) + if (timers_enabled) call synchup() + + if (debug) print *, 'proc coords: ', me, me1, me2 + +c--------------------------------------------------------------------- +c Determine which section of the grid is owned by this +c processor. +c--------------------------------------------------------------------- + if (layout_type .eq. layout_0d) then + + do i = 1, 3 + xstart(i) = 1 + xend(i) = nx + ystart(i) = 1 + yend(i) = ny + zstart(i) = 1 + zend(i) = nz + end do + + else if (layout_type .eq. layout_1d) then + + xstart(1) = 1 + xend(1) = nx + ystart(1) = 1 + yend(1) = ny + zstart(1) = 1 + me2 * nz/np2 + zend(1) = (me2+1) * nz/np2 + + xstart(2) = 1 + xend(2) = nx + ystart(2) = 1 + yend(2) = ny + zstart(2) = 1 + me2 * nz/np2 + zend(2) = (me2+1) * nz/np2 + + xstart(3) = 1 + xend(3) = nx + ystart(3) = 1 + me2 * ny/np2 + yend(3) = (me2+1) * ny/np2 + zstart(3) = 1 + zend(3) = nz + + else if (layout_type .eq. layout_2d) then + + xstart(1) = 1 + xend(1) = nx + ystart(1) = 1 + me1 * ny/np1 + yend(1) = (me1+1) * ny/np1 + zstart(1) = 1 + me2 * nz/np2 + zend(1) = (me2+1) * nz/np2 + + xstart(2) = 1 + me1 * nx/np1 + xend(2) = (me1+1)*nx/np1 + ystart(2) = 1 + yend(2) = ny + zstart(2) = zstart(1) + zend(2) = zend(1) + + xstart(3) = xstart(2) + xend(3) = xend(2) + ystart(3) = 1 + me2 *ny/np2 + yend(3) = (me2+1)*ny/np2 + zstart(3) = 1 + zend(3) = nz + endif + +c--------------------------------------------------------------------- +c Set up info for blocking of ffts and transposes. This improves +c performance on cache-based systems. Blocking involves +c working on a chunk of the problem at a time, taking chunks +c along the first, second, or third dimension. +c +c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) +c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) + +c Since 1st dim is always in processor, we'll assume it's long enough +c (default blocking factor is 16 so min size for 1st dim is 16) +c The only case we have to worry about is cffts1 in a 2d decomposition. +c so the blocking factor should not be larger than the 2nd dimension. +c--------------------------------------------------------------------- + + fftblock = fftblock_default + fftblockpad = fftblockpad_default + + if (layout_type .eq. layout_2d) then + if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1) + if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2) + if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3) + endif + + if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3 + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine compute_indexmap(twiddle, d1, d2, d3) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 +c for time evolution exponent. +c--------------------------------------------------------------------- + + implicit none + include 'mpinpb.h' + include 'global.h' + integer d1, d2, d3 + integer i, j, k, ii, ii2, jj, ij2, kk + double precision ap, twiddle(d1, d2, d3) + +c--------------------------------------------------------------------- +c this function is very different depending on whether +c we are in the 0d, 1d or 2d layout. Compute separately. +c basically we want to convert the fortran indices +c 1 2 3 4 5 6 7 8 +c to +c 0 1 2 3 -4 -3 -2 -1 +c The following magic formula does the trick: +c mod(i-1+n/2, n) - n/2 +c--------------------------------------------------------------------- + + ap = - 4.d0 * alpha * pi *pi + + if (layout_type .eq. layout_0d) then ! xyz layout + do i = 1, dims(1,3) + ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2 + ii2 = ii*ii + do j = 1, dims(2,3) + jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2 + ij2 = jj*jj+ii2 + do k = 1, dims(3,3) + kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2 + twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2)) + end do + end do + end do + else if (layout_type .eq. layout_1d) then ! zxy layout + do i = 1,dims(2,3) + ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2 + ii2 = ii*ii + do j = 1,dims(3,3) + jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2 + ij2 = jj*jj+ii2 + do k = 1,dims(1,3) + kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2 + twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2)) + end do + end do + end do + else if (layout_type .eq. layout_2d) then ! zxy layout + do i = 1,dims(2,3) + ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2 + ii2 = ii*ii + do j = 1, dims(3,3) + jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2 + ij2 = jj*jj+ii2 + do k =1,dims(1,3) + kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2 + twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2)) + end do + end do + end do + else + print *, ' Unknown layout type ', layout_type + stop + endif + + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine print_timers() + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + integer i + include 'global.h' + character*25 tstrings(T_max) + data tstrings / ' total ', + > ' setup ', + > ' fft ', + > ' evolve ', + > ' checksum ', + > ' fftlow ', + > ' fftcopy ', + > ' transpose ', + > ' transpose1_loc ', + > ' transpose1_glo ', + > ' transpose1_fin ', + > ' transpose2_loc ', + > ' transpose2_glo ', + > ' transpose2_fin ', + > ' sync ' / + + if (me .ne. 0) return + do i = 1, t_max + if (timer_read(i) .ne. 0.0d0) then + write(*, 100) i, tstrings(i), timer_read(i) + endif + end do + 100 format(' timer ', i2, '(', A16, ') :', F10.6) + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine fft(dir, x1, x2) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer dir + double complex x1(ntdivnp), x2(ntdivnp) + + double complex scratch(fftblockpad_default*maxdim*2) + +c--------------------------------------------------------------------- +c note: args x1, x2 must be different arrays +c note: args for cfftsx are (direction, layout, xin, xout, scratch) +c xin/xout may be the same and it can be somewhat faster +c if they are +c note: args for transpose are (layout1, layout2, xin, xout) +c xin/xout must be different +c--------------------------------------------------------------------- + + if (dir .eq. 1) then + if (layout_type .eq. layout_0d) then + call cffts1(1, dims(1,1), dims(2,1), dims(3,1), + > x1, x1, scratch) + call cffts2(1, dims(1,2), dims(2,2), dims(3,2), + > x1, x1, scratch) + call cffts3(1, dims(1,3), dims(2,3), dims(3,3), + > x1, x2, scratch) + else if (layout_type .eq. layout_1d) then + call cffts1(1, dims(1,1), dims(2,1), dims(3,1), + > x1, x1, scratch) + call cffts2(1, dims(1,2), dims(2,2), dims(3,2), + > x1, x1, scratch) + if (timers_enabled) call timer_start(T_transpose) + call transpose_xy_z(2, 3, x1, x2) + if (timers_enabled) call timer_stop(T_transpose) + call cffts1(1, dims(1,3), dims(2,3), dims(3,3), + > x2, x2, scratch) + else if (layout_type .eq. layout_2d) then + call cffts1(1, dims(1,1), dims(2,1), dims(3,1), + > x1, x1, scratch) + if (timers_enabled) call timer_start(T_transpose) + call transpose_x_y(1, 2, x1, x2) + if (timers_enabled) call timer_stop(T_transpose) + call cffts1(1, dims(1,2), dims(2,2), dims(3,2), + > x2, x2, scratch) + if (timers_enabled) call timer_start(T_transpose) + call transpose_x_z(2, 3, x2, x1) + if (timers_enabled) call timer_stop(T_transpose) + call cffts1(1, dims(1,3), dims(2,3), dims(3,3), + > x1, x2, scratch) + endif + else + if (layout_type .eq. layout_0d) then + call cffts3(-1, dims(1,3), dims(2,3), dims(3,3), + > x1, x1, scratch) + call cffts2(-1, dims(1,2), dims(2,2), dims(3,2), + > x1, x1, scratch) + call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), + > x1, x2, scratch) + else if (layout_type .eq. layout_1d) then + call cffts1(-1, dims(1,3), dims(2,3), dims(3,3), + > x1, x1, scratch) + if (timers_enabled) call timer_start(T_transpose) + call transpose_x_yz(3, 2, x1, x2) + if (timers_enabled) call timer_stop(T_transpose) + call cffts2(-1, dims(1,2), dims(2,2), dims(3,2), + > x2, x2, scratch) + call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), + > x2, x2, scratch) + else if (layout_type .eq. layout_2d) then + call cffts1(-1, dims(1,3), dims(2,3), dims(3,3), + > x1, x1, scratch) + if (timers_enabled) call timer_start(T_transpose) + call transpose_x_z(3, 2, x1, x2) + if (timers_enabled) call timer_stop(T_transpose) + call cffts1(-1, dims(1,2), dims(2,2), dims(3,2), + > x2, x2, scratch) + if (timers_enabled) call timer_start(T_transpose) + call transpose_x_y(2, 1, x2, x1) + if (timers_enabled) call timer_stop(T_transpose) + call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), + > x1, x2, scratch) + endif + endif + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine cffts1(is, d1, d2, d3, x, xout, y) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer is, d1, d2, d3, logd1 + double complex x(d1,d2,d3) + double complex xout(d1,d2,d3) + double complex y(fftblockpad, d1, 2) + integer i, j, k, jj + + logd1 = ilog2(d1) + + do k = 1, d3 + do jj = 0, d2 - fftblock, fftblock + if (timers_enabled) call timer_start(T_fftcopy) + do j = 1, fftblock + do i = 1, d1 + y(j,i,1) = x(i,j+jj,k) + enddo + enddo + if (timers_enabled) call timer_stop(T_fftcopy) + + if (timers_enabled) call timer_start(T_fftlow) + call cfftz (is, logd1, d1, y, y(1,1,2)) + if (timers_enabled) call timer_stop(T_fftlow) + + if (timers_enabled) call timer_start(T_fftcopy) + do j = 1, fftblock + do i = 1, d1 + xout(i,j+jj,k) = y(j,i,1) + enddo + enddo + if (timers_enabled) call timer_stop(T_fftcopy) + enddo + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine cffts2(is, d1, d2, d3, x, xout, y) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer is, d1, d2, d3, logd2 + double complex x(d1,d2,d3) + double complex xout(d1,d2,d3) + double complex y(fftblockpad, d2, 2) + integer i, j, k, ii + + logd2 = ilog2(d2) + + do k = 1, d3 + do ii = 0, d1 - fftblock, fftblock + if (timers_enabled) call timer_start(T_fftcopy) + do j = 1, d2 + do i = 1, fftblock + y(i,j,1) = x(i+ii,j,k) + enddo + enddo + if (timers_enabled) call timer_stop(T_fftcopy) + + if (timers_enabled) call timer_start(T_fftlow) + call cfftz (is, logd2, d2, y, y(1, 1, 2)) + if (timers_enabled) call timer_stop(T_fftlow) + + if (timers_enabled) call timer_start(T_fftcopy) + do j = 1, d2 + do i = 1, fftblock + xout(i+ii,j,k) = y(i,j,1) + enddo + enddo + if (timers_enabled) call timer_stop(T_fftcopy) + enddo + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine cffts3(is, d1, d2, d3, x, xout, y) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer is, d1, d2, d3, logd3 + double complex x(d1,d2,d3) + double complex xout(d1,d2,d3) + double complex y(fftblockpad, d3, 2) + integer i, j, k, ii + + logd3 = ilog2(d3) + + do j = 1, d2 + do ii = 0, d1 - fftblock, fftblock + if (timers_enabled) call timer_start(T_fftcopy) + do k = 1, d3 + do i = 1, fftblock + y(i,k,1) = x(i+ii,j,k) + enddo + enddo + if (timers_enabled) call timer_stop(T_fftcopy) + + if (timers_enabled) call timer_start(T_fftlow) + call cfftz (is, logd3, d3, y, y(1, 1, 2)) + if (timers_enabled) call timer_stop(T_fftlow) + + if (timers_enabled) call timer_start(T_fftcopy) + do k = 1, d3 + do i = 1, fftblock + xout(i+ii,j,k) = y(i,k,1) + enddo + enddo + if (timers_enabled) call timer_stop(T_fftcopy) + enddo + enddo + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine fft_init (n) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the roots-of-unity array that will be used for subsequent FFTs. +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer m,n,nu,ku,i,j,ln + double precision t, ti + + +c--------------------------------------------------------------------- +c Initialize the U array with sines and cosines in a manner that permits +c stride one access at each FFT iteration. +c--------------------------------------------------------------------- + nu = n + m = ilog2(n) + u(1) = m + ku = 2 + ln = 1 + + do j = 1, m + t = pi / ln + + do i = 0, ln - 1 + ti = i * t + u(i+ku) = dcmplx (cos (ti), sin(ti)) + enddo + + ku = ku + ln + ln = 2 * ln + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine cfftz (is, m, n, x, y) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Computes NY N-point complex-to-complex FFTs of X using an algorithm due +c to Swarztrauber. X is both the input and the output array, while Y is a +c scratch array. It is assumed that N = 2^M. Before calling CFFTZ to +c perform FFTs, the array U must be initialized by calling CFFTZ with IS +c set to 0 and M set to MX, where MX is the maximum value of M for any +c subsequent call. +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer is,m,n,i,j,l,mx + double complex x, y + + dimension x(fftblockpad,n), y(fftblockpad,n) + +c--------------------------------------------------------------------- +c Check if input parameters are invalid. +c--------------------------------------------------------------------- + mx = u(1) + if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx) + > then + write (*, 1) is, m, mx + 1 format ('CFFTZ: Either U has not been initialized, or else'/ + > 'one of the input parameters is invalid', 3I5) + stop + endif + +c--------------------------------------------------------------------- +c Perform one variant of the Stockham FFT. +c--------------------------------------------------------------------- + do l = 1, m, 2 + call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) + if (l .eq. m) goto 160 + call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) + enddo + + goto 180 + +c--------------------------------------------------------------------- +c Copy Y to X. +c--------------------------------------------------------------------- + 160 do j = 1, n + do i = 1, fftblock + x(i,j) = y(i,j) + enddo + enddo + + 180 continue + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Performs the L-th iteration of the second variant of the Stockham FFT. +c--------------------------------------------------------------------- + + implicit none + + integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 + double complex u,x,y,u1,x11,x21 + dimension u(n), x(ny1,n), y(ny1,n) + + +c--------------------------------------------------------------------- +c Set initial parameters. +c--------------------------------------------------------------------- + + n1 = n / 2 + lk = 2 ** (l - 1) + li = 2 ** (m - l) + lj = 2 * lk + ku = li + 1 + + do i = 0, li - 1 + i11 = i * lk + 1 + i12 = i11 + n1 + i21 = i * lj + 1 + i22 = i21 + lk + if (is .ge. 1) then + u1 = u(ku+i) + else + u1 = dconjg (u(ku+i)) + endif + +c--------------------------------------------------------------------- +c This loop is vectorizable. +c--------------------------------------------------------------------- + do k = 0, lk - 1 + do j = 1, ny + x11 = x(j,i11+k) + x21 = x(j,i12+k) + y(j,i21+k) = x11 + x21 + y(j,i22+k) = u1 * (x11 - x21) + enddo + enddo + enddo + + return + end + +c--------------------------------------------------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + integer function ilog2(n) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + integer n, nn, lg + if (n .eq. 1) then + ilog2=0 + return + endif + lg = 1 + nn = 2 + do while (nn .lt. n) + nn = nn*2 + lg = lg+1 + end do + ilog2 = lg + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_yz(l1, l2, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer l1, l2 + double complex xin(ntdivnp), xout(ntdivnp) + + call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1), + > xin, xout) + + call transpose2_global(xout, xin) + + call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1), + > xin, xout) + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_xy_z(l1, l2, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer l1, l2 + double complex xin(ntdivnp), xout(ntdivnp) + + call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1), + > xin, xout) + call transpose2_global(xout, xin) + call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1), + > xin, xout) + + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose2_local(n1, n2, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'mpinpb.h' + include 'global.h' + integer n1, n2 + double complex xin(n1, n2), xout(n2, n1) + + double complex z(transblockpad, transblock) + + integer i, j, ii, jj + + if (timers_enabled) call timer_start(T_transxzloc) + +c--------------------------------------------------------------------- +c If possible, block the transpose for cache memory systems. +c How much does this help? Example: R8000 Power Challenge (90 MHz) +c Blocked version decreases time spend in this routine +c from 14 seconds to 5.2 seconds on 8 nodes class A. +c--------------------------------------------------------------------- + + if (n1 .lt. transblock .or. n2 .lt. transblock) then + if (n1 .ge. n2) then + do j = 1, n2 + do i = 1, n1 + xout(j, i) = xin(i, j) + end do + end do + else + do i = 1, n1 + do j = 1, n2 + xout(j, i) = xin(i, j) + end do + end do + endif + else + do j = 0, n2-1, transblock + do i = 0, n1-1, transblock + +c--------------------------------------------------------------------- +c Note: compiler should be able to take j+jj out of inner loop +c--------------------------------------------------------------------- + do jj = 1, transblock + do ii = 1, transblock + z(jj,ii) = xin(i+ii, j+jj) + end do + end do + + do ii = 1, transblock + do jj = 1, transblock + xout(j+jj, i+ii) = z(jj,ii) + end do + end do + + end do + end do + endif + if (timers_enabled) call timer_stop(T_transxzloc) + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose2_global(xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + include 'mpinpb.h' + double complex xin(ntdivnp) + double complex xout(ntdivnp) + integer ierr + + if (timers_enabled) call synchup() + + if (timers_enabled) call timer_start(T_transxzglo) + call mpi_alltoall(xin, ntdivnp/np, dc_type, + > xout, ntdivnp/np, dc_type, + > commslice1, ierr) + if (timers_enabled) call timer_stop(T_transxzglo) + + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose2_finish(n1, n2, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer n1, n2, ioff + double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2) + + integer i, j, p + + if (timers_enabled) call timer_start(T_transxzfin) + do p = 0, np2-1 + ioff = p*n2 + do j = 1, n1/np2 + do i = 1, n2 + xout(i+ioff, j) = xin(i, j, p) + end do + end do + end do + if (timers_enabled) call timer_stop(T_transxzfin) + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_z(l1, l2, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer l1, l2 + double complex xin(ntdivnp), xout(ntdivnp) + + call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1), + > xin, xout) + call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1), + > xout, xin) + call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2), + > xin, xout) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_z_local(d1, d2, d3, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer d1, d2, d3 + double complex xin(d1,d2,d3) + double complex xout(d3,d2,d1) + integer block1, block3 + integer i, j, k, kk, ii, i1, k1 + + double complex buf(transblockpad, maxdim) + if (timers_enabled) call timer_start(T_transxzloc) + if (d1 .lt. 32) goto 100 + block3 = d3 + if (block3 .eq. 1) goto 100 + if (block3 .gt. transblock) block3 = transblock + block1 = d1 + if (block1*block3 .gt. transblock*transblock) + > block1 = transblock*transblock/block3 +c--------------------------------------------------------------------- +c blocked transpose +c--------------------------------------------------------------------- + do j = 1, d2 + do kk = 0, d3-block3, block3 + do ii = 0, d1-block1, block1 + + do k = 1, block3 + k1 = k + kk + do i = 1, block1 + buf(k, i) = xin(i+ii, j, k1) + end do + end do + + do i = 1, block1 + i1 = i + ii + do k = 1, block3 + xout(k+kk, j, i1) = buf(k, i) + end do + end do + + end do + end do + end do + goto 200 + + +c--------------------------------------------------------------------- +c basic transpose +c--------------------------------------------------------------------- + 100 continue + + do j = 1, d2 + do k = 1, d3 + do i = 1, d1 + xout(k, j, i) = xin(i, j, k) + end do + end do + end do + +c--------------------------------------------------------------------- +c all done +c--------------------------------------------------------------------- + 200 continue + + if (timers_enabled) call timer_stop(T_transxzloc) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_z_global(d1, d2, d3, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + include 'mpinpb.h' + integer d1, d2, d3 + double complex xin(d3,d2,d1) + double complex xout(d3,d2,d1) ! not real layout, but right size + integer ierr + + if (timers_enabled) call synchup() + +c--------------------------------------------------------------------- +c do transpose among all processes with same 1-coord (me1) +c--------------------------------------------------------------------- + if (timers_enabled)call timer_start(T_transxzglo) + call mpi_alltoall(xin, d1*d2*d3/np2, dc_type, + > xout, d1*d2*d3/np2, dc_type, + > commslice1, ierr) + if (timers_enabled) call timer_stop(T_transxzglo) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_z_finish(d1, d2, d3, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer d1, d2, d3 + double complex xin(d1/np2, d2, d3, 0:np2-1) + double complex xout(d1,d2,d3) + integer i, j, k, p, ioff + if (timers_enabled) call timer_start(T_transxzfin) +c--------------------------------------------------------------------- +c this is the most straightforward way of doing it. the +c calculation in the inner loop doesn't help. +c do i = 1, d1/np2 +c do j = 1, d2 +c do k = 1, d3 +c do p = 0, np2-1 +c ii = i + p*d1/np2 +c xout(ii, j, k) = xin(i, j, k, p) +c end do +c end do +c end do +c end do +c--------------------------------------------------------------------- + + do p = 0, np2-1 + ioff = p*d1/np2 + do k = 1, d3 + do j = 1, d2 + do i = 1, d1/np2 + xout(i+ioff, j, k) = xin(i, j, k, p) + end do + end do + end do + end do + if (timers_enabled) call timer_stop(T_transxzfin) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_y(l1, l2, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer l1, l2 + double complex xin(ntdivnp), xout(ntdivnp) + +c--------------------------------------------------------------------- +c xy transpose is a little tricky, since we don't want +c to touch 3rd axis. But alltoall must involve 3rd axis (most +c slowly varying) to be efficient. So we do +c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local) +c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global) +c then local finish. +c--------------------------------------------------------------------- + + + call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1), + > xin, xout) + call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1), + > xout, xin) + call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2), + > xin, xout) + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_y_local(d1, d2, d3, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer d1, d2, d3 + double complex xin(d1, d2, d3) + double complex xout(d2, d3, d1) + integer i, j, k + if (timers_enabled) call timer_start(T_transxyloc) + + do k = 1, d3 + do i = 1, d1 + do j = 1, d2 + xout(j,k,i)=xin(i,j,k) + end do + end do + end do + if (timers_enabled) call timer_stop(T_transxyloc) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_y_global(d1, d2, d3, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + include 'mpinpb.h' + integer d1, d2, d3 +c--------------------------------------------------------------------- +c array is in form (ny/np1, nz/np2, nx) +c--------------------------------------------------------------------- + double complex xin(d2,d3,d1) + double complex xout(d2,d3,d1) ! not real layout but right size + integer ierr + + if (timers_enabled) call synchup() + +c--------------------------------------------------------------------- +c do transpose among all processes with same 1-coord (me1) +c--------------------------------------------------------------------- + if (timers_enabled) call timer_start(T_transxyglo) + call mpi_alltoall(xin, d1*d2*d3/np1, dc_type, + > xout, d1*d2*d3/np1, dc_type, + > commslice2, ierr) + if (timers_enabled) call timer_stop(T_transxyglo) + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine transpose_x_y_finish(d1, d2, d3, xin, xout) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer d1, d2, d3 + double complex xin(d1/np1, d3, d2, 0:np1-1) + double complex xout(d1,d2,d3) + integer i, j, k, p, ioff + if (timers_enabled) call timer_start(T_transxyfin) +c--------------------------------------------------------------------- +c this is the most straightforward way of doing it. the +c calculation in the inner loop doesn't help. +c do i = 1, d1/np1 +c do j = 1, d2 +c do k = 1, d3 +c do p = 0, np1-1 +c ii = i + p*d1/np1 +c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2) +c xout(ii, j, k) = xin(i, k, j, p) +c end do +c end do +c end do +c end do +c--------------------------------------------------------------------- + + do p = 0, np1-1 + ioff = p*d1/np1 + do k = 1, d3 + do j = 1, d2 + do i = 1, d1/np1 + xout(i+ioff, j, k) = xin(i, k, j, p) + end do + end do + end do + end do + if (timers_enabled) call timer_stop(T_transxyfin) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine checksum(i, u1, d1, d2, d3) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + include 'mpinpb.h' + integer i, d1, d2, d3 + double complex u1(d1, d2, d3) + integer j, q,r,s, ierr + double complex chk,allchk + chk = (0.0,0.0) + + do j=1,1024 + q = mod(j, nx)+1 + if (q .ge. xstart(1) .and. q .le. xend(1)) then + r = mod(3*j,ny)+1 + if (r .ge. ystart(1) .and. r .le. yend(1)) then + s = mod(5*j,nz)+1 + if (s .ge. zstart(1) .and. s .le. zend(1)) then + chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1) + end if + end if + end if + end do + chk = chk/ntotal_f + + call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM, + > 0, MPI_COMM_WORLD, ierr) + if (me .eq. 0) then + write (*, 30) i, allchk + 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) + endif + +c sums(i) = allchk +c If we compute the checksum for diagnostic purposes, we let i be +c negative, so the result will not be stored in an array + if (i .gt. 0) sums(i) = allchk + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine synchup + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + include 'mpinpb.h' + integer ierr + call timer_start(T_synch) + call mpi_barrier(MPI_COMM_WORLD, ierr) + call timer_stop(T_synch) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine verify (d1, d2, d3, nt, verified, class) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'global.h' + include 'mpinpb.h' + integer d1, d2, d3, nt + character class + logical verified + integer ierr, size, i + double precision err, epsilon + +c--------------------------------------------------------------------- +c Reference checksums +c--------------------------------------------------------------------- + double complex csum_ref(25) + + + class = 'U' + + if (me .ne. 0) return + + epsilon = 1.0d-12 + verified = .FALSE. + + if (d1 .eq. 64 .and. + > d2 .eq. 64 .and. + > d3 .eq. 64 .and. + > nt .eq. 6) then +c--------------------------------------------------------------------- +c Sample size reference checksums +c--------------------------------------------------------------------- + class = 'S' + csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02) + csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02) + csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02) + csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02) + csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02) + csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02) + + else if (d1 .eq. 128 .and. + > d2 .eq. 128 .and. + > d3 .eq. 32 .and. + > nt .eq. 6) then +c--------------------------------------------------------------------- +c Class W size reference checksums +c--------------------------------------------------------------------- + class = 'W' + csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02) + csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02) + csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02) + csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02) + csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02) + csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02) + + else if (d1 .eq. 256 .and. + > d2 .eq. 256 .and. + > d3 .eq. 128 .and. + > nt .eq. 6) then +c--------------------------------------------------------------------- +c Class A size reference checksums +c--------------------------------------------------------------------- + class = 'A' + csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02) + csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02) + csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02) + csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) + csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) + csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) + + else if (d1 .eq. 512 .and. + > d2 .eq. 256 .and. + > d3 .eq. 256 .and. + > nt .eq. 20) then +c--------------------------------------------------------------------- +c Class B size reference checksums +c--------------------------------------------------------------------- + class = 'B' + csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02) + csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02) + csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02) + csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02) + csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02) + csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02) + csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02) + csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02) + csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02) + csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02) + csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02) + csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02) + csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02) + csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02) + csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02) + csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02) + csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02) + csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02) + csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02) + csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02) + + else if (d1 .eq. 512 .and. + > d2 .eq. 512 .and. + > d3 .eq. 512 .and. + > nt .eq. 20) then +c--------------------------------------------------------------------- +c Class C size reference checksums +c--------------------------------------------------------------------- + class = 'C' + csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02) + csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02) + csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02) + csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02) + csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02) + csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02) + csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02) + csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02) + csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02) + csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02) + csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02) + csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02) + csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02) + csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02) + csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02) + csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02) + csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02) + csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02) + csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02) + csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02) + + else if (d1 .eq. 2048 .and. + > d2 .eq. 1024 .and. + > d3 .eq. 1024 .and. + > nt .eq. 25) then +c--------------------------------------------------------------------- +c Class D size reference checksums +c--------------------------------------------------------------------- + class = 'D' + csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02) + csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02) + csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02) + csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02) + csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02) + csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02) + csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02) + csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02) + csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02) + csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02) + csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02) + csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02) + csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02) + csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02) + csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02) + csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02) + csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02) + csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02) + csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02) + csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02) + csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02) + csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02) + csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02) + csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02) + csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02) + + else if (d1 .eq. 4096 .and. + > d2 .eq. 2048 .and. + > d3 .eq. 2048 .and. + > nt .eq. 25) then +c--------------------------------------------------------------------- +c Class E size reference checksums +c--------------------------------------------------------------------- + class = 'E' + csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02) + csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02) + csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02) + csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02) + csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02) + csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02) + csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02) + csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02) + csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02) + csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02) + csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02) + csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02) + csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02) + csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02) + csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02) + csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02) + csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02) + csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02) + csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02) + csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02) + csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02) + csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02) + csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02) + csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02) + csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02) + + endif + + + if (class .ne. 'U') then + + do i = 1, nt + err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) ) + if (.not.(err .le. epsilon)) goto 100 + end do + verified = .TRUE. + 100 continue + + endif + + call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr) + if (size .ne. np) then + write(*, 4010) np + write(*, 4011) + write(*, 4012) +c--------------------------------------------------------------------- +c multiple statements because some Fortran compilers have +c problems with long strings. +c--------------------------------------------------------------------- + 4010 format( ' Warning: benchmark was compiled for ', i5, + > 'processors') + 4011 format( ' Must be run on this many processors for official', + > ' verification') + 4012 format( ' so memory access is repeatable') + verified = .false. + endif + + if (class .ne. 'U') then + if (verified) then + write(*,2000) + 2000 format(' Result verification successful') + else + write(*,2001) + 2001 format(' Result verification failed') + endif + endif + print *, 'class = ', class + + return + end + + diff --git a/examples/smpi/NAS/FT/global.h b/examples/smpi/NAS/FT/global.h new file mode 100644 index 0000000000..3e534bb48d --- /dev/null +++ b/examples/smpi/NAS/FT/global.h @@ -0,0 +1,134 @@ + include 'npbparams.h' + +c 2D processor array -> 2D grid decomposition (by pencils) +c If processor array is 1xN or -> 1D grid decomposition (by planes) +c If processor array is 1x1 -> 0D grid decomposition +c For simplicity, do not treat Nx1 (np2 = 1) specially + integer np1, np2, np + +c basic decomposition strategy + integer layout_type + integer layout_0D, layout_1D, layout_2D + parameter (layout_0D = 0, layout_1D = 1, layout_2D = 2) + + common /procgrid/ np1, np2, layout_type, np + + +c Cache blocking params. These values are good for most +c RISC processors. +c FFT parameters: +c fftblock controls how many ffts are done at a time. +c The default is appropriate for most cache-based machines +c On vector machines, the FFT can be vectorized with vector +c length equal to the block size, so the block size should +c be as large as possible. This is the size of the smallest +c dimension of the problem: 128 for class A, 256 for class B and +c 512 for class C. +c Transpose parameters: +c transblock is the blocking factor for the transposes when there +c is a 1-D layout. On vector machines it should probably be +c large (largest dimension of the problem). + + + integer fftblock_default, fftblockpad_default + parameter (fftblock_default=16, fftblockpad_default=18) + integer transblock, transblockpad + parameter(transblock=32, transblockpad=34) + + integer fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +c we need a bunch of logic to keep track of how +c arrays are laid out. +c coords of this processor + integer me, me1, me2 + common /coords/ me, me1, me2 +c need a communicator for row/col in processor grid + integer commslice1, commslice2 + common /comms/ commslice1, commslice2 + + + +c There are basically three stages +c 1: x-y-z layout +c 2: after x-transform (before y) +c 3: after y-transform (before z) +c The computation proceeds logically as + +c set up initial conditions +c fftx(1) +c transpose (1->2) +c ffty(2) +c transpose (2->3) +c fftz(3) +c time evolution +c fftz(3) +c transpose (3->2) +c ffty(2) +c transpose (2->1) +c fftx(1) +c compute residual(1) + +c for the 0D, 1D, 2D strategies, the layouts look like xxx +c +c 0D 1D 2D +c 1: xyz xyz xyz +c 2: xyz xyz yxz +c 3: xyz zyx zxy + +c the array dimensions are stored in dims(coord, phase) + integer dims(3, 3) + integer xstart(3), ystart(3), zstart(3) + integer xend(3), yend(3), zend(3) + common /layout/ dims, + > xstart, ystart, zstart, + > xend, yend, zend + + integer T_total, T_setup, T_fft, T_evolve, T_checksum, + > T_fftlow, T_fftcopy, T_transpose, + > T_transxzloc, T_transxzglo, T_transxzfin, + > T_transxyloc, T_transxyglo, T_transxyfin, + > T_synch, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, + > T_evolve = 4, T_checksum = 5, + > T_fftlow = 6, T_fftcopy = 7, T_transpose = 8, + > T_transxzloc = 9, T_transxzglo = 10, T_transxzfin = 11, + > T_transxyloc = 12, T_transxyglo = 13, + > T_transxyfin = 14, T_synch = 15, T_max = 15) + + + + logical timers_enabled + parameter (timers_enabled = .false.) + + + external timer_read + double precision timer_read + external ilog2 + integer ilog2 + + external randlc + double precision randlc + + +c other stuff + logical debug, debugsynch + common /dbg/ debug, debugsynch + + double precision seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, + > pi = 3.141592653589793238d0, alpha=1.0d-6) + +c roots of unity array +c relies on x being largest dimension? + double complex u(nx) + common /ucomm/ u + + +c for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +c number of iterations + integer niter + common /iter/ niter diff --git a/examples/smpi/NAS/FT/inputft.data.sample b/examples/smpi/NAS/FT/inputft.data.sample new file mode 100644 index 0000000000..448ac42bc0 --- /dev/null +++ b/examples/smpi/NAS/FT/inputft.data.sample @@ -0,0 +1,3 @@ +6 ! number of iterations +2 ! layout type. 0 = 0d, 1 = 1d, 2 = 2d +2 4 ! processor layout. 0d must be "1 1"; 1d must be "1 N" diff --git a/examples/smpi/NAS/FT/mpinpb.h b/examples/smpi/NAS/FT/mpinpb.h new file mode 100644 index 0000000000..e43e552a84 --- /dev/null +++ b/examples/smpi/NAS/FT/mpinpb.h @@ -0,0 +1,4 @@ + include 'mpif.h' +c mpi data types + integer dc_type + common /mpistuff/ dc_type diff --git a/examples/smpi/NAS/IS/Makefile b/examples/smpi/NAS/IS/Makefile new file mode 100644 index 0000000000..26d35e8343 --- /dev/null +++ b/examples/smpi/NAS/IS/Makefile @@ -0,0 +1,23 @@ +SHELL=/bin/sh +BENCHMARK=is +BENCHMARKU=IS + +include ../config/make.def + +include ../sys/make.common + +OBJS = is.o ${COMMON}/c_print_results.o + + +${PROGRAM}: config ${OBJS} + ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB} + +.c.o: + ${CCOMPILE} $< + +is.o: is.c npbparams.h + + +clean: + - rm -f *.o *~ mputil* + - rm -f is npbparams.h core diff --git a/examples/smpi/NAS/IS/is.c b/examples/smpi/NAS/IS/is.c new file mode 100644 index 0000000000..01bd9ddceb --- /dev/null +++ b/examples/smpi/NAS/IS/is.c @@ -0,0 +1,1150 @@ +/************************************************************************* + * * + * N A S P A R A L L E L B E N C H M A R K S 3.3 * + * * + * I S * + * * + ************************************************************************* + * * + * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. * + * It is described in NAS Technical Report 95-020. * + * * + * Permission to use, copy, distribute and modify this software * + * for any purpose with or without fee is hereby granted. We * + * request, however, that all derived work reference the NAS * + * Parallel Benchmarks 3.3. This software is provided "as is" * + * without express or implied warranty. * + * * + * Information on NPB 3.3, including the technical report, the * + * original specifications, source code, results and information * + * on how to submit new results, is available at: * + * * + * http://www.nas.nasa.gov/Software/NPB * + * * + * Send comments or suggestions to npb@nas.nasa.gov * + * Send bug reports to npb-bugs@nas.nasa.gov * + * * + * NAS Parallel Benchmarks Group * + * NASA Ames Research Center * + * Mail Stop: T27A-1 * + * Moffett Field, CA 94035-1000 * + * * + * E-mail: npb@nas.nasa.gov * + * Fax: (650) 604-3957 * + * * + ************************************************************************* + * * + * Author: M. Yarrow * + * H. Jin * + * * + *************************************************************************/ + +#include "mpi.h" +#include "npbparams.h" +#include +#include + +/******************/ +/* default values */ +/******************/ +#ifndef CLASS +#define CLASS 'S' +#define NUM_PROCS 1 +#endif +#define MIN_PROCS 1 + + +/*************/ +/* CLASS S */ +/*************/ +#if CLASS == 'S' +#define TOTAL_KEYS_LOG_2 16 +#define MAX_KEY_LOG_2 11 +#define NUM_BUCKETS_LOG_2 9 +#endif + + +/*************/ +/* CLASS W */ +/*************/ +#if CLASS == 'W' +#define TOTAL_KEYS_LOG_2 20 +#define MAX_KEY_LOG_2 16 +#define NUM_BUCKETS_LOG_2 10 +#endif + +/*************/ +/* CLASS A */ +/*************/ +#if CLASS == 'A' +#define TOTAL_KEYS_LOG_2 23 +#define MAX_KEY_LOG_2 19 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS B */ +/*************/ +#if CLASS == 'B' +#define TOTAL_KEYS_LOG_2 25 +#define MAX_KEY_LOG_2 21 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS C */ +/*************/ +#if CLASS == 'C' +#define TOTAL_KEYS_LOG_2 27 +#define MAX_KEY_LOG_2 23 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS D */ +/*************/ +#if CLASS == 'D' +#define TOTAL_KEYS_LOG_2 29 +#define MAX_KEY_LOG_2 27 +#define NUM_BUCKETS_LOG_2 10 +#undef MIN_PROCS +#define MIN_PROCS 4 +#endif + + +#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2) +#define MAX_KEY (1 << MAX_KEY_LOG_2) +#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2) +#define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS) + +/*****************************************************************/ +/* On larger number of processors, since the keys are (roughly) */ +/* gaussian distributed, the first and last processor sort keys */ +/* in a large interval, requiring array sizes to be larger. Note */ +/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/ +/* The required array size also depends on the bucket size used. */ +/* The following values are validated for the 1024-bucket setup. */ +/*****************************************************************/ +#if NUM_PROCS < 256 +#define SIZE_OF_BUFFERS 3*NUM_KEYS/2 +#elif NUM_PROCS < 512 +#define SIZE_OF_BUFFERS 5*NUM_KEYS/2 +#elif NUM_PROCS < 1024 +#define SIZE_OF_BUFFERS 4*NUM_KEYS +#else +#define SIZE_OF_BUFFERS 13*NUM_KEYS/2 +#endif + +/*****************************************************************/ +/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */ +/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */ +/* MAX_PROCS AT YOUR PERIL */ +/*****************************************************************/ +#if CLASS == 'S' +#define MAX_PROCS 128 +#else +#define MAX_PROCS 1024 +#endif + +#define MAX_ITERATIONS 10 +#define TEST_ARRAY_SIZE 5 + + +/***********************************/ +/* Enable separate communication, */ +/* computation timing and printout */ +/***********************************/ +/* #define TIMING_ENABLED */ + + +/*************************************/ +/* Typedef: if necessary, change the */ +/* size of int here by changing the */ +/* int type to, say, long */ +/*************************************/ +typedef int INT_TYPE; +typedef long INT_TYPE2; +#define MP_KEY_TYPE MPI_INT + + +typedef struct { + +/********************/ +/* MPI properties: */ +/********************/ +int my_rank, + comm_size; + + +/********************/ +/* Some global info */ +/********************/ +INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */ + total_local_keys, /* copies of rank info */ + total_lesser_keys; + + +int passed_verification; + + + +/************************************/ +/* These are the three main arrays. */ +/* See SIZE_OF_BUFFERS def above */ +/************************************/ +INT_TYPE key_array[SIZE_OF_BUFFERS], + key_buff1[SIZE_OF_BUFFERS], + key_buff2[SIZE_OF_BUFFERS], + bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */ + bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */ + bucket_ptrs[NUM_BUCKETS], + process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE], + process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE]; +int send_count[MAX_PROCS], recv_count[MAX_PROCS], + send_displ[MAX_PROCS], recv_displ[MAX_PROCS]; + + +/**********************/ +/* Partial verif info */ +/**********************/ +INT_TYPE2 test_index_array[TEST_ARRAY_SIZE], + test_rank_array[TEST_ARRAY_SIZE]; + +/**********/ +/* Timers */ +/**********/ +double start[64], elapsed[64]; + +} global_data; + + +const INT_TYPE2 + S_test_index_array[TEST_ARRAY_SIZE] = + {48427,17148,23627,62548,4431}, + S_test_rank_array[TEST_ARRAY_SIZE] = + {0,18,346,64917,65463}, + + W_test_index_array[TEST_ARRAY_SIZE] = + {357773,934767,875723,898999,404505}, + W_test_rank_array[TEST_ARRAY_SIZE] = + {1249,11698,1039987,1043896,1048018}, + + A_test_index_array[TEST_ARRAY_SIZE] = + {2112377,662041,5336171,3642833,4250760}, + A_test_rank_array[TEST_ARRAY_SIZE] = + {104,17523,123928,8288932,8388264}, + + B_test_index_array[TEST_ARRAY_SIZE] = + {41869,812306,5102857,18232239,26860214}, + B_test_rank_array[TEST_ARRAY_SIZE] = + {33422937,10244,59149,33135281,99}, + + C_test_index_array[TEST_ARRAY_SIZE] = + {44172927,72999161,74326391,129606274,21736814}, + C_test_rank_array[TEST_ARRAY_SIZE] = + {61147,882988,266290,133997595,133525895}, + + D_test_index_array[TEST_ARRAY_SIZE] = + {1317351170,995930646,1157283250,1503301535,1453734525}, + D_test_rank_array[TEST_ARRAY_SIZE] = + {1,36538729,1978098519,2145192618,2147425337}; + + + +/***********************/ +/* function prototypes */ +/***********************/ +double randlc( double *X, double *A ); + +void full_verify( global_data* gd ); + +void c_print_results( char *name, + char class, + int n1, + int n2, + int n3, + int niter, + int nprocs_compiled, + int nprocs_total, + double t, + double mops, + char *optype, + int passed_verification, + char *npbversion, + char *compiletime, + char *mpicc, + char *clink, + char *cmpi_lib, + char *cmpi_inc, + char *cflags, + char *clinkflags ); + +void timer_clear(global_data* gd, int n ); +void timer_start(global_data* gd, int n ); +void timer_stop(global_data* gd, int n ); +double timer_read(global_data* gd, int n ); + +void timer_clear(global_data* gd, int n ) { + gd->elapsed[n] = 0.0; +} + +void timer_start(global_data* gd, int n ) { + gd->start[n] = MPI_Wtime(); +} + +void timer_stop(global_data* gd, int n ) { + gd->elapsed[n] += MPI_Wtime() - gd->start[n]; +} + +double timer_read(global_data* gd, int n ) { + return gd->elapsed[n]; +} + + +/* + * FUNCTION RANDLC (X, A) + * + * This routine returns a uniform pseudorandom double precision number in the + * range (0, 1) by using the linear congruential generator + * + * x_{k+1} = a x_k (mod 2^46) + * + * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers + * before repeating. The argument A is the same as 'a' in the above formula, + * and X is the same as x_0. A and X must be odd double precision integers + * in the range (1, 2^46). The returned value RANDLC is normalized to be + * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain + * the new seed x_1, so that subsequent calls to RANDLC using the same + * arguments will generate a continuous sequence. + * + * This routine should produce the same results on any computer with at least + * 48 mantissa bits in double precision floating point data. On Cray systems, + * double precision should be disabled. + * + * David H. Bailey October 26, 1990 + * + * IMPLICIT DOUBLE PRECISION (A-H, O-Z) + * SAVE KS, R23, R46, T23, T46 + * DATA KS/0/ + * + * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46, + * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than + * by merely using the ** operator, in order to insure that the results are + * exact on all systems. This code assumes that 0.5D0 is represented exactly. + */ + + +/*****************************************************************/ +/************* R A N D L C ************/ +/************* ************/ +/************* portable random number generator ************/ +/*****************************************************************/ + +double randlc( double *X, double *A ) +{ + static int KS=0; + static double R23, R46, T23, T46; + double T1, T2, T3, T4; + double A1; + double A2; + double X1; + double X2; + double Z; + int i, j; + + if (KS == 0) + { + R23 = 1.0; + R46 = 1.0; + T23 = 1.0; + T46 = 1.0; + + for (i=1; i<=23; i++) + { + R23 = 0.50 * R23; + T23 = 2.0 * T23; + } + for (i=1; i<=46; i++) + { + R46 = 0.50 * R46; + T46 = 2.0 * T46; + } + KS = 1; + } + +/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ + + T1 = R23 * *A; + j = T1; + A1 = j; + A2 = *A - T23 * A1; + +/* Break X into two parts such that X = 2^23 * X1 + X2, compute + Z = A1 * X2 + A2 * X1 (mod 2^23), and then + X = 2^23 * Z + A2 * X2 (mod 2^46). */ + + T1 = R23 * *X; + j = T1; + X1 = j; + X2 = *X - T23 * X1; + T1 = A1 * X2 + A2 * X1; + + j = R23 * T1; + T2 = j; + Z = T1 - T23 * T2; + T3 = T23 * Z + A2 * X2; + j = R46 * T3; + T4 = j; + *X = T3 - T46 * T4; + return(R46 * *X); +} + + + +/*****************************************************************/ +/************ F I N D _ M Y _ S E E D ************/ +/************ ************/ +/************ returns parallel random number seq seed ************/ +/*****************************************************************/ + +/* + * Create a random number sequence of total length nn residing + * on np number of processors. Each processor will therefore have a + * subsequence of length nn/np. This routine returns that random + * number which is the first random number for the subsequence belonging + * to processor rank kn, and which is used as seed for proc kn ran # gen. + */ + +double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */ + int np, /* np = num procs */ + long nn, /* total num of ran numbers, all procs */ + double s, /* Ran num seed, for ex.: 314159265.00 */ + double a ) /* Ran num gen mult, try 1220703125.00 */ +{ + + long i; + + double t1,t2,t3,an; + long mq,nq,kk,ik; + + + + nq = nn / np; + + for( mq=0; nq>1; mq++,nq/=2 ) + ; + + t1 = a; + + for( i=1; i<=mq; i++ ) + t2 = randlc( &t1, &t1 ); + + an = t1; + + kk = kn; + t1 = s; + t2 = an; + + for( i=1; i<=100; i++ ) + { + ik = kk / 2; + if( 2 * ik != kk ) + t3 = randlc( &t1, &t2 ); + if( ik == 0 ) + break; + t3 = randlc( &t2, &t2 ); + kk = ik; + } + + return( t1 ); + +} + + + + +/*****************************************************************/ +/************* C R E A T E _ S E Q ************/ +/*****************************************************************/ + +void create_seq( global_data* gd, double seed, double a ) +{ + double x; + int i, k; + + k = MAX_KEY/4; + + for (i=0; ikey_array[i] = k*x; + } +} + + + + +/*****************************************************************/ +/************* F U L L _ V E R I F Y ************/ +/*****************************************************************/ + + +void full_verify( global_data* gd ) +{ + MPI_Status status; + MPI_Request request; + + INT_TYPE i, j; + INT_TYPE k, last_local_key; + + +/* Now, finally, sort the keys: */ + for( i=0; itotal_local_keys; i++ ) + gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- + gd->total_lesser_keys] = gd->key_buff2[i]; + last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1); + +/* Send largest key value to next processor */ + if( gd->my_rank > 0 ) + MPI_Irecv( &k, + 1, + MP_KEY_TYPE, + gd->my_rank-1, + 1000, + MPI_COMM_WORLD, + &request ); + if( gd->my_rank < gd->comm_size-1 ) + MPI_Send( &gd->key_array[last_local_key], + 1, + MP_KEY_TYPE, + gd->my_rank+1, + 1000, + MPI_COMM_WORLD ); + if( gd->my_rank > 0 ) + MPI_Wait( &request, &status ); + +/* Confirm that neighbor's greatest key value + is not greater than my least key value */ + j = 0; + if( gd->my_rank > 0 && gd->total_local_keys > 0 ) + if( k > gd->key_array[0] ) + j++; + + +/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */ + for( i=1; itotal_local_keys; i++ ) + if( gd->key_array[i-1] > gd->key_array[i] ) + j++; + + + if( j != 0 ) + { + printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", + gd->my_rank, j ); + } + else + gd->passed_verification++; + + +} + + + + +/*****************************************************************/ +/************* R A N K ****************/ +/*****************************************************************/ + + +void rank( global_data* gd, int iteration ) +{ + + INT_TYPE i, k; + + INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2; + INT_TYPE key; + INT_TYPE2 bucket_sum_accumulator, j, m; + INT_TYPE local_bucket_sum_accumulator; + INT_TYPE min_key_val, max_key_val; + INT_TYPE *key_buff_ptr; + + + + +/* Iteration alteration of keys */ + if(gd->my_rank == 0 ) + { + gd->key_array[iteration] = iteration; + gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration; + } + + +/* Initialize */ + for( i=0; ibucket_size[i] = 0; + gd->bucket_size_totals[i] = 0; + gd->process_bucket_distrib_ptr1[i] = 0; + gd->process_bucket_distrib_ptr2[i] = 0; + } + + +/* Determine where the partial verify test keys are, load into */ +/* top of array bucket_size */ + for( i=0; itest_index_array[i]/NUM_KEYS) == gd->my_rank ) + gd->bucket_size[NUM_BUCKETS+i] = + gd->key_array[gd->test_index_array[i] % NUM_KEYS]; + + +/* Determine the number of keys in each bucket */ + for( i=0; ibucket_size[gd->key_array[i] >> shift]++; + + +/* Accumulative bucket sizes are the bucket pointers */ + gd->bucket_ptrs[0] = 0; + for( i=1; i< NUM_BUCKETS; i++ ) + gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1]; + + +/* Sort into appropriate bucket */ + for( i=0; ikey_array[i]; + gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key; + } + +#ifdef TIMING_ENABLED + timer_stop(gd, 2 ); + timer_start(gd, 3 ); +#endif + +/* Get the bucket size totals for the entire problem. These + will be used to determine the redistribution of keys */ + MPI_Allreduce( gd->bucket_size, + gd->bucket_size_totals, + NUM_BUCKETS+TEST_ARRAY_SIZE, + MP_KEY_TYPE, + MPI_SUM, + MPI_COMM_WORLD ); + +#ifdef TIMING_ENABLED + timer_stop(gd, 3 ); + timer_start(gd, 2 ); +#endif + +/* Determine Redistibution of keys: accumulate the bucket size totals + till this number surpasses NUM_KEYS (which the average number of keys + per processor). Then all keys in these buckets go to processor 0. + Continue accumulating again until supassing 2*NUM_KEYS. All keys + in these buckets go to processor 1, etc. This algorithm guarantees + that all processors have work ranking; no processors are left idle. + The optimum number of buckets, however, does not result in as high + a degree of load balancing (as even a distribution of keys as is + possible) as is obtained from increasing the number of buckets, but + more buckets results in more computation per processor so that the + optimum number of buckets turns out to be 1024 for machines tested. + Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket + number of first and last bucket which each processor will have after + the redistribution is done. */ + + bucket_sum_accumulator = 0; + local_bucket_sum_accumulator = 0; + gd->send_displ[0] = 0; + gd->process_bucket_distrib_ptr1[0] = 0; + for( i=0, j=0; ibucket_size_totals[i]; + local_bucket_sum_accumulator += gd->bucket_size[i]; + if( bucket_sum_accumulator >= (j+1)*NUM_KEYS ) + { + gd->send_count[j] = local_bucket_sum_accumulator; + if( j != 0 ) + { + gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1]; + gd->process_bucket_distrib_ptr1[j] = + gd->process_bucket_distrib_ptr2[j-1]+1; + } + gd->process_bucket_distrib_ptr2[j++] = i; + local_bucket_sum_accumulator = 0; + } + } + +/* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible + that the last few processors don't get any buckets. So, we + need to set counts properly in this case to avoid any fallouts. */ + while( j < gd->comm_size ) + { + gd->send_count[j] = 0; + gd->process_bucket_distrib_ptr1[j] = 1; + j++; + } + +#ifdef TIMING_ENABLED + timer_stop(gd, 2 ); + timer_start(gd, 3 ); +#endif + +/* This is the redistribution section: first find out how many keys + each processor will send to every other processor: */ + MPI_Alltoall( gd->send_count, + 1, + MPI_INT, + gd->recv_count, + 1, + MPI_INT, + MPI_COMM_WORLD ); + MPI_Wtime(); + +/* Determine the receive array displacements for the buckets */ + gd->recv_displ[0] = 0; + for( i=1; icomm_size; i++ ) + gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1]; + + + MPI_Wtime(); +/* Now send the keys to respective processors */ + MPI_Alltoallv( gd->key_buff1, + gd->send_count, + gd->send_displ, + MP_KEY_TYPE, + gd->key_buff2, + gd->recv_count, + gd->recv_displ, + MP_KEY_TYPE, + MPI_COMM_WORLD ); + +#ifdef TIMING_ENABLED + timer_stop(gd, 3 ); + timer_start(gd, 2 ); +#endif + +/* The starting and ending bucket numbers on each processor are + multiplied by the interval size of the buckets to obtain the + smallest possible min and greatest possible max value of any + key on each processor */ + min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift; + max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1; + +/* Clear the work array */ + for( i=0; ikey_buff1[i] = 0; + +/* Determine the total number of keys on all other + processors holding keys of lesser value */ + m = 0; + for( k=0; kmy_rank; k++ ) + for( i= gd->process_bucket_distrib_ptr1[k]; + i<=gd->process_bucket_distrib_ptr2[k]; + i++ ) + m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */ + +/* Determine total number of keys on this processor */ + j = 0; + for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; + i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; + i++ ) + j += gd->bucket_size_totals[i]; /* j has total # of local keys */ + + +/* Ranking of all keys occurs in this section: */ +/* shift it backwards so no subtractions are necessary in loop */ + key_buff_ptr = gd->key_buff1 - min_key_val; + +/* In this section, the keys themselves are used as their + own indexes to determine how many of each there are: their + individual population */ + for( i=0; ikey_buff2[i]]++; /* Now they have individual key */ + /* population */ + +/* To obtain ranks of each key, successively add the individual key + population, not forgetting the total of lesser keys, m. + NOTE: Since the total of lesser keys would be subtracted later + in verification, it is no longer added to the first key population + here, but still needed during the partial verify test. This is to + ensure that 32-bit key_buff can still be used for class D. */ +/* key_buff_ptr[min_key_val] += m; */ + for( i=min_key_val; ibucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */ + if( min_key_val <= k && k <= max_key_val ) + { + /* Add the total of lesser keys, m, here */ + INT_TYPE2 key_rank = key_buff_ptr[k-1] + m; + int failed = 0; + + switch( CLASS ) + { + case 'S': + if( i <= 2 ) + { + if( key_rank != gd->test_rank_array[i]+iteration ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'W': + if( i < 2 ) + { + if( key_rank != gd->test_rank_array[i]+(iteration-2) ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'A': + if( i <= 2 ) + { + if( key_rank != gd->test_rank_array[i]+(iteration-1) ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-(iteration-1) ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'B': + if( i == 1 || i == 2 || i == 4 ) + { + if( key_rank != gd->test_rank_array[i]+iteration ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'C': + if( i <= 2 ) + { + if( key_rank != gd->test_rank_array[i]+iteration ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'D': + if( i < 2 ) + { + if( key_rank != gd->test_rank_array[i]+iteration ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + } + if( failed == 1 ) + printf( "Failed partial verification: " + "iteration %d, processor %d, test key %d\n", + iteration, gd->my_rank, (int)i ); + } + } + + + + +/* Make copies of rank info for use by full_verify: these variables + in rank are local; making them global slows down the code, probably + since they cannot be made register by compiler */ + + if( iteration == MAX_ITERATIONS ) + { + gd->key_buff_ptr_global = key_buff_ptr; + gd->total_local_keys = j; + gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */ + } + +} + + +/*****************************************************************/ +/************* M A I N ****************/ +/*****************************************************************/ + +int main( int argc, char **argv ) +{ + + int i, iteration, itemp; + + double timecounter, maxtime; + + global_data* gd = malloc(sizeof(global_data)); +/* Initialize MPI */ + MPI_Init( &argc, &argv ); + MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank ); + MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size ); + +/* Initialize the verification arrays if a valid class */ + for( i=0; itest_index_array[i] = S_test_index_array[i]; + gd->test_rank_array[i] = S_test_rank_array[i]; + break; + case 'A': + gd->test_index_array[i] = A_test_index_array[i]; + gd->test_rank_array[i] = A_test_rank_array[i]; + break; + case 'W': + gd->test_index_array[i] = W_test_index_array[i]; + gd->test_rank_array[i] = W_test_rank_array[i]; + break; + case 'B': + gd->test_index_array[i] = B_test_index_array[i]; + gd->test_rank_array[i] = B_test_rank_array[i]; + break; + case 'C': + gd->test_index_array[i] = C_test_index_array[i]; + gd->test_rank_array[i] = C_test_rank_array[i]; + break; + case 'D': + gd->test_index_array[i] = D_test_index_array[i]; + gd->test_rank_array[i] = D_test_rank_array[i]; + break; + }; + + + +/* Printout initial NPB info */ + if( gd->my_rank == 0 ) + { + printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" ); + printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS ); + printf( " Iterations: %d\n", MAX_ITERATIONS ); + printf( " Number of processes: %d\n",gd->comm_size ); + } + +/* Check that actual and compiled number of processors agree */ + if( gd->comm_size != NUM_PROCS ) + { + if( gd->my_rank == 0 ) + printf( "\n ERROR: compiled for %d processes\n" + " Number of active processes: %d\n" + " Exiting program!\n\n", NUM_PROCS, gd->comm_size ); + MPI_Finalize(); + exit( 1 ); + } + +/* Check to see whether total number of processes is within bounds. + This could in principle be checked in setparams.c, but it is more + convenient to do it here */ + if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS) + { + if( gd->my_rank == 0 ) + printf( "\n ERROR: number of processes %d not within range %d-%d" + "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS); + MPI_Finalize(); + exit( 1 ); + } + + +/* Generate random number sequence and subsequent keys on all procs */ + create_seq(gd, find_my_seed( gd->my_rank, + gd->comm_size, + 4*(long)TOTAL_KEYS*MIN_PROCS, + 314159265.00, /* Random number gen seed */ + 1220703125.00 ), /* Random number gen mult */ + 1220703125.00 ); /* Random number gen mult */ + +/* Do one interation for free (i.e., untimed) to guarantee initialization of + all data and code pages and respective tables */ + rank(gd, 1 ); + +/* Start verification counter */ + gd->passed_verification = 0; + + if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" ); + +/* Initialize timer */ + timer_clear(gd, 0 ); + +/* Initialize separate communication, computation timing */ +#ifdef TIMING_ENABLED + for( i=1; i<=3; i++ ) timer_clear(gd, i ); +#endif + +/* Start timer */ + timer_start(gd, 0 ); + +#ifdef TIMING_ENABLED + timer_start(gd, 1 ); + timer_start(gd, 2 ); +#endif + +/* This is the main iteration */ + for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) + { + if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration ); + rank(gd, iteration ); + } + + +#ifdef TIMING_ENABLED + timer_stop(gd, 2 ); + timer_stop(gd, 1 ); +#endif + +/* Stop timer, obtain time for processors */ + timer_stop(gd, 0 ); + + timecounter = timer_read(gd, 0 ); + +/* End of timing, obtain maximum time of all processors */ + MPI_Reduce( &timecounter, + &maxtime, + 1, + MPI_DOUBLE, + MPI_MAX, + 0, + MPI_COMM_WORLD ); + +#ifdef TIMING_ENABLED + { + double tmin, tsum, tmax; + + if( my_rank == 0 ) + { + printf( "\ntimer 1/2/3 = total/computation/communication time\n"); + printf( " min avg max\n" ); + } + for( i=1; i<=3; i++ ) + { + timecounter = timer_read(gd, i ); + MPI_Reduce( &timecounter, + &tmin, + 1, + MPI_DOUBLE, + MPI_MIN, + 0, + MPI_COMM_WORLD ); + MPI_Reduce( &timecounter, + &tsum, + 1, + MPI_DOUBLE, + MPI_SUM, + 0, + MPI_COMM_WORLD ); + MPI_Reduce( &timecounter, + &tmax, + 1, + MPI_DOUBLE, + MPI_MAX, + 0, + MPI_COMM_WORLD ); + if( my_rank == 0 ) + printf( "timer %d: %f %f %f\n", + i, tmin, tsum/((double) comm_size), tmax ); + } + if( my_rank == 0 ) + printf( "\n" ); + } +#endif + +/* This tests that keys are in sequence: sorting of last ranked key seq + occurs here, but is an untimed operation */ + full_verify(gd); + + +/* Obtain verification counter sum */ + itemp =gd->passed_verification; + MPI_Reduce( &itemp, + &gd->passed_verification, + 1, + MPI_INT, + MPI_SUM, + 0, + MPI_COMM_WORLD ); + + + +/* The final printout */ + if( gd->my_rank == 0 ) + { + if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size ) + gd->passed_verification = 0; + c_print_results( "IS", + CLASS, + (int)(TOTAL_KEYS), + MIN_PROCS, + 0, + MAX_ITERATIONS, + NUM_PROCS, + gd->comm_size, + maxtime, + ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS) + /maxtime/1000000., + "keys ranked", + gd->passed_verification, + NPBVERSION, + COMPILETIME, + MPICC, + CLINK, + CMPI_LIB, + CMPI_INC, + CFLAGS, + CLINKFLAGS ); + } + + MPI_Finalize(); + free(gd); + + return 0; + /**************************/ +} /* E N D P R O G R A M */ + /**************************/ diff --git a/examples/smpi/NAS/LU/Makefile b/examples/smpi/NAS/LU/Makefile new file mode 100644 index 0000000000..a05c94dc4a --- /dev/null +++ b/examples/smpi/NAS/LU/Makefile @@ -0,0 +1,74 @@ +SHELL=/bin/sh +BENCHMARK=lu +BENCHMARKU=LU +VEC= + +include ../config/make.def + +OBJS = lu.o init_comm.o read_input.o bcast_inputs.o proc_grid.o neighbors.o \ + nodedim.o subdomain.o setcoeff.o sethyper.o setbv.o exact.o setiv.o \ + erhs.o ssor.o exchange_1.o exchange_3.o exchange_4.o exchange_5.o \ + exchange_6.o rhs.o l2norm.o jacld.o blts$(VEC).o jacu.o buts$(VEC).o \ + error.o pintgr.o verify.o ${COMMON}/print_results.o ${COMMON}/timers.o + +include ../sys/make.common + + +# npbparams.h is included by applu.incl +# The following rule should do the trick but many make programs (not gmake) +# will do the wrong thing and rebuild the world every time (because the +# mod time on header.h is not changed. One solution would be to +# touch header.h but this might cause confusion if someone has +# accidentally deleted it. Instead, make the dependency on npbparams.h +# explicit in all the lines below (even though dependence is indirect). + +# applu.incl: npbparams.h + +${PROGRAM}: config + @if [ x$(VERSION) = xvec ] ; then \ + ${MAKE} VEC=_vec exec; \ + elif [ x$(VERSION) = xVEC ] ; then \ + ${MAKE} VEC=_vec exec; \ + else \ + ${MAKE} exec; \ + fi + +exec: $(OBJS) + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} + +.f.o : + ${FCOMPILE} $< + +lu.o: lu.f applu.incl npbparams.h +bcast_inputs.o: bcast_inputs.f applu.incl npbparams.h mpinpb.h +blts$(VEC).o: blts$(VEC).f +buts$(VEC).o: buts$(VEC).f +erhs.o: erhs.f applu.incl npbparams.h +error.o: error.f applu.incl npbparams.h mpinpb.h +exact.o: exact.f applu.incl npbparams.h +exchange_1.o: exchange_1.f applu.incl npbparams.h mpinpb.h +exchange_3.o: exchange_3.f applu.incl npbparams.h mpinpb.h +exchange_4.o: exchange_4.f applu.incl npbparams.h mpinpb.h +exchange_5.o: exchange_5.f applu.incl npbparams.h mpinpb.h +exchange_6.o: exchange_6.f applu.incl npbparams.h mpinpb.h +init_comm.o: init_comm.f applu.incl npbparams.h mpinpb.h +jacld.o: jacld.f applu.incl npbparams.h +jacu.o: jacu.f applu.incl npbparams.h +l2norm.o: l2norm.f mpinpb.h +neighbors.o: neighbors.f applu.incl npbparams.h +nodedim.o: nodedim.f +pintgr.o: pintgr.f applu.incl npbparams.h mpinpb.h +proc_grid.o: proc_grid.f applu.incl npbparams.h +read_input.o: read_input.f applu.incl npbparams.h mpinpb.h +rhs.o: rhs.f applu.incl npbparams.h +setbv.o: setbv.f applu.incl npbparams.h +setiv.o: setiv.f applu.incl npbparams.h +setcoeff.o: setcoeff.f applu.incl npbparams.h +sethyper.o: sethyper.f applu.incl npbparams.h +ssor.o: ssor.f applu.incl npbparams.h mpinpb.h +subdomain.o: subdomain.f applu.incl npbparams.h mpinpb.h +verify.o: verify.f applu.incl npbparams.h + +clean: + - /bin/rm -f npbparams.h + - /bin/rm -f *.o *~ diff --git a/examples/smpi/NAS/LU/applu.incl b/examples/smpi/NAS/LU/applu.incl new file mode 100644 index 0000000000..413fc834e3 --- /dev/null +++ b/examples/smpi/NAS/LU/applu.incl @@ -0,0 +1,153 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- +c--- applu.incl +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c npbparams.h defines parameters that depend on the class and +c number of nodes +c--------------------------------------------------------------------- + + include 'npbparams.h' + +c--------------------------------------------------------------------- +c parameters which can be overridden in runtime config file +c (in addition to size of problem - isiz01,02,03 give the maximum size) +c ipr = 1 to print out verbose information +c omega = 2.0 is correct for all classes +c tolrsd is tolerance levels for steady state residuals +c--------------------------------------------------------------------- + integer ipr_default + parameter (ipr_default = 1) + double precision omega_default + parameter (omega_default = 1.2d0) + double precision tolrsd1_def, tolrsd2_def, tolrsd3_def, + > tolrsd4_def, tolrsd5_def + parameter (tolrsd1_def=1.0e-08, + > tolrsd2_def=1.0e-08, tolrsd3_def=1.0e-08, + > tolrsd4_def=1.0e-08, tolrsd5_def=1.0e-08) + + double precision c1, c2, c3, c4, c5 + parameter( c1 = 1.40d+00, c2 = 0.40d+00, + > c3 = 1.00d-01, c4 = 1.00d+00, + > c5 = 1.40d+00 ) + +c--------------------------------------------------------------------- +c grid +c--------------------------------------------------------------------- + integer nx, ny, nz + integer nx0, ny0, nz0 + integer ipt, ist, iend + integer jpt, jst, jend + integer ii1, ii2 + integer ji1, ji2 + integer ki1, ki2 + double precision dxi, deta, dzeta + double precision tx1, tx2, tx3 + double precision ty1, ty2, ty3 + double precision tz1, tz2, tz3 + + common/cgcon/ dxi, deta, dzeta, + > tx1, tx2, tx3, + > ty1, ty2, ty3, + > tz1, tz2, tz3, + > nx, ny, nz, + > nx0, ny0, nz0, + > ipt, ist, iend, + > jpt, jst, jend, + > ii1, ii2, + > ji1, ji2, + > ki1, ki2 + +c--------------------------------------------------------------------- +c dissipation +c--------------------------------------------------------------------- + double precision dx1, dx2, dx3, dx4, dx5 + double precision dy1, dy2, dy3, dy4, dy5 + double precision dz1, dz2, dz3, dz4, dz5 + double precision dssp + + common/disp/ dx1,dx2,dx3,dx4,dx5, + > dy1,dy2,dy3,dy4,dy5, + > dz1,dz2,dz3,dz4,dz5, + > dssp + +c--------------------------------------------------------------------- +c field variables and residuals +c--------------------------------------------------------------------- + double precision u(5,-1:isiz1+2,-1:isiz2+2,isiz3), + > rsd(5,-1:isiz1+2,-1:isiz2+2,isiz3), + > frct(5,-1:isiz1+2,-1:isiz2+2,isiz3), + > flux(5,0:isiz1+1,0:isiz2+1,isiz3) + + common/cvar/ u, + > rsd, + > frct, + > flux + + +c--------------------------------------------------------------------- +c output control parameters +c--------------------------------------------------------------------- + integer ipr, inorm + + common/cprcon/ ipr, inorm + +c--------------------------------------------------------------------- +c newton-raphson iteration control parameters +c--------------------------------------------------------------------- + integer itmax, invert + double precision dt, omega, tolrsd(5), + > rsdnm(5), errnm(5), frc, ttotal + + common/ctscon/ dt, omega, tolrsd, + > rsdnm, errnm, frc, ttotal, + > itmax, invert + + double precision a(5,5,isiz1,isiz2), + > b(5,5,isiz1,isiz2), + > c(5,5,isiz1,isiz2), + > d(5,5,isiz1,isiz2) + + common/cjac/ a, b, c, d + +c--------------------------------------------------------------------- +c coefficients of the exact solution +c--------------------------------------------------------------------- + double precision ce(5,13) + + common/cexact/ ce + +c--------------------------------------------------------------------- +c multi-processor common blocks +c--------------------------------------------------------------------- + integer id, ndim, num, xdim, ydim, row, col + common/dim/ id,ndim,num,xdim,ydim,row,col + + integer north,south,east,west + common/neigh/ north,south,east, west + + integer from_s,from_n,from_e,from_w + parameter (from_s=1,from_n=2,from_e=3,from_w=4) + + integer npmax + parameter (npmax=isiz01+isiz02) + + logical icommn(npmax+1),icomms(npmax+1), + > icomme(npmax+1),icommw(npmax+1) + double precision buf(5,2*isiz2*isiz3), + > buf1(5,2*isiz2*isiz3) + + common/comm/ buf, buf1, + > icommn,icomms, + > icomme,icommw + + double precision maxtime + common/timer/maxtime + + +c--------------------------------------------------------------------- +c end of include file +c--------------------------------------------------------------------- diff --git a/examples/smpi/NAS/LU/bcast_inputs.f b/examples/smpi/NAS/LU/bcast_inputs.f new file mode 100644 index 0000000000..c606724bec --- /dev/null +++ b/examples/smpi/NAS/LU/bcast_inputs.f @@ -0,0 +1,41 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine bcast_inputs + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer ierr + +c--------------------------------------------------------------------- +c root broadcasts the data +c The data isn't contiguous or of the same type, so it's not +c clear how to send it in the "MPI" way. +c We could pack the info into a buffer or we could create +c an obscene datatype to handle it all at once. Since we only +c broadcast the data once, just use a separate broadcast for +c each piece. +c--------------------------------------------------------------------- + call MPI_BCAST(ipr, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) + call MPI_BCAST(inorm, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) + call MPI_BCAST(itmax, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) + call MPI_BCAST(dt, 1, dp_type, root, MPI_COMM_WORLD, ierr) + call MPI_BCAST(omega, 1, dp_type, root, MPI_COMM_WORLD, ierr) + call MPI_BCAST(tolrsd, 5, dp_type, root, MPI_COMM_WORLD, ierr) + call MPI_BCAST(nx0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) + call MPI_BCAST(ny0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) + call MPI_BCAST(nz0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) + + return + end + + diff --git a/examples/smpi/NAS/LU/blts.f b/examples/smpi/NAS/LU/blts.f new file mode 100644 index 0000000000..9861261b03 --- /dev/null +++ b/examples/smpi/NAS/LU/blts.f @@ -0,0 +1,261 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine blts ( ldmx, ldmy, ldmz, + > nx, ny, nz, k, + > omega, + > v, + > ldz, ldy, ldx, d, + > ist, iend, jst, jend, + > nx0, ny0, ipt, jpt) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c compute the regular-sparse, block lower triangular solution: +c +c v <-- ( L-inv ) * v +c +c--------------------------------------------------------------------- + + implicit none + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer ldmx, ldmy, ldmz + integer nx, ny, nz + integer k + double precision omega + double precision v( 5, -1:ldmx+2, -1:ldmy+2, *), + > ldz( 5, 5, ldmx, ldmy), + > ldy( 5, 5, ldmx, ldmy), + > ldx( 5, 5, ldmx, ldmy), + > d( 5, 5, ldmx, ldmy) + integer ist, iend + integer jst, jend + integer nx0, ny0 + integer ipt, jpt + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, m + integer iex + double precision tmp, tmp1 + double precision tmat(5,5) + + +c--------------------------------------------------------------------- +c receive data from north and west +c--------------------------------------------------------------------- + iex = 0 + call exchange_1( v,k,iex ) + + + do j = jst, jend + do i = ist, iend + do m = 1, 5 + + v( m, i, j, k ) = v( m, i, j, k ) + > - omega * ( ldz( m, 1, i, j ) * v( 1, i, j, k-1 ) + > + ldz( m, 2, i, j ) * v( 2, i, j, k-1 ) + > + ldz( m, 3, i, j ) * v( 3, i, j, k-1 ) + > + ldz( m, 4, i, j ) * v( 4, i, j, k-1 ) + > + ldz( m, 5, i, j ) * v( 5, i, j, k-1 ) ) + + end do + end do + end do + + + do j=jst,jend + do i = ist, iend + + do m = 1, 5 + + v( m, i, j, k ) = v( m, i, j, k ) + > - omega * ( ldy( m, 1, i, j ) * v( 1, i, j-1, k ) + > + ldx( m, 1, i, j ) * v( 1, i-1, j, k ) + > + ldy( m, 2, i, j ) * v( 2, i, j-1, k ) + > + ldx( m, 2, i, j ) * v( 2, i-1, j, k ) + > + ldy( m, 3, i, j ) * v( 3, i, j-1, k ) + > + ldx( m, 3, i, j ) * v( 3, i-1, j, k ) + > + ldy( m, 4, i, j ) * v( 4, i, j-1, k ) + > + ldx( m, 4, i, j ) * v( 4, i-1, j, k ) + > + ldy( m, 5, i, j ) * v( 5, i, j-1, k ) + > + ldx( m, 5, i, j ) * v( 5, i-1, j, k ) ) + + end do + +c--------------------------------------------------------------------- +c diagonal block inversion +c +c forward elimination +c--------------------------------------------------------------------- + do m = 1, 5 + tmat( m, 1 ) = d( m, 1, i, j ) + tmat( m, 2 ) = d( m, 2, i, j ) + tmat( m, 3 ) = d( m, 3, i, j ) + tmat( m, 4 ) = d( m, 4, i, j ) + tmat( m, 5 ) = d( m, 5, i, j ) + end do + + tmp1 = 1.0d+00 / tmat( 1, 1 ) + tmp = tmp1 * tmat( 2, 1 ) + tmat( 2, 2 ) = tmat( 2, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 2, 3 ) = tmat( 2, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 2, 4 ) = tmat( 2, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 2, 5 ) = tmat( 2, 5 ) + > - tmp * tmat( 1, 5 ) + v( 2, i, j, k ) = v( 2, i, j, k ) + > - v( 1, i, j, k ) * tmp + + tmp = tmp1 * tmat( 3, 1 ) + tmat( 3, 2 ) = tmat( 3, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 3, 3 ) = tmat( 3, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 3, 4 ) = tmat( 3, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 3, 5 ) = tmat( 3, 5 ) + > - tmp * tmat( 1, 5 ) + v( 3, i, j, k ) = v( 3, i, j, k ) + > - v( 1, i, j, k ) * tmp + + tmp = tmp1 * tmat( 4, 1 ) + tmat( 4, 2 ) = tmat( 4, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 4, 3 ) = tmat( 4, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 1, 5 ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > - v( 1, i, j, k ) * tmp + + tmp = tmp1 * tmat( 5, 1 ) + tmat( 5, 2 ) = tmat( 5, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 5, 3 ) = tmat( 5, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 1, 5 ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - v( 1, i, j, k ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 2, 2 ) + tmp = tmp1 * tmat( 3, 2 ) + tmat( 3, 3 ) = tmat( 3, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 3, 4 ) = tmat( 3, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 3, 5 ) = tmat( 3, 5 ) + > - tmp * tmat( 2, 5 ) + v( 3, i, j, k ) = v( 3, i, j, k ) + > - v( 2, i, j, k ) * tmp + + tmp = tmp1 * tmat( 4, 2 ) + tmat( 4, 3 ) = tmat( 4, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 2, 5 ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > - v( 2, i, j, k ) * tmp + + tmp = tmp1 * tmat( 5, 2 ) + tmat( 5, 3 ) = tmat( 5, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 2, 5 ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - v( 2, i, j, k ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 3, 3 ) + tmp = tmp1 * tmat( 4, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 3, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 3, 5 ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > - v( 3, i, j, k ) * tmp + + tmp = tmp1 * tmat( 5, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 3, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 3, 5 ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - v( 3, i, j, k ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 4, 4 ) + tmp = tmp1 * tmat( 5, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 4, 5 ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - v( 4, i, j, k ) * tmp + +c--------------------------------------------------------------------- +c back substitution +c--------------------------------------------------------------------- + v( 5, i, j, k ) = v( 5, i, j, k ) + > / tmat( 5, 5 ) + + v( 4, i, j, k ) = v( 4, i, j, k ) + > - tmat( 4, 5 ) * v( 5, i, j, k ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > / tmat( 4, 4 ) + + v( 3, i, j, k ) = v( 3, i, j, k ) + > - tmat( 3, 4 ) * v( 4, i, j, k ) + > - tmat( 3, 5 ) * v( 5, i, j, k ) + v( 3, i, j, k ) = v( 3, i, j, k ) + > / tmat( 3, 3 ) + + v( 2, i, j, k ) = v( 2, i, j, k ) + > - tmat( 2, 3 ) * v( 3, i, j, k ) + > - tmat( 2, 4 ) * v( 4, i, j, k ) + > - tmat( 2, 5 ) * v( 5, i, j, k ) + v( 2, i, j, k ) = v( 2, i, j, k ) + > / tmat( 2, 2 ) + + v( 1, i, j, k ) = v( 1, i, j, k ) + > - tmat( 1, 2 ) * v( 2, i, j, k ) + > - tmat( 1, 3 ) * v( 3, i, j, k ) + > - tmat( 1, 4 ) * v( 4, i, j, k ) + > - tmat( 1, 5 ) * v( 5, i, j, k ) + v( 1, i, j, k ) = v( 1, i, j, k ) + > / tmat( 1, 1 ) + + + enddo + enddo + +c--------------------------------------------------------------------- +c send data to east and south +c--------------------------------------------------------------------- + iex = 2 + call exchange_1( v,k,iex ) + + return + end + + diff --git a/examples/smpi/NAS/LU/blts_vec.f b/examples/smpi/NAS/LU/blts_vec.f new file mode 100644 index 0000000000..f90ea84560 --- /dev/null +++ b/examples/smpi/NAS/LU/blts_vec.f @@ -0,0 +1,334 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine blts ( ldmx, ldmy, ldmz, + > nx, ny, nz, k, + > omega, + > v, + > ldz, ldy, ldx, d, + > ist, iend, jst, jend, + > nx0, ny0, ipt, jpt) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c compute the regular-sparse, block lower triangular solution: +c +c v <-- ( L-inv ) * v +c +c--------------------------------------------------------------------- + + implicit none + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer ldmx, ldmy, ldmz + integer nx, ny, nz + integer k + double precision omega + double precision v( 5, -1:ldmx+2, -1:ldmy+2, *), + > ldz( 5, 5, ldmx, ldmy), + > ldy( 5, 5, ldmx, ldmy), + > ldx( 5, 5, ldmx, ldmy), + > d( 5, 5, ldmx, ldmy) + integer ist, iend + integer jst, jend + integer nx0, ny0 + integer ipt, jpt + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, m, l, istp, iendp + integer iex + double precision tmp, tmp1 + double precision tmat(5,5) + + +c--------------------------------------------------------------------- +c receive data from north and west +c--------------------------------------------------------------------- + iex = 0 + call exchange_1( v,k,iex ) + + + do j = jst, jend + do i = ist, iend + do m = 1, 5 + + v( m, i, j, k ) = v( m, i, j, k ) + > - omega * ( ldz( m, 1, i, j ) * v( 1, i, j, k-1 ) + > + ldz( m, 2, i, j ) * v( 2, i, j, k-1 ) + > + ldz( m, 3, i, j ) * v( 3, i, j, k-1 ) + > + ldz( m, 4, i, j ) * v( 4, i, j, k-1 ) + > + ldz( m, 5, i, j ) * v( 5, i, j, k-1 ) ) + + end do + end do + end do + + + do l = ist+jst, iend+jend + istp = max(l - jend, ist) + iendp = min(l - jst, iend) + +!dir$ ivdep + do i = istp, iendp + j = l - i + +!!dir$ unroll 5 +! manually unroll the loop +! do m = 1, 5 + + v( 1, i, j, k ) = v( 1, i, j, k ) + > - omega * ( ldy( 1, 1, i, j ) * v( 1, i, j-1, k ) + > + ldx( 1, 1, i, j ) * v( 1, i-1, j, k ) + > + ldy( 1, 2, i, j ) * v( 2, i, j-1, k ) + > + ldx( 1, 2, i, j ) * v( 2, i-1, j, k ) + > + ldy( 1, 3, i, j ) * v( 3, i, j-1, k ) + > + ldx( 1, 3, i, j ) * v( 3, i-1, j, k ) + > + ldy( 1, 4, i, j ) * v( 4, i, j-1, k ) + > + ldx( 1, 4, i, j ) * v( 4, i-1, j, k ) + > + ldy( 1, 5, i, j ) * v( 5, i, j-1, k ) + > + ldx( 1, 5, i, j ) * v( 5, i-1, j, k ) ) + v( 2, i, j, k ) = v( 2, i, j, k ) + > - omega * ( ldy( 2, 1, i, j ) * v( 1, i, j-1, k ) + > + ldx( 2, 1, i, j ) * v( 1, i-1, j, k ) + > + ldy( 2, 2, i, j ) * v( 2, i, j-1, k ) + > + ldx( 2, 2, i, j ) * v( 2, i-1, j, k ) + > + ldy( 2, 3, i, j ) * v( 3, i, j-1, k ) + > + ldx( 2, 3, i, j ) * v( 3, i-1, j, k ) + > + ldy( 2, 4, i, j ) * v( 4, i, j-1, k ) + > + ldx( 2, 4, i, j ) * v( 4, i-1, j, k ) + > + ldy( 2, 5, i, j ) * v( 5, i, j-1, k ) + > + ldx( 2, 5, i, j ) * v( 5, i-1, j, k ) ) + v( 3, i, j, k ) = v( 3, i, j, k ) + > - omega * ( ldy( 3, 1, i, j ) * v( 1, i, j-1, k ) + > + ldx( 3, 1, i, j ) * v( 1, i-1, j, k ) + > + ldy( 3, 2, i, j ) * v( 2, i, j-1, k ) + > + ldx( 3, 2, i, j ) * v( 2, i-1, j, k ) + > + ldy( 3, 3, i, j ) * v( 3, i, j-1, k ) + > + ldx( 3, 3, i, j ) * v( 3, i-1, j, k ) + > + ldy( 3, 4, i, j ) * v( 4, i, j-1, k ) + > + ldx( 3, 4, i, j ) * v( 4, i-1, j, k ) + > + ldy( 3, 5, i, j ) * v( 5, i, j-1, k ) + > + ldx( 3, 5, i, j ) * v( 5, i-1, j, k ) ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > - omega * ( ldy( 4, 1, i, j ) * v( 1, i, j-1, k ) + > + ldx( 4, 1, i, j ) * v( 1, i-1, j, k ) + > + ldy( 4, 2, i, j ) * v( 2, i, j-1, k ) + > + ldx( 4, 2, i, j ) * v( 2, i-1, j, k ) + > + ldy( 4, 3, i, j ) * v( 3, i, j-1, k ) + > + ldx( 4, 3, i, j ) * v( 3, i-1, j, k ) + > + ldy( 4, 4, i, j ) * v( 4, i, j-1, k ) + > + ldx( 4, 4, i, j ) * v( 4, i-1, j, k ) + > + ldy( 4, 5, i, j ) * v( 5, i, j-1, k ) + > + ldx( 4, 5, i, j ) * v( 5, i-1, j, k ) ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - omega * ( ldy( 5, 1, i, j ) * v( 1, i, j-1, k ) + > + ldx( 5, 1, i, j ) * v( 1, i-1, j, k ) + > + ldy( 5, 2, i, j ) * v( 2, i, j-1, k ) + > + ldx( 5, 2, i, j ) * v( 2, i-1, j, k ) + > + ldy( 5, 3, i, j ) * v( 3, i, j-1, k ) + > + ldx( 5, 3, i, j ) * v( 3, i-1, j, k ) + > + ldy( 5, 4, i, j ) * v( 4, i, j-1, k ) + > + ldx( 5, 4, i, j ) * v( 4, i-1, j, k ) + > + ldy( 5, 5, i, j ) * v( 5, i, j-1, k ) + > + ldx( 5, 5, i, j ) * v( 5, i-1, j, k ) ) + +! end do + +c--------------------------------------------------------------------- +c diagonal block inversion +c +c forward elimination +c--------------------------------------------------------------------- +!!dir$ unroll 5 +! manually unroll the loop +! do m = 1, 5 + tmat( 1, 1 ) = d( 1, 1, i, j ) + tmat( 1, 2 ) = d( 1, 2, i, j ) + tmat( 1, 3 ) = d( 1, 3, i, j ) + tmat( 1, 4 ) = d( 1, 4, i, j ) + tmat( 1, 5 ) = d( 1, 5, i, j ) + tmat( 2, 1 ) = d( 2, 1, i, j ) + tmat( 2, 2 ) = d( 2, 2, i, j ) + tmat( 2, 3 ) = d( 2, 3, i, j ) + tmat( 2, 4 ) = d( 2, 4, i, j ) + tmat( 2, 5 ) = d( 2, 5, i, j ) + tmat( 3, 1 ) = d( 3, 1, i, j ) + tmat( 3, 2 ) = d( 3, 2, i, j ) + tmat( 3, 3 ) = d( 3, 3, i, j ) + tmat( 3, 4 ) = d( 3, 4, i, j ) + tmat( 3, 5 ) = d( 3, 5, i, j ) + tmat( 4, 1 ) = d( 4, 1, i, j ) + tmat( 4, 2 ) = d( 4, 2, i, j ) + tmat( 4, 3 ) = d( 4, 3, i, j ) + tmat( 4, 4 ) = d( 4, 4, i, j ) + tmat( 4, 5 ) = d( 4, 5, i, j ) + tmat( 5, 1 ) = d( 5, 1, i, j ) + tmat( 5, 2 ) = d( 5, 2, i, j ) + tmat( 5, 3 ) = d( 5, 3, i, j ) + tmat( 5, 4 ) = d( 5, 4, i, j ) + tmat( 5, 5 ) = d( 5, 5, i, j ) +! end do + + tmp1 = 1.0d+00 / tmat( 1, 1 ) + tmp = tmp1 * tmat( 2, 1 ) + tmat( 2, 2 ) = tmat( 2, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 2, 3 ) = tmat( 2, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 2, 4 ) = tmat( 2, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 2, 5 ) = tmat( 2, 5 ) + > - tmp * tmat( 1, 5 ) + v( 2, i, j, k ) = v( 2, i, j, k ) + > - v( 1, i, j, k ) * tmp + + tmp = tmp1 * tmat( 3, 1 ) + tmat( 3, 2 ) = tmat( 3, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 3, 3 ) = tmat( 3, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 3, 4 ) = tmat( 3, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 3, 5 ) = tmat( 3, 5 ) + > - tmp * tmat( 1, 5 ) + v( 3, i, j, k ) = v( 3, i, j, k ) + > - v( 1, i, j, k ) * tmp + + tmp = tmp1 * tmat( 4, 1 ) + tmat( 4, 2 ) = tmat( 4, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 4, 3 ) = tmat( 4, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 1, 5 ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > - v( 1, i, j, k ) * tmp + + tmp = tmp1 * tmat( 5, 1 ) + tmat( 5, 2 ) = tmat( 5, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 5, 3 ) = tmat( 5, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 1, 5 ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - v( 1, i, j, k ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 2, 2 ) + tmp = tmp1 * tmat( 3, 2 ) + tmat( 3, 3 ) = tmat( 3, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 3, 4 ) = tmat( 3, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 3, 5 ) = tmat( 3, 5 ) + > - tmp * tmat( 2, 5 ) + v( 3, i, j, k ) = v( 3, i, j, k ) + > - v( 2, i, j, k ) * tmp + + tmp = tmp1 * tmat( 4, 2 ) + tmat( 4, 3 ) = tmat( 4, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 2, 5 ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > - v( 2, i, j, k ) * tmp + + tmp = tmp1 * tmat( 5, 2 ) + tmat( 5, 3 ) = tmat( 5, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 2, 5 ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - v( 2, i, j, k ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 3, 3 ) + tmp = tmp1 * tmat( 4, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 3, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 3, 5 ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > - v( 3, i, j, k ) * tmp + + tmp = tmp1 * tmat( 5, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 3, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 3, 5 ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - v( 3, i, j, k ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 4, 4 ) + tmp = tmp1 * tmat( 5, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 4, 5 ) + v( 5, i, j, k ) = v( 5, i, j, k ) + > - v( 4, i, j, k ) * tmp + +c--------------------------------------------------------------------- +c back substitution +c--------------------------------------------------------------------- + v( 5, i, j, k ) = v( 5, i, j, k ) + > / tmat( 5, 5 ) + + v( 4, i, j, k ) = v( 4, i, j, k ) + > - tmat( 4, 5 ) * v( 5, i, j, k ) + v( 4, i, j, k ) = v( 4, i, j, k ) + > / tmat( 4, 4 ) + + v( 3, i, j, k ) = v( 3, i, j, k ) + > - tmat( 3, 4 ) * v( 4, i, j, k ) + > - tmat( 3, 5 ) * v( 5, i, j, k ) + v( 3, i, j, k ) = v( 3, i, j, k ) + > / tmat( 3, 3 ) + + v( 2, i, j, k ) = v( 2, i, j, k ) + > - tmat( 2, 3 ) * v( 3, i, j, k ) + > - tmat( 2, 4 ) * v( 4, i, j, k ) + > - tmat( 2, 5 ) * v( 5, i, j, k ) + v( 2, i, j, k ) = v( 2, i, j, k ) + > / tmat( 2, 2 ) + + v( 1, i, j, k ) = v( 1, i, j, k ) + > - tmat( 1, 2 ) * v( 2, i, j, k ) + > - tmat( 1, 3 ) * v( 3, i, j, k ) + > - tmat( 1, 4 ) * v( 4, i, j, k ) + > - tmat( 1, 5 ) * v( 5, i, j, k ) + v( 1, i, j, k ) = v( 1, i, j, k ) + > / tmat( 1, 1 ) + + + enddo + enddo + +c--------------------------------------------------------------------- +c send data to east and south +c--------------------------------------------------------------------- + iex = 2 + call exchange_1( v,k,iex ) + + return + end + + diff --git a/examples/smpi/NAS/LU/buts.f b/examples/smpi/NAS/LU/buts.f new file mode 100644 index 0000000000..a6fc3d6217 --- /dev/null +++ b/examples/smpi/NAS/LU/buts.f @@ -0,0 +1,259 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine buts( ldmx, ldmy, ldmz, + > nx, ny, nz, k, + > omega, + > v, tv, + > d, udx, udy, udz, + > ist, iend, jst, jend, + > nx0, ny0, ipt, jpt ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c compute the regular-sparse, block upper triangular solution: +c +c v <-- ( U-inv ) * v +c +c--------------------------------------------------------------------- + + implicit none + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer ldmx, ldmy, ldmz + integer nx, ny, nz + integer k + double precision omega + double precision v( 5, -1:ldmx+2, -1:ldmy+2, *), + > tv(5, ldmx, ldmy), + > d( 5, 5, ldmx, ldmy), + > udx( 5, 5, ldmx, ldmy), + > udy( 5, 5, ldmx, ldmy), + > udz( 5, 5, ldmx, ldmy ) + integer ist, iend + integer jst, jend + integer nx0, ny0 + integer ipt, jpt + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, m + integer iex + double precision tmp, tmp1 + double precision tmat(5,5) + + +c--------------------------------------------------------------------- +c receive data from south and east +c--------------------------------------------------------------------- + iex = 1 + call exchange_1( v,k,iex ) + + do j = jend, jst, -1 + do i = iend, ist, -1 + do m = 1, 5 + tv( m, i, j ) = + > omega * ( udz( m, 1, i, j ) * v( 1, i, j, k+1 ) + > + udz( m, 2, i, j ) * v( 2, i, j, k+1 ) + > + udz( m, 3, i, j ) * v( 3, i, j, k+1 ) + > + udz( m, 4, i, j ) * v( 4, i, j, k+1 ) + > + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) ) + end do + end do + end do + + + do j = jend,jst,-1 + do i = iend,ist,-1 + + do m = 1, 5 + tv( m, i, j ) = tv( m, i, j ) + > + omega * ( udy( m, 1, i, j ) * v( 1, i, j+1, k ) + > + udx( m, 1, i, j ) * v( 1, i+1, j, k ) + > + udy( m, 2, i, j ) * v( 2, i, j+1, k ) + > + udx( m, 2, i, j ) * v( 2, i+1, j, k ) + > + udy( m, 3, i, j ) * v( 3, i, j+1, k ) + > + udx( m, 3, i, j ) * v( 3, i+1, j, k ) + > + udy( m, 4, i, j ) * v( 4, i, j+1, k ) + > + udx( m, 4, i, j ) * v( 4, i+1, j, k ) + > + udy( m, 5, i, j ) * v( 5, i, j+1, k ) + > + udx( m, 5, i, j ) * v( 5, i+1, j, k ) ) + end do + +c--------------------------------------------------------------------- +c diagonal block inversion +c--------------------------------------------------------------------- + do m = 1, 5 + tmat( m, 1 ) = d( m, 1, i, j ) + tmat( m, 2 ) = d( m, 2, i, j ) + tmat( m, 3 ) = d( m, 3, i, j ) + tmat( m, 4 ) = d( m, 4, i, j ) + tmat( m, 5 ) = d( m, 5, i, j ) + end do + + tmp1 = 1.0d+00 / tmat( 1, 1 ) + tmp = tmp1 * tmat( 2, 1 ) + tmat( 2, 2 ) = tmat( 2, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 2, 3 ) = tmat( 2, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 2, 4 ) = tmat( 2, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 2, 5 ) = tmat( 2, 5 ) + > - tmp * tmat( 1, 5 ) + tv( 2, i, j ) = tv( 2, i, j ) + > - tv( 1, i, j ) * tmp + + tmp = tmp1 * tmat( 3, 1 ) + tmat( 3, 2 ) = tmat( 3, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 3, 3 ) = tmat( 3, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 3, 4 ) = tmat( 3, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 3, 5 ) = tmat( 3, 5 ) + > - tmp * tmat( 1, 5 ) + tv( 3, i, j ) = tv( 3, i, j ) + > - tv( 1, i, j ) * tmp + + tmp = tmp1 * tmat( 4, 1 ) + tmat( 4, 2 ) = tmat( 4, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 4, 3 ) = tmat( 4, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 1, 5 ) + tv( 4, i, j ) = tv( 4, i, j ) + > - tv( 1, i, j ) * tmp + + tmp = tmp1 * tmat( 5, 1 ) + tmat( 5, 2 ) = tmat( 5, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 5, 3 ) = tmat( 5, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 1, 5 ) + tv( 5, i, j ) = tv( 5, i, j ) + > - tv( 1, i, j ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 2, 2 ) + tmp = tmp1 * tmat( 3, 2 ) + tmat( 3, 3 ) = tmat( 3, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 3, 4 ) = tmat( 3, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 3, 5 ) = tmat( 3, 5 ) + > - tmp * tmat( 2, 5 ) + tv( 3, i, j ) = tv( 3, i, j ) + > - tv( 2, i, j ) * tmp + + tmp = tmp1 * tmat( 4, 2 ) + tmat( 4, 3 ) = tmat( 4, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 2, 5 ) + tv( 4, i, j ) = tv( 4, i, j ) + > - tv( 2, i, j ) * tmp + + tmp = tmp1 * tmat( 5, 2 ) + tmat( 5, 3 ) = tmat( 5, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 2, 5 ) + tv( 5, i, j ) = tv( 5, i, j ) + > - tv( 2, i, j ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 3, 3 ) + tmp = tmp1 * tmat( 4, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 3, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 3, 5 ) + tv( 4, i, j ) = tv( 4, i, j ) + > - tv( 3, i, j ) * tmp + + tmp = tmp1 * tmat( 5, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 3, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 3, 5 ) + tv( 5, i, j ) = tv( 5, i, j ) + > - tv( 3, i, j ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 4, 4 ) + tmp = tmp1 * tmat( 5, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 4, 5 ) + tv( 5, i, j ) = tv( 5, i, j ) + > - tv( 4, i, j ) * tmp + +c--------------------------------------------------------------------- +c back substitution +c--------------------------------------------------------------------- + tv( 5, i, j ) = tv( 5, i, j ) + > / tmat( 5, 5 ) + + tv( 4, i, j ) = tv( 4, i, j ) + > - tmat( 4, 5 ) * tv( 5, i, j ) + tv( 4, i, j ) = tv( 4, i, j ) + > / tmat( 4, 4 ) + + tv( 3, i, j ) = tv( 3, i, j ) + > - tmat( 3, 4 ) * tv( 4, i, j ) + > - tmat( 3, 5 ) * tv( 5, i, j ) + tv( 3, i, j ) = tv( 3, i, j ) + > / tmat( 3, 3 ) + + tv( 2, i, j ) = tv( 2, i, j ) + > - tmat( 2, 3 ) * tv( 3, i, j ) + > - tmat( 2, 4 ) * tv( 4, i, j ) + > - tmat( 2, 5 ) * tv( 5, i, j ) + tv( 2, i, j ) = tv( 2, i, j ) + > / tmat( 2, 2 ) + + tv( 1, i, j ) = tv( 1, i, j ) + > - tmat( 1, 2 ) * tv( 2, i, j ) + > - tmat( 1, 3 ) * tv( 3, i, j ) + > - tmat( 1, 4 ) * tv( 4, i, j ) + > - tmat( 1, 5 ) * tv( 5, i, j ) + tv( 1, i, j ) = tv( 1, i, j ) + > / tmat( 1, 1 ) + + v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j ) + v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j ) + v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j ) + v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j ) + v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j ) + + + enddo + end do + +c--------------------------------------------------------------------- +c send data to north and west +c--------------------------------------------------------------------- + iex = 3 + call exchange_1( v,k,iex ) + + return + end diff --git a/examples/smpi/NAS/LU/buts_vec.f b/examples/smpi/NAS/LU/buts_vec.f new file mode 100644 index 0000000000..813105d270 --- /dev/null +++ b/examples/smpi/NAS/LU/buts_vec.f @@ -0,0 +1,332 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine buts( ldmx, ldmy, ldmz, + > nx, ny, nz, k, + > omega, + > v, tv, + > d, udx, udy, udz, + > ist, iend, jst, jend, + > nx0, ny0, ipt, jpt ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c compute the regular-sparse, block upper triangular solution: +c +c v <-- ( U-inv ) * v +c +c--------------------------------------------------------------------- + + implicit none + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer ldmx, ldmy, ldmz + integer nx, ny, nz + integer k + double precision omega + double precision v( 5, -1:ldmx+2, -1:ldmy+2, *), + > tv(5, ldmx, ldmy), + > d( 5, 5, ldmx, ldmy), + > udx( 5, 5, ldmx, ldmy), + > udy( 5, 5, ldmx, ldmy), + > udz( 5, 5, ldmx, ldmy ) + integer ist, iend + integer jst, jend + integer nx0, ny0 + integer ipt, jpt + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, m, l, istp, iendp + integer iex + double precision tmp, tmp1 + double precision tmat(5,5) + + +c--------------------------------------------------------------------- +c receive data from south and east +c--------------------------------------------------------------------- + iex = 1 + call exchange_1( v,k,iex ) + + do j = jend, jst, -1 + do i = iend, ist, -1 + do m = 1, 5 + tv( m, i, j ) = + > omega * ( udz( m, 1, i, j ) * v( 1, i, j, k+1 ) + > + udz( m, 2, i, j ) * v( 2, i, j, k+1 ) + > + udz( m, 3, i, j ) * v( 3, i, j, k+1 ) + > + udz( m, 4, i, j ) * v( 4, i, j, k+1 ) + > + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) ) + end do + end do + end do + + + do l = iend+jend, ist+jst, -1 + istp = max(l - jend, ist) + iendp = min(l - jst, iend) + +!dir$ ivdep + do i = istp, iendp + j = l - i + +!!dir$ unroll 5 +! manually unroll the loop +! do m = 1, 5 + tv( 1, i, j ) = tv( 1, i, j ) + > + omega * ( udy( 1, 1, i, j ) * v( 1, i, j+1, k ) + > + udx( 1, 1, i, j ) * v( 1, i+1, j, k ) + > + udy( 1, 2, i, j ) * v( 2, i, j+1, k ) + > + udx( 1, 2, i, j ) * v( 2, i+1, j, k ) + > + udy( 1, 3, i, j ) * v( 3, i, j+1, k ) + > + udx( 1, 3, i, j ) * v( 3, i+1, j, k ) + > + udy( 1, 4, i, j ) * v( 4, i, j+1, k ) + > + udx( 1, 4, i, j ) * v( 4, i+1, j, k ) + > + udy( 1, 5, i, j ) * v( 5, i, j+1, k ) + > + udx( 1, 5, i, j ) * v( 5, i+1, j, k ) ) + tv( 2, i, j ) = tv( 2, i, j ) + > + omega * ( udy( 2, 1, i, j ) * v( 1, i, j+1, k ) + > + udx( 2, 1, i, j ) * v( 1, i+1, j, k ) + > + udy( 2, 2, i, j ) * v( 2, i, j+1, k ) + > + udx( 2, 2, i, j ) * v( 2, i+1, j, k ) + > + udy( 2, 3, i, j ) * v( 3, i, j+1, k ) + > + udx( 2, 3, i, j ) * v( 3, i+1, j, k ) + > + udy( 2, 4, i, j ) * v( 4, i, j+1, k ) + > + udx( 2, 4, i, j ) * v( 4, i+1, j, k ) + > + udy( 2, 5, i, j ) * v( 5, i, j+1, k ) + > + udx( 2, 5, i, j ) * v( 5, i+1, j, k ) ) + tv( 3, i, j ) = tv( 3, i, j ) + > + omega * ( udy( 3, 1, i, j ) * v( 1, i, j+1, k ) + > + udx( 3, 1, i, j ) * v( 1, i+1, j, k ) + > + udy( 3, 2, i, j ) * v( 2, i, j+1, k ) + > + udx( 3, 2, i, j ) * v( 2, i+1, j, k ) + > + udy( 3, 3, i, j ) * v( 3, i, j+1, k ) + > + udx( 3, 3, i, j ) * v( 3, i+1, j, k ) + > + udy( 3, 4, i, j ) * v( 4, i, j+1, k ) + > + udx( 3, 4, i, j ) * v( 4, i+1, j, k ) + > + udy( 3, 5, i, j ) * v( 5, i, j+1, k ) + > + udx( 3, 5, i, j ) * v( 5, i+1, j, k ) ) + tv( 4, i, j ) = tv( 4, i, j ) + > + omega * ( udy( 4, 1, i, j ) * v( 1, i, j+1, k ) + > + udx( 4, 1, i, j ) * v( 1, i+1, j, k ) + > + udy( 4, 2, i, j ) * v( 2, i, j+1, k ) + > + udx( 4, 2, i, j ) * v( 2, i+1, j, k ) + > + udy( 4, 3, i, j ) * v( 3, i, j+1, k ) + > + udx( 4, 3, i, j ) * v( 3, i+1, j, k ) + > + udy( 4, 4, i, j ) * v( 4, i, j+1, k ) + > + udx( 4, 4, i, j ) * v( 4, i+1, j, k ) + > + udy( 4, 5, i, j ) * v( 5, i, j+1, k ) + > + udx( 4, 5, i, j ) * v( 5, i+1, j, k ) ) + tv( 5, i, j ) = tv( 5, i, j ) + > + omega * ( udy( 5, 1, i, j ) * v( 1, i, j+1, k ) + > + udx( 5, 1, i, j ) * v( 1, i+1, j, k ) + > + udy( 5, 2, i, j ) * v( 2, i, j+1, k ) + > + udx( 5, 2, i, j ) * v( 2, i+1, j, k ) + > + udy( 5, 3, i, j ) * v( 3, i, j+1, k ) + > + udx( 5, 3, i, j ) * v( 3, i+1, j, k ) + > + udy( 5, 4, i, j ) * v( 4, i, j+1, k ) + > + udx( 5, 4, i, j ) * v( 4, i+1, j, k ) + > + udy( 5, 5, i, j ) * v( 5, i, j+1, k ) + > + udx( 5, 5, i, j ) * v( 5, i+1, j, k ) ) +! end do + +c--------------------------------------------------------------------- +c diagonal block inversion +c--------------------------------------------------------------------- +!!dir$ unroll 5 +! manually unroll the loop +! do m = 1, 5 + tmat( 1, 1 ) = d( 1, 1, i, j ) + tmat( 1, 2 ) = d( 1, 2, i, j ) + tmat( 1, 3 ) = d( 1, 3, i, j ) + tmat( 1, 4 ) = d( 1, 4, i, j ) + tmat( 1, 5 ) = d( 1, 5, i, j ) + tmat( 2, 1 ) = d( 2, 1, i, j ) + tmat( 2, 2 ) = d( 2, 2, i, j ) + tmat( 2, 3 ) = d( 2, 3, i, j ) + tmat( 2, 4 ) = d( 2, 4, i, j ) + tmat( 2, 5 ) = d( 2, 5, i, j ) + tmat( 3, 1 ) = d( 3, 1, i, j ) + tmat( 3, 2 ) = d( 3, 2, i, j ) + tmat( 3, 3 ) = d( 3, 3, i, j ) + tmat( 3, 4 ) = d( 3, 4, i, j ) + tmat( 3, 5 ) = d( 3, 5, i, j ) + tmat( 4, 1 ) = d( 4, 1, i, j ) + tmat( 4, 2 ) = d( 4, 2, i, j ) + tmat( 4, 3 ) = d( 4, 3, i, j ) + tmat( 4, 4 ) = d( 4, 4, i, j ) + tmat( 4, 5 ) = d( 4, 5, i, j ) + tmat( 5, 1 ) = d( 5, 1, i, j ) + tmat( 5, 2 ) = d( 5, 2, i, j ) + tmat( 5, 3 ) = d( 5, 3, i, j ) + tmat( 5, 4 ) = d( 5, 4, i, j ) + tmat( 5, 5 ) = d( 5, 5, i, j ) +! end do + + tmp1 = 1.0d+00 / tmat( 1, 1 ) + tmp = tmp1 * tmat( 2, 1 ) + tmat( 2, 2 ) = tmat( 2, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 2, 3 ) = tmat( 2, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 2, 4 ) = tmat( 2, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 2, 5 ) = tmat( 2, 5 ) + > - tmp * tmat( 1, 5 ) + tv( 2, i, j ) = tv( 2, i, j ) + > - tv( 1, i, j ) * tmp + + tmp = tmp1 * tmat( 3, 1 ) + tmat( 3, 2 ) = tmat( 3, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 3, 3 ) = tmat( 3, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 3, 4 ) = tmat( 3, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 3, 5 ) = tmat( 3, 5 ) + > - tmp * tmat( 1, 5 ) + tv( 3, i, j ) = tv( 3, i, j ) + > - tv( 1, i, j ) * tmp + + tmp = tmp1 * tmat( 4, 1 ) + tmat( 4, 2 ) = tmat( 4, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 4, 3 ) = tmat( 4, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 1, 5 ) + tv( 4, i, j ) = tv( 4, i, j ) + > - tv( 1, i, j ) * tmp + + tmp = tmp1 * tmat( 5, 1 ) + tmat( 5, 2 ) = tmat( 5, 2 ) + > - tmp * tmat( 1, 2 ) + tmat( 5, 3 ) = tmat( 5, 3 ) + > - tmp * tmat( 1, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 1, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 1, 5 ) + tv( 5, i, j ) = tv( 5, i, j ) + > - tv( 1, i, j ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 2, 2 ) + tmp = tmp1 * tmat( 3, 2 ) + tmat( 3, 3 ) = tmat( 3, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 3, 4 ) = tmat( 3, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 3, 5 ) = tmat( 3, 5 ) + > - tmp * tmat( 2, 5 ) + tv( 3, i, j ) = tv( 3, i, j ) + > - tv( 2, i, j ) * tmp + + tmp = tmp1 * tmat( 4, 2 ) + tmat( 4, 3 ) = tmat( 4, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 2, 5 ) + tv( 4, i, j ) = tv( 4, i, j ) + > - tv( 2, i, j ) * tmp + + tmp = tmp1 * tmat( 5, 2 ) + tmat( 5, 3 ) = tmat( 5, 3 ) + > - tmp * tmat( 2, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 2, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 2, 5 ) + tv( 5, i, j ) = tv( 5, i, j ) + > - tv( 2, i, j ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 3, 3 ) + tmp = tmp1 * tmat( 4, 3 ) + tmat( 4, 4 ) = tmat( 4, 4 ) + > - tmp * tmat( 3, 4 ) + tmat( 4, 5 ) = tmat( 4, 5 ) + > - tmp * tmat( 3, 5 ) + tv( 4, i, j ) = tv( 4, i, j ) + > - tv( 3, i, j ) * tmp + + tmp = tmp1 * tmat( 5, 3 ) + tmat( 5, 4 ) = tmat( 5, 4 ) + > - tmp * tmat( 3, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 3, 5 ) + tv( 5, i, j ) = tv( 5, i, j ) + > - tv( 3, i, j ) * tmp + + + + tmp1 = 1.0d+00 / tmat( 4, 4 ) + tmp = tmp1 * tmat( 5, 4 ) + tmat( 5, 5 ) = tmat( 5, 5 ) + > - tmp * tmat( 4, 5 ) + tv( 5, i, j ) = tv( 5, i, j ) + > - tv( 4, i, j ) * tmp + +c--------------------------------------------------------------------- +c back substitution +c--------------------------------------------------------------------- + tv( 5, i, j ) = tv( 5, i, j ) + > / tmat( 5, 5 ) + + tv( 4, i, j ) = tv( 4, i, j ) + > - tmat( 4, 5 ) * tv( 5, i, j ) + tv( 4, i, j ) = tv( 4, i, j ) + > / tmat( 4, 4 ) + + tv( 3, i, j ) = tv( 3, i, j ) + > - tmat( 3, 4 ) * tv( 4, i, j ) + > - tmat( 3, 5 ) * tv( 5, i, j ) + tv( 3, i, j ) = tv( 3, i, j ) + > / tmat( 3, 3 ) + + tv( 2, i, j ) = tv( 2, i, j ) + > - tmat( 2, 3 ) * tv( 3, i, j ) + > - tmat( 2, 4 ) * tv( 4, i, j ) + > - tmat( 2, 5 ) * tv( 5, i, j ) + tv( 2, i, j ) = tv( 2, i, j ) + > / tmat( 2, 2 ) + + tv( 1, i, j ) = tv( 1, i, j ) + > - tmat( 1, 2 ) * tv( 2, i, j ) + > - tmat( 1, 3 ) * tv( 3, i, j ) + > - tmat( 1, 4 ) * tv( 4, i, j ) + > - tmat( 1, 5 ) * tv( 5, i, j ) + tv( 1, i, j ) = tv( 1, i, j ) + > / tmat( 1, 1 ) + + v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j ) + v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j ) + v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j ) + v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j ) + v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j ) + + + enddo + end do + +c--------------------------------------------------------------------- +c send data to north and west +c--------------------------------------------------------------------- + iex = 3 + call exchange_1( v,k,iex ) + + return + end diff --git a/examples/smpi/NAS/LU/erhs.f b/examples/smpi/NAS/LU/erhs.f new file mode 100644 index 0000000000..928e2a9f50 --- /dev/null +++ b/examples/smpi/NAS/LU/erhs.f @@ -0,0 +1,536 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine erhs + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c compute the right hand side based on exact solution +c +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k, m + integer iglob, jglob + integer iex + integer L1, L2 + integer ist1, iend1 + integer jst1, jend1 + double precision dsspm + double precision xi, eta, zeta + double precision q + double precision u21, u31, u41 + double precision tmp + double precision u21i, u31i, u41i, u51i + double precision u21j, u31j, u41j, u51j + double precision u21k, u31k, u41k, u51k + double precision u21im1, u31im1, u41im1, u51im1 + double precision u21jm1, u31jm1, u41jm1, u51jm1 + double precision u21km1, u31km1, u41km1, u51km1 + + dsspm = dssp + + + do k = 1, nz + do j = 1, ny + do i = 1, nx + do m = 1, 5 + frct( m, i, j, k ) = 0.0d+00 + end do + end do + end do + end do + + do k = 1, nz + zeta = ( dble(k-1) ) / ( nz - 1 ) + do j = 1, ny + jglob = jpt + j + eta = ( dble(jglob-1) ) / ( ny0 - 1 ) + do i = 1, nx + iglob = ipt + i + xi = ( dble(iglob-1) ) / ( nx0 - 1 ) + do m = 1, 5 + rsd(m,i,j,k) = ce(m,1) + > + ce(m,2) * xi + > + ce(m,3) * eta + > + ce(m,4) * zeta + > + ce(m,5) * xi * xi + > + ce(m,6) * eta * eta + > + ce(m,7) * zeta * zeta + > + ce(m,8) * xi * xi * xi + > + ce(m,9) * eta * eta * eta + > + ce(m,10) * zeta * zeta * zeta + > + ce(m,11) * xi * xi * xi * xi + > + ce(m,12) * eta * eta * eta * eta + > + ce(m,13) * zeta * zeta * zeta * zeta + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c xi-direction flux differences +c--------------------------------------------------------------------- +c +c iex = flag : iex = 0 north/south communication +c : iex = 1 east/west communication +c +c--------------------------------------------------------------------- + iex = 0 + +c--------------------------------------------------------------------- +c communicate and receive/send two rows of data +c--------------------------------------------------------------------- + call exchange_3 (rsd,iex) + + L1 = 0 + if (north.eq.-1) L1 = 1 + L2 = nx + 1 + if (south.eq.-1) L2 = nx + + ist1 = 1 + iend1 = nx + if (north.eq.-1) ist1 = 4 + if (south.eq.-1) iend1 = nx - 3 + + do k = 2, nz - 1 + do j = jst, jend + do i = L1, L2 + flux(1,i,j,k) = rsd(2,i,j,k) + u21 = rsd(2,i,j,k) / rsd(1,i,j,k) + q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k) + > + rsd(3,i,j,k) * rsd(3,i,j,k) + > + rsd(4,i,j,k) * rsd(4,i,j,k) ) + > / rsd(1,i,j,k) + flux(2,i,j,k) = rsd(2,i,j,k) * u21 + c2 * + > ( rsd(5,i,j,k) - q ) + flux(3,i,j,k) = rsd(3,i,j,k) * u21 + flux(4,i,j,k) = rsd(4,i,j,k) * u21 + flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u21 + end do + end do + end do + + do k = 2, nz - 1 + do j = jst, jend + do i = ist, iend + do m = 1, 5 + frct(m,i,j,k) = frct(m,i,j,k) + > - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) ) + end do + end do + do i = ist, L2 + tmp = 1.0d+00 / rsd(1,i,j,k) + + u21i = tmp * rsd(2,i,j,k) + u31i = tmp * rsd(3,i,j,k) + u41i = tmp * rsd(4,i,j,k) + u51i = tmp * rsd(5,i,j,k) + + tmp = 1.0d+00 / rsd(1,i-1,j,k) + + u21im1 = tmp * rsd(2,i-1,j,k) + u31im1 = tmp * rsd(3,i-1,j,k) + u41im1 = tmp * rsd(4,i-1,j,k) + u51im1 = tmp * rsd(5,i-1,j,k) + + flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * + > ( u21i - u21im1 ) + flux(3,i,j,k) = tx3 * ( u31i - u31im1 ) + flux(4,i,j,k) = tx3 * ( u41i - u41im1 ) + flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) + > * tx3 * ( ( u21i **2 + u31i **2 + u41i **2 ) + > - ( u21im1**2 + u31im1**2 + u41im1**2 ) ) + > + (1.0d+00/6.0d+00) + > * tx3 * ( u21i**2 - u21im1**2 ) + > + c1 * c5 * tx3 * ( u51i - u51im1 ) + end do + + do i = ist, iend + frct(1,i,j,k) = frct(1,i,j,k) + > + dx1 * tx1 * ( rsd(1,i-1,j,k) + > - 2.0d+00 * rsd(1,i,j,k) + > + rsd(1,i+1,j,k) ) + frct(2,i,j,k) = frct(2,i,j,k) + > + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) ) + > + dx2 * tx1 * ( rsd(2,i-1,j,k) + > - 2.0d+00 * rsd(2,i,j,k) + > + rsd(2,i+1,j,k) ) + frct(3,i,j,k) = frct(3,i,j,k) + > + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) ) + > + dx3 * tx1 * ( rsd(3,i-1,j,k) + > - 2.0d+00 * rsd(3,i,j,k) + > + rsd(3,i+1,j,k) ) + frct(4,i,j,k) = frct(4,i,j,k) + > + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) ) + > + dx4 * tx1 * ( rsd(4,i-1,j,k) + > - 2.0d+00 * rsd(4,i,j,k) + > + rsd(4,i+1,j,k) ) + frct(5,i,j,k) = frct(5,i,j,k) + > + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) ) + > + dx5 * tx1 * ( rsd(5,i-1,j,k) + > - 2.0d+00 * rsd(5,i,j,k) + > + rsd(5,i+1,j,k) ) + end do + +c--------------------------------------------------------------------- +c Fourth-order dissipation +c--------------------------------------------------------------------- + IF (north.eq.-1) then + do m = 1, 5 + frct(m,2,j,k) = frct(m,2,j,k) + > - dsspm * ( + 5.0d+00 * rsd(m,2,j,k) + > - 4.0d+00 * rsd(m,3,j,k) + > + rsd(m,4,j,k) ) + frct(m,3,j,k) = frct(m,3,j,k) + > - dsspm * ( - 4.0d+00 * rsd(m,2,j,k) + > + 6.0d+00 * rsd(m,3,j,k) + > - 4.0d+00 * rsd(m,4,j,k) + > + rsd(m,5,j,k) ) + end do + END IF + + do i = ist1,iend1 + do m = 1, 5 + frct(m,i,j,k) = frct(m,i,j,k) + > - dsspm * ( rsd(m,i-2,j,k) + > - 4.0d+00 * rsd(m,i-1,j,k) + > + 6.0d+00 * rsd(m,i,j,k) + > - 4.0d+00 * rsd(m,i+1,j,k) + > + rsd(m,i+2,j,k) ) + end do + end do + + IF (south.eq.-1) then + do m = 1, 5 + frct(m,nx-2,j,k) = frct(m,nx-2,j,k) + > - dsspm * ( rsd(m,nx-4,j,k) + > - 4.0d+00 * rsd(m,nx-3,j,k) + > + 6.0d+00 * rsd(m,nx-2,j,k) + > - 4.0d+00 * rsd(m,nx-1,j,k) ) + frct(m,nx-1,j,k) = frct(m,nx-1,j,k) + > - dsspm * ( rsd(m,nx-3,j,k) + > - 4.0d+00 * rsd(m,nx-2,j,k) + > + 5.0d+00 * rsd(m,nx-1,j,k) ) + end do + END IF + + end do + end do + +c--------------------------------------------------------------------- +c eta-direction flux differences +c--------------------------------------------------------------------- +c +c iex = flag : iex = 0 north/south communication +c : iex = 1 east/west communication +c +c--------------------------------------------------------------------- + iex = 1 + +c--------------------------------------------------------------------- +c communicate and receive/send two rows of data +c--------------------------------------------------------------------- + call exchange_3 (rsd,iex) + + L1 = 0 + if (west.eq.-1) L1 = 1 + L2 = ny + 1 + if (east.eq.-1) L2 = ny + + jst1 = 1 + jend1 = ny + if (west.eq.-1) jst1 = 4 + if (east.eq.-1) jend1 = ny - 3 + + do k = 2, nz - 1 + do j = L1, L2 + do i = ist, iend + flux(1,i,j,k) = rsd(3,i,j,k) + u31 = rsd(3,i,j,k) / rsd(1,i,j,k) + q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k) + > + rsd(3,i,j,k) * rsd(3,i,j,k) + > + rsd(4,i,j,k) * rsd(4,i,j,k) ) + > / rsd(1,i,j,k) + flux(2,i,j,k) = rsd(2,i,j,k) * u31 + flux(3,i,j,k) = rsd(3,i,j,k) * u31 + c2 * + > ( rsd(5,i,j,k) - q ) + flux(4,i,j,k) = rsd(4,i,j,k) * u31 + flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u31 + end do + end do + end do + + do k = 2, nz - 1 + do i = ist, iend + do j = jst, jend + do m = 1, 5 + frct(m,i,j,k) = frct(m,i,j,k) + > - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) ) + end do + end do + end do + + do j = jst, L2 + do i = ist, iend + tmp = 1.0d+00 / rsd(1,i,j,k) + + u21j = tmp * rsd(2,i,j,k) + u31j = tmp * rsd(3,i,j,k) + u41j = tmp * rsd(4,i,j,k) + u51j = tmp * rsd(5,i,j,k) + + tmp = 1.0d+00 / rsd(1,i,j-1,k) + + u21jm1 = tmp * rsd(2,i,j-1,k) + u31jm1 = tmp * rsd(3,i,j-1,k) + u41jm1 = tmp * rsd(4,i,j-1,k) + u51jm1 = tmp * rsd(5,i,j-1,k) + + flux(2,i,j,k) = ty3 * ( u21j - u21jm1 ) + flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * + > ( u31j - u31jm1 ) + flux(4,i,j,k) = ty3 * ( u41j - u41jm1 ) + flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) + > * ty3 * ( ( u21j **2 + u31j **2 + u41j **2 ) + > - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) ) + > + (1.0d+00/6.0d+00) + > * ty3 * ( u31j**2 - u31jm1**2 ) + > + c1 * c5 * ty3 * ( u51j - u51jm1 ) + end do + end do + + do j = jst, jend + do i = ist, iend + frct(1,i,j,k) = frct(1,i,j,k) + > + dy1 * ty1 * ( rsd(1,i,j-1,k) + > - 2.0d+00 * rsd(1,i,j,k) + > + rsd(1,i,j+1,k) ) + frct(2,i,j,k) = frct(2,i,j,k) + > + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) ) + > + dy2 * ty1 * ( rsd(2,i,j-1,k) + > - 2.0d+00 * rsd(2,i,j,k) + > + rsd(2,i,j+1,k) ) + frct(3,i,j,k) = frct(3,i,j,k) + > + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) ) + > + dy3 * ty1 * ( rsd(3,i,j-1,k) + > - 2.0d+00 * rsd(3,i,j,k) + > + rsd(3,i,j+1,k) ) + frct(4,i,j,k) = frct(4,i,j,k) + > + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) ) + > + dy4 * ty1 * ( rsd(4,i,j-1,k) + > - 2.0d+00 * rsd(4,i,j,k) + > + rsd(4,i,j+1,k) ) + frct(5,i,j,k) = frct(5,i,j,k) + > + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) ) + > + dy5 * ty1 * ( rsd(5,i,j-1,k) + > - 2.0d+00 * rsd(5,i,j,k) + > + rsd(5,i,j+1,k) ) + end do + end do + +c--------------------------------------------------------------------- +c fourth-order dissipation +c--------------------------------------------------------------------- + IF (west.eq.-1) then + do i = ist, iend + do m = 1, 5 + frct(m,i,2,k) = frct(m,i,2,k) + > - dsspm * ( + 5.0d+00 * rsd(m,i,2,k) + > - 4.0d+00 * rsd(m,i,3,k) + > + rsd(m,i,4,k) ) + frct(m,i,3,k) = frct(m,i,3,k) + > - dsspm * ( - 4.0d+00 * rsd(m,i,2,k) + > + 6.0d+00 * rsd(m,i,3,k) + > - 4.0d+00 * rsd(m,i,4,k) + > + rsd(m,i,5,k) ) + end do + end do + END IF + + do j = jst1, jend1 + do i = ist, iend + do m = 1, 5 + frct(m,i,j,k) = frct(m,i,j,k) + > - dsspm * ( rsd(m,i,j-2,k) + > - 4.0d+00 * rsd(m,i,j-1,k) + > + 6.0d+00 * rsd(m,i,j,k) + > - 4.0d+00 * rsd(m,i,j+1,k) + > + rsd(m,i,j+2,k) ) + end do + end do + end do + + IF (east.eq.-1) then + do i = ist, iend + do m = 1, 5 + frct(m,i,ny-2,k) = frct(m,i,ny-2,k) + > - dsspm * ( rsd(m,i,ny-4,k) + > - 4.0d+00 * rsd(m,i,ny-3,k) + > + 6.0d+00 * rsd(m,i,ny-2,k) + > - 4.0d+00 * rsd(m,i,ny-1,k) ) + frct(m,i,ny-1,k) = frct(m,i,ny-1,k) + > - dsspm * ( rsd(m,i,ny-3,k) + > - 4.0d+00 * rsd(m,i,ny-2,k) + > + 5.0d+00 * rsd(m,i,ny-1,k) ) + end do + end do + END IF + + end do + +c--------------------------------------------------------------------- +c zeta-direction flux differences +c--------------------------------------------------------------------- + do k = 1, nz + do j = jst, jend + do i = ist, iend + flux(1,i,j,k) = rsd(4,i,j,k) + u41 = rsd(4,i,j,k) / rsd(1,i,j,k) + q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k) + > + rsd(3,i,j,k) * rsd(3,i,j,k) + > + rsd(4,i,j,k) * rsd(4,i,j,k) ) + > / rsd(1,i,j,k) + flux(2,i,j,k) = rsd(2,i,j,k) * u41 + flux(3,i,j,k) = rsd(3,i,j,k) * u41 + flux(4,i,j,k) = rsd(4,i,j,k) * u41 + c2 * + > ( rsd(5,i,j,k) - q ) + flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u41 + end do + end do + end do + + do k = 2, nz - 1 + do j = jst, jend + do i = ist, iend + do m = 1, 5 + frct(m,i,j,k) = frct(m,i,j,k) + > - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) ) + end do + end do + end do + end do + + do k = 2, nz + do j = jst, jend + do i = ist, iend + tmp = 1.0d+00 / rsd(1,i,j,k) + + u21k = tmp * rsd(2,i,j,k) + u31k = tmp * rsd(3,i,j,k) + u41k = tmp * rsd(4,i,j,k) + u51k = tmp * rsd(5,i,j,k) + + tmp = 1.0d+00 / rsd(1,i,j,k-1) + + u21km1 = tmp * rsd(2,i,j,k-1) + u31km1 = tmp * rsd(3,i,j,k-1) + u41km1 = tmp * rsd(4,i,j,k-1) + u51km1 = tmp * rsd(5,i,j,k-1) + + flux(2,i,j,k) = tz3 * ( u21k - u21km1 ) + flux(3,i,j,k) = tz3 * ( u31k - u31km1 ) + flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * ( u41k + > - u41km1 ) + flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) + > * tz3 * ( ( u21k **2 + u31k **2 + u41k **2 ) + > - ( u21km1**2 + u31km1**2 + u41km1**2 ) ) + > + (1.0d+00/6.0d+00) + > * tz3 * ( u41k**2 - u41km1**2 ) + > + c1 * c5 * tz3 * ( u51k - u51km1 ) + end do + end do + end do + + do k = 2, nz - 1 + do j = jst, jend + do i = ist, iend + frct(1,i,j,k) = frct(1,i,j,k) + > + dz1 * tz1 * ( rsd(1,i,j,k+1) + > - 2.0d+00 * rsd(1,i,j,k) + > + rsd(1,i,j,k-1) ) + frct(2,i,j,k) = frct(2,i,j,k) + > + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) ) + > + dz2 * tz1 * ( rsd(2,i,j,k+1) + > - 2.0d+00 * rsd(2,i,j,k) + > + rsd(2,i,j,k-1) ) + frct(3,i,j,k) = frct(3,i,j,k) + > + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) ) + > + dz3 * tz1 * ( rsd(3,i,j,k+1) + > - 2.0d+00 * rsd(3,i,j,k) + > + rsd(3,i,j,k-1) ) + frct(4,i,j,k) = frct(4,i,j,k) + > + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) ) + > + dz4 * tz1 * ( rsd(4,i,j,k+1) + > - 2.0d+00 * rsd(4,i,j,k) + > + rsd(4,i,j,k-1) ) + frct(5,i,j,k) = frct(5,i,j,k) + > + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) ) + > + dz5 * tz1 * ( rsd(5,i,j,k+1) + > - 2.0d+00 * rsd(5,i,j,k) + > + rsd(5,i,j,k-1) ) + end do + end do + end do + +c--------------------------------------------------------------------- +c fourth-order dissipation +c--------------------------------------------------------------------- + do j = jst, jend + do i = ist, iend + do m = 1, 5 + frct(m,i,j,2) = frct(m,i,j,2) + > - dsspm * ( + 5.0d+00 * rsd(m,i,j,2) + > - 4.0d+00 * rsd(m,i,j,3) + > + rsd(m,i,j,4) ) + frct(m,i,j,3) = frct(m,i,j,3) + > - dsspm * (- 4.0d+00 * rsd(m,i,j,2) + > + 6.0d+00 * rsd(m,i,j,3) + > - 4.0d+00 * rsd(m,i,j,4) + > + rsd(m,i,j,5) ) + end do + end do + end do + + do k = 4, nz - 3 + do j = jst, jend + do i = ist, iend + do m = 1, 5 + frct(m,i,j,k) = frct(m,i,j,k) + > - dsspm * ( rsd(m,i,j,k-2) + > - 4.0d+00 * rsd(m,i,j,k-1) + > + 6.0d+00 * rsd(m,i,j,k) + > - 4.0d+00 * rsd(m,i,j,k+1) + > + rsd(m,i,j,k+2) ) + end do + end do + end do + end do + + do j = jst, jend + do i = ist, iend + do m = 1, 5 + frct(m,i,j,nz-2) = frct(m,i,j,nz-2) + > - dsspm * ( rsd(m,i,j,nz-4) + > - 4.0d+00 * rsd(m,i,j,nz-3) + > + 6.0d+00 * rsd(m,i,j,nz-2) + > - 4.0d+00 * rsd(m,i,j,nz-1) ) + frct(m,i,j,nz-1) = frct(m,i,j,nz-1) + > - dsspm * ( rsd(m,i,j,nz-3) + > - 4.0d+00 * rsd(m,i,j,nz-2) + > + 5.0d+00 * rsd(m,i,j,nz-1) ) + end do + end do + end do + + return + end diff --git a/examples/smpi/NAS/LU/error.f b/examples/smpi/NAS/LU/error.f new file mode 100644 index 0000000000..e83f74912f --- /dev/null +++ b/examples/smpi/NAS/LU/error.f @@ -0,0 +1,81 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine error + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c compute the solution error +c +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k, m + integer iglob, jglob + double precision tmp + double precision u000ijk(5), dummy(5) + + integer IERROR + + + do m = 1, 5 + errnm(m) = 0.0d+00 + dummy(m) = 0.0d+00 + end do + + do k = 2, nz-1 + do j = jst, jend + jglob = jpt + j + do i = ist, iend + iglob = ipt + i + call exact( iglob, jglob, k, u000ijk ) + do m = 1, 5 + tmp = ( u000ijk(m) - u(m,i,j,k) ) + dummy(m) = dummy(m) + tmp ** 2 + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c compute the global sum of individual contributions to dot product. +c--------------------------------------------------------------------- + call MPI_ALLREDUCE( dummy, + > errnm, + > 5, + > dp_type, + > MPI_SUM, + > MPI_COMM_WORLD, + > IERROR ) + + do m = 1, 5 + errnm(m) = sqrt ( errnm(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) ) + end do + +c if (id.eq.0) then +c write (*,1002) ( errnm(m), m = 1, 5 ) +c end if + + 1002 format (1x/1x,'RMS-norm of error in soln. to ', + > 'first pde = ',1pe12.5/, + > 1x,'RMS-norm of error in soln. to ', + > 'second pde = ',1pe12.5/, + > 1x,'RMS-norm of error in soln. to ', + > 'third pde = ',1pe12.5/, + > 1x,'RMS-norm of error in soln. to ', + > 'fourth pde = ',1pe12.5/, + > 1x,'RMS-norm of error in soln. to ', + > 'fifth pde = ',1pe12.5) + + return + end diff --git a/examples/smpi/NAS/LU/exact.f b/examples/smpi/NAS/LU/exact.f new file mode 100644 index 0000000000..19e14c3d5f --- /dev/null +++ b/examples/smpi/NAS/LU/exact.f @@ -0,0 +1,53 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exact( i, j, k, u000ijk ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c compute the exact solution at (i,j,k) +c +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer i, j, k + double precision u000ijk(*) + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer m + double precision xi, eta, zeta + + xi = ( dble ( i - 1 ) ) / ( nx0 - 1 ) + eta = ( dble ( j - 1 ) ) / ( ny0 - 1 ) + zeta = ( dble ( k - 1 ) ) / ( nz - 1 ) + + + do m = 1, 5 + u000ijk(m) = ce(m,1) + > + ce(m,2) * xi + > + ce(m,3) * eta + > + ce(m,4) * zeta + > + ce(m,5) * xi * xi + > + ce(m,6) * eta * eta + > + ce(m,7) * zeta * zeta + > + ce(m,8) * xi * xi * xi + > + ce(m,9) * eta * eta * eta + > + ce(m,10) * zeta * zeta * zeta + > + ce(m,11) * xi * xi * xi * xi + > + ce(m,12) * eta * eta * eta * eta + > + ce(m,13) * zeta * zeta * zeta * zeta + end do + + return + end diff --git a/examples/smpi/NAS/LU/exchange_1.f b/examples/smpi/NAS/LU/exchange_1.f new file mode 100644 index 0000000000..2bf7d28b94 --- /dev/null +++ b/examples/smpi/NAS/LU/exchange_1.f @@ -0,0 +1,180 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exchange_1( g,k,iex ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + + double precision g(5,-1:isiz1+2,-1:isiz2+2,isiz3) + integer k + integer iex + integer i, j + double precision dum(5,isiz1+isiz2), dum1(5,isiz1+isiz2) + + integer STATUS(MPI_STATUS_SIZE) + integer IERROR + + + + if( iex .eq. 0 ) then + + if( north .ne. -1 ) then + call MPI_RECV( dum1(1,jst), + > 5*(jend-jst+1), + > dp_type, + > north, + > from_n, + > MPI_COMM_WORLD, + > status, + > IERROR ) + do j=jst,jend + g(1,0,j,k) = dum1(1,j) + g(2,0,j,k) = dum1(2,j) + g(3,0,j,k) = dum1(3,j) + g(4,0,j,k) = dum1(4,j) + g(5,0,j,k) = dum1(5,j) + enddo + endif + + if( west .ne. -1 ) then + call MPI_RECV( dum1(1,ist), + > 5*(iend-ist+1), + > dp_type, + > west, + > from_w, + > MPI_COMM_WORLD, + > status, + > IERROR ) + do i=ist,iend + g(1,i,0,k) = dum1(1,i) + g(2,i,0,k) = dum1(2,i) + g(3,i,0,k) = dum1(3,i) + g(4,i,0,k) = dum1(4,i) + g(5,i,0,k) = dum1(5,i) + enddo + endif + + else if( iex .eq. 1 ) then + + if( south .ne. -1 ) then + call MPI_RECV( dum1(1,jst), + > 5*(jend-jst+1), + > dp_type, + > south, + > from_s, + > MPI_COMM_WORLD, + > status, + > IERROR ) + do j=jst,jend + g(1,nx+1,j,k) = dum1(1,j) + g(2,nx+1,j,k) = dum1(2,j) + g(3,nx+1,j,k) = dum1(3,j) + g(4,nx+1,j,k) = dum1(4,j) + g(5,nx+1,j,k) = dum1(5,j) + enddo + endif + + if( east .ne. -1 ) then + call MPI_RECV( dum1(1,ist), + > 5*(iend-ist+1), + > dp_type, + > east, + > from_e, + > MPI_COMM_WORLD, + > status, + > IERROR ) + do i=ist,iend + g(1,i,ny+1,k) = dum1(1,i) + g(2,i,ny+1,k) = dum1(2,i) + g(3,i,ny+1,k) = dum1(3,i) + g(4,i,ny+1,k) = dum1(4,i) + g(5,i,ny+1,k) = dum1(5,i) + enddo + endif + + else if( iex .eq. 2 ) then + + if( south .ne. -1 ) then + do j=jst,jend + dum(1,j) = g(1,nx,j,k) + dum(2,j) = g(2,nx,j,k) + dum(3,j) = g(3,nx,j,k) + dum(4,j) = g(4,nx,j,k) + dum(5,j) = g(5,nx,j,k) + enddo + call MPI_SEND( dum(1,jst), + > 5*(jend-jst+1), + > dp_type, + > south, + > from_n, + > MPI_COMM_WORLD, + > IERROR ) + endif + + if( east .ne. -1 ) then + do i=ist,iend + dum(1,i) = g(1,i,ny,k) + dum(2,i) = g(2,i,ny,k) + dum(3,i) = g(3,i,ny,k) + dum(4,i) = g(4,i,ny,k) + dum(5,i) = g(5,i,ny,k) + enddo + call MPI_SEND( dum(1,ist), + > 5*(iend-ist+1), + > dp_type, + > east, + > from_w, + > MPI_COMM_WORLD, + > IERROR ) + endif + + else + + if( north .ne. -1 ) then + do j=jst,jend + dum(1,j) = g(1,1,j,k) + dum(2,j) = g(2,1,j,k) + dum(3,j) = g(3,1,j,k) + dum(4,j) = g(4,1,j,k) + dum(5,j) = g(5,1,j,k) + enddo + call MPI_SEND( dum(1,jst), + > 5*(jend-jst+1), + > dp_type, + > north, + > from_s, + > MPI_COMM_WORLD, + > IERROR ) + endif + + if( west .ne. -1 ) then + do i=ist,iend + dum(1,i) = g(1,i,1,k) + dum(2,i) = g(2,i,1,k) + dum(3,i) = g(3,i,1,k) + dum(4,i) = g(4,i,1,k) + dum(5,i) = g(5,i,1,k) + enddo + call MPI_SEND( dum(1,ist), + > 5*(iend-ist+1), + > dp_type, + > west, + > from_e, + > MPI_COMM_WORLD, + > IERROR ) + endif + + endif + + end + + + diff --git a/examples/smpi/NAS/LU/exchange_3.f b/examples/smpi/NAS/LU/exchange_3.f new file mode 100644 index 0000000000..d52ae7ef4d --- /dev/null +++ b/examples/smpi/NAS/LU/exchange_3.f @@ -0,0 +1,312 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exchange_3(g,iex) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the right hand side based on exact solution +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + double precision g(5,-1:isiz1+2,-1:isiz2+2,isiz3) + integer iex + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k + integer ipos1, ipos2 + + integer mid + integer STATUS(MPI_STATUS_SIZE) + integer IERROR + + + + if (iex.eq.0) then +c--------------------------------------------------------------------- +c communicate in the south and north directions +c--------------------------------------------------------------------- + if (north.ne.-1) then + call MPI_IRECV( buf1, + > 10*ny*nz, + > dp_type, + > MPI_ANY_SOURCE, + > from_n, + > MPI_COMM_WORLD, + > mid, + > IERROR ) + end if + +c--------------------------------------------------------------------- +c send south +c--------------------------------------------------------------------- + if (south.ne.-1) then + do k = 1,nz + do j = 1,ny + ipos1 = (k-1)*ny + j + ipos2 = ipos1 + ny*nz + buf(1,ipos1) = g(1,nx-1,j,k) + buf(2,ipos1) = g(2,nx-1,j,k) + buf(3,ipos1) = g(3,nx-1,j,k) + buf(4,ipos1) = g(4,nx-1,j,k) + buf(5,ipos1) = g(5,nx-1,j,k) + buf(1,ipos2) = g(1,nx,j,k) + buf(2,ipos2) = g(2,nx,j,k) + buf(3,ipos2) = g(3,nx,j,k) + buf(4,ipos2) = g(4,nx,j,k) + buf(5,ipos2) = g(5,nx,j,k) + end do + end do + + call MPI_SEND( buf, + > 10*ny*nz, + > dp_type, + > south, + > from_n, + > MPI_COMM_WORLD, + > IERROR ) + end if + +c--------------------------------------------------------------------- +c receive from north +c--------------------------------------------------------------------- + if (north.ne.-1) then + call MPI_WAIT( mid, STATUS, IERROR ) + + do k = 1,nz + do j = 1,ny + ipos1 = (k-1)*ny + j + ipos2 = ipos1 + ny*nz + g(1,-1,j,k) = buf1(1,ipos1) + g(2,-1,j,k) = buf1(2,ipos1) + g(3,-1,j,k) = buf1(3,ipos1) + g(4,-1,j,k) = buf1(4,ipos1) + g(5,-1,j,k) = buf1(5,ipos1) + g(1,0,j,k) = buf1(1,ipos2) + g(2,0,j,k) = buf1(2,ipos2) + g(3,0,j,k) = buf1(3,ipos2) + g(4,0,j,k) = buf1(4,ipos2) + g(5,0,j,k) = buf1(5,ipos2) + end do + end do + + end if + + if (south.ne.-1) then + call MPI_IRECV( buf1, + > 10*ny*nz, + > dp_type, + > MPI_ANY_SOURCE, + > from_s, + > MPI_COMM_WORLD, + > mid, + > IERROR ) + end if + +c--------------------------------------------------------------------- +c send north +c--------------------------------------------------------------------- + if (north.ne.-1) then + do k = 1,nz + do j = 1,ny + ipos1 = (k-1)*ny + j + ipos2 = ipos1 + ny*nz + buf(1,ipos1) = g(1,2,j,k) + buf(2,ipos1) = g(2,2,j,k) + buf(3,ipos1) = g(3,2,j,k) + buf(4,ipos1) = g(4,2,j,k) + buf(5,ipos1) = g(5,2,j,k) + buf(1,ipos2) = g(1,1,j,k) + buf(2,ipos2) = g(2,1,j,k) + buf(3,ipos2) = g(3,1,j,k) + buf(4,ipos2) = g(4,1,j,k) + buf(5,ipos2) = g(5,1,j,k) + end do + end do + + call MPI_SEND( buf, + > 10*ny*nz, + > dp_type, + > north, + > from_s, + > MPI_COMM_WORLD, + > IERROR ) + end if + +c--------------------------------------------------------------------- +c receive from south +c--------------------------------------------------------------------- + if (south.ne.-1) then + call MPI_WAIT( mid, STATUS, IERROR ) + + do k = 1,nz + do j = 1,ny + ipos1 = (k-1)*ny + j + ipos2 = ipos1 + ny*nz + g(1,nx+2,j,k) = buf1(1,ipos1) + g(2,nx+2,j,k) = buf1(2,ipos1) + g(3,nx+2,j,k) = buf1(3,ipos1) + g(4,nx+2,j,k) = buf1(4,ipos1) + g(5,nx+2,j,k) = buf1(5,ipos1) + g(1,nx+1,j,k) = buf1(1,ipos2) + g(2,nx+1,j,k) = buf1(2,ipos2) + g(3,nx+1,j,k) = buf1(3,ipos2) + g(4,nx+1,j,k) = buf1(4,ipos2) + g(5,nx+1,j,k) = buf1(5,ipos2) + end do + end do + end if + + else + +c--------------------------------------------------------------------- +c communicate in the east and west directions +c--------------------------------------------------------------------- + if (west.ne.-1) then + call MPI_IRECV( buf1, + > 10*nx*nz, + > dp_type, + > MPI_ANY_SOURCE, + > from_w, + > MPI_COMM_WORLD, + > mid, + > IERROR ) + end if + +c--------------------------------------------------------------------- +c send east +c--------------------------------------------------------------------- + if (east.ne.-1) then + do k = 1,nz + do i = 1,nx + ipos1 = (k-1)*nx + i + ipos2 = ipos1 + nx*nz + buf(1,ipos1) = g(1,i,ny-1,k) + buf(2,ipos1) = g(2,i,ny-1,k) + buf(3,ipos1) = g(3,i,ny-1,k) + buf(4,ipos1) = g(4,i,ny-1,k) + buf(5,ipos1) = g(5,i,ny-1,k) + buf(1,ipos2) = g(1,i,ny,k) + buf(2,ipos2) = g(2,i,ny,k) + buf(3,ipos2) = g(3,i,ny,k) + buf(4,ipos2) = g(4,i,ny,k) + buf(5,ipos2) = g(5,i,ny,k) + end do + end do + + call MPI_SEND( buf, + > 10*nx*nz, + > dp_type, + > east, + > from_w, + > MPI_COMM_WORLD, + > IERROR ) + end if + +c--------------------------------------------------------------------- +c receive from west +c--------------------------------------------------------------------- + if (west.ne.-1) then + call MPI_WAIT( mid, STATUS, IERROR ) + + do k = 1,nz + do i = 1,nx + ipos1 = (k-1)*nx + i + ipos2 = ipos1 + nx*nz + g(1,i,-1,k) = buf1(1,ipos1) + g(2,i,-1,k) = buf1(2,ipos1) + g(3,i,-1,k) = buf1(3,ipos1) + g(4,i,-1,k) = buf1(4,ipos1) + g(5,i,-1,k) = buf1(5,ipos1) + g(1,i,0,k) = buf1(1,ipos2) + g(2,i,0,k) = buf1(2,ipos2) + g(3,i,0,k) = buf1(3,ipos2) + g(4,i,0,k) = buf1(4,ipos2) + g(5,i,0,k) = buf1(5,ipos2) + end do + end do + + end if + + if (east.ne.-1) then + call MPI_IRECV( buf1, + > 10*nx*nz, + > dp_type, + > MPI_ANY_SOURCE, + > from_e, + > MPI_COMM_WORLD, + > mid, + > IERROR ) + end if + +c--------------------------------------------------------------------- +c send west +c--------------------------------------------------------------------- + if (west.ne.-1) then + do k = 1,nz + do i = 1,nx + ipos1 = (k-1)*nx + i + ipos2 = ipos1 + nx*nz + buf(1,ipos1) = g(1,i,2,k) + buf(2,ipos1) = g(2,i,2,k) + buf(3,ipos1) = g(3,i,2,k) + buf(4,ipos1) = g(4,i,2,k) + buf(5,ipos1) = g(5,i,2,k) + buf(1,ipos2) = g(1,i,1,k) + buf(2,ipos2) = g(2,i,1,k) + buf(3,ipos2) = g(3,i,1,k) + buf(4,ipos2) = g(4,i,1,k) + buf(5,ipos2) = g(5,i,1,k) + end do + end do + + call MPI_SEND( buf, + > 10*nx*nz, + > dp_type, + > west, + > from_e, + > MPI_COMM_WORLD, + > IERROR ) + end if + +c--------------------------------------------------------------------- +c receive from east +c--------------------------------------------------------------------- + if (east.ne.-1) then + call MPI_WAIT( mid, STATUS, IERROR ) + + do k = 1,nz + do i = 1,nx + ipos1 = (k-1)*nx + i + ipos2 = ipos1 + nx*nz + g(1,i,ny+2,k) = buf1(1,ipos1) + g(2,i,ny+2,k) = buf1(2,ipos1) + g(3,i,ny+2,k) = buf1(3,ipos1) + g(4,i,ny+2,k) = buf1(4,ipos1) + g(5,i,ny+2,k) = buf1(5,ipos1) + g(1,i,ny+1,k) = buf1(1,ipos2) + g(2,i,ny+1,k) = buf1(2,ipos2) + g(3,i,ny+1,k) = buf1(3,ipos2) + g(4,i,ny+1,k) = buf1(4,ipos2) + g(5,i,ny+1,k) = buf1(5,ipos2) + end do + end do + + end if + + end if + + return + end diff --git a/examples/smpi/NAS/LU/exchange_4.f b/examples/smpi/NAS/LU/exchange_4.f new file mode 100644 index 0000000000..1c4c38e218 --- /dev/null +++ b/examples/smpi/NAS/LU/exchange_4.f @@ -0,0 +1,133 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exchange_4(g,h,ibeg,ifin1,jbeg,jfin1) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the right hand side based on exact solution +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + double precision g(0:isiz2+1,0:isiz3+1), + > h(0:isiz2+1,0:isiz3+1) + integer ibeg, ifin1 + integer jbeg, jfin1 + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j + integer ny2 + double precision dum(1024) + + integer msgid1, msgid3 + integer STATUS(MPI_STATUS_SIZE) + integer IERROR + + + + ny2 = ny + 2 + +c--------------------------------------------------------------------- +c communicate in the east and west directions +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c receive from east +c--------------------------------------------------------------------- + if (jfin1.eq.ny) then + call MPI_IRECV( dum, + > 2*nx, + > dp_type, + > MPI_ANY_SOURCE, + > from_e, + > MPI_COMM_WORLD, + > msgid3, + > IERROR ) + + call MPI_WAIT( msgid3, STATUS, IERROR ) + + do i = 1,nx + g(i,ny+1) = dum(i) + h(i,ny+1) = dum(i+nx) + end do + + end if + +c--------------------------------------------------------------------- +c send west +c--------------------------------------------------------------------- + if (jbeg.eq.1) then + do i = 1,nx + dum(i) = g(i,1) + dum(i+nx) = h(i,1) + end do + + call MPI_SEND( dum, + > 2*nx, + > dp_type, + > west, + > from_e, + > MPI_COMM_WORLD, + > IERROR ) + + end if + +c--------------------------------------------------------------------- +c communicate in the south and north directions +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c receive from south +c--------------------------------------------------------------------- + if (ifin1.eq.nx) then + call MPI_IRECV( dum, + > 2*ny2, + > dp_type, + > MPI_ANY_SOURCE, + > from_s, + > MPI_COMM_WORLD, + > msgid1, + > IERROR ) + + call MPI_WAIT( msgid1, STATUS, IERROR ) + + do j = 0,ny+1 + g(nx+1,j) = dum(j+1) + h(nx+1,j) = dum(j+ny2+1) + end do + + end if + +c--------------------------------------------------------------------- +c send north +c--------------------------------------------------------------------- + if (ibeg.eq.1) then + do j = 0,ny+1 + dum(j+1) = g(1,j) + dum(j+ny2+1) = h(1,j) + end do + + call MPI_SEND( dum, + > 2*ny2, + > dp_type, + > north, + > from_s, + > MPI_COMM_WORLD, + > IERROR ) + + end if + + return + end diff --git a/examples/smpi/NAS/LU/exchange_5.f b/examples/smpi/NAS/LU/exchange_5.f new file mode 100644 index 0000000000..e4cc66f5bf --- /dev/null +++ b/examples/smpi/NAS/LU/exchange_5.f @@ -0,0 +1,81 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exchange_5(g,ibeg,ifin1) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the right hand side based on exact solution +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + double precision g(0:isiz2+1,0:isiz3+1) + integer ibeg, ifin1 + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer k + double precision dum(1024) + + integer msgid1 + integer STATUS(MPI_STATUS_SIZE) + integer IERROR + + + +c--------------------------------------------------------------------- +c communicate in the south and north directions +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c receive from south +c--------------------------------------------------------------------- + if (ifin1.eq.nx) then + call MPI_IRECV( dum, + > nz, + > dp_type, + > MPI_ANY_SOURCE, + > from_s, + > MPI_COMM_WORLD, + > msgid1, + > IERROR ) + + call MPI_WAIT( msgid1, STATUS, IERROR ) + + do k = 1,nz + g(nx+1,k) = dum(k) + end do + + end if + +c--------------------------------------------------------------------- +c send north +c--------------------------------------------------------------------- + if (ibeg.eq.1) then + do k = 1,nz + dum(k) = g(1,k) + end do + + call MPI_SEND( dum, + > nz, + > dp_type, + > north, + > from_s, + > MPI_COMM_WORLD, + > IERROR ) + + end if + + return + end diff --git a/examples/smpi/NAS/LU/exchange_6.f b/examples/smpi/NAS/LU/exchange_6.f new file mode 100644 index 0000000000..0626609547 --- /dev/null +++ b/examples/smpi/NAS/LU/exchange_6.f @@ -0,0 +1,81 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exchange_6(g,jbeg,jfin1) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the right hand side based on exact solution +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + double precision g(0:isiz2+1,0:isiz3+1) + integer jbeg, jfin1 + +c--------------------------------------------------------------------- +c local parameters +c--------------------------------------------------------------------- + integer k + double precision dum(1024) + + integer msgid3 + integer STATUS(MPI_STATUS_SIZE) + integer IERROR + + + +c--------------------------------------------------------------------- +c communicate in the east and west directions +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c receive from east +c--------------------------------------------------------------------- + if (jfin1.eq.ny) then + call MPI_IRECV( dum, + > nz, + > dp_type, + > MPI_ANY_SOURCE, + > from_e, + > MPI_COMM_WORLD, + > msgid3, + > IERROR ) + + call MPI_WAIT( msgid3, STATUS, IERROR ) + + do k = 1,nz + g(ny+1,k) = dum(k) + end do + + end if + +c--------------------------------------------------------------------- +c send west +c--------------------------------------------------------------------- + if (jbeg.eq.1) then + do k = 1,nz + dum(k) = g(1,k) + end do + + call MPI_SEND( dum, + > nz, + > dp_type, + > west, + > from_e, + > MPI_COMM_WORLD, + > IERROR ) + + end if + + return + end diff --git a/examples/smpi/NAS/LU/init_comm.f b/examples/smpi/NAS/LU/init_comm.f new file mode 100644 index 0000000000..72ece00ac4 --- /dev/null +++ b/examples/smpi/NAS/LU/init_comm.f @@ -0,0 +1,57 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine init_comm + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c initialize MPI and establish rank and size +c +c This is a module in the MPI implementation of LUSSOR +c pseudo application from the NAS Parallel Benchmarks. +c +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + + integer nodedim + integer IERROR + + +c--------------------------------------------------------------------- +c initialize MPI communication +c--------------------------------------------------------------------- + call MPI_INIT( IERROR ) + +c--------------------------------------------------------------------- +c establish the global rank of this process +c--------------------------------------------------------------------- + call MPI_COMM_RANK( MPI_COMM_WORLD, + > id, + > IERROR ) + +c--------------------------------------------------------------------- +c establish the size of the global group +c--------------------------------------------------------------------- + call MPI_COMM_SIZE( MPI_COMM_WORLD, + > num, + > IERROR ) + + ndim = nodedim(num) + + if (.not. convertdouble) then + dp_type = MPI_DOUBLE_PRECISION + else + dp_type = MPI_REAL + endif + + + return + end diff --git a/examples/smpi/NAS/LU/inputlu.data.sample b/examples/smpi/NAS/LU/inputlu.data.sample new file mode 100644 index 0000000000..9ef5a7be00 --- /dev/null +++ b/examples/smpi/NAS/LU/inputlu.data.sample @@ -0,0 +1,24 @@ +c +c***controls printing of the progress of iterations: ipr inorm + 1 250 +c +c***the maximum no. of pseudo-time steps to be performed: nitmax + 250 +c +c***magnitude of the time step: dt + 2.0e+00 +c +c***relaxation factor for SSOR iterations: omega + 1.2 +c +c***tolerance levels for steady-state residuals: tolnwt(m),m=1,5 + 1.0e-08 1.0e-08 1.0e-08 1.0e-08 1.0e-08 +c +c***number of grid points in xi and eta and zeta directions: nx ny nz + 64 64 64 +c + + + + + diff --git a/examples/smpi/NAS/LU/jacld.f b/examples/smpi/NAS/LU/jacld.f new file mode 100644 index 0000000000..9580d080ab --- /dev/null +++ b/examples/smpi/NAS/LU/jacld.f @@ -0,0 +1,384 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine jacld(k) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + +c--------------------------------------------------------------------- +c compute the lower triangular part of the jacobian matrix +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer k + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j + double precision r43 + double precision c1345 + double precision c34 + double precision tmp1, tmp2, tmp3 + + + + r43 = ( 4.0d+00 / 3.0d+00 ) + c1345 = c1 * c3 * c4 * c5 + c34 = c3 * c4 + + do j = jst, jend + do i = ist, iend + +c--------------------------------------------------------------------- +c form the block daigonal +c--------------------------------------------------------------------- + tmp1 = 1.0d+00 / u(1,i,j,k) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + d(1,1,i,j) = 1.0d+00 + > + dt * 2.0d+00 * ( tx1 * dx1 + > + ty1 * dy1 + > + tz1 * dz1 ) + d(1,2,i,j) = 0.0d+00 + d(1,3,i,j) = 0.0d+00 + d(1,4,i,j) = 0.0d+00 + d(1,5,i,j) = 0.0d+00 + + d(2,1,i,j) = dt * 2.0d+00 + > * ( tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) ) + > + ty1 * ( - c34 * tmp2 * u(2,i,j,k) ) + > + tz1 * ( - c34 * tmp2 * u(2,i,j,k) ) ) + d(2,2,i,j) = 1.0d+00 + > + dt * 2.0d+00 + > * ( tx1 * r43 * c34 * tmp1 + > + ty1 * c34 * tmp1 + > + tz1 * c34 * tmp1 ) + > + dt * 2.0d+00 * ( tx1 * dx2 + > + ty1 * dy2 + > + tz1 * dz2 ) + d(2,3,i,j) = 0.0d+00 + d(2,4,i,j) = 0.0d+00 + d(2,5,i,j) = 0.0d+00 + + d(3,1,i,j) = dt * 2.0d+00 + > * ( tx1 * ( - c34 * tmp2 * u(3,i,j,k) ) + > + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) ) + > + tz1 * ( - c34 * tmp2 * u(3,i,j,k) ) ) + d(3,2,i,j) = 0.0d+00 + d(3,3,i,j) = 1.0d+00 + > + dt * 2.0d+00 + > * ( tx1 * c34 * tmp1 + > + ty1 * r43 * c34 * tmp1 + > + tz1 * c34 * tmp1 ) + > + dt * 2.0d+00 * ( tx1 * dx3 + > + ty1 * dy3 + > + tz1 * dz3 ) + d(3,4,i,j) = 0.0d+00 + d(3,5,i,j) = 0.0d+00 + + d(4,1,i,j) = dt * 2.0d+00 + > * ( tx1 * ( - c34 * tmp2 * u(4,i,j,k) ) + > + ty1 * ( - c34 * tmp2 * u(4,i,j,k) ) + > + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) ) + d(4,2,i,j) = 0.0d+00 + d(4,3,i,j) = 0.0d+00 + d(4,4,i,j) = 1.0d+00 + > + dt * 2.0d+00 + > * ( tx1 * c34 * tmp1 + > + ty1 * c34 * tmp1 + > + tz1 * r43 * c34 * tmp1 ) + > + dt * 2.0d+00 * ( tx1 * dx4 + > + ty1 * dy4 + > + tz1 * dz4 ) + d(4,5,i,j) = 0.0d+00 + + d(5,1,i,j) = dt * 2.0d+00 + > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) + > - ( c1345 ) * tmp2 * u(5,i,j,k) ) + > + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) + > - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) + > - ( c1345 ) * tmp2 * u(5,i,j,k) ) + > + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) + > - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) + > - ( c1345 ) * tmp2 * u(5,i,j,k) ) ) + d(5,2,i,j) = dt * 2.0d+00 + > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k) + > + ty1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) + > + tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) ) + d(5,3,i,j) = dt * 2.0d+00 + > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) + > + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k) + > + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) ) + d(5,4,i,j) = dt * 2.0d+00 + > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k) + > + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k) + > + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) ) + d(5,5,i,j) = 1.0d+00 + > + dt * 2.0d+00 * ( tx1 * c1345 * tmp1 + > + ty1 * c1345 * tmp1 + > + tz1 * c1345 * tmp1 ) + > + dt * 2.0d+00 * ( tx1 * dx5 + > + ty1 * dy5 + > + tz1 * dz5 ) + +c--------------------------------------------------------------------- +c form the first block sub-diagonal +c--------------------------------------------------------------------- + tmp1 = 1.0d+00 / u(1,i,j,k-1) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + a(1,1,i,j) = - dt * tz1 * dz1 + a(1,2,i,j) = 0.0d+00 + a(1,3,i,j) = 0.0d+00 + a(1,4,i,j) = - dt * tz2 + a(1,5,i,j) = 0.0d+00 + + a(2,1,i,j) = - dt * tz2 + > * ( - ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) + > - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k-1) ) + a(2,2,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 ) + > - dt * tz1 * c34 * tmp1 + > - dt * tz1 * dz2 + a(2,3,i,j) = 0.0d+00 + a(2,4,i,j) = - dt * tz2 * ( u(2,i,j,k-1) * tmp1 ) + a(2,5,i,j) = 0.0d+00 + + a(3,1,i,j) = - dt * tz2 + > * ( - ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) + > - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k-1) ) + a(3,2,i,j) = 0.0d+00 + a(3,3,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 ) + > - dt * tz1 * ( c34 * tmp1 ) + > - dt * tz1 * dz3 + a(3,4,i,j) = - dt * tz2 * ( u(3,i,j,k-1) * tmp1 ) + a(3,5,i,j) = 0.0d+00 + + a(4,1,i,j) = - dt * tz2 + > * ( - ( u(4,i,j,k-1) * tmp1 ) ** 2 + > + 0.50d+00 * c2 + > * ( ( u(2,i,j,k-1) * u(2,i,j,k-1) + > + u(3,i,j,k-1) * u(3,i,j,k-1) + > + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2 ) ) + > - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k-1) ) + a(4,2,i,j) = - dt * tz2 + > * ( - c2 * ( u(2,i,j,k-1) * tmp1 ) ) + a(4,3,i,j) = - dt * tz2 + > * ( - c2 * ( u(3,i,j,k-1) * tmp1 ) ) + a(4,4,i,j) = - dt * tz2 * ( 2.0d+00 - c2 ) + > * ( u(4,i,j,k-1) * tmp1 ) + > - dt * tz1 * ( r43 * c34 * tmp1 ) + > - dt * tz1 * dz4 + a(4,5,i,j) = - dt * tz2 * c2 + + a(5,1,i,j) = - dt * tz2 + > * ( ( c2 * ( u(2,i,j,k-1) * u(2,i,j,k-1) + > + u(3,i,j,k-1) * u(3,i,j,k-1) + > + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2 + > - c1 * ( u(5,i,j,k-1) * tmp1 ) ) + > * ( u(4,i,j,k-1) * tmp1 ) ) + > - dt * tz1 + > * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k-1)**2) + > - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k-1)**2) + > - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k-1)**2) + > - c1345 * tmp2 * u(5,i,j,k-1) ) + a(5,2,i,j) = - dt * tz2 + > * ( - c2 * ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) + > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k-1) + a(5,3,i,j) = - dt * tz2 + > * ( - c2 * ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) + > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k-1) + a(5,4,i,j) = - dt * tz2 + > * ( c1 * ( u(5,i,j,k-1) * tmp1 ) + > - 0.50d+00 * c2 + > * ( ( u(2,i,j,k-1)*u(2,i,j,k-1) + > + u(3,i,j,k-1)*u(3,i,j,k-1) + > + 3.0d+00*u(4,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) ) + > - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k-1) + a(5,5,i,j) = - dt * tz2 + > * ( c1 * ( u(4,i,j,k-1) * tmp1 ) ) + > - dt * tz1 * c1345 * tmp1 + > - dt * tz1 * dz5 + +c--------------------------------------------------------------------- +c form the second block sub-diagonal +c--------------------------------------------------------------------- + tmp1 = 1.0d+00 / u(1,i,j-1,k) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + b(1,1,i,j) = - dt * ty1 * dy1 + b(1,2,i,j) = 0.0d+00 + b(1,3,i,j) = - dt * ty2 + b(1,4,i,j) = 0.0d+00 + b(1,5,i,j) = 0.0d+00 + + b(2,1,i,j) = - dt * ty2 + > * ( - ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 ) + > - dt * ty1 * ( - c34 * tmp2 * u(2,i,j-1,k) ) + b(2,2,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 ) + > - dt * ty1 * ( c34 * tmp1 ) + > - dt * ty1 * dy2 + b(2,3,i,j) = - dt * ty2 * ( u(2,i,j-1,k) * tmp1 ) + b(2,4,i,j) = 0.0d+00 + b(2,5,i,j) = 0.0d+00 + + b(3,1,i,j) = - dt * ty2 + > * ( - ( u(3,i,j-1,k) * tmp1 ) ** 2 + > + 0.50d+00 * c2 * ( ( u(2,i,j-1,k) * u(2,i,j-1,k) + > + u(3,i,j-1,k) * u(3,i,j-1,k) + > + u(4,i,j-1,k) * u(4,i,j-1,k) ) + > * tmp2 ) ) + > - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j-1,k) ) + b(3,2,i,j) = - dt * ty2 + > * ( - c2 * ( u(2,i,j-1,k) * tmp1 ) ) + b(3,3,i,j) = - dt * ty2 * ( ( 2.0d+00 - c2 ) + > * ( u(3,i,j-1,k) * tmp1 ) ) + > - dt * ty1 * ( r43 * c34 * tmp1 ) + > - dt * ty1 * dy3 + b(3,4,i,j) = - dt * ty2 + > * ( - c2 * ( u(4,i,j-1,k) * tmp1 ) ) + b(3,5,i,j) = - dt * ty2 * c2 + + b(4,1,i,j) = - dt * ty2 + > * ( - ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) + > - dt * ty1 * ( - c34 * tmp2 * u(4,i,j-1,k) ) + b(4,2,i,j) = 0.0d+00 + b(4,3,i,j) = - dt * ty2 * ( u(4,i,j-1,k) * tmp1 ) + b(4,4,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 ) + > - dt * ty1 * ( c34 * tmp1 ) + > - dt * ty1 * dy4 + b(4,5,i,j) = 0.0d+00 + + b(5,1,i,j) = - dt * ty2 + > * ( ( c2 * ( u(2,i,j-1,k) * u(2,i,j-1,k) + > + u(3,i,j-1,k) * u(3,i,j-1,k) + > + u(4,i,j-1,k) * u(4,i,j-1,k) ) * tmp2 + > - c1 * ( u(5,i,j-1,k) * tmp1 ) ) + > * ( u(3,i,j-1,k) * tmp1 ) ) + > - dt * ty1 + > * ( - ( c34 - c1345 )*tmp3*(u(2,i,j-1,k)**2) + > - ( r43*c34 - c1345 )*tmp3*(u(3,i,j-1,k)**2) + > - ( c34 - c1345 )*tmp3*(u(4,i,j-1,k)**2) + > - c1345*tmp2*u(5,i,j-1,k) ) + b(5,2,i,j) = - dt * ty2 + > * ( - c2 * ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 ) + > - dt * ty1 + > * ( c34 - c1345 ) * tmp2 * u(2,i,j-1,k) + b(5,3,i,j) = - dt * ty2 + > * ( c1 * ( u(5,i,j-1,k) * tmp1 ) + > - 0.50d+00 * c2 + > * ( ( u(2,i,j-1,k)*u(2,i,j-1,k) + > + 3.0d+00 * u(3,i,j-1,k)*u(3,i,j-1,k) + > + u(4,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) ) + > - dt * ty1 + > * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j-1,k) + b(5,4,i,j) = - dt * ty2 + > * ( - c2 * ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) + > - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j-1,k) + b(5,5,i,j) = - dt * ty2 + > * ( c1 * ( u(3,i,j-1,k) * tmp1 ) ) + > - dt * ty1 * c1345 * tmp1 + > - dt * ty1 * dy5 + +c--------------------------------------------------------------------- +c form the third block sub-diagonal +c--------------------------------------------------------------------- + tmp1 = 1.0d+00 / u(1,i-1,j,k) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + c(1,1,i,j) = - dt * tx1 * dx1 + c(1,2,i,j) = - dt * tx2 + c(1,3,i,j) = 0.0d+00 + c(1,4,i,j) = 0.0d+00 + c(1,5,i,j) = 0.0d+00 + + c(2,1,i,j) = - dt * tx2 + > * ( - ( u(2,i-1,j,k) * tmp1 ) ** 2 + > + c2 * 0.50d+00 * ( u(2,i-1,j,k) * u(2,i-1,j,k) + > + u(3,i-1,j,k) * u(3,i-1,j,k) + > + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2 ) + > - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i-1,j,k) ) + c(2,2,i,j) = - dt * tx2 + > * ( ( 2.0d+00 - c2 ) * ( u(2,i-1,j,k) * tmp1 ) ) + > - dt * tx1 * ( r43 * c34 * tmp1 ) + > - dt * tx1 * dx2 + c(2,3,i,j) = - dt * tx2 + > * ( - c2 * ( u(3,i-1,j,k) * tmp1 ) ) + c(2,4,i,j) = - dt * tx2 + > * ( - c2 * ( u(4,i-1,j,k) * tmp1 ) ) + c(2,5,i,j) = - dt * tx2 * c2 + + c(3,1,i,j) = - dt * tx2 + > * ( - ( u(2,i-1,j,k) * u(3,i-1,j,k) ) * tmp2 ) + > - dt * tx1 * ( - c34 * tmp2 * u(3,i-1,j,k) ) + c(3,2,i,j) = - dt * tx2 * ( u(3,i-1,j,k) * tmp1 ) + c(3,3,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 ) + > - dt * tx1 * ( c34 * tmp1 ) + > - dt * tx1 * dx3 + c(3,4,i,j) = 0.0d+00 + c(3,5,i,j) = 0.0d+00 + + c(4,1,i,j) = - dt * tx2 + > * ( - ( u(2,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 ) + > - dt * tx1 * ( - c34 * tmp2 * u(4,i-1,j,k) ) + c(4,2,i,j) = - dt * tx2 * ( u(4,i-1,j,k) * tmp1 ) + c(4,3,i,j) = 0.0d+00 + c(4,4,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 ) + > - dt * tx1 * ( c34 * tmp1 ) + > - dt * tx1 * dx4 + c(4,5,i,j) = 0.0d+00 + + c(5,1,i,j) = - dt * tx2 + > * ( ( c2 * ( u(2,i-1,j,k) * u(2,i-1,j,k) + > + u(3,i-1,j,k) * u(3,i-1,j,k) + > + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2 + > - c1 * ( u(5,i-1,j,k) * tmp1 ) ) + > * ( u(2,i-1,j,k) * tmp1 ) ) + > - dt * tx1 + > * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i-1,j,k)**2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(3,i-1,j,k)**2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(4,i-1,j,k)**2 ) + > - c1345 * tmp2 * u(5,i-1,j,k) ) + c(5,2,i,j) = - dt * tx2 + > * ( c1 * ( u(5,i-1,j,k) * tmp1 ) + > - 0.50d+00 * c2 + > * ( ( 3.0d+00*u(2,i-1,j,k)*u(2,i-1,j,k) + > + u(3,i-1,j,k)*u(3,i-1,j,k) + > + u(4,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 ) ) + > - dt * tx1 + > * ( r43*c34 - c1345 ) * tmp2 * u(2,i-1,j,k) + c(5,3,i,j) = - dt * tx2 + > * ( - c2 * ( u(3,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 ) + > - dt * tx1 + > * ( c34 - c1345 ) * tmp2 * u(3,i-1,j,k) + c(5,4,i,j) = - dt * tx2 + > * ( - c2 * ( u(4,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 ) + > - dt * tx1 + > * ( c34 - c1345 ) * tmp2 * u(4,i-1,j,k) + c(5,5,i,j) = - dt * tx2 + > * ( c1 * ( u(2,i-1,j,k) * tmp1 ) ) + > - dt * tx1 * c1345 * tmp1 + > - dt * tx1 * dx5 + + end do + end do + + return + end diff --git a/examples/smpi/NAS/LU/jacu.f b/examples/smpi/NAS/LU/jacu.f new file mode 100644 index 0000000000..6a3c5b8ddb --- /dev/null +++ b/examples/smpi/NAS/LU/jacu.f @@ -0,0 +1,384 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine jacu(k) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the upper triangular part of the jacobian matrix +c--------------------------------------------------------------------- + + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer k + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j + double precision r43 + double precision c1345 + double precision c34 + double precision tmp1, tmp2, tmp3 + + + + r43 = ( 4.0d+00 / 3.0d+00 ) + c1345 = c1 * c3 * c4 * c5 + c34 = c3 * c4 + + do j = jst, jend + do i = ist, iend + +c--------------------------------------------------------------------- +c form the block daigonal +c--------------------------------------------------------------------- + tmp1 = 1.0d+00 / u(1,i,j,k) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + d(1,1,i,j) = 1.0d+00 + > + dt * 2.0d+00 * ( tx1 * dx1 + > + ty1 * dy1 + > + tz1 * dz1 ) + d(1,2,i,j) = 0.0d+00 + d(1,3,i,j) = 0.0d+00 + d(1,4,i,j) = 0.0d+00 + d(1,5,i,j) = 0.0d+00 + + d(2,1,i,j) = dt * 2.0d+00 + > * ( tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) ) + > + ty1 * ( - c34 * tmp2 * u(2,i,j,k) ) + > + tz1 * ( - c34 * tmp2 * u(2,i,j,k) ) ) + d(2,2,i,j) = 1.0d+00 + > + dt * 2.0d+00 + > * ( tx1 * r43 * c34 * tmp1 + > + ty1 * c34 * tmp1 + > + tz1 * c34 * tmp1 ) + > + dt * 2.0d+00 * ( tx1 * dx2 + > + ty1 * dy2 + > + tz1 * dz2 ) + d(2,3,i,j) = 0.0d+00 + d(2,4,i,j) = 0.0d+00 + d(2,5,i,j) = 0.0d+00 + + d(3,1,i,j) = dt * 2.0d+00 + > * ( tx1 * ( - c34 * tmp2 * u(3,i,j,k) ) + > + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) ) + > + tz1 * ( - c34 * tmp2 * u(3,i,j,k) ) ) + d(3,2,i,j) = 0.0d+00 + d(3,3,i,j) = 1.0d+00 + > + dt * 2.0d+00 + > * ( tx1 * c34 * tmp1 + > + ty1 * r43 * c34 * tmp1 + > + tz1 * c34 * tmp1 ) + > + dt * 2.0d+00 * ( tx1 * dx3 + > + ty1 * dy3 + > + tz1 * dz3 ) + d(3,4,i,j) = 0.0d+00 + d(3,5,i,j) = 0.0d+00 + + d(4,1,i,j) = dt * 2.0d+00 + > * ( tx1 * ( - c34 * tmp2 * u(4,i,j,k) ) + > + ty1 * ( - c34 * tmp2 * u(4,i,j,k) ) + > + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) ) + d(4,2,i,j) = 0.0d+00 + d(4,3,i,j) = 0.0d+00 + d(4,4,i,j) = 1.0d+00 + > + dt * 2.0d+00 + > * ( tx1 * c34 * tmp1 + > + ty1 * c34 * tmp1 + > + tz1 * r43 * c34 * tmp1 ) + > + dt * 2.0d+00 * ( tx1 * dx4 + > + ty1 * dy4 + > + tz1 * dz4 ) + d(4,5,i,j) = 0.0d+00 + + d(5,1,i,j) = dt * 2.0d+00 + > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) + > - ( c1345 ) * tmp2 * u(5,i,j,k) ) + > + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) + > - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) + > - ( c1345 ) * tmp2 * u(5,i,j,k) ) + > + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) + > - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) + > - ( c1345 ) * tmp2 * u(5,i,j,k) ) ) + d(5,2,i,j) = dt * 2.0d+00 + > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k) + > + ty1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) + > + tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) ) + d(5,3,i,j) = dt * 2.0d+00 + > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) + > + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k) + > + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) ) + d(5,4,i,j) = dt * 2.0d+00 + > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k) + > + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k) + > + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) ) + d(5,5,i,j) = 1.0d+00 + > + dt * 2.0d+00 * ( tx1 * c1345 * tmp1 + > + ty1 * c1345 * tmp1 + > + tz1 * c1345 * tmp1 ) + > + dt * 2.0d+00 * ( tx1 * dx5 + > + ty1 * dy5 + > + tz1 * dz5 ) + +c--------------------------------------------------------------------- +c form the first block sub-diagonal +c--------------------------------------------------------------------- + tmp1 = 1.0d+00 / u(1,i+1,j,k) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + a(1,1,i,j) = - dt * tx1 * dx1 + a(1,2,i,j) = dt * tx2 + a(1,3,i,j) = 0.0d+00 + a(1,4,i,j) = 0.0d+00 + a(1,5,i,j) = 0.0d+00 + + a(2,1,i,j) = dt * tx2 + > * ( - ( u(2,i+1,j,k) * tmp1 ) ** 2 + > + c2 * 0.50d+00 * ( u(2,i+1,j,k) * u(2,i+1,j,k) + > + u(3,i+1,j,k) * u(3,i+1,j,k) + > + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2 ) + > - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i+1,j,k) ) + a(2,2,i,j) = dt * tx2 + > * ( ( 2.0d+00 - c2 ) * ( u(2,i+1,j,k) * tmp1 ) ) + > - dt * tx1 * ( r43 * c34 * tmp1 ) + > - dt * tx1 * dx2 + a(2,3,i,j) = dt * tx2 + > * ( - c2 * ( u(3,i+1,j,k) * tmp1 ) ) + a(2,4,i,j) = dt * tx2 + > * ( - c2 * ( u(4,i+1,j,k) * tmp1 ) ) + a(2,5,i,j) = dt * tx2 * c2 + + a(3,1,i,j) = dt * tx2 + > * ( - ( u(2,i+1,j,k) * u(3,i+1,j,k) ) * tmp2 ) + > - dt * tx1 * ( - c34 * tmp2 * u(3,i+1,j,k) ) + a(3,2,i,j) = dt * tx2 * ( u(3,i+1,j,k) * tmp1 ) + a(3,3,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 ) + > - dt * tx1 * ( c34 * tmp1 ) + > - dt * tx1 * dx3 + a(3,4,i,j) = 0.0d+00 + a(3,5,i,j) = 0.0d+00 + + a(4,1,i,j) = dt * tx2 + > * ( - ( u(2,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 ) + > - dt * tx1 * ( - c34 * tmp2 * u(4,i+1,j,k) ) + a(4,2,i,j) = dt * tx2 * ( u(4,i+1,j,k) * tmp1 ) + a(4,3,i,j) = 0.0d+00 + a(4,4,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 ) + > - dt * tx1 * ( c34 * tmp1 ) + > - dt * tx1 * dx4 + a(4,5,i,j) = 0.0d+00 + + a(5,1,i,j) = dt * tx2 + > * ( ( c2 * ( u(2,i+1,j,k) * u(2,i+1,j,k) + > + u(3,i+1,j,k) * u(3,i+1,j,k) + > + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2 + > - c1 * ( u(5,i+1,j,k) * tmp1 ) ) + > * ( u(2,i+1,j,k) * tmp1 ) ) + > - dt * tx1 + > * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i+1,j,k)**2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(3,i+1,j,k)**2 ) + > - ( c34 - c1345 ) * tmp3 * ( u(4,i+1,j,k)**2 ) + > - c1345 * tmp2 * u(5,i+1,j,k) ) + a(5,2,i,j) = dt * tx2 + > * ( c1 * ( u(5,i+1,j,k) * tmp1 ) + > - 0.50d+00 * c2 + > * ( ( 3.0d+00*u(2,i+1,j,k)*u(2,i+1,j,k) + > + u(3,i+1,j,k)*u(3,i+1,j,k) + > + u(4,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 ) ) + > - dt * tx1 + > * ( r43*c34 - c1345 ) * tmp2 * u(2,i+1,j,k) + a(5,3,i,j) = dt * tx2 + > * ( - c2 * ( u(3,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 ) + > - dt * tx1 + > * ( c34 - c1345 ) * tmp2 * u(3,i+1,j,k) + a(5,4,i,j) = dt * tx2 + > * ( - c2 * ( u(4,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 ) + > - dt * tx1 + > * ( c34 - c1345 ) * tmp2 * u(4,i+1,j,k) + a(5,5,i,j) = dt * tx2 + > * ( c1 * ( u(2,i+1,j,k) * tmp1 ) ) + > - dt * tx1 * c1345 * tmp1 + > - dt * tx1 * dx5 + +c--------------------------------------------------------------------- +c form the second block sub-diagonal +c--------------------------------------------------------------------- + tmp1 = 1.0d+00 / u(1,i,j+1,k) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + b(1,1,i,j) = - dt * ty1 * dy1 + b(1,2,i,j) = 0.0d+00 + b(1,3,i,j) = dt * ty2 + b(1,4,i,j) = 0.0d+00 + b(1,5,i,j) = 0.0d+00 + + b(2,1,i,j) = dt * ty2 + > * ( - ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 ) + > - dt * ty1 * ( - c34 * tmp2 * u(2,i,j+1,k) ) + b(2,2,i,j) = dt * ty2 * ( u(3,i,j+1,k) * tmp1 ) + > - dt * ty1 * ( c34 * tmp1 ) + > - dt * ty1 * dy2 + b(2,3,i,j) = dt * ty2 * ( u(2,i,j+1,k) * tmp1 ) + b(2,4,i,j) = 0.0d+00 + b(2,5,i,j) = 0.0d+00 + + b(3,1,i,j) = dt * ty2 + > * ( - ( u(3,i,j+1,k) * tmp1 ) ** 2 + > + 0.50d+00 * c2 * ( ( u(2,i,j+1,k) * u(2,i,j+1,k) + > + u(3,i,j+1,k) * u(3,i,j+1,k) + > + u(4,i,j+1,k) * u(4,i,j+1,k) ) + > * tmp2 ) ) + > - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j+1,k) ) + b(3,2,i,j) = dt * ty2 + > * ( - c2 * ( u(2,i,j+1,k) * tmp1 ) ) + b(3,3,i,j) = dt * ty2 * ( ( 2.0d+00 - c2 ) + > * ( u(3,i,j+1,k) * tmp1 ) ) + > - dt * ty1 * ( r43 * c34 * tmp1 ) + > - dt * ty1 * dy3 + b(3,4,i,j) = dt * ty2 + > * ( - c2 * ( u(4,i,j+1,k) * tmp1 ) ) + b(3,5,i,j) = dt * ty2 * c2 + + b(4,1,i,j) = dt * ty2 + > * ( - ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) + > - dt * ty1 * ( - c34 * tmp2 * u(4,i,j+1,k) ) + b(4,2,i,j) = 0.0d+00 + b(4,3,i,j) = dt * ty2 * ( u(4,i,j+1,k) * tmp1 ) + b(4,4,i,j) = dt * ty2 * ( u(3,i,j+1,k) * tmp1 ) + > - dt * ty1 * ( c34 * tmp1 ) + > - dt * ty1 * dy4 + b(4,5,i,j) = 0.0d+00 + + b(5,1,i,j) = dt * ty2 + > * ( ( c2 * ( u(2,i,j+1,k) * u(2,i,j+1,k) + > + u(3,i,j+1,k) * u(3,i,j+1,k) + > + u(4,i,j+1,k) * u(4,i,j+1,k) ) * tmp2 + > - c1 * ( u(5,i,j+1,k) * tmp1 ) ) + > * ( u(3,i,j+1,k) * tmp1 ) ) + > - dt * ty1 + > * ( - ( c34 - c1345 )*tmp3*(u(2,i,j+1,k)**2) + > - ( r43*c34 - c1345 )*tmp3*(u(3,i,j+1,k)**2) + > - ( c34 - c1345 )*tmp3*(u(4,i,j+1,k)**2) + > - c1345*tmp2*u(5,i,j+1,k) ) + b(5,2,i,j) = dt * ty2 + > * ( - c2 * ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 ) + > - dt * ty1 + > * ( c34 - c1345 ) * tmp2 * u(2,i,j+1,k) + b(5,3,i,j) = dt * ty2 + > * ( c1 * ( u(5,i,j+1,k) * tmp1 ) + > - 0.50d+00 * c2 + > * ( ( u(2,i,j+1,k)*u(2,i,j+1,k) + > + 3.0d+00 * u(3,i,j+1,k)*u(3,i,j+1,k) + > + u(4,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) ) + > - dt * ty1 + > * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j+1,k) + b(5,4,i,j) = dt * ty2 + > * ( - c2 * ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) + > - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j+1,k) + b(5,5,i,j) = dt * ty2 + > * ( c1 * ( u(3,i,j+1,k) * tmp1 ) ) + > - dt * ty1 * c1345 * tmp1 + > - dt * ty1 * dy5 + +c--------------------------------------------------------------------- +c form the third block sub-diagonal +c--------------------------------------------------------------------- + tmp1 = 1.0d+00 / u(1,i,j,k+1) + tmp2 = tmp1 * tmp1 + tmp3 = tmp1 * tmp2 + + c(1,1,i,j) = - dt * tz1 * dz1 + c(1,2,i,j) = 0.0d+00 + c(1,3,i,j) = 0.0d+00 + c(1,4,i,j) = dt * tz2 + c(1,5,i,j) = 0.0d+00 + + c(2,1,i,j) = dt * tz2 + > * ( - ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) + > - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k+1) ) + c(2,2,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 ) + > - dt * tz1 * c34 * tmp1 + > - dt * tz1 * dz2 + c(2,3,i,j) = 0.0d+00 + c(2,4,i,j) = dt * tz2 * ( u(2,i,j,k+1) * tmp1 ) + c(2,5,i,j) = 0.0d+00 + + c(3,1,i,j) = dt * tz2 + > * ( - ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) + > - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k+1) ) + c(3,2,i,j) = 0.0d+00 + c(3,3,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 ) + > - dt * tz1 * ( c34 * tmp1 ) + > - dt * tz1 * dz3 + c(3,4,i,j) = dt * tz2 * ( u(3,i,j,k+1) * tmp1 ) + c(3,5,i,j) = 0.0d+00 + + c(4,1,i,j) = dt * tz2 + > * ( - ( u(4,i,j,k+1) * tmp1 ) ** 2 + > + 0.50d+00 * c2 + > * ( ( u(2,i,j,k+1) * u(2,i,j,k+1) + > + u(3,i,j,k+1) * u(3,i,j,k+1) + > + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2 ) ) + > - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k+1) ) + c(4,2,i,j) = dt * tz2 + > * ( - c2 * ( u(2,i,j,k+1) * tmp1 ) ) + c(4,3,i,j) = dt * tz2 + > * ( - c2 * ( u(3,i,j,k+1) * tmp1 ) ) + c(4,4,i,j) = dt * tz2 * ( 2.0d+00 - c2 ) + > * ( u(4,i,j,k+1) * tmp1 ) + > - dt * tz1 * ( r43 * c34 * tmp1 ) + > - dt * tz1 * dz4 + c(4,5,i,j) = dt * tz2 * c2 + + c(5,1,i,j) = dt * tz2 + > * ( ( c2 * ( u(2,i,j,k+1) * u(2,i,j,k+1) + > + u(3,i,j,k+1) * u(3,i,j,k+1) + > + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2 + > - c1 * ( u(5,i,j,k+1) * tmp1 ) ) + > * ( u(4,i,j,k+1) * tmp1 ) ) + > - dt * tz1 + > * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k+1)**2) + > - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k+1)**2) + > - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k+1)**2) + > - c1345 * tmp2 * u(5,i,j,k+1) ) + c(5,2,i,j) = dt * tz2 + > * ( - c2 * ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) + > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k+1) + c(5,3,i,j) = dt * tz2 + > * ( - c2 * ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) + > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k+1) + c(5,4,i,j) = dt * tz2 + > * ( c1 * ( u(5,i,j,k+1) * tmp1 ) + > - 0.50d+00 * c2 + > * ( ( u(2,i,j,k+1)*u(2,i,j,k+1) + > + u(3,i,j,k+1)*u(3,i,j,k+1) + > + 3.0d+00*u(4,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) ) + > - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k+1) + c(5,5,i,j) = dt * tz2 + > * ( c1 * ( u(4,i,j,k+1) * tmp1 ) ) + > - dt * tz1 * c1345 * tmp1 + > - dt * tz1 * dz5 + + end do + end do + + return + end diff --git a/examples/smpi/NAS/LU/l2norm.f b/examples/smpi/NAS/LU/l2norm.f new file mode 100644 index 0000000000..147b21deb4 --- /dev/null +++ b/examples/smpi/NAS/LU/l2norm.f @@ -0,0 +1,68 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine l2norm ( ldx, ldy, ldz, + > nx0, ny0, nz0, + > ist, iend, + > jst, jend, + > v, sum ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c to compute the l2-norm of vector v. +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer ldx, ldy, ldz + integer nx0, ny0, nz0 + integer ist, iend + integer jst, jend + double precision v(5,-1:ldx+2,-1:ldy+2,*), sum(5) + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k, m + double precision dummy(5) + + integer IERROR + + + do m = 1, 5 + dummy(m) = 0.0d+00 + end do + + do k = 2, nz0-1 + do j = jst, jend + do i = ist, iend + do m = 1, 5 + dummy(m) = dummy(m) + v(m,i,j,k) * v(m,i,j,k) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c compute the global sum of individual contributions to dot product. +c--------------------------------------------------------------------- + call MPI_ALLREDUCE( dummy, + > sum, + > 5, + > dp_type, + > MPI_SUM, + > MPI_COMM_WORLD, + > IERROR ) + + do m = 1, 5 + sum(m) = sqrt ( sum(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) ) + end do + + return + end diff --git a/examples/smpi/NAS/LU/lu.f b/examples/smpi/NAS/LU/lu.f new file mode 100644 index 0000000000..543463a07b --- /dev/null +++ b/examples/smpi/NAS/LU/lu.f @@ -0,0 +1,164 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! L U ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! +! It is described in NAS Technical Reports 95-020 and 02-007 ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +c--------------------------------------------------------------------- +c +c Authors: S. Weeratunga +c V. Venkatakrishnan +c E. Barszcz +c M. Yarrow +c +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- + program applu +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c driver for the performance evaluation of the solver for +c five coupled parabolic/elliptic partial differential equations. +c +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + character class + logical verified + double precision mflops + integer ierr + +c--------------------------------------------------------------------- +c initialize communications +c--------------------------------------------------------------------- + call init_comm() + +c--------------------------------------------------------------------- +c read input data +c--------------------------------------------------------------------- + call read_input() + +c--------------------------------------------------------------------- +c set up processor grid +c--------------------------------------------------------------------- + call proc_grid() + +c--------------------------------------------------------------------- +c determine the neighbors +c--------------------------------------------------------------------- + call neighbors() + +c--------------------------------------------------------------------- +c set up sub-domain sizes +c--------------------------------------------------------------------- + call subdomain() + +c--------------------------------------------------------------------- +c set up coefficients +c--------------------------------------------------------------------- + call setcoeff() + +c--------------------------------------------------------------------- +c set the masks required for comm +c--------------------------------------------------------------------- + call sethyper() + +c--------------------------------------------------------------------- +c set the boundary values for dependent variables +c--------------------------------------------------------------------- + call setbv() + +c--------------------------------------------------------------------- +c set the initial values for dependent variables +c--------------------------------------------------------------------- + call setiv() + +c--------------------------------------------------------------------- +c compute the forcing term based on prescribed exact solution +c--------------------------------------------------------------------- + call erhs() + +c--------------------------------------------------------------------- +c perform one SSOR iteration to touch all data and program pages +c--------------------------------------------------------------------- + call ssor(1) + +c--------------------------------------------------------------------- +c reset the boundary and initial values +c--------------------------------------------------------------------- + call setbv() + call setiv() + +c--------------------------------------------------------------------- +c perform the SSOR iterations +c--------------------------------------------------------------------- + call ssor(itmax) + +c--------------------------------------------------------------------- +c compute the solution error +c--------------------------------------------------------------------- + call error() + +c--------------------------------------------------------------------- +c compute the surface integral +c--------------------------------------------------------------------- + call pintgr() + +c--------------------------------------------------------------------- +c verification test +c--------------------------------------------------------------------- + IF (id.eq.0) THEN + call verify ( rsdnm, errnm, frc, class, verified ) + mflops = float(itmax)*(1984.77*float( nx0 ) + > *float( ny0 ) + > *float( nz0 ) + > -10923.3*(float( nx0+ny0+nz0 )/3.)**2 + > +27770.9* float( nx0+ny0+nz0 )/3. + > -144010.) + > / (maxtime*1000000.) + + call print_results('LU', class, nx0, + > ny0, nz0, itmax, nnodes_compiled, + > num, maxtime, mflops, ' floating point', verified, + > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, + > '(none)') + + END IF + + call mpi_finalize(ierr) + end + + diff --git a/examples/smpi/NAS/LU/mpinpb.h b/examples/smpi/NAS/LU/mpinpb.h new file mode 100644 index 0000000000..ddbf1515f0 --- /dev/null +++ b/examples/smpi/NAS/LU/mpinpb.h @@ -0,0 +1,11 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'mpif.h' + + integer node, no_nodes, root, comm_setup, + > comm_solve, comm_rhs, dp_type + common /mpistuff/ node, no_nodes, root, comm_setup, + > comm_solve, comm_rhs, dp_type + diff --git a/examples/smpi/NAS/LU/neighbors.f b/examples/smpi/NAS/LU/neighbors.f new file mode 100644 index 0000000000..ed8a3126af --- /dev/null +++ b/examples/smpi/NAS/LU/neighbors.f @@ -0,0 +1,48 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine neighbors () + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c figure out the neighbors and their wrap numbers for each processor +c--------------------------------------------------------------------- + + south = -1 + east = -1 + north = -1 + west = -1 + + if (row.gt.1) then + north = id -1 + else + north = -1 + end if + + if (row.lt.xdim) then + south = id + 1 + else + south = -1 + end if + + if (col.gt.1) then + west = id- xdim + else + west = -1 + end if + + if (col.lt.ydim) then + east = id + xdim + else + east = -1 + end if + + return + end diff --git a/examples/smpi/NAS/LU/nodedim.f b/examples/smpi/NAS/LU/nodedim.f new file mode 100644 index 0000000000..f4def3a01e --- /dev/null +++ b/examples/smpi/NAS/LU/nodedim.f @@ -0,0 +1,36 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + integer function nodedim(num) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c compute the exponent where num = 2**nodedim +c NOTE: assumes a power-of-two number of nodes +c +c--------------------------------------------------------------------- + + implicit none + +c--------------------------------------------------------------------- +c input parameters +c--------------------------------------------------------------------- + integer num + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + double precision fnum + + + fnum = dble(num) + nodedim = log(fnum)/log(2.0d+0) + 0.00001 + + return + end + + diff --git a/examples/smpi/NAS/LU/pintgr.f b/examples/smpi/NAS/LU/pintgr.f new file mode 100644 index 0000000000..de514cccd2 --- /dev/null +++ b/examples/smpi/NAS/LU/pintgr.f @@ -0,0 +1,288 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine pintgr + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k + integer ibeg, ifin, ifin1 + integer jbeg, jfin, jfin1 + integer iglob, iglob1, iglob2 + integer jglob, jglob1, jglob2 + integer ind1, ind2 + double precision phi1(0:isiz2+1,0:isiz3+1), + > phi2(0:isiz2+1,0:isiz3+1) + double precision frc1, frc2, frc3 + double precision dummy + + integer IERROR + + +c--------------------------------------------------------------------- +c set up the sub-domains for integeration in each processor +c--------------------------------------------------------------------- + ibeg = nx + 1 + ifin = 0 + iglob1 = ipt + 1 + iglob2 = ipt + nx + if (iglob1.ge.ii1.and.iglob2.lt.ii2+nx) ibeg = 1 + if (iglob1.gt.ii1-nx.and.iglob2.le.ii2) ifin = nx + if (ii1.ge.iglob1.and.ii1.le.iglob2) ibeg = ii1 - ipt + if (ii2.ge.iglob1.and.ii2.le.iglob2) ifin = ii2 - ipt + jbeg = ny + 1 + jfin = 0 + jglob1 = jpt + 1 + jglob2 = jpt + ny + if (jglob1.ge.ji1.and.jglob2.lt.ji2+ny) jbeg = 1 + if (jglob1.gt.ji1-ny.and.jglob2.le.ji2) jfin = ny + if (ji1.ge.jglob1.and.ji1.le.jglob2) jbeg = ji1 - jpt + if (ji2.ge.jglob1.and.ji2.le.jglob2) jfin = ji2 - jpt + ifin1 = ifin + jfin1 = jfin + if (ipt + ifin1.eq.ii2) ifin1 = ifin -1 + if (jpt + jfin1.eq.ji2) jfin1 = jfin -1 + +c--------------------------------------------------------------------- +c initialize +c--------------------------------------------------------------------- + do i = 0,isiz2+1 + do k = 0,isiz3+1 + phi1(i,k) = 0. + phi2(i,k) = 0. + end do + end do + + do j = jbeg,jfin + jglob = jpt + j + do i = ibeg,ifin + iglob = ipt + i + + k = ki1 + + phi1(i,j) = c2*( u(5,i,j,k) + > - 0.50d+00 * ( u(2,i,j,k) ** 2 + > + u(3,i,j,k) ** 2 + > + u(4,i,j,k) ** 2 ) + > / u(1,i,j,k) ) + + k = ki2 + + phi2(i,j) = c2*( u(5,i,j,k) + > - 0.50d+00 * ( u(2,i,j,k) ** 2 + > + u(3,i,j,k) ** 2 + > + u(4,i,j,k) ** 2 ) + > / u(1,i,j,k) ) + end do + end do + +c--------------------------------------------------------------------- +c communicate in i and j directions +c--------------------------------------------------------------------- + call exchange_4(phi1,phi2,ibeg,ifin1,jbeg,jfin1) + + frc1 = 0.0d+00 + + do j = jbeg,jfin1 + do i = ibeg, ifin1 + frc1 = frc1 + ( phi1(i,j) + > + phi1(i+1,j) + > + phi1(i,j+1) + > + phi1(i+1,j+1) + > + phi2(i,j) + > + phi2(i+1,j) + > + phi2(i,j+1) + > + phi2(i+1,j+1) ) + end do + end do + +c--------------------------------------------------------------------- +c compute the global sum of individual contributions to frc1 +c--------------------------------------------------------------------- + dummy = frc1 + call MPI_ALLREDUCE( dummy, + > frc1, + > 1, + > dp_type, + > MPI_SUM, + > MPI_COMM_WORLD, + > IERROR ) + + frc1 = dxi * deta * frc1 + +c--------------------------------------------------------------------- +c initialize +c--------------------------------------------------------------------- + do i = 0,isiz2+1 + do k = 0,isiz3+1 + phi1(i,k) = 0. + phi2(i,k) = 0. + end do + end do + jglob = jpt + jbeg + ind1 = 0 + if (jglob.eq.ji1) then + ind1 = 1 + do k = ki1, ki2 + do i = ibeg, ifin + iglob = ipt + i + phi1(i,k) = c2*( u(5,i,jbeg,k) + > - 0.50d+00 * ( u(2,i,jbeg,k) ** 2 + > + u(3,i,jbeg,k) ** 2 + > + u(4,i,jbeg,k) ** 2 ) + > / u(1,i,jbeg,k) ) + end do + end do + end if + + jglob = jpt + jfin + ind2 = 0 + if (jglob.eq.ji2) then + ind2 = 1 + do k = ki1, ki2 + do i = ibeg, ifin + iglob = ipt + i + phi2(i,k) = c2*( u(5,i,jfin,k) + > - 0.50d+00 * ( u(2,i,jfin,k) ** 2 + > + u(3,i,jfin,k) ** 2 + > + u(4,i,jfin,k) ** 2 ) + > / u(1,i,jfin,k) ) + end do + end do + end if + +c--------------------------------------------------------------------- +c communicate in i direction +c--------------------------------------------------------------------- + if (ind1.eq.1) then + call exchange_5(phi1,ibeg,ifin1) + end if + if (ind2.eq.1) then + call exchange_5 (phi2,ibeg,ifin1) + end if + + frc2 = 0.0d+00 + do k = ki1, ki2-1 + do i = ibeg, ifin1 + frc2 = frc2 + ( phi1(i,k) + > + phi1(i+1,k) + > + phi1(i,k+1) + > + phi1(i+1,k+1) + > + phi2(i,k) + > + phi2(i+1,k) + > + phi2(i,k+1) + > + phi2(i+1,k+1) ) + end do + end do + +c--------------------------------------------------------------------- +c compute the global sum of individual contributions to frc2 +c--------------------------------------------------------------------- + dummy = frc2 + call MPI_ALLREDUCE( dummy, + > frc2, + > 1, + > dp_type, + > MPI_SUM, + > MPI_COMM_WORLD, + > IERROR ) + + frc2 = dxi * dzeta * frc2 + +c--------------------------------------------------------------------- +c initialize +c--------------------------------------------------------------------- + do i = 0,isiz2+1 + do k = 0,isiz3+1 + phi1(i,k) = 0. + phi2(i,k) = 0. + end do + end do + iglob = ipt + ibeg + ind1 = 0 + if (iglob.eq.ii1) then + ind1 = 1 + do k = ki1, ki2 + do j = jbeg, jfin + jglob = jpt + j + phi1(j,k) = c2*( u(5,ibeg,j,k) + > - 0.50d+00 * ( u(2,ibeg,j,k) ** 2 + > + u(3,ibeg,j,k) ** 2 + > + u(4,ibeg,j,k) ** 2 ) + > / u(1,ibeg,j,k) ) + end do + end do + end if + + iglob = ipt + ifin + ind2 = 0 + if (iglob.eq.ii2) then + ind2 = 1 + do k = ki1, ki2 + do j = jbeg, jfin + jglob = jpt + j + phi2(j,k) = c2*( u(5,ifin,j,k) + > - 0.50d+00 * ( u(2,ifin,j,k) ** 2 + > + u(3,ifin,j,k) ** 2 + > + u(4,ifin,j,k) ** 2 ) + > / u(1,ifin,j,k) ) + end do + end do + end if + +c--------------------------------------------------------------------- +c communicate in j direction +c--------------------------------------------------------------------- + if (ind1.eq.1) then + call exchange_6(phi1,jbeg,jfin1) + end if + if (ind2.eq.1) then + call exchange_6(phi2,jbeg,jfin1) + end if + + frc3 = 0.0d+00 + + do k = ki1, ki2-1 + do j = jbeg, jfin1 + frc3 = frc3 + ( phi1(j,k) + > + phi1(j+1,k) + > + phi1(j,k+1) + > + phi1(j+1,k+1) + > + phi2(j,k) + > + phi2(j+1,k) + > + phi2(j,k+1) + > + phi2(j+1,k+1) ) + end do + end do + +c--------------------------------------------------------------------- +c compute the global sum of individual contributions to frc3 +c--------------------------------------------------------------------- + dummy = frc3 + call MPI_ALLREDUCE( dummy, + > frc3, + > 1, + > dp_type, + > MPI_SUM, + > MPI_COMM_WORLD, + > IERROR ) + + frc3 = deta * dzeta * frc3 + frc = 0.25d+00 * ( frc1 + frc2 + frc3 ) +c if (id.eq.0) write (*,1001) frc + + return + + 1001 format (//5x,'surface integral = ',1pe12.5//) + + end diff --git a/examples/smpi/NAS/LU/proc_grid.f b/examples/smpi/NAS/LU/proc_grid.f new file mode 100644 index 0000000000..40271c135e --- /dev/null +++ b/examples/smpi/NAS/LU/proc_grid.f @@ -0,0 +1,36 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine proc_grid + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c set up a two-d grid for processors: column-major ordering of unknowns +c NOTE: assumes a power-of-two number of processors +c +c--------------------------------------------------------------------- + + xdim = 2**(ndim/2) + if (mod(ndim,2).eq.1) xdim = xdim + xdim + ydim = num/xdim + + row = mod(id,xdim) + 1 + col = id/xdim + 1 + + + return + end + + diff --git a/examples/smpi/NAS/LU/read_input.f b/examples/smpi/NAS/LU/read_input.f new file mode 100644 index 0000000000..b2e5ff1e79 --- /dev/null +++ b/examples/smpi/NAS/LU/read_input.f @@ -0,0 +1,127 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine read_input + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + + integer IERROR, fstatus, nnodes + + +c--------------------------------------------------------------------- +c only root reads the input file +c if input file does not exist, it uses defaults +c ipr = 1 for detailed progress output +c inorm = how often the norm is printed (once every inorm iterations) +c itmax = number of pseudo time steps +c dt = time step +c omega 1 over-relaxation factor for SSOR +c tolrsd = steady state residual tolerance levels +c nx, ny, nz = number of grid points in x, y, z directions +c--------------------------------------------------------------------- + ROOT = 0 + if (id .eq. ROOT) then + + write(*, 1000) + + open (unit=3,file='inputlu.data',status='old', + > access='sequential',form='formatted', iostat=fstatus) + if (fstatus .eq. 0) then + + write(*, *) 'Reading from input file inputlu.data' + + read (3,*) + read (3,*) + read (3,*) ipr, inorm + read (3,*) + read (3,*) + read (3,*) itmax + read (3,*) + read (3,*) + read (3,*) dt + read (3,*) + read (3,*) + read (3,*) omega + read (3,*) + read (3,*) + read (3,*) tolrsd(1),tolrsd(2),tolrsd(3),tolrsd(4),tolrsd(5) + read (3,*) + read (3,*) + read (3,*) nx0, ny0, nz0 + close(3) + else + ipr = ipr_default + inorm = inorm_default + itmax = itmax_default + dt = dt_default + omega = omega_default + tolrsd(1) = tolrsd1_def + tolrsd(2) = tolrsd2_def + tolrsd(3) = tolrsd3_def + tolrsd(4) = tolrsd4_def + tolrsd(5) = tolrsd5_def + nx0 = isiz01 + ny0 = isiz02 + nz0 = isiz03 + endif + +c--------------------------------------------------------------------- +c check problem size +c--------------------------------------------------------------------- + call MPI_COMM_SIZE(MPI_COMM_WORLD, nnodes, ierror) + if (nnodes .ne. nnodes_compiled) then + write (*, 2000) nnodes, nnodes_compiled + 2000 format (5x,'Warning: program is running on',i3,' processors' + > /5x,'but was compiled for ', i3) + endif + + if ( ( nx0 .lt. 4 ) .or. + > ( ny0 .lt. 4 ) .or. + > ( nz0 .lt. 4 ) ) then + + write (*,2001) + 2001 format (5x,'PROBLEM SIZE IS TOO SMALL - ', + > /5x,'SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5') + CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR ) + + end if + + if ( ( nx0 .gt. isiz01 ) .or. + > ( ny0 .gt. isiz02 ) .or. + > ( nz0 .gt. isiz03 ) ) then + + write (*,2002) + 2002 format (5x,'PROBLEM SIZE IS TOO LARGE - ', + > /5x,'NX, NY AND NZ SHOULD BE LESS THAN OR EQUAL TO ', + > /5x,'ISIZ01, ISIZ02 AND ISIZ03 RESPECTIVELY') + CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR ) + + end if + + + write(*, 1001) nx0, ny0, nz0 + write(*, 1002) itmax + write(*, 1003) nnodes + + 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- LU Benchmark',/) + 1001 format(' Size: ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations: ', i4) + 1003 format(' Number of processes: ', i5, /) + + + + end if + + call bcast_inputs + + return + end + + diff --git a/examples/smpi/NAS/LU/rhs.f b/examples/smpi/NAS/LU/rhs.f new file mode 100644 index 0000000000..3da39117a4 --- /dev/null +++ b/examples/smpi/NAS/LU/rhs.f @@ -0,0 +1,504 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine rhs + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the right hand sides +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k, m + integer iex + integer L1, L2 + integer ist1, iend1 + integer jst1, jend1 + double precision q + double precision u21, u31, u41 + double precision tmp + double precision u21i, u31i, u41i, u51i + double precision u21j, u31j, u41j, u51j + double precision u21k, u31k, u41k, u51k + double precision u21im1, u31im1, u41im1, u51im1 + double precision u21jm1, u31jm1, u41jm1, u51jm1 + double precision u21km1, u31km1, u41km1, u51km1 + + + + do k = 1, nz + do j = 1, ny + do i = 1, nx + do m = 1, 5 + rsd(m,i,j,k) = - frct(m,i,j,k) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c xi-direction flux differences +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c iex = flag : iex = 0 north/south communication +c : iex = 1 east/west communication +c--------------------------------------------------------------------- + iex = 0 + +c--------------------------------------------------------------------- +c communicate and receive/send two rows of data +c--------------------------------------------------------------------- + call exchange_3(u,iex) + + L1 = 0 + if (north.eq.-1) L1 = 1 + L2 = nx + 1 + if (south.eq.-1) L2 = nx + + ist1 = 1 + iend1 = nx + if (north.eq.-1) ist1 = 4 + if (south.eq.-1) iend1 = nx - 3 + + do k = 2, nz - 1 + do j = jst, jend + do i = L1, L2 + flux(1,i,j,k) = u(2,i,j,k) + u21 = u(2,i,j,k) / u(1,i,j,k) + + q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k) + > + u(3,i,j,k) * u(3,i,j,k) + > + u(4,i,j,k) * u(4,i,j,k) ) + > / u(1,i,j,k) + + flux(2,i,j,k) = u(2,i,j,k) * u21 + c2 * + > ( u(5,i,j,k) - q ) + flux(3,i,j,k) = u(3,i,j,k) * u21 + flux(4,i,j,k) = u(4,i,j,k) * u21 + flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u21 + end do + + do i = ist, iend + do m = 1, 5 + rsd(m,i,j,k) = rsd(m,i,j,k) + > - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) ) + end do + end do + + do i = ist, L2 + tmp = 1.0d+00 / u(1,i,j,k) + + u21i = tmp * u(2,i,j,k) + u31i = tmp * u(3,i,j,k) + u41i = tmp * u(4,i,j,k) + u51i = tmp * u(5,i,j,k) + + tmp = 1.0d+00 / u(1,i-1,j,k) + + u21im1 = tmp * u(2,i-1,j,k) + u31im1 = tmp * u(3,i-1,j,k) + u41im1 = tmp * u(4,i-1,j,k) + u51im1 = tmp * u(5,i-1,j,k) + + flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * (u21i-u21im1) + flux(3,i,j,k) = tx3 * ( u31i - u31im1 ) + flux(4,i,j,k) = tx3 * ( u41i - u41im1 ) + flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) + > * tx3 * ( ( u21i **2 + u31i **2 + u41i **2 ) + > - ( u21im1**2 + u31im1**2 + u41im1**2 ) ) + > + (1.0d+00/6.0d+00) + > * tx3 * ( u21i**2 - u21im1**2 ) + > + c1 * c5 * tx3 * ( u51i - u51im1 ) + end do + + do i = ist, iend + rsd(1,i,j,k) = rsd(1,i,j,k) + > + dx1 * tx1 * ( u(1,i-1,j,k) + > - 2.0d+00 * u(1,i,j,k) + > + u(1,i+1,j,k) ) + rsd(2,i,j,k) = rsd(2,i,j,k) + > + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) ) + > + dx2 * tx1 * ( u(2,i-1,j,k) + > - 2.0d+00 * u(2,i,j,k) + > + u(2,i+1,j,k) ) + rsd(3,i,j,k) = rsd(3,i,j,k) + > + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) ) + > + dx3 * tx1 * ( u(3,i-1,j,k) + > - 2.0d+00 * u(3,i,j,k) + > + u(3,i+1,j,k) ) + rsd(4,i,j,k) = rsd(4,i,j,k) + > + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) ) + > + dx4 * tx1 * ( u(4,i-1,j,k) + > - 2.0d+00 * u(4,i,j,k) + > + u(4,i+1,j,k) ) + rsd(5,i,j,k) = rsd(5,i,j,k) + > + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) ) + > + dx5 * tx1 * ( u(5,i-1,j,k) + > - 2.0d+00 * u(5,i,j,k) + > + u(5,i+1,j,k) ) + end do + +c--------------------------------------------------------------------- +c Fourth-order dissipation +c--------------------------------------------------------------------- + IF (north.eq.-1) then + do m = 1, 5 + rsd(m,2,j,k) = rsd(m,2,j,k) + > - dssp * ( + 5.0d+00 * u(m,2,j,k) + > - 4.0d+00 * u(m,3,j,k) + > + u(m,4,j,k) ) + rsd(m,3,j,k) = rsd(m,3,j,k) + > - dssp * ( - 4.0d+00 * u(m,2,j,k) + > + 6.0d+00 * u(m,3,j,k) + > - 4.0d+00 * u(m,4,j,k) + > + u(m,5,j,k) ) + end do + END IF + + do i = ist1,iend1 + do m = 1, 5 + rsd(m,i,j,k) = rsd(m,i,j,k) + > - dssp * ( u(m,i-2,j,k) + > - 4.0d+00 * u(m,i-1,j,k) + > + 6.0d+00 * u(m,i,j,k) + > - 4.0d+00 * u(m,i+1,j,k) + > + u(m,i+2,j,k) ) + end do + end do + + IF (south.eq.-1) then + do m = 1, 5 + rsd(m,nx-2,j,k) = rsd(m,nx-2,j,k) + > - dssp * ( u(m,nx-4,j,k) + > - 4.0d+00 * u(m,nx-3,j,k) + > + 6.0d+00 * u(m,nx-2,j,k) + > - 4.0d+00 * u(m,nx-1,j,k) ) + rsd(m,nx-1,j,k) = rsd(m,nx-1,j,k) + > - dssp * ( u(m,nx-3,j,k) + > - 4.0d+00 * u(m,nx-2,j,k) + > + 5.0d+00 * u(m,nx-1,j,k) ) + end do + END IF + + end do + end do + +c--------------------------------------------------------------------- +c eta-direction flux differences +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c iex = flag : iex = 0 north/south communication +c--------------------------------------------------------------------- + iex = 1 + +c--------------------------------------------------------------------- +c communicate and receive/send two rows of data +c--------------------------------------------------------------------- + call exchange_3(u,iex) + + L1 = 0 + if (west.eq.-1) L1 = 1 + L2 = ny + 1 + if (east.eq.-1) L2 = ny + + jst1 = 1 + jend1 = ny + if (west.eq.-1) jst1 = 4 + if (east.eq.-1) jend1 = ny - 3 + + do k = 2, nz - 1 + do j = L1, L2 + do i = ist, iend + flux(1,i,j,k) = u(3,i,j,k) + u31 = u(3,i,j,k) / u(1,i,j,k) + + q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k) + > + u(3,i,j,k) * u(3,i,j,k) + > + u(4,i,j,k) * u(4,i,j,k) ) + > / u(1,i,j,k) + + flux(2,i,j,k) = u(2,i,j,k) * u31 + flux(3,i,j,k) = u(3,i,j,k) * u31 + c2 * (u(5,i,j,k)-q) + flux(4,i,j,k) = u(4,i,j,k) * u31 + flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u31 + end do + end do + + do j = jst, jend + do i = ist, iend + do m = 1, 5 + rsd(m,i,j,k) = rsd(m,i,j,k) + > - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) ) + end do + end do + end do + + do j = jst, L2 + do i = ist, iend + tmp = 1.0d+00 / u(1,i,j,k) + + u21j = tmp * u(2,i,j,k) + u31j = tmp * u(3,i,j,k) + u41j = tmp * u(4,i,j,k) + u51j = tmp * u(5,i,j,k) + + tmp = 1.0d+00 / u(1,i,j-1,k) + u21jm1 = tmp * u(2,i,j-1,k) + u31jm1 = tmp * u(3,i,j-1,k) + u41jm1 = tmp * u(4,i,j-1,k) + u51jm1 = tmp * u(5,i,j-1,k) + + flux(2,i,j,k) = ty3 * ( u21j - u21jm1 ) + flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * (u31j-u31jm1) + flux(4,i,j,k) = ty3 * ( u41j - u41jm1 ) + flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) + > * ty3 * ( ( u21j **2 + u31j **2 + u41j **2 ) + > - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) ) + > + (1.0d+00/6.0d+00) + > * ty3 * ( u31j**2 - u31jm1**2 ) + > + c1 * c5 * ty3 * ( u51j - u51jm1 ) + end do + end do + + do j = jst, jend + do i = ist, iend + + rsd(1,i,j,k) = rsd(1,i,j,k) + > + dy1 * ty1 * ( u(1,i,j-1,k) + > - 2.0d+00 * u(1,i,j,k) + > + u(1,i,j+1,k) ) + + rsd(2,i,j,k) = rsd(2,i,j,k) + > + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) ) + > + dy2 * ty1 * ( u(2,i,j-1,k) + > - 2.0d+00 * u(2,i,j,k) + > + u(2,i,j+1,k) ) + + rsd(3,i,j,k) = rsd(3,i,j,k) + > + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) ) + > + dy3 * ty1 * ( u(3,i,j-1,k) + > - 2.0d+00 * u(3,i,j,k) + > + u(3,i,j+1,k) ) + + rsd(4,i,j,k) = rsd(4,i,j,k) + > + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) ) + > + dy4 * ty1 * ( u(4,i,j-1,k) + > - 2.0d+00 * u(4,i,j,k) + > + u(4,i,j+1,k) ) + + rsd(5,i,j,k) = rsd(5,i,j,k) + > + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) ) + > + dy5 * ty1 * ( u(5,i,j-1,k) + > - 2.0d+00 * u(5,i,j,k) + > + u(5,i,j+1,k) ) + + end do + end do + +c--------------------------------------------------------------------- +c fourth-order dissipation +c--------------------------------------------------------------------- + IF (west.eq.-1) then + do i = ist, iend + do m = 1, 5 + rsd(m,i,2,k) = rsd(m,i,2,k) + > - dssp * ( + 5.0d+00 * u(m,i,2,k) + > - 4.0d+00 * u(m,i,3,k) + > + u(m,i,4,k) ) + rsd(m,i,3,k) = rsd(m,i,3,k) + > - dssp * ( - 4.0d+00 * u(m,i,2,k) + > + 6.0d+00 * u(m,i,3,k) + > - 4.0d+00 * u(m,i,4,k) + > + u(m,i,5,k) ) + end do + end do + END IF + + do j = jst1, jend1 + do i = ist, iend + do m = 1, 5 + rsd(m,i,j,k) = rsd(m,i,j,k) + > - dssp * ( u(m,i,j-2,k) + > - 4.0d+00 * u(m,i,j-1,k) + > + 6.0d+00 * u(m,i,j,k) + > - 4.0d+00 * u(m,i,j+1,k) + > + u(m,i,j+2,k) ) + end do + end do + end do + + IF (east.eq.-1) then + do i = ist, iend + do m = 1, 5 + rsd(m,i,ny-2,k) = rsd(m,i,ny-2,k) + > - dssp * ( u(m,i,ny-4,k) + > - 4.0d+00 * u(m,i,ny-3,k) + > + 6.0d+00 * u(m,i,ny-2,k) + > - 4.0d+00 * u(m,i,ny-1,k) ) + rsd(m,i,ny-1,k) = rsd(m,i,ny-1,k) + > - dssp * ( u(m,i,ny-3,k) + > - 4.0d+00 * u(m,i,ny-2,k) + > + 5.0d+00 * u(m,i,ny-1,k) ) + end do + end do + END IF + + end do + +c--------------------------------------------------------------------- +c zeta-direction flux differences +c--------------------------------------------------------------------- + do k = 1, nz + do j = jst, jend + do i = ist, iend + flux(1,i,j,k) = u(4,i,j,k) + u41 = u(4,i,j,k) / u(1,i,j,k) + + q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k) + > + u(3,i,j,k) * u(3,i,j,k) + > + u(4,i,j,k) * u(4,i,j,k) ) + > / u(1,i,j,k) + + flux(2,i,j,k) = u(2,i,j,k) * u41 + flux(3,i,j,k) = u(3,i,j,k) * u41 + flux(4,i,j,k) = u(4,i,j,k) * u41 + c2 * (u(5,i,j,k)-q) + flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u41 + end do + end do + end do + + do k = 2, nz - 1 + do j = jst, jend + do i = ist, iend + do m = 1, 5 + rsd(m,i,j,k) = rsd(m,i,j,k) + > - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) ) + end do + end do + end do + end do + + do k = 2, nz + do j = jst, jend + do i = ist, iend + tmp = 1.0d+00 / u(1,i,j,k) + + u21k = tmp * u(2,i,j,k) + u31k = tmp * u(3,i,j,k) + u41k = tmp * u(4,i,j,k) + u51k = tmp * u(5,i,j,k) + + tmp = 1.0d+00 / u(1,i,j,k-1) + + u21km1 = tmp * u(2,i,j,k-1) + u31km1 = tmp * u(3,i,j,k-1) + u41km1 = tmp * u(4,i,j,k-1) + u51km1 = tmp * u(5,i,j,k-1) + + flux(2,i,j,k) = tz3 * ( u21k - u21km1 ) + flux(3,i,j,k) = tz3 * ( u31k - u31km1 ) + flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * (u41k-u41km1) + flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) + > * tz3 * ( ( u21k **2 + u31k **2 + u41k **2 ) + > - ( u21km1**2 + u31km1**2 + u41km1**2 ) ) + > + (1.0d+00/6.0d+00) + > * tz3 * ( u41k**2 - u41km1**2 ) + > + c1 * c5 * tz3 * ( u51k - u51km1 ) + end do + end do + end do + + do k = 2, nz - 1 + do j = jst, jend + do i = ist, iend + rsd(1,i,j,k) = rsd(1,i,j,k) + > + dz1 * tz1 * ( u(1,i,j,k-1) + > - 2.0d+00 * u(1,i,j,k) + > + u(1,i,j,k+1) ) + rsd(2,i,j,k) = rsd(2,i,j,k) + > + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) ) + > + dz2 * tz1 * ( u(2,i,j,k-1) + > - 2.0d+00 * u(2,i,j,k) + > + u(2,i,j,k+1) ) + rsd(3,i,j,k) = rsd(3,i,j,k) + > + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) ) + > + dz3 * tz1 * ( u(3,i,j,k-1) + > - 2.0d+00 * u(3,i,j,k) + > + u(3,i,j,k+1) ) + rsd(4,i,j,k) = rsd(4,i,j,k) + > + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) ) + > + dz4 * tz1 * ( u(4,i,j,k-1) + > - 2.0d+00 * u(4,i,j,k) + > + u(4,i,j,k+1) ) + rsd(5,i,j,k) = rsd(5,i,j,k) + > + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) ) + > + dz5 * tz1 * ( u(5,i,j,k-1) + > - 2.0d+00 * u(5,i,j,k) + > + u(5,i,j,k+1) ) + end do + end do + end do + +c--------------------------------------------------------------------- +c fourth-order dissipation +c--------------------------------------------------------------------- + do j = jst, jend + do i = ist, iend + do m = 1, 5 + rsd(m,i,j,2) = rsd(m,i,j,2) + > - dssp * ( + 5.0d+00 * u(m,i,j,2) + > - 4.0d+00 * u(m,i,j,3) + > + u(m,i,j,4) ) + rsd(m,i,j,3) = rsd(m,i,j,3) + > - dssp * ( - 4.0d+00 * u(m,i,j,2) + > + 6.0d+00 * u(m,i,j,3) + > - 4.0d+00 * u(m,i,j,4) + > + u(m,i,j,5) ) + end do + end do + end do + + do k = 4, nz - 3 + do j = jst, jend + do i = ist, iend + do m = 1, 5 + rsd(m,i,j,k) = rsd(m,i,j,k) + > - dssp * ( u(m,i,j,k-2) + > - 4.0d+00 * u(m,i,j,k-1) + > + 6.0d+00 * u(m,i,j,k) + > - 4.0d+00 * u(m,i,j,k+1) + > + u(m,i,j,k+2) ) + end do + end do + end do + end do + + do j = jst, jend + do i = ist, iend + do m = 1, 5 + rsd(m,i,j,nz-2) = rsd(m,i,j,nz-2) + > - dssp * ( u(m,i,j,nz-4) + > - 4.0d+00 * u(m,i,j,nz-3) + > + 6.0d+00 * u(m,i,j,nz-2) + > - 4.0d+00 * u(m,i,j,nz-1) ) + rsd(m,i,j,nz-1) = rsd(m,i,j,nz-1) + > - dssp * ( u(m,i,j,nz-3) + > - 4.0d+00 * u(m,i,j,nz-2) + > + 5.0d+00 * u(m,i,j,nz-1) ) + end do + end do + end do + + return + end diff --git a/examples/smpi/NAS/LU/setbv.f b/examples/smpi/NAS/LU/setbv.f new file mode 100644 index 0000000000..56b0edf967 --- /dev/null +++ b/examples/smpi/NAS/LU/setbv.f @@ -0,0 +1,79 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setbv + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c set the boundary values of dependent variables +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k + integer iglob, jglob + +c--------------------------------------------------------------------- +c set the dependent variable values along the top and bottom faces +c--------------------------------------------------------------------- + do j = 1, ny + jglob = jpt + j + do i = 1, nx + iglob = ipt + i + call exact( iglob, jglob, 1, u( 1, i, j, 1 ) ) + call exact( iglob, jglob, nz, u( 1, i, j, nz ) ) + end do + end do + +c--------------------------------------------------------------------- +c set the dependent variable values along north and south faces +c--------------------------------------------------------------------- + IF (west.eq.-1) then + do k = 1, nz + do i = 1, nx + iglob = ipt + i + call exact( iglob, 1, k, u( 1, i, 1, k ) ) + end do + end do + END IF + + IF (east.eq.-1) then + do k = 1, nz + do i = 1, nx + iglob = ipt + i + call exact( iglob, ny0, k, u( 1, i, ny, k ) ) + end do + end do + END IF + +c--------------------------------------------------------------------- +c set the dependent variable values along east and west faces +c--------------------------------------------------------------------- + IF (north.eq.-1) then + do k = 1, nz + do j = 1, ny + jglob = jpt + j + call exact( 1, jglob, k, u( 1, 1, j, k ) ) + end do + end do + END IF + + IF (south.eq.-1) then + do k = 1, nz + do j = 1, ny + jglob = jpt + j + call exact( nx0, jglob, k, u( 1, nx, j, k ) ) + end do + end do + END IF + + return + end diff --git a/examples/smpi/NAS/LU/setcoeff.f b/examples/smpi/NAS/LU/setcoeff.f new file mode 100644 index 0000000000..8fc5c1840c --- /dev/null +++ b/examples/smpi/NAS/LU/setcoeff.f @@ -0,0 +1,159 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setcoeff + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + + +c--------------------------------------------------------------------- +c set up coefficients +c--------------------------------------------------------------------- + dxi = 1.0d+00 / ( nx0 - 1 ) + deta = 1.0d+00 / ( ny0 - 1 ) + dzeta = 1.0d+00 / ( nz0 - 1 ) + + tx1 = 1.0d+00 / ( dxi * dxi ) + tx2 = 1.0d+00 / ( 2.0d+00 * dxi ) + tx3 = 1.0d+00 / dxi + + ty1 = 1.0d+00 / ( deta * deta ) + ty2 = 1.0d+00 / ( 2.0d+00 * deta ) + ty3 = 1.0d+00 / deta + + tz1 = 1.0d+00 / ( dzeta * dzeta ) + tz2 = 1.0d+00 / ( 2.0d+00 * dzeta ) + tz3 = 1.0d+00 / dzeta + + ii1 = 2 + ii2 = nx0 - 1 + ji1 = 2 + ji2 = ny0 - 2 + ki1 = 3 + ki2 = nz0 - 1 + +c--------------------------------------------------------------------- +c diffusion coefficients +c--------------------------------------------------------------------- + dx1 = 0.75d+00 + dx2 = dx1 + dx3 = dx1 + dx4 = dx1 + dx5 = dx1 + + dy1 = 0.75d+00 + dy2 = dy1 + dy3 = dy1 + dy4 = dy1 + dy5 = dy1 + + dz1 = 1.00d+00 + dz2 = dz1 + dz3 = dz1 + dz4 = dz1 + dz5 = dz1 + +c--------------------------------------------------------------------- +c fourth difference dissipation +c--------------------------------------------------------------------- + dssp = ( max (dx1, dy1, dz1 ) ) / 4.0d+00 + +c--------------------------------------------------------------------- +c coefficients of the exact solution to the first pde +c--------------------------------------------------------------------- + ce(1,1) = 2.0d+00 + ce(1,2) = 0.0d+00 + ce(1,3) = 0.0d+00 + ce(1,4) = 4.0d+00 + ce(1,5) = 5.0d+00 + ce(1,6) = 3.0d+00 + ce(1,7) = 5.0d-01 + ce(1,8) = 2.0d-02 + ce(1,9) = 1.0d-02 + ce(1,10) = 3.0d-02 + ce(1,11) = 5.0d-01 + ce(1,12) = 4.0d-01 + ce(1,13) = 3.0d-01 + +c--------------------------------------------------------------------- +c coefficients of the exact solution to the second pde +c--------------------------------------------------------------------- + ce(2,1) = 1.0d+00 + ce(2,2) = 0.0d+00 + ce(2,3) = 0.0d+00 + ce(2,4) = 0.0d+00 + ce(2,5) = 1.0d+00 + ce(2,6) = 2.0d+00 + ce(2,7) = 3.0d+00 + ce(2,8) = 1.0d-02 + ce(2,9) = 3.0d-02 + ce(2,10) = 2.0d-02 + ce(2,11) = 4.0d-01 + ce(2,12) = 3.0d-01 + ce(2,13) = 5.0d-01 + +c--------------------------------------------------------------------- +c coefficients of the exact solution to the third pde +c--------------------------------------------------------------------- + ce(3,1) = 2.0d+00 + ce(3,2) = 2.0d+00 + ce(3,3) = 0.0d+00 + ce(3,4) = 0.0d+00 + ce(3,5) = 0.0d+00 + ce(3,6) = 2.0d+00 + ce(3,7) = 3.0d+00 + ce(3,8) = 4.0d-02 + ce(3,9) = 3.0d-02 + ce(3,10) = 5.0d-02 + ce(3,11) = 3.0d-01 + ce(3,12) = 5.0d-01 + ce(3,13) = 4.0d-01 + +c--------------------------------------------------------------------- +c coefficients of the exact solution to the fourth pde +c--------------------------------------------------------------------- + ce(4,1) = 2.0d+00 + ce(4,2) = 2.0d+00 + ce(4,3) = 0.0d+00 + ce(4,4) = 0.0d+00 + ce(4,5) = 0.0d+00 + ce(4,6) = 2.0d+00 + ce(4,7) = 3.0d+00 + ce(4,8) = 3.0d-02 + ce(4,9) = 5.0d-02 + ce(4,10) = 4.0d-02 + ce(4,11) = 2.0d-01 + ce(4,12) = 1.0d-01 + ce(4,13) = 3.0d-01 + +c--------------------------------------------------------------------- +c coefficients of the exact solution to the fifth pde +c--------------------------------------------------------------------- + ce(5,1) = 5.0d+00 + ce(5,2) = 4.0d+00 + ce(5,3) = 3.0d+00 + ce(5,4) = 2.0d+00 + ce(5,5) = 1.0d-01 + ce(5,6) = 4.0d-01 + ce(5,7) = 3.0d-01 + ce(5,8) = 5.0d-02 + ce(5,9) = 4.0d-02 + ce(5,10) = 3.0d-02 + ce(5,11) = 1.0d-01 + ce(5,12) = 3.0d-01 + ce(5,13) = 2.0d-01 + + return + end + + diff --git a/examples/smpi/NAS/LU/sethyper.f b/examples/smpi/NAS/LU/sethyper.f new file mode 100644 index 0000000000..15245d4c21 --- /dev/null +++ b/examples/smpi/NAS/LU/sethyper.f @@ -0,0 +1,94 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine sethyper + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c for each column in a hyperplane, istart = first row, +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j + integer iglob, jglob + integer kp + +c--------------------------------------------------------------------- +c compute the pointers for hyperplanes +c--------------------------------------------------------------------- + do kp = 2,nx0+ny0 + icomms(kp) = .false. + icommn(kp) = .false. + icomme(kp) = .false. + icommw(kp) = .false. + +c--------------------------------------------------------------------- +c check to see if comm. to south is required +c--------------------------------------------------------------------- + if (south.ne.-1) then + i = iend + iglob = ipt + i + jglob = kp - iglob + j = jglob - jpt + if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and. + > j.le.jend) icomms(kp) = .true. + end if + +c--------------------------------------------------------------------- +c check to see if comm. to north is required +c--------------------------------------------------------------------- + if (north.ne.-1) then + i = ist + iglob = ipt + i + jglob = kp - iglob + j = jglob - jpt + if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and. + > j.le.jend) icommn(kp) = .true. + end if + +c--------------------------------------------------------------------- +c check to see if comm. to east is required +c--------------------------------------------------------------------- + if (east.ne.-1) then + j = jend + jglob = jpt + j + iglob = kp - jglob + i = iglob - ipt + if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and. + > i.le.iend) icomme(kp) = .true. + end if + +c--------------------------------------------------------------------- +c check to see if comm. to west is required +c--------------------------------------------------------------------- + if (west.ne.-1) then + j = jst + jglob = jpt + j + iglob = kp - jglob + i = iglob - ipt + if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and. + > i.le.iend) icommw(kp) = .true. + end if + + end do + + icomms(1) = .false. + icommn(1) = .false. + icomme(1) = .false. + icommw(1) = .false. + icomms(nx0+ny0+1) = .false. + icommn(nx0+ny0+1) = .false. + icomme(nx0+ny0+1) = .false. + icommw(nx0+ny0+1) = .false. + + return + end diff --git a/examples/smpi/NAS/LU/setiv.f b/examples/smpi/NAS/LU/setiv.f new file mode 100644 index 0000000000..73725cbe5c --- /dev/null +++ b/examples/smpi/NAS/LU/setiv.f @@ -0,0 +1,67 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine setiv + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c set the initial values of independent variables based on tri-linear +c interpolation of boundary values in the computational space. +c +c--------------------------------------------------------------------- + + implicit none + + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k, m + integer iglob, jglob + double precision xi, eta, zeta + double precision pxi, peta, pzeta + double precision ue_1jk(5),ue_nx0jk(5),ue_i1k(5), + > ue_iny0k(5),ue_ij1(5),ue_ijnz(5) + + + do k = 2, nz - 1 + zeta = ( dble (k-1) ) / (nz-1) + do j = 1, ny + jglob = jpt + j + IF (jglob.ne.1.and.jglob.ne.ny0) then + eta = ( dble (jglob-1) ) / (ny0-1) + do i = 1, nx + iglob = ipt + i + IF (iglob.ne.1.and.iglob.ne.nx0) then + xi = ( dble (iglob-1) ) / (nx0-1) + call exact (1,jglob,k,ue_1jk) + call exact (nx0,jglob,k,ue_nx0jk) + call exact (iglob,1,k,ue_i1k) + call exact (iglob,ny0,k,ue_iny0k) + call exact (iglob,jglob,1,ue_ij1) + call exact (iglob,jglob,nz,ue_ijnz) + do m = 1, 5 + pxi = ( 1.0d+00 - xi ) * ue_1jk(m) + > + xi * ue_nx0jk(m) + peta = ( 1.0d+00 - eta ) * ue_i1k(m) + > + eta * ue_iny0k(m) + pzeta = ( 1.0d+00 - zeta ) * ue_ij1(m) + > + zeta * ue_ijnz(m) + + u( m, i, j, k ) = pxi + peta + pzeta + > - pxi * peta - peta * pzeta - pzeta * pxi + > + pxi * peta * pzeta + + end do + END IF + end do + END IF + end do + end do + + return + end diff --git a/examples/smpi/NAS/LU/ssor.f b/examples/smpi/NAS/LU/ssor.f new file mode 100644 index 0000000000..cf4eed0eb7 --- /dev/null +++ b/examples/smpi/NAS/LU/ssor.f @@ -0,0 +1,241 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine ssor(niter) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c to perform pseudo-time stepping SSOR iterations +c for five nonlinear pde's. +c--------------------------------------------------------------------- + + implicit none + integer niter + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer i, j, k, m + integer istep + double precision tmp + double precision delunm(5), tv(5,isiz1,isiz2) + + external timer_read + double precision wtime, timer_read + + integer IERROR + + + ROOT = 0 + +c--------------------------------------------------------------------- +c begin pseudo-time stepping iterations +c--------------------------------------------------------------------- + tmp = 1.0d+00 / ( omega * ( 2.0d+00 - omega ) ) + +c--------------------------------------------------------------------- +c initialize a,b,c,d to zero (guarantees that page tables have been +c formed, if applicable on given architecture, before timestepping). +c--------------------------------------------------------------------- + do m=1,isiz2 + do k=1,isiz1 + do j=1,5 + do i=1,5 + a(i,j,k,m) = 0.d0 + b(i,j,k,m) = 0.d0 + c(i,j,k,m) = 0.d0 + d(i,j,k,m) = 0.d0 + enddo + enddo + enddo + enddo + +c--------------------------------------------------------------------- +c compute the steady-state residuals +c--------------------------------------------------------------------- + call rhs + +c--------------------------------------------------------------------- +c compute the L2 norms of newton iteration residuals +c--------------------------------------------------------------------- + call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0, + > ist, iend, jst, jend, + > rsd, rsdnm ) + + call MPI_BARRIER( MPI_COMM_WORLD, IERROR ) + + call timer_clear(1) + call timer_start(1) + +c--------------------------------------------------------------------- +c the timestep loop +c--------------------------------------------------------------------- + do istep = 1, niter + + if (id .eq. 0) then + if (mod ( istep, 20) .eq. 0 .or. + > istep .eq. itmax .or. + > istep .eq. 1) then + if (niter .gt. 1) write( *, 200) istep + 200 format(' Time step ', i4) + endif + endif + +c--------------------------------------------------------------------- +c perform SSOR iteration +c--------------------------------------------------------------------- + do k = 2, nz - 1 + do j = jst, jend + do i = ist, iend + do m = 1, 5 + rsd(m,i,j,k) = dt * rsd(m,i,j,k) + end do + end do + end do + end do + + DO k = 2, nz -1 +c--------------------------------------------------------------------- +c form the lower triangular part of the jacobian matrix +c--------------------------------------------------------------------- + call jacld(k) + +c--------------------------------------------------------------------- +c perform the lower triangular solution +c--------------------------------------------------------------------- + call blts( isiz1, isiz2, isiz3, + > nx, ny, nz, k, + > omega, + > rsd, + > a, b, c, d, + > ist, iend, jst, jend, + > nx0, ny0, ipt, jpt) + END DO + + DO k = nz - 1, 2, -1 +c--------------------------------------------------------------------- +c form the strictly upper triangular part of the jacobian matrix +c--------------------------------------------------------------------- + call jacu(k) + +c--------------------------------------------------------------------- +c perform the upper triangular solution +c--------------------------------------------------------------------- + call buts( isiz1, isiz2, isiz3, + > nx, ny, nz, k, + > omega, + > rsd, tv, + > d, a, b, c, + > ist, iend, jst, jend, + > nx0, ny0, ipt, jpt) + END DO + +c--------------------------------------------------------------------- +c update the variables +c--------------------------------------------------------------------- + + do k = 2, nz-1 + do j = jst, jend + do i = ist, iend + do m = 1, 5 + u( m, i, j, k ) = u( m, i, j, k ) + > + tmp * rsd( m, i, j, k ) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c compute the max-norms of newton iteration corrections +c--------------------------------------------------------------------- + if ( mod ( istep, inorm ) .eq. 0 ) then + call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0, + > ist, iend, jst, jend, + > rsd, delunm ) +c if ( ipr .eq. 1 .and. id .eq. 0 ) then +c write (*,1006) ( delunm(m), m = 1, 5 ) +c else if ( ipr .eq. 2 .and. id .eq. 0 ) then +c write (*,'(i5,f15.6)') istep,delunm(5) +c end if + end if + +c--------------------------------------------------------------------- +c compute the steady-state residuals +c--------------------------------------------------------------------- + call rhs + +c--------------------------------------------------------------------- +c compute the max-norms of newton iteration residuals +c--------------------------------------------------------------------- + if ( ( mod ( istep, inorm ) .eq. 0 ) .or. + > ( istep .eq. itmax ) ) then + call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0, + > ist, iend, jst, jend, + > rsd, rsdnm ) +c if ( ipr .eq. 1.and.id.eq.0 ) then +c write (*,1007) ( rsdnm(m), m = 1, 5 ) +c end if + end if + +c--------------------------------------------------------------------- +c check the newton-iteration residuals against the tolerance levels +c--------------------------------------------------------------------- + if ( ( rsdnm(1) .lt. tolrsd(1) ) .and. + > ( rsdnm(2) .lt. tolrsd(2) ) .and. + > ( rsdnm(3) .lt. tolrsd(3) ) .and. + > ( rsdnm(4) .lt. tolrsd(4) ) .and. + > ( rsdnm(5) .lt. tolrsd(5) ) ) then +c if (ipr .eq. 1 .and. id.eq.0) then +c write (*,1004) istep +c end if + return + end if + + end do + + call timer_stop(1) + wtime = timer_read(1) + + + call MPI_ALLREDUCE( wtime, + > maxtime, + > 1, + > MPI_DOUBLE_PRECISION, + > MPI_MAX, + > MPI_COMM_WORLD, + > IERROR ) + + + + return + + 1001 format (1x/5x,'pseudo-time SSOR iteration no.=',i4/) + 1004 format (1x/1x,'convergence was achieved after ',i4, + > ' pseudo-time steps' ) + 1006 format (1x/1x,'RMS-norm of SSOR-iteration correction ', + > 'for first pde = ',1pe12.5/, + > 1x,'RMS-norm of SSOR-iteration correction ', + > 'for second pde = ',1pe12.5/, + > 1x,'RMS-norm of SSOR-iteration correction ', + > 'for third pde = ',1pe12.5/, + > 1x,'RMS-norm of SSOR-iteration correction ', + > 'for fourth pde = ',1pe12.5/, + > 1x,'RMS-norm of SSOR-iteration correction ', + > 'for fifth pde = ',1pe12.5) + 1007 format (1x/1x,'RMS-norm of steady-state residual for ', + > 'first pde = ',1pe12.5/, + > 1x,'RMS-norm of steady-state residual for ', + > 'second pde = ',1pe12.5/, + > 1x,'RMS-norm of steady-state residual for ', + > 'third pde = ',1pe12.5/, + > 1x,'RMS-norm of steady-state residual for ', + > 'fourth pde = ',1pe12.5/, + > 1x,'RMS-norm of steady-state residual for ', + > 'fifth pde = ',1pe12.5) + + end diff --git a/examples/smpi/NAS/LU/subdomain.f b/examples/smpi/NAS/LU/subdomain.f new file mode 100644 index 0000000000..388bbf4494 --- /dev/null +++ b/examples/smpi/NAS/LU/subdomain.f @@ -0,0 +1,103 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine subdomain + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'applu.incl' + +c--------------------------------------------------------------------- +c local variables +c--------------------------------------------------------------------- + integer mm, ierror, errorcode + + +c--------------------------------------------------------------------- +c +c set up the sub-domain sizes +c +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c x dimension +c--------------------------------------------------------------------- + mm = mod(nx0,xdim) + if (row.le.mm) then + nx = nx0/xdim + 1 + ipt = (row-1)*nx + else + nx = nx0/xdim + ipt = (row-1)*nx + mm + end if + +c--------------------------------------------------------------------- +c y dimension +c--------------------------------------------------------------------- + mm = mod(ny0,ydim) + if (col.le.mm) then + ny = ny0/ydim + 1 + jpt = (col-1)*ny + else + ny = ny0/ydim + jpt = (col-1)*ny + mm + end if + +c--------------------------------------------------------------------- +c z dimension +c--------------------------------------------------------------------- + nz = nz0 + +c--------------------------------------------------------------------- +c check the sub-domain size +c--------------------------------------------------------------------- + if ( ( nx .lt. 4 ) .or. + > ( ny .lt. 4 ) .or. + > ( nz .lt. 4 ) ) then + write (*,2001) nx, ny, nz + 2001 format (5x,'SUBDOMAIN SIZE IS TOO SMALL - ', + > /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS', + > /5x,'SO THAT NX, NY AND NZ ARE GREATER THAN OR EQUAL', + > /5x,'TO 4 THEY ARE CURRENTLY', 3I3) + CALL MPI_ABORT( MPI_COMM_WORLD, + > ERRORCODE, + > IERROR ) + end if + + if ( ( nx .gt. isiz1 ) .or. + > ( ny .gt. isiz2 ) .or. + > ( nz .gt. isiz3 ) ) then + write (*,2002) nx, ny, nz + 2002 format (5x,'SUBDOMAIN SIZE IS TOO LARGE - ', + > /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS', + > /5x,'SO THAT NX, NY AND NZ ARE LESS THAN OR EQUAL TO ', + > /5x,'ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY. THEY ARE', + > /5x,'CURRENTLY', 3I4) + CALL MPI_ABORT( MPI_COMM_WORLD, + > ERRORCODE, + > IERROR ) + end if + + +c--------------------------------------------------------------------- +c set up the start and end in i and j extents for all processors +c--------------------------------------------------------------------- + ist = 1 + iend = nx + if (north.eq.-1) ist = 2 + if (south.eq.-1) iend = nx - 1 + + jst = 1 + jend = ny + if (west.eq.-1) jst = 2 + if (east.eq.-1) jend = ny - 1 + + return + end + + diff --git a/examples/smpi/NAS/LU/verify.f b/examples/smpi/NAS/LU/verify.f new file mode 100644 index 0000000000..2572441a44 --- /dev/null +++ b/examples/smpi/NAS/LU/verify.f @@ -0,0 +1,403 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine verify(xcr, xce, xci, class, verified) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c verification routine +c--------------------------------------------------------------------- + + implicit none + include 'mpinpb.h' + include 'applu.incl' + + double precision xcr(5), xce(5), xci + double precision xcrref(5),xceref(5),xciref, + > xcrdif(5),xcedif(5),xcidif, + > epsilon, dtref + integer m + character class + logical verified + +c--------------------------------------------------------------------- +c tolerance level +c--------------------------------------------------------------------- + epsilon = 1.0d-08 + + class = 'U' + verified = .true. + + do m = 1,5 + xcrref(m) = 1.0 + xceref(m) = 1.0 + end do + xciref = 1.0 + + if ( (nx0 .eq. 12 ) .and. + > (ny0 .eq. 12 ) .and. + > (nz0 .eq. 12 ) .and. + > (itmax .eq. 50 )) then + + class = 'S' + dtref = 5.0d-1 +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual, for the (12X12X12) grid, +c after 50 time steps, with DT = 5.0d-01 +c--------------------------------------------------------------------- + xcrref(1) = 1.6196343210976702d-02 + xcrref(2) = 2.1976745164821318d-03 + xcrref(3) = 1.5179927653399185d-03 + xcrref(4) = 1.5029584435994323d-03 + xcrref(5) = 3.4264073155896461d-02 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error, for the (12X12X12) grid, +c after 50 time steps, with DT = 5.0d-01 +c--------------------------------------------------------------------- + xceref(1) = 6.4223319957960924d-04 + xceref(2) = 8.4144342047347926d-05 + xceref(3) = 5.8588269616485186d-05 + xceref(4) = 5.8474222595157350d-05 + xceref(5) = 1.3103347914111294d-03 + +c--------------------------------------------------------------------- +c Reference value of surface integral, for the (12X12X12) grid, +c after 50 time steps, with DT = 5.0d-01 +c--------------------------------------------------------------------- + xciref = 7.8418928865937083d+00 + + + elseif ( (nx0 .eq. 33) .and. + > (ny0 .eq. 33) .and. + > (nz0 .eq. 33) .and. + > (itmax . eq. 300) ) then + + class = 'W' !SPEC95fp size + dtref = 1.5d-3 +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual, for the (33x33x33) grid, +c after 300 time steps, with DT = 1.5d-3 +c--------------------------------------------------------------------- + xcrref(1) = 0.1236511638192d+02 + xcrref(2) = 0.1317228477799d+01 + xcrref(3) = 0.2550120713095d+01 + xcrref(4) = 0.2326187750252d+01 + xcrref(5) = 0.2826799444189d+02 + + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error, for the (33X33X33) grid, +c--------------------------------------------------------------------- + xceref(1) = 0.4867877144216d+00 + xceref(2) = 0.5064652880982d-01 + xceref(3) = 0.9281818101960d-01 + xceref(4) = 0.8570126542733d-01 + xceref(5) = 0.1084277417792d+01 + + +c--------------------------------------------------------------------- +c Reference value of surface integral, for the (33X33X33) grid, +c after 300 time steps, with DT = 1.5d-3 +c--------------------------------------------------------------------- + xciref = 0.1161399311023d+02 + + elseif ( (nx0 .eq. 64) .and. + > (ny0 .eq. 64) .and. + > (nz0 .eq. 64) .and. + > (itmax . eq. 250) ) then + + class = 'A' + dtref = 2.0d+0 +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual, for the (64X64X64) grid, +c after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xcrref(1) = 7.7902107606689367d+02 + xcrref(2) = 6.3402765259692870d+01 + xcrref(3) = 1.9499249727292479d+02 + xcrref(4) = 1.7845301160418537d+02 + xcrref(5) = 1.8384760349464247d+03 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error, for the (64X64X64) grid, +c after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xceref(1) = 2.9964085685471943d+01 + xceref(2) = 2.8194576365003349d+00 + xceref(3) = 7.3473412698774742d+00 + xceref(4) = 6.7139225687777051d+00 + xceref(5) = 7.0715315688392578d+01 + +c--------------------------------------------------------------------- +c Reference value of surface integral, for the (64X64X64) grid, +c after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xciref = 2.6030925604886277d+01 + + + elseif ( (nx0 .eq. 102) .and. + > (ny0 .eq. 102) .and. + > (nz0 .eq. 102) .and. + > (itmax . eq. 250) ) then + + class = 'B' + dtref = 2.0d+0 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual, for the (102X102X102) grid, +c after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xcrref(1) = 3.5532672969982736d+03 + xcrref(2) = 2.6214750795310692d+02 + xcrref(3) = 8.8333721850952190d+02 + xcrref(4) = 7.7812774739425265d+02 + xcrref(5) = 7.3087969592545314d+03 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error, for the (102X102X102) +c grid, after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xceref(1) = 1.1401176380212709d+02 + xceref(2) = 8.1098963655421574d+00 + xceref(3) = 2.8480597317698308d+01 + xceref(4) = 2.5905394567832939d+01 + xceref(5) = 2.6054907504857413d+02 + +c--------------------------------------------------------------------- +c Reference value of surface integral, for the (102X102X102) grid, +c after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xciref = 4.7887162703308227d+01 + + elseif ( (nx0 .eq. 162) .and. + > (ny0 .eq. 162) .and. + > (nz0 .eq. 162) .and. + > (itmax . eq. 250) ) then + + class = 'C' + dtref = 2.0d+0 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual, for the (162X162X162) grid, +c after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xcrref(1) = 1.03766980323537846d+04 + xcrref(2) = 8.92212458801008552d+02 + xcrref(3) = 2.56238814582660871d+03 + xcrref(4) = 2.19194343857831427d+03 + xcrref(5) = 1.78078057261061185d+04 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error, for the (162X162X162) +c grid, after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xceref(1) = 2.15986399716949279d+02 + xceref(2) = 1.55789559239863600d+01 + xceref(3) = 5.41318863077207766d+01 + xceref(4) = 4.82262643154045421d+01 + xceref(5) = 4.55902910043250358d+02 + +c--------------------------------------------------------------------- +c Reference value of surface integral, for the (162X162X162) grid, +c after 250 time steps, with DT = 2.0d+00 +c--------------------------------------------------------------------- + xciref = 6.66404553572181300d+01 + + elseif ( (nx0 .eq. 408) .and. + > (ny0 .eq. 408) .and. + > (nz0 .eq. 408) .and. + > (itmax . eq. 300) ) then + + class = 'D' + dtref = 1.0d+0 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual, for the (408X408X408) grid, +c after 300 time steps, with DT = 1.0d+00 +c--------------------------------------------------------------------- + xcrref(1) = 0.4868417937025d+05 + xcrref(2) = 0.4696371050071d+04 + xcrref(3) = 0.1218114549776d+05 + xcrref(4) = 0.1033801493461d+05 + xcrref(5) = 0.7142398413817d+05 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error, for the (408X408X408) +c grid, after 300 time steps, with DT = 1.0d+00 +c--------------------------------------------------------------------- + xceref(1) = 0.3752393004482d+03 + xceref(2) = 0.3084128893659d+02 + xceref(3) = 0.9434276905469d+02 + xceref(4) = 0.8230686681928d+02 + xceref(5) = 0.7002620636210d+03 + +c--------------------------------------------------------------------- +c Reference value of surface integral, for the (408X408X408) grid, +c after 300 time steps, with DT = 1.0d+00 +c--------------------------------------------------------------------- + xciref = 0.8334101392503d+02 + + elseif ( (nx0 .eq. 1020) .and. + > (ny0 .eq. 1020) .and. + > (nz0 .eq. 1020) .and. + > (itmax . eq. 300) ) then + + class = 'E' + dtref = 0.5d+0 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual, for the (1020X1020X1020) grid, +c after 300 time steps, with DT = 0.5d+00 +c--------------------------------------------------------------------- + xcrref(1) = 0.2099641687874d+06 + xcrref(2) = 0.2130403143165d+05 + xcrref(3) = 0.5319228789371d+05 + xcrref(4) = 0.4509761639833d+05 + xcrref(5) = 0.2932360006590d+06 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error, for the (1020X1020X1020) +c grid, after 300 time steps, with DT = 0.5d+00 +c--------------------------------------------------------------------- + xceref(1) = 0.4800572578333d+03 + xceref(2) = 0.4221993400184d+02 + xceref(3) = 0.1210851906824d+03 + xceref(4) = 0.1047888986770d+03 + xceref(5) = 0.8363028257389d+03 + +c--------------------------------------------------------------------- +c Reference value of surface integral, for the (1020X1020X1020) grid, +c after 300 time steps, with DT = 0.5d+00 +c--------------------------------------------------------------------- + xciref = 0.9512163272273d+02 + + else + verified = .FALSE. + endif + +c--------------------------------------------------------------------- +c verification test for residuals if gridsize is one of +c the defined grid sizes above (class .ne. 'U') +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Compute the difference of solution values and the known reference values. +c--------------------------------------------------------------------- + do m = 1, 5 + + xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) + xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m)) + + enddo + xcidif = dabs((xci - xciref)/xciref) + + +c--------------------------------------------------------------------- +c Output the comparison of computed results to known cases. +c--------------------------------------------------------------------- + + if (class .ne. 'U') then + write(*, 1990) class + 1990 format(/, ' Verification being performed for class ', a) + write (*,2000) epsilon + 2000 format(' Accuracy setting for epsilon = ', E20.13) + verified = (dabs(dt-dtref) .le. epsilon) + if (.not.verified) then + class = 'U' + write (*,1000) dtref + 1000 format(' DT does not match the reference value of ', + > E15.8) + endif + else + write(*, 1995) + 1995 format(' Unknown class') + endif + + + if (class .ne. 'U') then + write (*,2001) + else + write (*, 2005) + endif + + 2001 format(' Comparison of RMS-norms of residual') + 2005 format(' RMS-norms of residual') + do m = 1, 5 + if (class .eq. 'U') then + write(*, 2015) m, xcr(m) + else if (xcrdif(m) .le. epsilon) then + write (*,2011) m,xcr(m),xcrref(m),xcrdif(m) + else + verified = .false. + write (*,2010) m,xcr(m),xcrref(m),xcrdif(m) + endif + enddo + + if (class .ne. 'U') then + write (*,2002) + else + write (*,2006) + endif + 2002 format(' Comparison of RMS-norms of solution error') + 2006 format(' RMS-norms of solution error') + + do m = 1, 5 + if (class .eq. 'U') then + write(*, 2015) m, xce(m) + else if (xcedif(m) .le. epsilon) then + write (*,2011) m,xce(m),xceref(m),xcedif(m) + else + verified = .false. + write (*,2010) m,xce(m),xceref(m),xcedif(m) + endif + enddo + + 2010 format(' FAILURE: ', i2, 2x, E20.13, E20.13, E20.13) + 2011 format(' ', i2, 2x, E20.13, E20.13, E20.13) + 2015 format(' ', i2, 2x, E20.13) + + if (class .ne. 'U') then + write (*,2025) + else + write (*,2026) + endif + 2025 format(' Comparison of surface integral') + 2026 format(' Surface integral') + + + if (class .eq. 'U') then + write(*, 2030) xci + else if (xcidif .le. epsilon) then + write(*, 2032) xci, xciref, xcidif + else + verified = .false. + write(*, 2031) xci, xciref, xcidif + endif + + 2030 format(' ', 4x, E20.13) + 2031 format(' FAILURE: ', 4x, E20.13, E20.13, E20.13) + 2032 format(' ', 4x, E20.13, E20.13, E20.13) + + + + if (class .eq. 'U') then + write(*, 2022) + write(*, 2023) + 2022 format(' No reference values provided') + 2023 format(' No verification performed') + else if (verified) then + write(*, 2020) + 2020 format(' Verification Successful') + else + write(*, 2021) + 2021 format(' Verification failed') + endif + + return + + + end diff --git a/examples/smpi/NAS/MG/Makefile b/examples/smpi/NAS/MG/Makefile new file mode 100644 index 0000000000..1554bedeea --- /dev/null +++ b/examples/smpi/NAS/MG/Makefile @@ -0,0 +1,23 @@ +SHELL=/bin/sh +BENCHMARK=mg +BENCHMARKU=MG + +include ../config/make.def + +OBJS = mg.o ${COMMON}/print_results.o \ + ${COMMON}/${RAND}.o ${COMMON}/timers.o + +include ../sys/make.common + +${PROGRAM}: config ${OBJS} + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} + +mg.o: mg.f globals.h mpinpb.h npbparams.h + ${FCOMPILE} mg.f + +clean: + - rm -f *.o *~ + - rm -f npbparams.h core + + + diff --git a/examples/smpi/NAS/MG/README b/examples/smpi/NAS/MG/README new file mode 100644 index 0000000000..6c03f78527 --- /dev/null +++ b/examples/smpi/NAS/MG/README @@ -0,0 +1,138 @@ +Some info about the MG benchmark +================================ + +'mg_demo' demonstrates the capabilities of a very simple multigrid +solver in computing a three dimensional potential field. This is +a simplified multigrid solver in two important respects: + + (1) it solves only a constant coefficient equation, + and that only on a uniform cubical grid, + + (2) it solves only a single equation, representing + a scalar field rather than a vector field. + +We chose it for its portability and simplicity, and expect that a +supercomputer which can run it effectively will also be able to +run more complex multigrid programs at least as well. + + Eric Barszcz Paul Frederickson + RIACS + NASA Ames Research Center NASA Ames Research Center + +======================================================================== +Running the program: (Note: also see parameter lm information in the + two sections immediately below this section) + +The program may be run with or without an input deck (called "mg.input"). +The following describes a few things about the input deck if you want to +use one. + +The four lines below are the "mg.input" file required to run a +problem of total size 256x256x256, for 4 iterations (Class "A"), +and presumes the use of 8 processors: + + 8 = top level + 256 256 256 = nx ny nz + 4 = nit + 0 0 0 0 0 0 0 0 = debug_vec + +The first line of input indicates how many levels of multi-grid +cycle will be applied to a particular subpartition. Presuming that +8 processors are solving this problem (recall that the number of +processors is specified to MPI as a run parameter, and MPI subsequently +determines this for the code via an MPI subroutine call), a 2x2x2 +processor grid is formed, and thus each partition on a processor is +of size 128x128x128. Therefore, a maximum of 8 multi-grid levels may +be used. These are of size 128,64,32,16,8,4,2,1, with the coarsest +level being a single point on a given processor. + + +Next, consider the same size problem but running on 1 processor. The +following "mg.input" file is appropriate: + + 9 = top level + 256 256 256 = nx ny nz + 4 = nit + 0 0 0 0 0 0 0 0 = debug_vec + +Since this processor must solve the full 256x256x256 problem, this +permits 9 multi-grid levels (256,128,64,32,16,8,4,2,1), resulting in +a coarsest multi-grid level of a single point on the processor + + +Next, consider the same size problem but running on 2 processors. The +following "mg.input" file is required: + + 8 = top level + 256 256 256 = nx ny nz + 4 = nit + 0 0 0 0 0 0 0 0 = debug_vec + +The algorithm for partitioning the full grid onto some power of 2 number +of processors is to start by splitting the last dimension of the grid +(z dimension) in 2: the problem is now partitioned onto 2 processors. +Next the middle dimension (y dimension) is split in 2: the problem is now +partitioned onto 4 processors. Next, first dimension (x dimension) is +split in 2: the problem is now partitioned onto 8 processors. Next, the +last dimension (z dimension) is split again in 2: the problem is now +partitioned onto 16 processors. This partitioning is repeated until all +of the power of 2 processors have been allocated. + +Thus to run the above problem on 2 processors, the grid partitioning +algorithm will allocate the two processors across the last dimension, +creating two partitions each of size 256x256x128. The coarsest level of +multi-grid must be a single point surrounded by a cubic number of grid +points. Therefore, each of the two processor partitions will contain 4 +coarsest multi-grid level points, each surrounded by a cube of grid points +of size 128x128x128, indicated by a top level of 8. + + +Next, consider the same size problem but running on 4 processors. The +following "mg.input" file is required: + + 8 = top level + 256 256 256 = nx ny nz + 4 = nit + 0 0 0 0 0 0 0 0 = debug_vec + +The partitioning algorithm will create 4 partitions, each of size +256x128x128. Each partition will contain 2 coarsest multi-grid level +points each surrounded by a cube of grid points of size 128x128x128, +indicated by a top level of 8. + + +Next, consider the same size problem but running on 16 processors. The +following "mg.input" file is required: + + 7 = top level + 256 256 256 = nx ny nz + 4 = nit + 0 0 0 0 0 0 0 0 = debug_vec + +On each node a partition of size 128x128x64 will be created. A maximum +of 7 multi-grid levels (64,32,16,8,4,2,1) may be used, resulting in each +partions containing 4 coarsest multi-grid level points, each surrounded +by a cube of grid points of size 64x64x64, indicated by a top level of 7. + + + + +Note that non-cubic problem sizes may also be considered: + +The four lines below are the "mg.input" file appropriate for running a +problem of total size 256x512x512, for 20 iterations and presumes the +use of 32 processors (note: this is NOT a class C problem): + + 8 = top level + 256 512 512 = nx ny nz + 20 = nit + 0 0 0 0 0 0 0 0 = debug_vec + +The first line of input indicates how many levels of multi-grid +cycle will be applied to a particular subpartition. Presuming that +32 processors are solving this problem, a 2x4x4 processor grid is +formed, and thus each partition on a processor is of size 128x128x128. +Therefore, a maximum of 8 multi-grid levels may be used. These are of +size 128,64,32,16,8,4,2,1, with the coarsest level being a single +point on a given processor. + diff --git a/examples/smpi/NAS/MG/globals.h b/examples/smpi/NAS/MG/globals.h new file mode 100644 index 0000000000..99573e3c20 --- /dev/null +++ b/examples/smpi/NAS/MG/globals.h @@ -0,0 +1,55 @@ +c--------------------------------------------------------------------- +c Parameter lm (declared and set in "npbparams.h") is the log-base2 of +c the edge size max for the partition on a given node, so must be changed +c either to save space (if running a small case) or made bigger for larger +c cases, for example, 512^3. Thus lm=7 means that the largest dimension +c of a partition that can be solved on a node is 2^7 = 128. lm is set +c automatically in npbparams.h +c Parameters ndim1, ndim2, ndim3 are the local problem dimensions. +c--------------------------------------------------------------------- + + include 'npbparams.h' + + integer nm ! actual dimension including ghost cells for communications + > , nv ! size of rhs array + > , nr ! size of residual array + > , nm2 ! size of communication buffer + > , maxlevel! maximum number of levels + + parameter( nm=2+2**lm, nv=(2+2**ndim1)*(2+2**ndim2)*(2+2**ndim3) ) + parameter( nm2=2*nm*nm, maxlevel=(lt_default+1) ) + parameter( nr = (8*(nv+nm**2+5*nm+14*lt_default-7*lm))/7 ) + integer maxprocs + parameter( maxprocs = 131072 ) ! this is the upper proc limit that + ! the current "nr" parameter can handle +c--------------------------------------------------------------------- + integer nbr(3,-1:1,maxlevel), msg_type(3,-1:1) + integer msg_id(3,-1:1,2),nx(maxlevel),ny(maxlevel),nz(maxlevel) + common /mg3/ nbr,msg_type,msg_id,nx,ny,nz + + character class + common /ClassType/class + + integer debug_vec(0:7) + common /my_debug/ debug_vec + + integer ir(maxlevel), m1(maxlevel), m2(maxlevel), m3(maxlevel) + integer lt, lb + common /fap/ ir,m1,m2,m3,lt,lb + + logical dead(maxlevel), give_ex(3,maxlevel), take_ex(3,maxlevel) + common /comm_ex/ dead, give_ex, take_ex + +c--------------------------------------------------------------------- +c Set at m=1024, can handle cases up to 1024^3 case +c--------------------------------------------------------------------- + integer m +c parameter( m=1037 ) + parameter( m=nm+1 ) + + double precision buff(nm2,4) + common /buffer/ buff + + + + diff --git a/examples/smpi/NAS/MG/mg.f b/examples/smpi/NAS/MG/mg.f new file mode 100644 index 0000000000..b0352ae2df --- /dev/null +++ b/examples/smpi/NAS/MG/mg.f @@ -0,0 +1,2479 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! M G ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! +! It is described in NAS Technical Reports 95-020 and 02-007 ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + + +c--------------------------------------------------------------------- +c +c Authors: E. Barszcz +c P. Frederickson +c A. Woo +c M. Yarrow +c R. F. Van der Wijngaart +c +c--------------------------------------------------------------------- + + +c--------------------------------------------------------------------- + program mg_mpi +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'globals.h' + +c---------------------------------------------------------------------------c +c k is the current level. It is passed down through subroutine args +c and is NOT global. it is the current iteration +c---------------------------------------------------------------------------c + + integer k, it + + external timer_read + double precision t, t0, tinit, mflops, timer_read + +c---------------------------------------------------------------------------c +c These arrays are in common because they are quite large +c and probably shouldn't be allocated on the stack. They +c are always passed as subroutine args. +c---------------------------------------------------------------------------c + + double precision u(nr),v(nv),r(nr),a(0:3),c(0:3) + common /noautom/ u,v,r + + double precision rnm2, rnmu, old2, oldu, epsilon + integer n1, n2, n3, nit + double precision nn, verify_value, err + logical verified + + integer ierr,i, fstatus + integer T_bench, T_init + parameter (T_bench=1, T_init=2) + + call mpi_init(ierr) + call mpi_comm_rank(mpi_comm_world, me, ierr) + call mpi_comm_size(mpi_comm_world, nprocs, ierr) + + root = 0 + if (nprocs_compiled .gt. maxprocs) then + if (me .eq. root) write(*,20) nprocs_compiled, maxprocs + 20 format(' ERROR: compiled for ',i8,' processes'// + & ' The maximum size allowed for this benchmark is ',i6) + call mpi_abort(MPI_COMM_WORLD, ierr) + stop + endif + + if (.not. convertdouble) then + dp_type = MPI_DOUBLE_PRECISION + else + dp_type = MPI_REAL + endif + + + call timer_clear(T_bench) + call timer_clear(T_init) + + call mpi_barrier(MPI_COMM_WORLD, ierr) + + call timer_start(T_init) + + +c--------------------------------------------------------------------- +c Read in and broadcast input data +c--------------------------------------------------------------------- + + if( me .eq. root )then + write (*, 1000) + + open(unit=7,file="mg.input", status="old", iostat=fstatus) + if (fstatus .eq. 0) then + write(*,50) + 50 format(' Reading from input file mg.input') + read(7,*) lt + read(7,*) nx(lt), ny(lt), nz(lt) + read(7,*) nit + read(7,*) (debug_vec(i),i=0,7) + else + write(*,51) + 51 format(' No input file. Using compiled defaults ') + lt = lt_default + nit = nit_default + nx(lt) = nx_default + ny(lt) = ny_default + nz(lt) = nz_default + do i = 0,7 + debug_vec(i) = debug_default + end do + endif + endif + + call mpi_bcast(lt, 1, MPI_INTEGER, 0, mpi_comm_world, ierr) + call mpi_bcast(nit, 1, MPI_INTEGER, 0, mpi_comm_world, ierr) + call mpi_bcast(nx(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) + call mpi_bcast(ny(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) + call mpi_bcast(nz(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) + call mpi_bcast(debug_vec(0), 8, MPI_INTEGER, 0, + > mpi_comm_world, ierr) + + if ( (nx(lt) .ne. ny(lt)) .or. (nx(lt) .ne. nz(lt)) ) then + Class = 'U' + else if( nx(lt) .eq. 32 .and. nit .eq. 4 ) then + Class = 'S' + else if( nx(lt) .eq. 128 .and. nit .eq. 4 ) then + Class = 'W' + else if( nx(lt) .eq. 256 .and. nit .eq. 4 ) then + Class = 'A' + else if( nx(lt) .eq. 256 .and. nit .eq. 20 ) then + Class = 'B' + else if( nx(lt) .eq. 512 .and. nit .eq. 20 ) then + Class = 'C' + else if( nx(lt) .eq. 1024 .and. nit .eq. 50 ) then + Class = 'D' + else if( nx(lt) .eq. 2048 .and. nit .eq. 50 ) then + Class = 'E' + else + Class = 'U' + endif + +c--------------------------------------------------------------------- +c Use these for debug info: +c--------------------------------------------------------------------- +c debug_vec(0) = 1 !=> report all norms +c debug_vec(1) = 1 !=> some setup information +c debug_vec(1) = 2 !=> more setup information +c debug_vec(2) = k => at level k or below, show result of resid +c debug_vec(3) = k => at level k or below, show result of psinv +c debug_vec(4) = k => at level k or below, show result of rprj +c debug_vec(5) = k => at level k or below, show result of interp +c debug_vec(6) = 1 => (unused) +c debug_vec(7) = 1 => (unused) +c--------------------------------------------------------------------- + a(0) = -8.0D0/3.0D0 + a(1) = 0.0D0 + a(2) = 1.0D0/6.0D0 + a(3) = 1.0D0/12.0D0 + + if(Class .eq. 'A' .or. Class .eq. 'S'.or. Class .eq.'W') then +c--------------------------------------------------------------------- +c Coefficients for the S(a) smoother +c--------------------------------------------------------------------- + c(0) = -3.0D0/8.0D0 + c(1) = +1.0D0/32.0D0 + c(2) = -1.0D0/64.0D0 + c(3) = 0.0D0 + else +c--------------------------------------------------------------------- +c Coefficients for the S(b) smoother +c--------------------------------------------------------------------- + c(0) = -3.0D0/17.0D0 + c(1) = +1.0D0/33.0D0 + c(2) = -1.0D0/61.0D0 + c(3) = 0.0D0 + endif + lb = 1 + k = lt + + call setup(n1,n2,n3,k) + call zero3(u,n1,n2,n3) + call zran3(v,n1,n2,n3,nx(lt),ny(lt),k) + + call norm2u3(v,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) + + if( me .eq. root )then + write (*, 1001) nx(lt),ny(lt),nz(lt), Class + write (*, 1002) nit + + 1000 format(//,' NAS Parallel Benchmarks 3.3 -- MG Benchmark', /) + 1001 format(' Size: ', i4, 'x', i4, 'x', i4, ' (class ', A, ')' ) + 1002 format(' Iterations: ', i4) + 1003 format(' Number of processes: ', i6) + if (nprocs .ne. nprocs_compiled) then + write (*, 1004) nprocs_compiled + write (*, 1005) nprocs + 1004 format(' WARNING: compiled for ', i6, ' processes ') + 1005 format(' Number of active processes: ', i6, /) + else + write (*, 1003) nprocs + endif + endif + + call resid(u,v,r,n1,n2,n3,a,k) + call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) + old2 = rnm2 + oldu = rnmu + +c--------------------------------------------------------------------- +c One iteration for startup +c--------------------------------------------------------------------- + call mg3P(u,v,r,a,c,n1,n2,n3,k) + call resid(u,v,r,n1,n2,n3,a,k) + call setup(n1,n2,n3,k) + call zero3(u,n1,n2,n3) + call zran3(v,n1,n2,n3,nx(lt),ny(lt),k) + + call timer_stop(T_init) + if( me .eq. root )then + tinit = timer_read(T_init) + write( *,'(/A,F15.3,A/)' ) + > ' Initialization time: ',tinit, ' seconds' + endif + + call mpi_barrier(mpi_comm_world,ierr) + + call timer_start(T_bench) + + call resid(u,v,r,n1,n2,n3,a,k) + call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) + old2 = rnm2 + oldu = rnmu + + do it=1,nit + if (it.eq.1 .or. it.eq.nit .or. mod(it,5).eq.0) then + if (me .eq. root) write(*,80) it + 80 format(' iter ',i4) + endif + call mg3P(u,v,r,a,c,n1,n2,n3,k) + call resid(u,v,r,n1,n2,n3,a,k) + enddo + + + call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) + + call timer_stop(T_bench) + + t0 = timer_read(T_bench) + + call mpi_reduce(t0,t,1,dp_type, + > mpi_max,root,mpi_comm_world,ierr) + verified = .FALSE. + verify_value = 0.0 + if( me .eq. root )then + write(*,100) + 100 format(/' Benchmark completed ') + + epsilon = 1.d-8 + if (Class .ne. 'U') then + if(Class.eq.'S') then + verify_value = 0.5307707005734d-04 + elseif(Class.eq.'W') then + verify_value = 0.6467329375339d-05 + elseif(Class.eq.'A') then + verify_value = 0.2433365309069d-05 + elseif(Class.eq.'B') then + verify_value = 0.1800564401355d-05 + elseif(Class.eq.'C') then + verify_value = 0.5706732285740d-06 + elseif(Class.eq.'D') then + verify_value = 0.1583275060440d-09 + elseif(Class.eq.'E') then + verify_value = 0.5630442584711d-10 + endif + + err = abs( rnm2 - verify_value ) / verify_value + if( err .le. epsilon ) then + verified = .TRUE. + write(*, 200) + write(*, 201) rnm2 + write(*, 202) err + 200 format(' VERIFICATION SUCCESSFUL ') + 201 format(' L2 Norm is ', E20.13) + 202 format(' Error is ', E20.13) + else + verified = .FALSE. + write(*, 300) + write(*, 301) rnm2 + write(*, 302) verify_value + 300 format(' VERIFICATION FAILED') + 301 format(' L2 Norm is ', E20.13) + 302 format(' The correct L2 Norm is ', E20.13) + endif + else + verified = .FALSE. + write (*, 400) + write (*, 401) + write (*, 201) rnm2 + 400 format(' Problem size unknown') + 401 format(' NO VERIFICATION PERFORMED') + endif + + nn = 1.0d0*nx(lt)*ny(lt)*nz(lt) + + if( t .ne. 0. ) then + mflops = 58.*1.0D-6*nit*nn / t + else + mflops = 0.0 + endif + + call print_results('MG', class, nx(lt), ny(lt), nz(lt), + > nit, nprocs_compiled, nprocs, t, + > mflops, ' floating point', + > verified, npbversion, compiletime, + > cs1, cs2, cs3, cs4, cs5, cs6, cs7) + + + endif + + + call mpi_finalize(ierr) + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup(n1,n2,n3,k) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer is1, is2, is3, ie1, ie2, ie3 + common /grid/ is1,is2,is3,ie1,ie2,ie3 + + integer n1,n2,n3,k + integer dx, dy, log_p, d, i, j + + integer ax, next(3),mi(3,maxlevel),mip(3,maxlevel) + integer ng(3,maxlevel) + integer idi(3), pi(3), idin(3,-1:1) + integer s, dir,ierr + + do j=-1,1,1 + do d=1,3 + msg_type(d,j) = 100*(j+2+10*d) + enddo + enddo + + ng(1,lt) = nx(lt) + ng(2,lt) = ny(lt) + ng(3,lt) = nz(lt) + do ax=1,3 + next(ax) = 1 + do k=lt-1,1,-1 + ng(ax,k) = ng(ax,k+1)/2 + enddo + enddo + 61 format(10i4) + do k=lt,1,-1 + nx(k) = ng(1,k) + ny(k) = ng(2,k) + nz(k) = ng(3,k) + enddo + + log_p = log(float(nprocs)+0.0001)/log(2.0) + dx = log_p/3 + pi(1) = 2**dx + idi(1) = mod(me,pi(1)) + + dy = (log_p-dx)/2 + pi(2) = 2**dy + idi(2) = mod((me/pi(1)),pi(2)) + + pi(3) = nprocs/(pi(1)*pi(2)) + idi(3) = me/(pi(1)*pi(2)) + + do k = lt,1,-1 + dead(k) = .false. + do ax = 1,3 + take_ex(ax,k) = .false. + give_ex(ax,k) = .false. + + mi(ax,k) = 2 + + > ((idi(ax)+1)*ng(ax,k))/pi(ax) - + > ((idi(ax)+0)*ng(ax,k))/pi(ax) + mip(ax,k) = 2 + + > ((next(ax)+idi(ax)+1)*ng(ax,k))/pi(ax) - + > ((next(ax)+idi(ax)+0)*ng(ax,k))/pi(ax) + + if(mip(ax,k).eq.2.or.mi(ax,k).eq.2)then + next(ax) = 2*next(ax) + endif + + if( k+1 .le. lt )then + if((mip(ax,k).eq.2).and.(mi(ax,k).eq.3))then + give_ex(ax,k+1) = .true. + endif + if((mip(ax,k).eq.3).and.(mi(ax,k).eq.2))then + take_ex(ax,k+1) = .true. + endif + endif + enddo + + if( mi(1,k).eq.2 .or. + > mi(2,k).eq.2 .or. + > mi(3,k).eq.2 )then + dead(k) = .true. + endif + m1(k) = mi(1,k) + m2(k) = mi(2,k) + m3(k) = mi(3,k) + + do ax=1,3 + idin(ax,+1) = mod( idi(ax) + next(ax) + pi(ax) , pi(ax) ) + idin(ax,-1) = mod( idi(ax) - next(ax) + pi(ax) , pi(ax) ) + enddo + do dir = 1,-1,-2 + nbr(1,dir,k) = idin(1,dir) + pi(1) + > *(idi(2) + pi(2) + > * idi(3)) + nbr(2,dir,k) = idi(1) + pi(1) + > *(idin(2,dir) + pi(2) + > * idi(3)) + nbr(3,dir,k) = idi(1) + pi(1) + > *(idi(2) + pi(2) + > * idin(3,dir)) + enddo + enddo + + k = lt + is1 = 2 + ng(1,k) - ((pi(1) -idi(1))*ng(1,lt))/pi(1) + ie1 = 1 + ng(1,k) - ((pi(1)-1-idi(1))*ng(1,lt))/pi(1) + n1 = 3 + ie1 - is1 + is2 = 2 + ng(2,k) - ((pi(2) -idi(2))*ng(2,lt))/pi(2) + ie2 = 1 + ng(2,k) - ((pi(2)-1-idi(2))*ng(2,lt))/pi(2) + n2 = 3 + ie2 - is2 + is3 = 2 + ng(3,k) - ((pi(3) -idi(3))*ng(3,lt))/pi(3) + ie3 = 1 + ng(3,k) - ((pi(3)-1-idi(3))*ng(3,lt))/pi(3) + n3 = 3 + ie3 - is3 + + + ir(lt)=1 + do j = lt-1, 1, -1 + ir(j)=ir(j+1)+m1(j+1)*m2(j+1)*m3(j+1) + enddo + + + if( debug_vec(1) .ge. 1 )then + if( me .eq. root )write(*,*)' in setup, ' + if( me .eq. root )write(*,*)' me k lt nx ny nz ', + > ' n1 n2 n3 is1 is2 is3 ie1 ie2 ie3' + do i=0,nprocs-1 + if( me .eq. i )then + write(*,9) me,k,lt,ng(1,k),ng(2,k),ng(3,k), + > n1,n2,n3,is1,is2,is3,ie1,ie2,ie3 + 9 format(15i4) + endif + call mpi_barrier(mpi_comm_world,ierr) + enddo + endif + if( debug_vec(1) .ge. 2 )then + do i=0,nprocs-1 + if( me .eq. i )then + write(*,*)' ' + write(*,*)' processor =',me + do k=lt,1,-1 + write(*,7)k,idi(1),idi(2),idi(3), + > ((nbr(d,j,k),j=-1,1,2),d=1,3), + > (mi(d,k),d=1,3) + enddo + 7 format(i4,'idi=',3i4,'nbr=',3(2i4,' '),'mi=',3i4,' ') + write(*,*)'idi(s) = ',(idi(s),s=1,3) + write(*,*)'dead(2), dead(1) = ',dead(2),dead(1) + do ax=1,3 + write(*,*)'give_ex(ax,2)= ',give_ex(ax,2) + write(*,*)'take_ex(ax,2)= ',take_ex(ax,2) + enddo + endif + call mpi_barrier(mpi_comm_world,ierr) + enddo + endif + + k = lt + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine mg3P(u,v,r,a,c,n1,n2,n3,k) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c multigrid V-cycle routine +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer n1, n2, n3, k + double precision u(nr),v(nv),r(nr) + double precision a(0:3),c(0:3) + + integer j + +c--------------------------------------------------------------------- +c down cycle. +c restrict the residual from the find grid to the coarse +c--------------------------------------------------------------------- + + do k= lt, lb+1 , -1 + j = k-1 + call rprj3(r(ir(k)),m1(k),m2(k),m3(k), + > r(ir(j)),m1(j),m2(j),m3(j),k) + enddo + + k = lb +c--------------------------------------------------------------------- +c compute an approximate solution on the coarsest grid +c--------------------------------------------------------------------- + call zero3(u(ir(k)),m1(k),m2(k),m3(k)) + call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k) + + do k = lb+1, lt-1 + j = k-1 +c--------------------------------------------------------------------- +c prolongate from level k-1 to k +c--------------------------------------------------------------------- + call zero3(u(ir(k)),m1(k),m2(k),m3(k)) + call interp(u(ir(j)),m1(j),m2(j),m3(j), + > u(ir(k)),m1(k),m2(k),m3(k),k) +c--------------------------------------------------------------------- +c compute residual for level k +c--------------------------------------------------------------------- + call resid(u(ir(k)),r(ir(k)),r(ir(k)),m1(k),m2(k),m3(k),a,k) +c--------------------------------------------------------------------- +c apply smoother +c--------------------------------------------------------------------- + call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k) + enddo + 200 continue + j = lt - 1 + k = lt + call interp(u(ir(j)),m1(j),m2(j),m3(j),u,n1,n2,n3,k) + call resid(u,v,r,n1,n2,n3,a,k) + call psinv(r,u,n1,n2,n3,c,k) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine psinv( r,u,n1,n2,n3,c,k) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c psinv applies an approximate inverse as smoother: u = u + Cr +c +c This implementation costs 15A + 4M per result, where +c A and M denote the costs of Addition and Multiplication. +c Presuming coefficient c(3) is zero (the NPB assumes this, +c but it is thus not a general case), 2A + 1M may be eliminated, +c resulting in 13A + 3M. +c Note that this vectorizes, and is also fine for cache +c based machines. +c--------------------------------------------------------------------- + implicit none + + include 'globals.h' + + integer n1,n2,n3,k + double precision u(n1,n2,n3),r(n1,n2,n3),c(0:3) + integer i3, i2, i1 + + double precision r1(m), r2(m) + + do i3=2,n3-1 + do i2=2,n2-1 + do i1=1,n1 + r1(i1) = r(i1,i2-1,i3) + r(i1,i2+1,i3) + > + r(i1,i2,i3-1) + r(i1,i2,i3+1) + r2(i1) = r(i1,i2-1,i3-1) + r(i1,i2+1,i3-1) + > + r(i1,i2-1,i3+1) + r(i1,i2+1,i3+1) + enddo + do i1=2,n1-1 + u(i1,i2,i3) = u(i1,i2,i3) + > + c(0) * r(i1,i2,i3) + > + c(1) * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + > + r1(i1) ) + > + c(2) * ( r2(i1) + r1(i1-1) + r1(i1+1) ) +c--------------------------------------------------------------------- +c Assume c(3) = 0 (Enable line below if c(3) not= 0) +c--------------------------------------------------------------------- +c > + c(3) * ( r2(i1-1) + r2(i1+1) ) +c--------------------------------------------------------------------- + enddo + enddo + enddo + +c--------------------------------------------------------------------- +c exchange boundary points +c--------------------------------------------------------------------- + call comm3(u,n1,n2,n3,k) + + if( debug_vec(0) .ge. 1 )then + call rep_nrm(u,n1,n2,n3,' psinv',k) + endif + + if( debug_vec(3) .ge. k )then + call showall(u,n1,n2,n3) + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine resid( u,v,r,n1,n2,n3,a,k ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c resid computes the residual: r = v - Au +c +c This implementation costs 15A + 4M per result, where +c A and M denote the costs of Addition (or Subtraction) and +c Multiplication, respectively. +c Presuming coefficient a(1) is zero (the NPB assumes this, +c but it is thus not a general case), 3A + 1M may be eliminated, +c resulting in 12A + 3M. +c Note that this vectorizes, and is also fine for cache +c based machines. +c--------------------------------------------------------------------- + implicit none + + include 'globals.h' + + integer n1,n2,n3,k + double precision u(n1,n2,n3),v(n1,n2,n3),r(n1,n2,n3),a(0:3) + integer i3, i2, i1 + double precision u1(m), u2(m) + + do i3=2,n3-1 + do i2=2,n2-1 + do i1=1,n1 + u1(i1) = u(i1,i2-1,i3) + u(i1,i2+1,i3) + > + u(i1,i2,i3-1) + u(i1,i2,i3+1) + u2(i1) = u(i1,i2-1,i3-1) + u(i1,i2+1,i3-1) + > + u(i1,i2-1,i3+1) + u(i1,i2+1,i3+1) + enddo + do i1=2,n1-1 + r(i1,i2,i3) = v(i1,i2,i3) + > - a(0) * u(i1,i2,i3) +c--------------------------------------------------------------------- +c Assume a(1) = 0 (Enable 2 lines below if a(1) not= 0) +c--------------------------------------------------------------------- +c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3) +c > + u1(i1) ) +c--------------------------------------------------------------------- + > - a(2) * ( u2(i1) + u1(i1-1) + u1(i1+1) ) + > - a(3) * ( u2(i1-1) + u2(i1+1) ) + enddo + enddo + enddo + +c--------------------------------------------------------------------- +c exchange boundary data +c--------------------------------------------------------------------- + call comm3(r,n1,n2,n3,k) + + if( debug_vec(0) .ge. 1 )then + call rep_nrm(r,n1,n2,n3,' resid',k) + endif + + if( debug_vec(2) .ge. k )then + call showall(r,n1,n2,n3) + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine rprj3( r,m1k,m2k,m3k,s,m1j,m2j,m3j,k ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c rprj3 projects onto the next coarser grid, +c using a trilinear Finite Element projection: s = r' = P r +c +c This implementation costs 20A + 4M per result, where +c A and M denote the costs of Addition and Multiplication. +c Note that this vectorizes, and is also fine for cache +c based machines. +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer m1k, m2k, m3k, m1j, m2j, m3j,k + double precision r(m1k,m2k,m3k), s(m1j,m2j,m3j) + integer j3, j2, j1, i3, i2, i1, d1, d2, d3, j + + double precision x1(m), y1(m), x2,y2 + + + if(m1k.eq.3)then + d1 = 2 + else + d1 = 1 + endif + + if(m2k.eq.3)then + d2 = 2 + else + d2 = 1 + endif + + if(m3k.eq.3)then + d3 = 2 + else + d3 = 1 + endif + + do j3=2,m3j-1 + i3 = 2*j3-d3 +C i3 = 2*j3-1 + do j2=2,m2j-1 + i2 = 2*j2-d2 +C i2 = 2*j2-1 + + do j1=2,m1j + i1 = 2*j1-d1 +C i1 = 2*j1-1 + x1(i1-1) = r(i1-1,i2-1,i3 ) + r(i1-1,i2+1,i3 ) + > + r(i1-1,i2, i3-1) + r(i1-1,i2, i3+1) + y1(i1-1) = r(i1-1,i2-1,i3-1) + r(i1-1,i2-1,i3+1) + > + r(i1-1,i2+1,i3-1) + r(i1-1,i2+1,i3+1) + enddo + + do j1=2,m1j-1 + i1 = 2*j1-d1 +C i1 = 2*j1-1 + y2 = r(i1, i2-1,i3-1) + r(i1, i2-1,i3+1) + > + r(i1, i2+1,i3-1) + r(i1, i2+1,i3+1) + x2 = r(i1, i2-1,i3 ) + r(i1, i2+1,i3 ) + > + r(i1, i2, i3-1) + r(i1, i2, i3+1) + s(j1,j2,j3) = + > 0.5D0 * r(i1,i2,i3) + > + 0.25D0 * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + x2) + > + 0.125D0 * ( x1(i1-1) + x1(i1+1) + y2) + > + 0.0625D0 * ( y1(i1-1) + y1(i1+1) ) + enddo + + enddo + enddo + + + j = k-1 + call comm3(s,m1j,m2j,m3j,j) + + if( debug_vec(0) .ge. 1 )then + call rep_nrm(s,m1j,m2j,m3j,' rprj3',k-1) + endif + + if( debug_vec(4) .ge. k )then + call showall(s,m1j,m2j,m3j) + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine interp( z,mm1,mm2,mm3,u,n1,n2,n3,k ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c interp adds the trilinear interpolation of the correction +c from the coarser grid to the current approximation: u = u + Qu' +c +c Observe that this implementation costs 16A + 4M, where +c A and M denote the costs of Addition and Multiplication. +c Note that this vectorizes, and is also fine for cache +c based machines. Vector machines may get slightly better +c performance however, with 8 separate "do i1" loops, rather than 4. +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer mm1, mm2, mm3, n1, n2, n3,k + double precision z(mm1,mm2,mm3),u(n1,n2,n3) + integer i3, i2, i1, d1, d2, d3, t1, t2, t3 + +c note that m = 1037 in globals.h but for this only need to be +c 535 to handle up to 1024^3 +c integer m +c parameter( m=535 ) + double precision z1(m),z2(m),z3(m) + + + if( n1 .ne. 3 .and. n2 .ne. 3 .and. n3 .ne. 3 ) then + + do i3=1,mm3-1 + do i2=1,mm2-1 + + do i1=1,mm1 + z1(i1) = z(i1,i2+1,i3) + z(i1,i2,i3) + z2(i1) = z(i1,i2,i3+1) + z(i1,i2,i3) + z3(i1) = z(i1,i2+1,i3+1) + z(i1,i2,i3+1) + z1(i1) + enddo + + do i1=1,mm1-1 + u(2*i1-1,2*i2-1,2*i3-1)=u(2*i1-1,2*i2-1,2*i3-1) + > +z(i1,i2,i3) + u(2*i1,2*i2-1,2*i3-1)=u(2*i1,2*i2-1,2*i3-1) + > +0.5d0*(z(i1+1,i2,i3)+z(i1,i2,i3)) + enddo + do i1=1,mm1-1 + u(2*i1-1,2*i2,2*i3-1)=u(2*i1-1,2*i2,2*i3-1) + > +0.5d0 * z1(i1) + u(2*i1,2*i2,2*i3-1)=u(2*i1,2*i2,2*i3-1) + > +0.25d0*( z1(i1) + z1(i1+1) ) + enddo + do i1=1,mm1-1 + u(2*i1-1,2*i2-1,2*i3)=u(2*i1-1,2*i2-1,2*i3) + > +0.5d0 * z2(i1) + u(2*i1,2*i2-1,2*i3)=u(2*i1,2*i2-1,2*i3) + > +0.25d0*( z2(i1) + z2(i1+1) ) + enddo + do i1=1,mm1-1 + u(2*i1-1,2*i2,2*i3)=u(2*i1-1,2*i2,2*i3) + > +0.25d0* z3(i1) + u(2*i1,2*i2,2*i3)=u(2*i1,2*i2,2*i3) + > +0.125d0*( z3(i1) + z3(i1+1) ) + enddo + enddo + enddo + + else + + if(n1.eq.3)then + d1 = 2 + t1 = 1 + else + d1 = 1 + t1 = 0 + endif + + if(n2.eq.3)then + d2 = 2 + t2 = 1 + else + d2 = 1 + t2 = 0 + endif + + if(n3.eq.3)then + d3 = 2 + t3 = 1 + else + d3 = 1 + t3 = 0 + endif + + do i3=d3,mm3-1 + do i2=d2,mm2-1 + do i1=d1,mm1-1 + u(2*i1-d1,2*i2-d2,2*i3-d3)=u(2*i1-d1,2*i2-d2,2*i3-d3) + > +z(i1,i2,i3) + enddo + do i1=1,mm1-1 + u(2*i1-t1,2*i2-d2,2*i3-d3)=u(2*i1-t1,2*i2-d2,2*i3-d3) + > +0.5D0*(z(i1+1,i2,i3)+z(i1,i2,i3)) + enddo + enddo + do i2=1,mm2-1 + do i1=d1,mm1-1 + u(2*i1-d1,2*i2-t2,2*i3-d3)=u(2*i1-d1,2*i2-t2,2*i3-d3) + > +0.5D0*(z(i1,i2+1,i3)+z(i1,i2,i3)) + enddo + do i1=1,mm1-1 + u(2*i1-t1,2*i2-t2,2*i3-d3)=u(2*i1-t1,2*i2-t2,2*i3-d3) + > +0.25D0*(z(i1+1,i2+1,i3)+z(i1+1,i2,i3) + > +z(i1, i2+1,i3)+z(i1, i2,i3)) + enddo + enddo + enddo + + do i3=1,mm3-1 + do i2=d2,mm2-1 + do i1=d1,mm1-1 + u(2*i1-d1,2*i2-d2,2*i3-t3)=u(2*i1-d1,2*i2-d2,2*i3-t3) + > +0.5D0*(z(i1,i2,i3+1)+z(i1,i2,i3)) + enddo + do i1=1,mm1-1 + u(2*i1-t1,2*i2-d2,2*i3-t3)=u(2*i1-t1,2*i2-d2,2*i3-t3) + > +0.25D0*(z(i1+1,i2,i3+1)+z(i1,i2,i3+1) + > +z(i1+1,i2,i3 )+z(i1,i2,i3 )) + enddo + enddo + do i2=1,mm2-1 + do i1=d1,mm1-1 + u(2*i1-d1,2*i2-t2,2*i3-t3)=u(2*i1-d1,2*i2-t2,2*i3-t3) + > +0.25D0*(z(i1,i2+1,i3+1)+z(i1,i2,i3+1) + > +z(i1,i2+1,i3 )+z(i1,i2,i3 )) + enddo + do i1=1,mm1-1 + u(2*i1-t1,2*i2-t2,2*i3-t3)=u(2*i1-t1,2*i2-t2,2*i3-t3) + > +0.125D0*(z(i1+1,i2+1,i3+1)+z(i1+1,i2,i3+1) + > +z(i1 ,i2+1,i3+1)+z(i1 ,i2,i3+1) + > +z(i1+1,i2+1,i3 )+z(i1+1,i2,i3 ) + > +z(i1 ,i2+1,i3 )+z(i1 ,i2,i3 )) + enddo + enddo + enddo + + endif + + call comm3_ex(u,n1,n2,n3,k) + + if( debug_vec(0) .ge. 1 )then + call rep_nrm(z,mm1,mm2,mm3,'z: inter',k-1) + call rep_nrm(u,n1,n2,n3,'u: inter',k) + endif + + if( debug_vec(5) .ge. k )then + call showall(z,mm1,mm2,mm3) + call showall(u,n1,n2,n3) + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine norm2u3(r,n1,n2,n3,rnm2,rnmu,nx,ny,nz) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c norm2u3 evaluates approximations to the L2 norm and the +c uniform (or L-infinity or Chebyshev) norm, under the +c assumption that the boundaries are periodic or zero. Add the +c boundaries in with half weight (quarter weight on the edges +c and eighth weight at the corners) for inhomogeneous boundaries. +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + + integer n1, n2, n3, nx, ny, nz + double precision rnm2, rnmu, r(n1,n2,n3) + double precision s, a, ss + integer i3, i2, i1, ierr + + double precision dn + + dn = 1.0d0*nx*ny*nz + + s=0.0D0 + rnmu = 0.0D0 + do i3=2,n3-1 + do i2=2,n2-1 + do i1=2,n1-1 + s=s+r(i1,i2,i3)**2 + a=abs(r(i1,i2,i3)) + if(a.gt.rnmu)rnmu=a + enddo + enddo + enddo + + call mpi_allreduce(rnmu,ss,1,dp_type, + > mpi_max,mpi_comm_world,ierr) + rnmu = ss + call mpi_allreduce(s, ss, 1, dp_type, + > mpi_sum,mpi_comm_world,ierr) + s = ss + rnm2=sqrt( s / dn ) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine rep_nrm(u,n1,n2,n3,title,kk) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c report on norm +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer n1, n2, n3, kk + double precision u(n1,n2,n3) + character*8 title + + double precision rnm2, rnmu + + + call norm2u3(u,n1,n2,n3,rnm2,rnmu,nx(kk),ny(kk),nz(kk)) + if( me .eq. root )then + write(*,7)kk,title,rnm2,rnmu + 7 format(' Level',i2,' in ',a8,': norms =',D21.14,D21.14) + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine comm3(u,n1,n2,n3,kk) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c comm3 organizes the communication on all borders +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer n1, n2, n3, kk + double precision u(n1,n2,n3) + integer axis + + if( .not. dead(kk) )then + do axis = 1, 3 + if( nprocs .ne. 1) then + + call ready( axis, -1, kk ) + call ready( axis, +1, kk ) + + call give3( axis, +1, u, n1, n2, n3, kk ) + call give3( axis, -1, u, n1, n2, n3, kk ) + + call take3( axis, -1, u, n1, n2, n3 ) + call take3( axis, +1, u, n1, n2, n3 ) + + else + call comm1p( axis, u, n1, n2, n3, kk ) + endif + enddo + else + call zero3(u,n1,n2,n3) + endif + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine comm3_ex(u,n1,n2,n3,kk) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c comm3_ex communicates to expand the number of processors +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer n1, n2, n3, kk + double precision u(n1,n2,n3) + integer axis + + do axis = 1, 3 + if( nprocs .ne. 1 ) then + if( take_ex( axis, kk ) )then + call ready( axis, -1, kk ) + call ready( axis, +1, kk ) + call take3_ex( axis, -1, u, n1, n2, n3 ) + call take3_ex( axis, +1, u, n1, n2, n3 ) + endif + + if( give_ex( axis, kk ) )then + call give3_ex( axis, +1, u, n1, n2, n3, kk ) + call give3_ex( axis, -1, u, n1, n2, n3, kk ) + endif + else + call comm1p_ex( axis, u, n1, n2, n3, kk ) + endif + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine ready( axis, dir, k ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c ready allocates a buffer to take in a message +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer axis, dir, k + integer buff_id,buff_len,i,ierr + + buff_id = 3 + dir + buff_len = nm2 + + do i=1,nm2 + buff(i,buff_id) = 0.0D0 + enddo + + +c--------------------------------------------------------------------- +c fake message request type +c--------------------------------------------------------------------- + msg_id(axis,dir,1) = msg_type(axis,dir) +1000*me + + call mpi_irecv( buff(1,buff_id), buff_len, + > dp_type, nbr(axis,-dir,k), msg_type(axis,dir), + > mpi_comm_world, msg_id(axis,dir,1), ierr) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine give3( axis, dir, u, n1, n2, n3, k ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c give3 sends border data out in the requested direction +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer axis, dir, n1, n2, n3, k, ierr + double precision u( n1, n2, n3 ) + + integer i3, i2, i1, buff_len,buff_id + + buff_id = 2 + dir + buff_len = 0 + + if( axis .eq. 1 )then + if( dir .eq. -1 )then + + do i3=2,n3-1 + do i2=2,n2-1 + buff_len = buff_len + 1 + buff(buff_len,buff_id ) = u( 2, i2,i3) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + else if( dir .eq. +1 ) then + + do i3=2,n3-1 + do i2=2,n2-1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( n1-1, i2,i3) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + endif + endif + + if( axis .eq. 2 )then + if( dir .eq. -1 )then + + do i3=2,n3-1 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1, 2,i3) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + else if( dir .eq. +1 ) then + + do i3=2,n3-1 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id )= u( i1,n2-1,i3) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + endif + endif + + if( axis .eq. 3 )then + if( dir .eq. -1 )then + + do i2=1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1,i2,2) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + else if( dir .eq. +1 ) then + + do i2=1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1,i2,n3-1) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + endif + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine take3( axis, dir, u, n1, n2, n3 ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c take3 copies in border data from the requested direction +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer axis, dir, n1, n2, n3 + double precision u( n1, n2, n3 ) + + integer buff_id, indx + + integer status(mpi_status_size), ierr + + integer i3, i2, i1 + + call mpi_wait( msg_id( axis, dir, 1 ),status,ierr) + buff_id = 3 + dir + indx = 0 + + if( axis .eq. 1 )then + if( dir .eq. -1 )then + + do i3=2,n3-1 + do i2=2,n2-1 + indx = indx + 1 + u(n1,i2,i3) = buff(indx, buff_id ) + enddo + enddo + + else if( dir .eq. +1 ) then + + do i3=2,n3-1 + do i2=2,n2-1 + indx = indx + 1 + u(1,i2,i3) = buff(indx, buff_id ) + enddo + enddo + + endif + endif + + if( axis .eq. 2 )then + if( dir .eq. -1 )then + + do i3=2,n3-1 + do i1=1,n1 + indx = indx + 1 + u(i1,n2,i3) = buff(indx, buff_id ) + enddo + enddo + + else if( dir .eq. +1 ) then + + do i3=2,n3-1 + do i1=1,n1 + indx = indx + 1 + u(i1,1,i3) = buff(indx, buff_id ) + enddo + enddo + + endif + endif + + if( axis .eq. 3 )then + if( dir .eq. -1 )then + + do i2=1,n2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,n3) = buff(indx, buff_id ) + enddo + enddo + + else if( dir .eq. +1 ) then + + do i2=1,n2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,1) = buff(indx, buff_id ) + enddo + enddo + + endif + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine give3_ex( axis, dir, u, n1, n2, n3, k ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c give3_ex sends border data out to expand number of processors +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer axis, dir, n1, n2, n3, k, ierr + double precision u( n1, n2, n3 ) + + integer i3, i2, i1, buff_len, buff_id + + buff_id = 2 + dir + buff_len = 0 + + if( axis .eq. 1 )then + if( dir .eq. -1 )then + + do i3=1,n3 + do i2=1,n2 + buff_len = buff_len + 1 + buff(buff_len,buff_id ) = u( 2, i2,i3) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + else if( dir .eq. +1 ) then + + do i3=1,n3 + do i2=1,n2 + do i1=n1-1,n1 + buff_len = buff_len + 1 + buff(buff_len,buff_id)= u(i1,i2,i3) + enddo + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + endif + endif + + if( axis .eq. 2 )then + if( dir .eq. -1 )then + + do i3=1,n3 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1, 2,i3) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + else if( dir .eq. +1 ) then + + do i3=1,n3 + do i2=n2-1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len,buff_id )= u(i1,i2,i3) + enddo + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + endif + endif + + if( axis .eq. 3 )then + if( dir .eq. -1 )then + + do i2=1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1,i2,2) + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + else if( dir .eq. +1 ) then + + do i3=n3-1,n3 + do i2=1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1,i2,i3) + enddo + enddo + enddo + + call mpi_send( + > buff(1, buff_id ), buff_len,dp_type, + > nbr( axis, dir, k ), msg_type(axis,dir), + > mpi_comm_world, ierr) + + endif + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine take3_ex( axis, dir, u, n1, n2, n3 ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c take3_ex copies in border data to expand number of processors +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer axis, dir, n1, n2, n3 + double precision u( n1, n2, n3 ) + + integer buff_id, indx + + integer status(mpi_status_size) , ierr + + integer i3, i2, i1 + + call mpi_wait( msg_id( axis, dir, 1 ),status,ierr) + buff_id = 3 + dir + indx = 0 + + if( axis .eq. 1 )then + if( dir .eq. -1 )then + + do i3=1,n3 + do i2=1,n2 + indx = indx + 1 + u(n1,i2,i3) = buff(indx, buff_id ) + enddo + enddo + + else if( dir .eq. +1 ) then + + do i3=1,n3 + do i2=1,n2 + do i1=1,2 + indx = indx + 1 + u(i1,i2,i3) = buff(indx,buff_id) + enddo + enddo + enddo + + endif + endif + + if( axis .eq. 2 )then + if( dir .eq. -1 )then + + do i3=1,n3 + do i1=1,n1 + indx = indx + 1 + u(i1,n2,i3) = buff(indx, buff_id ) + enddo + enddo + + else if( dir .eq. +1 ) then + + do i3=1,n3 + do i2=1,2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,i3) = buff(indx,buff_id) + enddo + enddo + enddo + + endif + endif + + if( axis .eq. 3 )then + if( dir .eq. -1 )then + + do i2=1,n2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,n3) = buff(indx, buff_id ) + enddo + enddo + + else if( dir .eq. +1 ) then + + do i3=1,2 + do i2=1,n2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,i3) = buff(indx,buff_id) + enddo + enddo + enddo + + endif + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine comm1p( axis, u, n1, n2, n3, kk ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer axis, dir, n1, n2, n3 + double precision u( n1, n2, n3 ) + + integer i3, i2, i1, buff_len,buff_id + integer i, kk, indx + + dir = -1 + + buff_id = 3 + dir + buff_len = nm2 + + do i=1,nm2 + buff(i,buff_id) = 0.0D0 + enddo + + + dir = +1 + + buff_id = 3 + dir + buff_len = nm2 + + do i=1,nm2 + buff(i,buff_id) = 0.0D0 + enddo + + dir = +1 + + buff_id = 2 + dir + buff_len = 0 + + if( axis .eq. 1 )then + do i3=2,n3-1 + do i2=2,n2-1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( n1-1, i2,i3) + enddo + enddo + endif + + if( axis .eq. 2 )then + do i3=2,n3-1 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id )= u( i1,n2-1,i3) + enddo + enddo + endif + + if( axis .eq. 3 )then + do i2=1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1,i2,n3-1) + enddo + enddo + endif + + dir = -1 + + buff_id = 2 + dir + buff_len = 0 + + if( axis .eq. 1 )then + do i3=2,n3-1 + do i2=2,n2-1 + buff_len = buff_len + 1 + buff(buff_len,buff_id ) = u( 2, i2,i3) + enddo + enddo + endif + + if( axis .eq. 2 )then + do i3=2,n3-1 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1, 2,i3) + enddo + enddo + endif + + if( axis .eq. 3 )then + do i2=1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1,i2,2) + enddo + enddo + endif + + do i=1,nm2 + buff(i,4) = buff(i,3) + buff(i,2) = buff(i,1) + enddo + + dir = -1 + + buff_id = 3 + dir + indx = 0 + + if( axis .eq. 1 )then + do i3=2,n3-1 + do i2=2,n2-1 + indx = indx + 1 + u(n1,i2,i3) = buff(indx, buff_id ) + enddo + enddo + endif + + if( axis .eq. 2 )then + do i3=2,n3-1 + do i1=1,n1 + indx = indx + 1 + u(i1,n2,i3) = buff(indx, buff_id ) + enddo + enddo + endif + + if( axis .eq. 3 )then + do i2=1,n2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,n3) = buff(indx, buff_id ) + enddo + enddo + endif + + + dir = +1 + + buff_id = 3 + dir + indx = 0 + + if( axis .eq. 1 )then + do i3=2,n3-1 + do i2=2,n2-1 + indx = indx + 1 + u(1,i2,i3) = buff(indx, buff_id ) + enddo + enddo + endif + + if( axis .eq. 2 )then + do i3=2,n3-1 + do i1=1,n1 + indx = indx + 1 + u(i1,1,i3) = buff(indx, buff_id ) + enddo + enddo + endif + + if( axis .eq. 3 )then + do i2=1,n2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,1) = buff(indx, buff_id ) + enddo + enddo + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine comm1p_ex( axis, u, n1, n2, n3, kk ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + include 'globals.h' + + integer axis, dir, n1, n2, n3 + double precision u( n1, n2, n3 ) + + integer i3, i2, i1, buff_len,buff_id + integer i, kk, indx + + if( take_ex( axis, kk ) ) then + + dir = -1 + + buff_id = 3 + dir + buff_len = nm2 + + do i=1,nm2 + buff(i,buff_id) = 0.0D0 + enddo + + + dir = +1 + + buff_id = 3 + dir + buff_len = nm2 + + do i=1,nm2 + buff(i,buff_id) = 0.0D0 + enddo + + + dir = -1 + + buff_id = 3 + dir + indx = 0 + + if( axis .eq. 1 )then + do i3=1,n3 + do i2=1,n2 + indx = indx + 1 + u(n1,i2,i3) = buff(indx, buff_id ) + enddo + enddo + endif + + if( axis .eq. 2 )then + do i3=1,n3 + do i1=1,n1 + indx = indx + 1 + u(i1,n2,i3) = buff(indx, buff_id ) + enddo + enddo + endif + + if( axis .eq. 3 )then + do i2=1,n2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,n3) = buff(indx, buff_id ) + enddo + enddo + endif + + dir = +1 + + buff_id = 3 + dir + indx = 0 + + if( axis .eq. 1 )then + do i3=1,n3 + do i2=1,n2 + do i1=1,2 + indx = indx + 1 + u(i1,i2,i3) = buff(indx,buff_id) + enddo + enddo + enddo + endif + + if( axis .eq. 2 )then + do i3=1,n3 + do i2=1,2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,i3) = buff(indx,buff_id) + enddo + enddo + enddo + endif + + if( axis .eq. 3 )then + do i3=1,2 + do i2=1,n2 + do i1=1,n1 + indx = indx + 1 + u(i1,i2,i3) = buff(indx,buff_id) + enddo + enddo + enddo + endif + + endif + + if( give_ex( axis, kk ) )then + + dir = +1 + + buff_id = 2 + dir + buff_len = 0 + + if( axis .eq. 1 )then + do i3=1,n3 + do i2=1,n2 + do i1=n1-1,n1 + buff_len = buff_len + 1 + buff(buff_len,buff_id)= u(i1,i2,i3) + enddo + enddo + enddo + endif + + if( axis .eq. 2 )then + do i3=1,n3 + do i2=n2-1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len,buff_id )= u(i1,i2,i3) + enddo + enddo + enddo + endif + + if( axis .eq. 3 )then + do i3=n3-1,n3 + do i2=1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1,i2,i3) + enddo + enddo + enddo + endif + + dir = -1 + + buff_id = 2 + dir + buff_len = 0 + + if( axis .eq. 1 )then + do i3=1,n3 + do i2=1,n2 + buff_len = buff_len + 1 + buff(buff_len,buff_id ) = u( 2, i2,i3) + enddo + enddo + endif + + if( axis .eq. 2 )then + do i3=1,n3 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1, 2,i3) + enddo + enddo + endif + + if( axis .eq. 3 )then + do i2=1,n2 + do i1=1,n1 + buff_len = buff_len + 1 + buff(buff_len, buff_id ) = u( i1,i2,2) + enddo + enddo + endif + + endif + + do i=1,nm2 + buff(i,4) = buff(i,3) + buff(i,2) = buff(i,1) + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine zran3(z,n1,n2,n3,nx,ny,k) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c zran3 loads +1 at ten randomly chosen points, +c loads -1 at a different ten random points, +c and zero elsewhere. +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + + integer is1, is2, is3, ie1, ie2, ie3 + common /grid/ is1,is2,is3,ie1,ie2,ie3 + + integer n1, n2, n3, k, nx, ny, ierr, i0, m0, m1 + double precision z(n1,n2,n3) + + integer mm, i1, i2, i3, d1, e1, e2, e3 + double precision x, a + double precision xx, x0, x1, a1, a2, ai, power + parameter( mm = 10, a = 5.D0 ** 13, x = 314159265.D0) + double precision ten( mm, 0:1 ), temp, best + integer i, j1( mm, 0:1 ), j2( mm, 0:1 ), j3( mm, 0:1 ) + integer jg( 0:3, mm, 0:1 ), jg_temp(4) + + external randlc + double precision randlc, rdummy + + a1 = power( a, nx, 1, 0 ) + a2 = power( a, nx, ny, 0 ) + + call zero3(z,n1,n2,n3) + +c i = is1-2+nx*(is2-2+ny*(is3-2)) + + ai = power( a, nx, is2-2+ny*(is3-2), is1-2 ) + d1 = ie1 - is1 + 1 + e1 = ie1 - is1 + 2 + e2 = ie2 - is2 + 2 + e3 = ie3 - is3 + 2 + x0 = x + rdummy = randlc( x0, ai ) + do i3 = 2, e3 + x1 = x0 + do i2 = 2, e2 + xx = x1 + call vranlc( d1, xx, a, z( 2, i2, i3 )) + rdummy = randlc( x1, a1 ) + enddo + rdummy = randlc( x0, a2 ) + enddo + +c--------------------------------------------------------------------- +c call comm3(z,n1,n2,n3) +c call showall(z,n1,n2,n3) +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c each processor looks for twenty candidates +c--------------------------------------------------------------------- + do i=1,mm + ten( i, 1 ) = 0.0D0 + j1( i, 1 ) = 0 + j2( i, 1 ) = 0 + j3( i, 1 ) = 0 + ten( i, 0 ) = 1.0D0 + j1( i, 0 ) = 0 + j2( i, 0 ) = 0 + j3( i, 0 ) = 0 + enddo + + do i3=2,n3-1 + do i2=2,n2-1 + do i1=2,n1-1 + if( z(i1,i2,i3) .gt. ten( 1, 1 ) )then + ten(1,1) = z(i1,i2,i3) + j1(1,1) = i1 + j2(1,1) = i2 + j3(1,1) = i3 + call bubble( ten, j1, j2, j3, mm, 1 ) + endif + if( z(i1,i2,i3) .lt. ten( 1, 0 ) )then + ten(1,0) = z(i1,i2,i3) + j1(1,0) = i1 + j2(1,0) = i2 + j3(1,0) = i3 + call bubble( ten, j1, j2, j3, mm, 0 ) + endif + enddo + enddo + enddo + + call mpi_barrier(mpi_comm_world,ierr) + +c--------------------------------------------------------------------- +c Now which of these are globally best? +c--------------------------------------------------------------------- + i1 = mm + i0 = mm + do i=mm,1,-1 + + best = z( j1(i1,1), j2(i1,1), j3(i1,1) ) + call mpi_allreduce(best,temp,1,dp_type, + > mpi_max,mpi_comm_world,ierr) + best = temp + if(best.eq.z(j1(i1,1),j2(i1,1),j3(i1,1)))then + jg( 0, i, 1) = me + jg( 1, i, 1) = is1 - 2 + j1( i1, 1 ) + jg( 2, i, 1) = is2 - 2 + j2( i1, 1 ) + jg( 3, i, 1) = is3 - 2 + j3( i1, 1 ) + i1 = i1-1 + else + jg( 0, i, 1) = 0 + jg( 1, i, 1) = 0 + jg( 2, i, 1) = 0 + jg( 3, i, 1) = 0 + endif + ten( i, 1 ) = best + call mpi_allreduce(jg(0,i,1), jg_temp,4,MPI_INTEGER, + > mpi_max,mpi_comm_world,ierr) + jg( 0, i, 1) = jg_temp(1) + jg( 1, i, 1) = jg_temp(2) + jg( 2, i, 1) = jg_temp(3) + jg( 3, i, 1) = jg_temp(4) + + best = z( j1(i0,0), j2(i0,0), j3(i0,0) ) + call mpi_allreduce(best,temp,1,dp_type, + > mpi_min,mpi_comm_world,ierr) + best = temp + if(best.eq.z(j1(i0,0),j2(i0,0),j3(i0,0)))then + jg( 0, i, 0) = me + jg( 1, i, 0) = is1 - 2 + j1( i0, 0 ) + jg( 2, i, 0) = is2 - 2 + j2( i0, 0 ) + jg( 3, i, 0) = is3 - 2 + j3( i0, 0 ) + i0 = i0-1 + else + jg( 0, i, 0) = 0 + jg( 1, i, 0) = 0 + jg( 2, i, 0) = 0 + jg( 3, i, 0) = 0 + endif + ten( i, 0 ) = best + call mpi_allreduce(jg(0,i,0), jg_temp,4,MPI_INTEGER, + > mpi_max,mpi_comm_world,ierr) + jg( 0, i, 0) = jg_temp(1) + jg( 1, i, 0) = jg_temp(2) + jg( 2, i, 0) = jg_temp(3) + jg( 3, i, 0) = jg_temp(4) + + enddo + m1 = i1+1 + m0 = i0+1 + +c if( me .eq. root) then +c write(*,*)' ' +c write(*,*)' negative charges at' +c write(*,9)(jg(1,i,0),jg(2,i,0),jg(3,i,0),i=1,mm) +c write(*,*)' positive charges at' +c write(*,9)(jg(1,i,1),jg(2,i,1),jg(3,i,1),i=1,mm) +c write(*,*)' small random numbers were' +c write(*,8)(ten( i,0),i=mm,1,-1) +c write(*,*)' and they were found on processor number' +c write(*,7)(jg(0,i,0),i=mm,1,-1) +c write(*,*)' large random numbers were' +c write(*,8)(ten( i,1),i=mm,1,-1) +c write(*,*)' and they were found on processor number' +c write(*,7)(jg(0,i,1),i=mm,1,-1) +c endif +c 9 format(5(' (',i3,2(',',i3),')')) +c 8 format(5D15.8) +c 7 format(10i4) + call mpi_barrier(mpi_comm_world,ierr) + do i3=1,n3 + do i2=1,n2 + do i1=1,n1 + z(i1,i2,i3) = 0.0D0 + enddo + enddo + enddo + do i=mm,m0,-1 + z( j1(i,0), j2(i,0), j3(i,0) ) = -1.0D0 + enddo + do i=mm,m1,-1 + z( j1(i,1), j2(i,1), j3(i,1) ) = +1.0D0 + enddo + call comm3(z,n1,n2,n3,k) + +c--------------------------------------------------------------------- +c call showall(z,n1,n2,n3) +c--------------------------------------------------------------------- + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine show_l(z,n1,n2,n3) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + + integer n1,n2,n3,i1,i2,i3,ierr + double precision z(n1,n2,n3) + integer m1, m2, m3,i + + m1 = min(n1,18) + m2 = min(n2,14) + m3 = min(n3,18) + + write(*,*)' ' + do i=0,nprocs-1 + if( me .eq. i )then + write(*,*)' id = ', me + do i3=1,m3 + do i1=1,m1 + write(*,6)(z(i1,i2,i3),i2=1,m2) + enddo + write(*,*)' - - - - - - - ' + enddo + write(*,*)' ' + 6 format(6f15.11) + endif + call mpi_barrier(mpi_comm_world,ierr) + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine showall(z,n1,n2,n3) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + + integer n1,n2,n3,i1,i2,i3,i,ierr + double precision z(n1,n2,n3) + integer m1, m2, m3 + + m1 = min(n1,18) + m2 = min(n2,14) + m3 = min(n3,18) + + write(*,*)' ' + do i=0,nprocs-1 + if( me .eq. i )then + write(*,*)' id = ', me + do i3=1,m3 + do i1=1,m1 + write(*,6)(z(i1,i2,i3),i2=1,m2) + enddo + write(*,*)' - - - - - - - ' + enddo + write(*,*)' ' + 6 format(15f6.3) + endif + call mpi_barrier(mpi_comm_world,ierr) + enddo + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine show(z,n1,n2,n3) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + integer n1,n2,n3,i1,i2,i3,ierr,i + double precision z(n1,n2,n3) + + write(*,*)' ' + do i=0,nprocs-1 + if( me .eq. i )then + write(*,*)' id = ', me + do i3=2,n3-1 + do i1=2,n1-1 + write(*,6)(z(i1,i2,i3),i2=2,n1-1) + enddo + write(*,*)' - - - - - - - ' + enddo + write(*,*)' ' + 6 format(8D10.3) + endif + call mpi_barrier(mpi_comm_world,ierr) + enddo + +c call comm3(z,n1,n2,n3) + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + double precision function power( a, n1, n2, n3 ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c power raises an integer, disguised as a double +c precision real, to an integer power. +c This version tries to avoid integer overflow by treating +c it as expressed in a form of "n1*n2+n3". +c--------------------------------------------------------------------- + implicit none + + double precision a, aj + integer n1, n2, n3 + + integer n1j, n2j, nj + external randlc + double precision randlc, rdummy + + power = 1.0d0 + aj = a + nj = n3 + n1j = n1 + n2j = n2 + 100 continue + + if( n2j .gt. 0 ) then + if( mod(n2j,2) .eq. 1 ) nj = nj + n1j + n2j = n2j/2 + else if( nj .eq. 0 ) then + go to 200 + endif + if( mod(nj,2) .eq. 1 ) rdummy = randlc( power, aj ) + rdummy = randlc( aj, aj ) + nj = nj/2 + go to 100 + + 200 continue + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine bubble( ten, j1, j2, j3, m, ind ) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c bubble does a bubble sort in direction dir +c--------------------------------------------------------------------- + implicit none + + include 'mpinpb.h' + + integer m, ind, j1( m, 0:1 ), j2( m, 0:1 ), j3( m, 0:1 ) + double precision ten( m, 0:1 ) + double precision temp + integer i, j_temp + + if( ind .eq. 1 )then + + do i=1,m-1 + if( ten(i,ind) .gt. ten(i+1,ind) )then + + temp = ten( i+1, ind ) + ten( i+1, ind ) = ten( i, ind ) + ten( i, ind ) = temp + + j_temp = j1( i+1, ind ) + j1( i+1, ind ) = j1( i, ind ) + j1( i, ind ) = j_temp + + j_temp = j2( i+1, ind ) + j2( i+1, ind ) = j2( i, ind ) + j2( i, ind ) = j_temp + + j_temp = j3( i+1, ind ) + j3( i+1, ind ) = j3( i, ind ) + j3( i, ind ) = j_temp + + else + return + endif + enddo + + else + + do i=1,m-1 + if( ten(i,ind) .lt. ten(i+1,ind) )then + + temp = ten( i+1, ind ) + ten( i+1, ind ) = ten( i, ind ) + ten( i, ind ) = temp + + j_temp = j1( i+1, ind ) + j1( i+1, ind ) = j1( i, ind ) + j1( i, ind ) = j_temp + + j_temp = j2( i+1, ind ) + j2( i+1, ind ) = j2( i, ind ) + j2( i, ind ) = j_temp + + j_temp = j3( i+1, ind ) + j3( i+1, ind ) = j3( i, ind ) + j3( i, ind ) = j_temp + + else + return + endif + enddo + + endif + + return + end + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine zero3(z,n1,n2,n3) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + + include 'mpinpb.h' + + integer n1, n2, n3 + double precision z(n1,n2,n3) + integer i1, i2, i3 + + do i3=1,n3 + do i2=1,n2 + do i1=1,n1 + z(i1,i2,i3)=0.0D0 + enddo + enddo + enddo + + return + end + + +c----- end of program ------------------------------------------------ diff --git a/examples/smpi/NAS/MG/mg.input.sample b/examples/smpi/NAS/MG/mg.input.sample new file mode 100644 index 0000000000..a4dcf81275 --- /dev/null +++ b/examples/smpi/NAS/MG/mg.input.sample @@ -0,0 +1,4 @@ + 8 = top level + 256 256 256 = nx ny nz + 20 = nit + 0 0 0 0 0 0 0 0 = debug_vec diff --git a/examples/smpi/NAS/MG/mpinpb.h b/examples/smpi/NAS/MG/mpinpb.h new file mode 100644 index 0000000000..1f0368c0b7 --- /dev/null +++ b/examples/smpi/NAS/MG/mpinpb.h @@ -0,0 +1,9 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'mpif.h' + + integer me, nprocs, root, dp_type + common /mpistuff/ me, nprocs, root, dp_type + diff --git a/examples/smpi/NAS/MPI_dummy/Makefile b/examples/smpi/NAS/MPI_dummy/Makefile new file mode 100644 index 0000000000..86288d7a1d --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/Makefile @@ -0,0 +1,38 @@ +# Makefile for MPI dummy library. +# Must be edited for a specific machine. Does NOT read in +# the make.def file of NPB 2.3 +F77 = f77 +CC = cc +AR = ar + +# Enable if either Cray or IBM: (no such flag for most machines: see wtime.h) +# MACHINE = -DCRAY +# MACHINE = -DIBM + +libmpi.a: mpi_dummy.o mpi_dummy_c.o wtime.o + $(AR) r libmpi.a mpi_dummy.o mpi_dummy_c.o wtime.o + +mpi_dummy.o: mpi_dummy.f mpif.h + $(F77) -c mpi_dummy.f +# For a Cray C90, try: +# cf77 -dp -c mpi_dummy.f +# For an IBM 590, try: +# xlf -c mpi_dummy.f + +mpi_dummy_c.o: mpi_dummy.c mpi.h + $(CC) -c ${MACHINE} -o mpi_dummy_c.o mpi_dummy.c + +wtime.o: wtime.c +# For most machines or CRAY or IBM + $(CC) -c ${MACHINE} wtime.c +# For a precise timer on an SGI Power Challenge, try: +# $(CC) -o wtime.o -c wtime_sgi64.c + +test: test.f + $(F77) -o test -I. test.f -L. -lmpi + + + +clean: + - rm -f *~ *.o + - rm -f test libmpi.a diff --git a/examples/smpi/NAS/MPI_dummy/README b/examples/smpi/NAS/MPI_dummy/README new file mode 100644 index 0000000000..9096a0b350 --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/README @@ -0,0 +1,52 @@ +########################################### +# NAS Parallel Benchmarks 2&3 # +# MPI/F77/C # +# Revision 3.3 # +# NASA Ames Research Center # +# npb@nas.nasa.gov # +# http://www.nas.nasa.gov/Software/NPB/ # +########################################### + +MPI Dummy Library + + +The MPI dummy library is supplied as a convenience for people who do +not have an MPI library but would like to try running on one processor +anyway. The NPB 2.x/3.x benchmarks are designed so that they do not +actually try to do any message passing when run on one node. The MPI +dummy library is just that - a set of dummy MPI routines which don't +do anything, but allow you to link the benchmarks. Actually they do a +few things, but nothing important. Note that the dummy library is +sufficient only for the NPB 2.x/3.x benchmarks. It probably won't be +useful for anything else because it implements only a handful of +functions. + +Because the dummy library is just an extra goody, and since we don't +have an infinite amount of time, it may be a bit trickier to configure +than the rest of the benchmarks. You need to: + +1. Find out how C and Fortran interact on your machine. On most machines, +the fortran functon foo(x) is declared in C as foo_(xp) where xp is +a pointer, not a value. On IBMs, it's just foo(xp). On Cray C90s, its +FOO(xp). You can define CRAY or IBM to get these, or you need to +edit wtime.c if you've got something else. + +2. Edit the Makefile to compile mpi_dummy.f and wtime.c correctly +for your machine (including -DCRAY or -DIBM if necessary). + +3. The substitute MPI timer gives wall clock time, not CPU time. +If you're running on a timeshared machine, you may want to +use a CPU timer. Edit the function mpi_wtime() in mpi_dummy.f +to change this timer. (NOTE: for official benchmark results, +ONLY wall clock times are valid. Using a CPU timer is ok +if you want to get things running, but don't report any results +measured with a CPU timer. ) + +TROUBLESHOOTING + +o Compiling or linking of the benchmark aborts because the dummy MPI + header file or the dummy MPI library cannot be found. + - the file make.dummy in subdirectory config relies on the use + of the -I"path" and -L"path" -l"library" constructs to pass + information to the compilers and linkers. Edit this file to conform + to your system. diff --git a/examples/smpi/NAS/MPI_dummy/mpi.h b/examples/smpi/NAS/MPI_dummy/mpi.h new file mode 100644 index 0000000000..70eb3138b4 --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/mpi.h @@ -0,0 +1,112 @@ +#define MPI_DOUBLE 1 +#define MPI_INT 2 +#define MPI_BYTE 3 +#define MPI_FLOAT 4 +#define MPI_LONG 5 + +#define MPI_COMM_WORLD 0 + +#define MPI_MAX 1 +#define MPI_SUM 2 +#define MPI_MIN 3 + +#define MPI_SUCCESS 0 +#define MPI_ANY_SOURCE -1 +#define MPI_ERR_OTHER -1 +#define MPI_STATUS_SIZE 3 + + +/* + Status object. It is the only user-visible MPI data-structure + The "count" field is PRIVATE; use MPI_Get_count to access it. + */ +typedef struct { + int count; + int MPI_SOURCE; + int MPI_TAG; + int MPI_ERROR; +} MPI_Status; + + +/* MPI request objects */ +typedef int MPI_Request; + +/* MPI datatype */ +typedef int MPI_Datatype; + +/* MPI comm */ +typedef int MPI_Comm; + +/* MPI operation */ +typedef int MPI_Op; + + + +/* Prototypes: */ +void mpi_error( void ); + +int MPI_Irecv( void *buf, + int count, + MPI_Datatype datatype, + int source, + int tag, + MPI_Comm comm, + MPI_Request *request ); + +int MPI_Send( void *buf, + int count, + MPI_Datatype datatype, + int dest, + int tag, + MPI_Comm comm ); + +int MPI_Wait( MPI_Request *request, + MPI_Status *status ); + +int MPI_Init( int *argc, + char ***argv ); + +int MPI_Comm_rank( MPI_Comm comm, + int *rank ); + +int MPI_Comm_size( MPI_Comm comm, + int *size ); + +double MPI_Wtime( void ); + +int MPI_Barrier( MPI_Comm comm ); + +int MPI_Finalize( void ); + +int MPI_Allreduce( void *sendbuf, + void *recvbuf, + int nitems, + MPI_Datatype type, + MPI_Op op, + MPI_Comm comm ); + +int MPI_Reduce( void *sendbuf, + void *recvbuf, + int nitems, + MPI_Datatype type, + MPI_Op op, + int root, + MPI_Comm comm ); + +int MPI_Alltoall( void *sendbuf, + int sendcount, + MPI_Datatype sendtype, + void *recvbuf, + int recvcount, + MPI_Datatype recvtype, + MPI_Comm comm ); + +int MPI_Alltoallv( void *sendbuf, + int *sendcounts, + int *senddispl, + MPI_Datatype sendtype, + void *recvbuf, + int *recvcounts, + int *recvdispl, + MPI_Datatype recvtype, + MPI_Comm comm ); diff --git a/examples/smpi/NAS/MPI_dummy/mpi_dummy.c b/examples/smpi/NAS/MPI_dummy/mpi_dummy.c new file mode 100644 index 0000000000..d2cbfb84bc --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/mpi_dummy.c @@ -0,0 +1,267 @@ +#include "mpi.h" +#include "wtime.h" +#include + + + +void mpi_error( void ) +{ + printf( "mpi_error called\n" ); + abort(); +} + + + + +int MPI_Irecv( void *buf, + int count, + MPI_Datatype datatype, + int source, + int tag, + MPI_Comm comm, + MPI_Request *request ) +{ + mpi_error(); + return( MPI_ERR_OTHER ); +} + + + + +int MPI_Recv( void *buf, + int count, + MPI_Datatype datatype, + int source, + int tag, + MPI_Comm comm, + MPI_Status *status ) +{ + mpi_error(); + return( MPI_ERR_OTHER ); +} + + + + +int MPI_Send( void *buf, + int count, + MPI_Datatype datatype, + int dest, + int tag, + MPI_Comm comm ) +{ + mpi_error(); + return( MPI_ERR_OTHER ); +} + + + + +int MPI_Wait( MPI_Request *request, + MPI_Status *status ) +{ + mpi_error(); + return( MPI_ERR_OTHER ); +} + + + + +int MPI_Init( int *argc, + char ***argv ) +{ + return( MPI_SUCCESS ); +} + + + + +int MPI_Comm_rank( MPI_Comm comm, + int *rank ) +{ + *rank = 0; + return( MPI_SUCCESS ); +} + + + + +int MPI_Comm_size( MPI_Comm comm, + int *size ) +{ + *size = 1; + return( MPI_SUCCESS ); +} + + + + +double MPI_Wtime( void ) +{ + void wtime(); + + double t; + wtime( &t ); + return( t ); +} + + + + +int MPI_Barrier( MPI_Comm comm ) +{ + return( MPI_SUCCESS ); +} + + + + +int MPI_Finalize( void ) +{ + return( MPI_SUCCESS ); +} + + + + +int MPI_Allreduce( void *sendbuf, + void *recvbuf, + int nitems, + MPI_Datatype type, + MPI_Op op, + MPI_Comm comm ) +{ + int i; + if( type == MPI_INT ) + { + int *pd_sendbuf, *pd_recvbuf; + pd_sendbuf = (int *) sendbuf; + pd_recvbuf = (int *) recvbuf; + for( i=0; i mpi_double_precision, mpi_complex, + > mpi_double_complex + parameter (mpi_double_precision = 1, + $ mpi_integer = 2, + $ mpi_byte = 3, + $ mpi_real= 4, + $ mpi_complex = 5, + $ mpi_double_complex = 6) + + integer mpi_any_source + parameter (mpi_any_source = -1) + + integer mpi_err_other + parameter (mpi_err_other = -1) + + double precision mpi_wtime + external mpi_wtime + + integer mpi_status_size + parameter (mpi_status_size=3) diff --git a/examples/smpi/NAS/MPI_dummy/test.f b/examples/smpi/NAS/MPI_dummy/test.f new file mode 100644 index 0000000000..081c73c72f --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/test.f @@ -0,0 +1,10 @@ + program + implicit none + double precision t, mpi_wtime + external mpi_wtime + t = 0.0 + t = mpi_wtime() + print *, t + t = mpi_wtime() + print *, t + end diff --git a/examples/smpi/NAS/MPI_dummy/wtime.c b/examples/smpi/NAS/MPI_dummy/wtime.c new file mode 100644 index 0000000000..221d2225ae --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/wtime.c @@ -0,0 +1,13 @@ +#include "wtime.h" +#include + +void wtime(double *t) +{ + static int sec = -1; + struct timeval tv; + gettimeofday(&tv, (void *)0); + if (sec < 0) sec = tv.tv_sec; + *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; +} + + diff --git a/examples/smpi/NAS/MPI_dummy/wtime.f b/examples/smpi/NAS/MPI_dummy/wtime.f new file mode 100644 index 0000000000..a1cfde9aa3 --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/wtime.f @@ -0,0 +1,12 @@ + subroutine wtime(tim) + real*8 tim + dimension tarray(2) + call etime(tarray) + tim = tarray(1) + return + end + + + + + diff --git a/examples/smpi/NAS/MPI_dummy/wtime.h b/examples/smpi/NAS/MPI_dummy/wtime.h new file mode 100644 index 0000000000..12eb0cb0ee --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/wtime.h @@ -0,0 +1,12 @@ +/* C/Fortran interface is different on different machines. + * You may need to tweak this. + */ + + +#if defined(IBM) +#define wtime wtime +#elif defined(CRAY) +#define wtime WTIME +#else +#define wtime wtime_ +#endif diff --git a/examples/smpi/NAS/MPI_dummy/wtime_sgi64.c b/examples/smpi/NAS/MPI_dummy/wtime_sgi64.c new file mode 100644 index 0000000000..d08d50cd34 --- /dev/null +++ b/examples/smpi/NAS/MPI_dummy/wtime_sgi64.c @@ -0,0 +1,74 @@ +#include +#include +#include +#include +#include +#include +#include + +/* The following works on SGI Power Challenge systems */ + +typedef unsigned long iotimer_t; + +unsigned int cycleval; +volatile iotimer_t *iotimer_addr, base_counter; +double resolution; + +/* address_t is an integer type big enough to hold an address */ +typedef unsigned long address_t; + + + +void timer_init() +{ + + int fd; + char *virt_addr; + address_t phys_addr, page_offset, pagemask, pagebase_addr; + + pagemask = getpagesize() - 1; + errno = 0; + phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval); + if (errno != 0) { + perror("SGI_QUERY_CYCLECNTR"); + exit(1); + } + /* rel_addr = page offset of physical address */ + page_offset = phys_addr & pagemask; + pagebase_addr = phys_addr - page_offset; + fd = open("/dev/mmem", O_RDONLY); + + virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr); + virt_addr = virt_addr + page_offset; + iotimer_addr = (iotimer_t *)virt_addr; + /* cycleval in picoseconds to this gives resolution in seconds */ + resolution = 1.0e-12*cycleval; + base_counter = *iotimer_addr; +} + +void wtime_(double *time) +{ + static int initialized = 0; + volatile iotimer_t counter_value; + if (!initialized) { + timer_init(); + initialized = 1; + } + counter_value = *iotimer_addr - base_counter; + *time = (double)counter_value * resolution; +} + + +void wtime(double *time) +{ + static int initialized = 0; + volatile iotimer_t counter_value; + if (!initialized) { + timer_init(); + initialized = 1; + } + counter_value = *iotimer_addr - base_counter; + *time = (double)counter_value * resolution; +} + + diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile new file mode 100644 index 0000000000..8f356aa71d --- /dev/null +++ b/examples/smpi/NAS/Makefile @@ -0,0 +1,69 @@ +SHELL=/bin/sh +CLASS=U +NPROCS=1 +SUBTYPE= +VERSION= +SFILE=config/suite.def + +default: header + @ sys/print_instructions + +BT: bt +bt: header + cd BT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) SUBTYPE=$(SUBTYPE) VERSION=$(VERSION) + +SP: sp +sp: header + cd SP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) + +LU: lu +lu: header + cd LU; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) VERSION=$(VERSION) + +MG: mg +mg: header + cd MG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) + +FT: ft +ft: header + cd FT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) + +IS: is +is: header + cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) + +CG: cg +cg: header + cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) + +EP: ep +ep: header + cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) + +DT: dt +dt: header + cd DT; $(MAKE) CLASS=$(CLASS) + +# Awk script courtesy cmg@cray.com, modified by Haoqiang Jin +suite: + @ awk -f sys/suite.awk SMAKE=$(MAKE) $(SFILE) | $(SHELL) + + +# It would be nice to make clean in each subdirectory (the targets +# are defined) but on a really clean system this will won't work +# because those makefiles need config/make.def +clean: + - rm -f core + - rm -f *~ */core */*~ */*.o */npbparams.h */*.obj */*.exe + - rm -f MPI_dummy/test MPI_dummy/libmpi.a + - rm -f sys/setparams sys/makesuite sys/setparams.h + - rm -f btio.*.out* + +veryclean: clean + - rm -f config/make.def config/suite.def + - rm -f bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* + - rm -f bin/ep.* bin/cg.* bin/dt.* + +header: + @ sys/print_header + diff --git a/examples/smpi/NAS/README b/examples/smpi/NAS/README new file mode 100644 index 0000000000..a80f5d6d70 --- /dev/null +++ b/examples/smpi/NAS/README @@ -0,0 +1,52 @@ +*** Warning *** + +This version of benchmarks IS, DT and EP contain special +tweaks to work with SMPI. + + + +The MPI implementation of NPB 3.3 (NPB3.3-MPI) +-------------------------------------------------- + +For problem reports and suggestions on the implementation, +please contact: + + NAS Parallel Benchmark Team + npb@nas.nasa.gov + + http://www.nas.nasa.gov/Software/NPB + + +This directory contains the MPI implementation of the NAS +Parallel Benchmarks, Version 3.3 (NPB3.3-MPI). A brief +summary of the new features introduced in this version is +given below. + +For changes from different versions, see the Changes.log file +included in the upper directory of this distribution. + +For explanation of compilation and running of the benchmarks, +please refer to README.install. For a special note on DT, please +see the README file in the DT subdirectory. + + +New features in NPB3.3-MPI: + * NPB3.3-MPI introduces a new problem size (class E) to seven of + the benchmarks (BT, SP, LU, CG, MG, FT, and EP). The version + also includes a new problem size (class D) for the IS benchmark, + which was not present in the previous releases. + + * The release is merged with the vector codes for the BT and LU + benchmarks, which can be selected with the VERSION=VEC option + during compilation. However, it should be noted that successful + vectorization highly depends on the compiler used. Some changes + to compiler directives for vectorization in the current codes + (see *_vec.f files) may be required. + + * New improvements to BTIO (BT with IO subtypes): + - added I/O stats (I/O timing, data size written, I/O data rate) + - added an option for interleaving reads between writes through + the inputbt.data file. Although the data file size would be + smaller as a result, the total amount of data written is still + the same. + diff --git a/examples/smpi/NAS/README.install b/examples/smpi/NAS/README.install new file mode 100644 index 0000000000..47a3139562 --- /dev/null +++ b/examples/smpi/NAS/README.install @@ -0,0 +1,156 @@ +Some explanations on the MPI implementation of NPB 3.3 (NPB3.3-MPI) +---------------------------------------------------------------------- + +NPB-MPI is a sample MPI implementation based on NPB2.4 and NPB3.0-SER. +This implementation contains all eight original benchmarks: +Seven in Fortran: BT, SP, LU, FT, CG, MG, and EP; one in C: IS, +as well as the DT benchmark, written in C, introduced in NPB3.2-MPI. + +For changes from different versions, see the Changes.log file +included in the upper directory of this distribution. + +This version has been tested, among others, on an SGI Origin3000 and +an SGI Altix. For problem reports and suggestions on the implementation, +please contact + + NAS Parallel Benchmark Team + npb@nas.nasa.gov + + +CAUTION ********************************* +When running the I/O benchmark, one or more data files will be written +in the directory from which the executable is invoked. They are not +deleted at the end of the program. A new run will overwrite the old +file(s). If not enough space is available in the user partition, the +program will fail. For classes C and D the disk space required is +3 GB and 135 GB, respectively. +***************************************** + + +1. Compilation + + NPB3-MPI uses the same directory tree as NPB3-SER (and NPB2.x) does. + Before compilation, one needs to check the configuration file + 'make.def' in the config directory and modify the file if necessary. + If it does not (yet) exist, copy 'make.def.template' or one of the + sample files in the NAS.samples subdirectory to 'make.def' and + edit the content for site- and machine-specific data. Then + + make NPROCS= CLASS= \ + [SUBTYPE=] [VERSION=VEC] + + where is "bt", "cg", "dt", "ep", "ft", "is", + "lu", "mg", or "sp" + is the number of processes + is "S", "W", "A", "B", "C", "D", or "E" + + Classes C, D and E are not available for DT. + Class E is not available for IS. + + The "VERSION=VEC" option is used for selecting the vectorized + versions of BT and LU. + + Only when making the I/O benchmark: + is "bt" + , as above + is "full", "simple", "fortran", or "epio" + + Three parameters not used in the original BT benchmark are present in + the I/O benchmark. Two are set by default in the file BT/bt.f. + Changing them is optional. + One is set in make.def. It must be specified. + + bt.f: collbuf_nodes: number of processes used to buffer data before + writing to file in the collective buffering mode + ( is "full"). + collbuf_size: size of buffer (in bytes) per process used in + collective buffering + + make.def: -DFORTRAN_REC_SIZE: Fortran I/O record length in bytes. This + is a system-specific value. It is part of the + definition string of variable CONVERTFLAG. Syntax: + "CONVERTFLAG = -DFORTRAN_REC_SIZE=n", where n is + the record length. + + When is "full" or "simple", the code must be linked with an + MPI library that contains the subset of IO routines defined in MPI 2. + + + Class D for IS (Integer Sort) requires a compiler/system that + supports the "long" type in C to be 64-bit. As examples, the SGI + MIPS compiler for the SGI Origin using the "-64" compilation flag and + the Intel compiler for IA64 are known to work. + + + The above procedure allows you to build one benchmark + at a time. To build a whole suite, you can type "make suite" + Make will look in file "config/suite.def" for a list of + executables to build. The file contains one line per specification, + with comments preceded by "#". Each line contains the name + of a benchmark, the class, and the number of processors, separated + by spaces or tabs. config/suite.def.template contains an example + of such a file. + + + The benchmarks have been designed so that they can be run + on a single processor without an MPI library. A few "dummy" + MPI routines are still required for linking. For convenience + such a library is supplied in the "MPI_dummy" subdirectory of + the distribution. It contains an mpif.h and mpi.f include files + which must be used as well. The dummy library is built and + linked automatically and paths to the include files are defined + by inserting the line "include ../config/make.dummy" into the + make.def file (see example in make.def.template). Make sure to + read the warnings in the README file in "MPI_dummy".The use of + the library is fragile and can produce unexpected errors. + + + ================================ + + The "RAND" variable in make.def + -------------------------------- + + Most of the NPBs use a random number generator. In two of the NPBs (FT + and EP) the computation of random numbers is included in the timed + part of the calculation, and it is important that the random number + generator be efficient. The default random number generator package + provided is called "randi8" and should be used where possible. It has + the following requirements: + + randi8: + 1. Uses integer*8 arithmetic. Compiler must support integer*8 + 2. Uses the Fortran 90 IAND intrinsic. Compiler must support IAND. + 3. Assumes overflow bits are discarded by the hardware. In particular, + that the lowest 46 bits of a*b are always correct, even if the + result a*b is larger than 2^64. + + Since randi8 may not work on all machines, we supply the following + alternatives: + + randi8_safe + 1. Uses integer*8 arithmetic + 2. Uses the Fortran 90 IBITS intrinsic. + 3. Does not make any assumptions about overflow. Should always + work correctly if compiler supports integer*8 and IBITS. + + randdp + 1. Uses double precision arithmetic (to simulate integer*8 operations). + Should work with any system with support for 64-bit floating + point arithmetic. + + randdpvec + 1. Similar to randdp but written to be easier to vectorize. + + +2. Execution + + The executable is named ..[.], + where is "fortran_io", "mpi_io_simple", "ep_io", or + "mpi_io_full" + The executable is placed in the bin subdirectory (or in the directory + BINDIR specified in make.def, if you've defined it). The method for + running the MPI program depends on your local system. + When any of the I/O benchmarks is run (non-empty subtype), one or + more output files are created, and placed in the directory from which + the program was started. These are not removed automatically, and + will be overwritten the next time an IO benchmark is run. diff --git a/examples/smpi/NAS/SP/Makefile b/examples/smpi/NAS/SP/Makefile new file mode 100644 index 0000000000..01508aa935 --- /dev/null +++ b/examples/smpi/NAS/SP/Makefile @@ -0,0 +1,60 @@ +SHELL=/bin/sh +BENCHMARK=sp +BENCHMARKU=SP + +include ../config/make.def + + +OBJS = sp.o make_set.o initialize.o exact_solution.o exact_rhs.o \ + set_constants.o adi.o define.o copy_faces.o rhs.o \ + lhsx.o lhsy.o lhsz.o x_solve.o ninvr.o y_solve.o pinvr.o \ + z_solve.o tzetar.o add.o txinvr.o error.o verify.o setup_mpi.o \ + ${COMMON}/print_results.o ${COMMON}/timers.o + +include ../sys/make.common + +# npbparams.h is included by header.h +# The following rule should do the trick but many make programs (not gmake) +# will do the wrong thing and rebuild the world every time (because the +# mod time on header.h is not changed. One solution would be to +# touch header.h but this might cause confusion if someone has +# accidentally deleted it. Instead, make the dependency on npbparams.h +# explicit in all the lines below (even though dependence is indirect). + +# header.h: npbparams.h + +${PROGRAM}: config ${OBJS} + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} + +.f.o: + ${FCOMPILE} $< + +sp.o: sp.f header.h npbparams.h mpinpb.h +make_set.o: make_set.f header.h npbparams.h mpinpb.h +initialize.o: initialize.f header.h npbparams.h +exact_solution.o: exact_solution.f header.h npbparams.h +exact_rhs.o: exact_rhs.f header.h npbparams.h +set_constants.o: set_constants.f header.h npbparams.h +adi.o: adi.f header.h npbparams.h +define.o: define.f header.h npbparams.h +copy_faces.o: copy_faces.f header.h npbparams.h mpinpb.h +rhs.o: rhs.f header.h npbparams.h +lhsx.o: lhsx.f header.h npbparams.h +lhsy.o: lhsy.f header.h npbparams.h +lhsz.o: lhsz.f header.h npbparams.h +x_solve.o: x_solve.f header.h npbparams.h mpinpb.h +ninvr.o: ninvr.f header.h npbparams.h +y_solve.o: y_solve.f header.h npbparams.h mpinpb.h +pinvr.o: pinvr.f header.h npbparams.h +z_solve.o: z_solve.f header.h npbparams.h mpinpb.h +tzetar.o: tzetar.f header.h npbparams.h +add.o: add.f header.h npbparams.h +txinvr.o: txinvr.f header.h npbparams.h +error.o: error.f header.h npbparams.h mpinpb.h +verify.o: verify.f header.h npbparams.h mpinpb.h +setup_mpi.o: setup_mpi.f mpinpb.h npbparams.h + + +clean: + - rm -f *.o *~ mputil* + - rm -f npbparams.h core diff --git a/examples/smpi/NAS/SP/README b/examples/smpi/NAS/SP/README new file mode 100644 index 0000000000..fe423db43f --- /dev/null +++ b/examples/smpi/NAS/SP/README @@ -0,0 +1,17 @@ + +This code implements a 3D Multi-partition algorithm for the solution +of the uncoupled systems of linear equations resulting from +Beam-Warming approximate factorization. Consequently, the program +must be run on a square number of processors. The included file +"npbparams.h" contains a parameter statement which sets "maxcells" +and "problem_size". The parameter maxcells must be set to the +square root of the number of processors. For example, if running +on 25 processors, then set max_cells=5. The standard problem sizes +are problem_size=64 for class A, 102 for class B, and 162 for class C. + +The number of time steps and the time step size dt are set in the +npbparams.h but may be overridden in the input deck "inputsp.data". +The number of time steps is 400 for all three +standard problems, and the appropriate time step sizes "dt" are +0.0015d0 for class A, 0.001d0 for class B, and 0.00067 for class C. + diff --git a/examples/smpi/NAS/SP/add.f b/examples/smpi/NAS/SP/add.f new file mode 100644 index 0000000000..cdc4765cbf --- /dev/null +++ b/examples/smpi/NAS/SP/add.f @@ -0,0 +1,31 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine add + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c addition of update to the vector u +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, i, j, k, m + + do c = 1, ncells + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + u(i,j,k,m,c) = u(i,j,k,m,c) + rhs(i,j,k,m,c) + end do + end do + end do + end do + end do + + return + end diff --git a/examples/smpi/NAS/SP/adi.f b/examples/smpi/NAS/SP/adi.f new file mode 100644 index 0000000000..e55cfd60da --- /dev/null +++ b/examples/smpi/NAS/SP/adi.f @@ -0,0 +1,24 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine adi + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + call copy_faces + + call txinvr + + call x_solve + + call y_solve + + call z_solve + + call add + + return + end + diff --git a/examples/smpi/NAS/SP/copy_faces.f b/examples/smpi/NAS/SP/copy_faces.f new file mode 100644 index 0000000000..41824d2198 --- /dev/null +++ b/examples/smpi/NAS/SP/copy_faces.f @@ -0,0 +1,306 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine copy_faces + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c this function copies the face values of a variable defined on a set +c of cells to the overlap locations of the adjacent sets of cells. +c Because a set of cells interfaces in each direction with exactly one +c other set, we only need to fill six different buffers. We could try to +c overlap communication with computation, by computing +c some internal values while communicating boundary values, but this +c adds so much overhead that it's not clearly useful. +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i, j, k, c, m, requests(0:11), p0, p1, + > p2, p3, p4, p5, b_size(0:5), ss(0:5), + > sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11) + +c--------------------------------------------------------------------- +c exit immediately if there are no faces to be copied +c--------------------------------------------------------------------- + if (no_nodes .eq. 1) then + call compute_rhs + return + endif + + + ss(0) = start_send_east + ss(1) = start_send_west + ss(2) = start_send_north + ss(3) = start_send_south + ss(4) = start_send_top + ss(5) = start_send_bottom + + sr(0) = start_recv_east + sr(1) = start_recv_west + sr(2) = start_recv_north + sr(3) = start_recv_south + sr(4) = start_recv_top + sr(5) = start_recv_bottom + + b_size(0) = east_size + b_size(1) = west_size + b_size(2) = north_size + b_size(3) = south_size + b_size(4) = top_size + b_size(5) = bottom_size + +c--------------------------------------------------------------------- +c because the difference stencil for the diagonalized scheme is +c orthogonal, we do not have to perform the staged copying of faces, +c but can send all face information simultaneously to the neighboring +c cells in all directions +c--------------------------------------------------------------------- + p0 = 0 + p1 = 0 + p2 = 0 + p3 = 0 + p4 = 0 + p5 = 0 + + do c = 1, ncells + do m = 1, 5 + +c--------------------------------------------------------------------- +c fill the buffer to be sent to eastern neighbors (i-dir) +c--------------------------------------------------------------------- + if (cell_coord(1,c) .ne. ncells) then + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = cell_size(1,c)-2, cell_size(1,c)-1 + out_buffer(ss(0)+p0) = u(i,j,k,m,c) + p0 = p0 + 1 + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to western neighbors +c--------------------------------------------------------------------- + if (cell_coord(1,c) .ne. 1) then + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, 1 + out_buffer(ss(1)+p1) = u(i,j,k,m,c) + p1 = p1 + 1 + end do + end do + end do + + + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to northern neighbors (j_dir) +c--------------------------------------------------------------------- + if (cell_coord(2,c) .ne. ncells) then + do k = 0, cell_size(3,c)-1 + do j = cell_size(2,c)-2, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + out_buffer(ss(2)+p2) = u(i,j,k,m,c) + p2 = p2 + 1 + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to southern neighbors +c--------------------------------------------------------------------- + if (cell_coord(2,c).ne. 1) then + do k = 0, cell_size(3,c)-1 + do j = 0, 1 + do i = 0, cell_size(1,c)-1 + out_buffer(ss(3)+p3) = u(i,j,k,m,c) + p3 = p3 + 1 + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to top neighbors (k-dir) +c--------------------------------------------------------------------- + if (cell_coord(3,c) .ne. ncells) then + do k = cell_size(3,c)-2, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + out_buffer(ss(4)+p4) = u(i,j,k,m,c) + p4 = p4 + 1 + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c fill the buffer to be sent to bottom neighbors +c--------------------------------------------------------------------- + if (cell_coord(3,c).ne. 1) then + do k=0, 1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + out_buffer(ss(5)+p5) = u(i,j,k,m,c) + p5 = p5 + 1 + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c m loop +c--------------------------------------------------------------------- + end do + +c--------------------------------------------------------------------- +c cell loop +c--------------------------------------------------------------------- + end do + + call mpi_irecv(in_buffer(sr(0)), b_size(0), + > dp_type, successor(1), WEST, + > comm_rhs, requests(0), error) + call mpi_irecv(in_buffer(sr(1)), b_size(1), + > dp_type, predecessor(1), EAST, + > comm_rhs, requests(1), error) + call mpi_irecv(in_buffer(sr(2)), b_size(2), + > dp_type, successor(2), SOUTH, + > comm_rhs, requests(2), error) + call mpi_irecv(in_buffer(sr(3)), b_size(3), + > dp_type, predecessor(2), NORTH, + > comm_rhs, requests(3), error) + call mpi_irecv(in_buffer(sr(4)), b_size(4), + > dp_type, successor(3), BOTTOM, + > comm_rhs, requests(4), error) + call mpi_irecv(in_buffer(sr(5)), b_size(5), + > dp_type, predecessor(3), TOP, + > comm_rhs, requests(5), error) + + call mpi_isend(out_buffer(ss(0)), b_size(0), + > dp_type, successor(1), EAST, + > comm_rhs, requests(6), error) + call mpi_isend(out_buffer(ss(1)), b_size(1), + > dp_type, predecessor(1), WEST, + > comm_rhs, requests(7), error) + call mpi_isend(out_buffer(ss(2)), b_size(2), + > dp_type,successor(2), NORTH, + > comm_rhs, requests(8), error) + call mpi_isend(out_buffer(ss(3)), b_size(3), + > dp_type,predecessor(2), SOUTH, + > comm_rhs, requests(9), error) + call mpi_isend(out_buffer(ss(4)), b_size(4), + > dp_type,successor(3), TOP, + > comm_rhs, requests(10), error) + call mpi_isend(out_buffer(ss(5)), b_size(5), + > dp_type,predecessor(3), BOTTOM, + > comm_rhs,requests(11), error) + + + call mpi_waitall(12, requests, statuses, error) + +c--------------------------------------------------------------------- +c unpack the data that has just been received; +c--------------------------------------------------------------------- + p0 = 0 + p1 = 0 + p2 = 0 + p3 = 0 + p4 = 0 + p5 = 0 + + do c = 1, ncells + do m = 1, 5 + + if (cell_coord(1,c) .ne. 1) then + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = -2, -1 + u(i,j,k,m,c) = in_buffer(sr(1)+p0) + p0 = p0 + 1 + end do + end do + end do + endif + + if (cell_coord(1,c) .ne. ncells) then + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = cell_size(1,c), cell_size(1,c)+1 + u(i,j,k,m,c) = in_buffer(sr(0)+p1) + p1 = p1 + 1 + end do + end do + end do + end if + + if (cell_coord(2,c) .ne. 1) then + do k = 0, cell_size(3,c)-1 + do j = -2, -1 + do i = 0, cell_size(1,c)-1 + u(i,j,k,m,c) = in_buffer(sr(3)+p2) + p2 = p2 + 1 + end do + end do + end do + + endif + + if (cell_coord(2,c) .ne. ncells) then + do k = 0, cell_size(3,c)-1 + do j = cell_size(2,c), cell_size(2,c)+1 + do i = 0, cell_size(1,c)-1 + u(i,j,k,m,c) = in_buffer(sr(2)+p3) + p3 = p3 + 1 + end do + end do + end do + endif + + if (cell_coord(3,c) .ne. 1) then + do k = -2, -1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + u(i,j,k,m,c) = in_buffer(sr(5)+p4) + p4 = p4 + 1 + end do + end do + end do + endif + + if (cell_coord(3,c) .ne. ncells) then + do k = cell_size(3,c), cell_size(3,c)+1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + u(i,j,k,m,c) = in_buffer(sr(4)+p5) + p5 = p5 + 1 + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c m loop +c--------------------------------------------------------------------- + end do + +c--------------------------------------------------------------------- +c cells loop +c--------------------------------------------------------------------- + end do + +c--------------------------------------------------------------------- +c now that we have all the data, compute the rhs +c--------------------------------------------------------------------- + call compute_rhs + + return + end diff --git a/examples/smpi/NAS/SP/define.f b/examples/smpi/NAS/SP/define.f new file mode 100644 index 0000000000..c465533f9a --- /dev/null +++ b/examples/smpi/NAS/SP/define.f @@ -0,0 +1,66 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine compute_buffer_size(dim) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, dim, face_size + + if (ncells .eq. 1) return + +c--------------------------------------------------------------------- +c compute the actual sizes of the buffers; note that there is +c always one cell face that doesn't need buffer space, because it +c is at the boundary of the grid +c--------------------------------------------------------------------- + + west_size = 0 + east_size = 0 + + do c = 1, ncells + face_size = cell_size(2,c) * cell_size(3,c) * dim * 2 + if (cell_coord(1,c).ne.1) west_size = west_size + face_size + if (cell_coord(1,c).ne.ncells) east_size = east_size + + > face_size + end do + + north_size = 0 + south_size = 0 + do c = 1, ncells + face_size = cell_size(1,c)*cell_size(3,c) * dim * 2 + if (cell_coord(2,c).ne.1) south_size = south_size + face_size + if (cell_coord(2,c).ne.ncells) north_size = north_size + + > face_size + end do + + top_size = 0 + bottom_size = 0 + do c = 1, ncells + face_size = cell_size(1,c) * cell_size(2,c) * dim * 2 + if (cell_coord(3,c).ne.1) bottom_size = bottom_size + + > face_size + if (cell_coord(3,c).ne.ncells) top_size = top_size + + > face_size + end do + + start_send_west = 1 + start_send_east = start_send_west + west_size + start_send_south = start_send_east + east_size + start_send_north = start_send_south + south_size + start_send_bottom = start_send_north + north_size + start_send_top = start_send_bottom + bottom_size + start_recv_west = 1 + start_recv_east = start_recv_west + west_size + start_recv_south = start_recv_east + east_size + start_recv_north = start_recv_south + south_size + start_recv_bottom = start_recv_north + north_size + start_recv_top = start_recv_bottom + bottom_size + + return + end + diff --git a/examples/smpi/NAS/SP/error.f b/examples/smpi/NAS/SP/error.f new file mode 100644 index 0000000000..fd9aab37b3 --- /dev/null +++ b/examples/smpi/NAS/SP/error.f @@ -0,0 +1,105 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine error_norm(rms) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c this function computes the norm of the difference between the +c computed solution and the exact solution +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer c, i, j, k, m, ii, jj, kk, d, error + double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5), + > add + + do m = 1, 5 + rms_work(m) = 0.0d0 + end do + + do c = 1, ncells + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + ii = 0 + do i = cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + call exact_solution(xi, eta, zeta, u_exact) + + do m = 1, 5 + add = u(ii,jj,kk,m,c)-u_exact(m) + rms_work(m) = rms_work(m) + add*add + end do + ii = ii + 1 + end do + jj = jj + 1 + end do + kk = kk + 1 + end do + end do + + call mpi_allreduce(rms_work, rms, 5, dp_type, + > MPI_SUM, comm_setup, error) + + do m = 1, 5 + do d = 1, 3 + rms(m) = rms(m) / dble(grid_points(d)-2) + end do + rms(m) = dsqrt(rms(m)) + end do + + return + end + + + + subroutine rhs_norm(rms) + + include 'header.h' + include 'mpinpb.h' + + integer c, i, j, k, d, m, error + double precision rms(5), rms_work(5), add + + do m = 1, 5 + rms_work(m) = 0.0d0 + end do + + do c = 1, ncells + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + do m = 1, 5 + add = rhs(i,j,k,m,c) + rms_work(m) = rms_work(m) + add*add + end do + end do + end do + end do + end do + + + + call mpi_allreduce(rms_work, rms, 5, dp_type, + > MPI_SUM, comm_setup, error) + + do m = 1, 5 + do d = 1, 3 + rms(m) = rms(m) / dble(grid_points(d)-2) + end do + rms(m) = dsqrt(rms(m)) + end do + + return + end + + diff --git a/examples/smpi/NAS/SP/exact_rhs.f b/examples/smpi/NAS/SP/exact_rhs.f new file mode 100644 index 0000000000..b589668126 --- /dev/null +++ b/examples/smpi/NAS/SP/exact_rhs.f @@ -0,0 +1,363 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exact_rhs + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c compute the right hand side based on exact solution +c--------------------------------------------------------------------- + + include 'header.h' + + double precision dtemp(5), xi, eta, zeta, dtpp + integer c, m, i, j, k, ip1, im1, jp1, + > jm1, km1, kp1 + +c--------------------------------------------------------------------- +c loop over all cells owned by this node +c--------------------------------------------------------------------- + do c = 1, ncells + +c--------------------------------------------------------------------- +c initialize +c--------------------------------------------------------------------- + do m = 1, 5 + do k= 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + forcing(i,j,k,m,c) = 0.0d0 + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c xi-direction flux differences +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + zeta = dble(k+cell_low(3,c)) * dnzm1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + eta = dble(j+cell_low(2,c)) * dnym1 + + do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c) + xi = dble(i+cell_low(1,c)) * dnxm1 + + call exact_solution(xi, eta, zeta, dtemp) + do m = 1, 5 + ue(i,m) = dtemp(m) + end do + + dtpp = 1.0d0 / dtemp(1) + + do m = 2, 5 + buf(i,m) = dtpp * dtemp(m) + end do + + cuf(i) = buf(i,2) * buf(i,2) + buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + + > buf(i,4) * buf(i,4) + q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) + + > buf(i,4)*ue(i,4)) + + end do + + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + im1 = i-1 + ip1 = i+1 + + forcing(i,j,k,1,c) = forcing(i,j,k,1,c) - + > tx2*( ue(ip1,2)-ue(im1,2) )+ + > dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1)) + + forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tx2 * ( + > (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))- + > (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+ + > xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+ + > dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2)) + + forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tx2 * ( + > ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+ + > xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+ + > dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3)) + + forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tx2*( + > ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+ + > xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+ + > dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4)) + + forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tx2*( + > buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))- + > buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+ + > 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+ + > buf(im1,1))+ + > xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+ + > xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+ + > dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5)) + end do + +c--------------------------------------------------------------------- +c Fourth-order dissipation +c--------------------------------------------------------------------- + if (start(1,c) .gt. 0) then + do m = 1, 5 + i = 1 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m)) + i = 2 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) - + > 4.0d0*ue(i+1,m) + ue(i+2,m)) + end do + endif + + do m = 1, 5 + do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp* + > (ue(i-2,m) - 4.0d0*ue(i-1,m) + + > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m)) + end do + end do + + if (end(1,c) .gt. 0) then + do m = 1, 5 + i = cell_size(1,c)-3 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (ue(i-2,m) - 4.0d0*ue(i-1,m) + + > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m)) + i = cell_size(1,c)-2 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m)) + end do + endif + + end do + end do +c--------------------------------------------------------------------- +c eta-direction flux differences +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + zeta = dble(k+cell_low(3,c)) * dnzm1 + do i=start(1,c), cell_size(1,c)-end(1,c)-1 + xi = dble(i+cell_low(1,c)) * dnxm1 + + do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c) + eta = dble(j+cell_low(2,c)) * dnym1 + + call exact_solution(xi, eta, zeta, dtemp) + do m = 1, 5 + ue(j,m) = dtemp(m) + end do + dtpp = 1.0d0/dtemp(1) + + do m = 2, 5 + buf(j,m) = dtpp * dtemp(m) + end do + + cuf(j) = buf(j,3) * buf(j,3) + buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + + > buf(j,4) * buf(j,4) + q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) + + > buf(j,4)*ue(j,4)) + end do + + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + jm1 = j-1 + jp1 = j+1 + + forcing(i,j,k,1,c) = forcing(i,j,k,1,c) - + > ty2*( ue(jp1,3)-ue(jm1,3) )+ + > dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1)) + + forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - ty2*( + > ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+ + > yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+ + > dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2)) + + forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - ty2*( + > (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))- + > (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+ + > yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+ + > dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3)) + + forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - ty2*( + > ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+ + > yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+ + > dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4)) + + forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - ty2*( + > buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))- + > buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+ + > 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+ + > buf(jm1,1))+ + > yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+ + > yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+ + > dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5)) + end do + +c--------------------------------------------------------------------- +c Fourth-order dissipation +c--------------------------------------------------------------------- + if (start(2,c) .gt. 0) then + do m = 1, 5 + j = 1 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m)) + j = 2 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) - + > 4.0d0*ue(j+1,m) + ue(j+2,m)) + end do + endif + + do m = 1, 5 + do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp* + > (ue(j-2,m) - 4.0d0*ue(j-1,m) + + > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m)) + end do + end do + if (end(2,c) .gt. 0) then + do m = 1, 5 + j = cell_size(2,c)-3 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (ue(j-2,m) - 4.0d0*ue(j-1,m) + + > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m)) + j = cell_size(2,c)-2 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m)) + + end do + endif + + end do + end do + +c--------------------------------------------------------------------- +c zeta-direction flux differences +c--------------------------------------------------------------------- + do j=start(2,c), cell_size(2,c)-end(2,c)-1 + eta = dble(j+cell_low(2,c)) * dnym1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + xi = dble(i+cell_low(1,c)) * dnxm1 + + do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c) + zeta = dble(k+cell_low(3,c)) * dnzm1 + + call exact_solution(xi, eta, zeta, dtemp) + do m = 1, 5 + ue(k,m) = dtemp(m) + end do + + dtpp = 1.0d0/dtemp(1) + + do m = 2, 5 + buf(k,m) = dtpp * dtemp(m) + end do + + cuf(k) = buf(k,4) * buf(k,4) + buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + + > buf(k,3) * buf(k,3) + q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) + + > buf(k,4)*ue(k,4)) + end do + + do k=start(3,c), cell_size(3,c)-end(3,c)-1 + km1 = k-1 + kp1 = k+1 + + forcing(i,j,k,1,c) = forcing(i,j,k,1,c) - + > tz2*( ue(kp1,4)-ue(km1,4) )+ + > dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1)) + + forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tz2 * ( + > ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+ + > zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+ + > dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2)) + + forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tz2 * ( + > ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+ + > zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+ + > dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3)) + + forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tz2 * ( + > (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))- + > (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+ + > zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+ + > dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4)) + + forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tz2 * ( + > buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))- + > buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+ + > 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1) + > +buf(km1,1))+ + > zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+ + > zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+ + > dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5)) + end do + +c--------------------------------------------------------------------- +c Fourth-order dissipation +c--------------------------------------------------------------------- + if (start(3,c) .gt. 0) then + do m = 1, 5 + k = 1 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m)) + k = 2 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) - + > 4.0d0*ue(k+1,m) + ue(k+2,m)) + end do + endif + + do m = 1, 5 + do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp* + > (ue(k-2,m) - 4.0d0*ue(k-1,m) + + > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m)) + end do + end do + + if (end(3,c) .gt. 0) then + do m = 1, 5 + k = cell_size(3,c)-3 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (ue(k-2,m) - 4.0d0*ue(k-1,m) + + > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m)) + k = cell_size(3,c)-2 + forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * + > (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m)) + end do + endif + + end do + end do +c--------------------------------------------------------------------- +c now change the sign of the forcing function, +c--------------------------------------------------------------------- + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + forcing(i,j,k,m,c) = -1.d0 * forcing(i,j,k,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c cell loop +c--------------------------------------------------------------------- + end do + + return + end + + + + + diff --git a/examples/smpi/NAS/SP/exact_solution.f b/examples/smpi/NAS/SP/exact_solution.f new file mode 100644 index 0000000000..2644f0b8f9 --- /dev/null +++ b/examples/smpi/NAS/SP/exact_solution.f @@ -0,0 +1,30 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine exact_solution(xi,eta,zeta,dtemp) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c this function returns the exact solution at point xi, eta, zeta +c--------------------------------------------------------------------- + + include 'header.h' + + double precision xi, eta, zeta, dtemp(5) + integer m + + do m = 1, 5 + dtemp(m) = ce(m,1) + + > xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) + + > eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+ + > zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + + > zeta*ce(m,13)))) + end do + + return + end + + diff --git a/examples/smpi/NAS/SP/header.h b/examples/smpi/NAS/SP/header.h new file mode 100644 index 0000000000..663515a104 --- /dev/null +++ b/examples/smpi/NAS/SP/header.h @@ -0,0 +1,113 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + +c--------------------------------------------------------------------- +c The following include file is generated automatically by the +c "setparams" utility. It defines +c maxcells: the square root of the maximum number of processors +c problem_size: 12, 64, 102, 162 (for class T, A, B, C) +c dt_default: default time step for this problem size if no +c config file +c niter_default: default number of iterations for this problem size +c--------------------------------------------------------------------- + + include 'npbparams.h' + + integer ncells, grid_points(3) + common /global/ ncells, grid_points + + double precision tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, + > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, + > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, + > ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, + > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, + > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, + > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, + > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, + > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, + > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, + > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, + > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, + > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, + > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16 + + common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, + > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, + > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, + > ce, dxmax, dymax, dzmax, xxcon1, xxcon2, + > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, + > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, + > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, + > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, + > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, + > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, + > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, + > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, + > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, + > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16 + + integer EAST, WEST, NORTH, SOUTH, + > BOTTOM, TOP + + parameter (EAST=2000, WEST=3000, NORTH=4000, SOUTH=5000, + > BOTTOM=6000, TOP=7000) + + integer cell_coord (3,maxcells), cell_low (3,maxcells), + > cell_high (3,maxcells), cell_size(3,maxcells), + > predecessor(3), slice (3,maxcells), + > grid_size (3), successor(3), + > start (3,maxcells), end (3,maxcells) + common /partition/ cell_coord, cell_low, cell_high, cell_size, + > grid_size, successor, predecessor, slice, + > start, end + + integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE, IMAXP, JMAXP + + parameter (MAX_CELL_DIM = (problem_size/maxcells)+1) + + parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM) + parameter (IMAXP=IMAX/2*2+1,JMAXP=JMAX/2*2+1) + +c--------------------------------------------------------------------- +c +1 at end to avoid zero length arrays for 1 node +c--------------------------------------------------------------------- + parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60*2+1) + + double precision + > u (-2:IMAXP+1,-2:JMAXP+1,-2:KMAX+1, 5,maxcells), + > us (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > vs (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > ws (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > qs (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > ainv (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > rho_i (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > speed (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > square (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), + > rhs ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells), + > forcing ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells), + > lhs ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1,15,maxcells), + > in_buffer(BUF_SIZE), out_buffer(BUF_SIZE) + common /fields/ u, us, vs, ws, qs, ainv, rho_i, speed, square, + > rhs, forcing, lhs, in_buffer, out_buffer + + double precision cv(-2:MAX_CELL_DIM+1), rhon(-2:MAX_CELL_DIM+1), + > rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1), + > cuf(-2:MAX_CELL_DIM+1), q(-2:MAX_CELL_DIM+1), + > ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5) + common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf + + integer west_size, east_size, bottom_size, top_size, + > north_size, south_size, start_send_west, + > start_send_east, start_send_south, start_send_north, + > start_send_bottom, start_send_top, start_recv_west, + > start_recv_east, start_recv_south, start_recv_north, + > start_recv_bottom, start_recv_top + common /box/ west_size, east_size, bottom_size, + > top_size, north_size, south_size, + > start_send_west, start_send_east, start_send_south, + > start_send_north, start_send_bottom, start_send_top, + > start_recv_west, start_recv_east, start_recv_south, + > start_recv_north, start_recv_bottom, start_recv_top diff --git a/examples/smpi/NAS/SP/initialize.f b/examples/smpi/NAS/SP/initialize.f new file mode 100644 index 0000000000..655c8d9369 --- /dev/null +++ b/examples/smpi/NAS/SP/initialize.f @@ -0,0 +1,286 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine initialize + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c This subroutine initializes the field variable u using +c tri-linear transfinite interpolation of the boundary values +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, i, j, k, m, ii, jj, kk, ix, iy, iz + double precision xi, eta, zeta, Pface(5,3,2), Pxi, Peta, + > Pzeta, temp(5) + + +c--------------------------------------------------------------------- +c Later (in compute_rhs) we compute 1/u for every element. A few of +c the corner elements are not used, but it convenient (and faster) +c to compute the whole thing with a simple loop. Make sure those +c values are nonzero by initializing the whole thing here. +c--------------------------------------------------------------------- + do c = 1, ncells + do kk = -1, IMAX + do jj = -1, IMAX + do ii = -1, IMAX + u(ii, jj, kk, 1, c) = 1.0 + u(ii, jj, kk, 2, c) = 0.0 + u(ii, jj, kk, 3, c) = 0.0 + u(ii, jj, kk, 4, c) = 0.0 + u(ii, jj, kk, 5, c) = 1.0 + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c first store the "interpolated" values everywhere on the grid +c--------------------------------------------------------------------- + do c=1, ncells + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + ii = 0 + do i = cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + + do ix = 1, 2 + call exact_solution(dble(ix-1), eta, zeta, + > Pface(1,1,ix)) + end do + + do iy = 1, 2 + call exact_solution(xi, dble(iy-1) , zeta, + > Pface(1,2,iy)) + end do + + do iz = 1, 2 + call exact_solution(xi, eta, dble(iz-1), + > Pface(1,3,iz)) + end do + + do m = 1, 5 + Pxi = xi * Pface(m,1,2) + + > (1.0d0-xi) * Pface(m,1,1) + Peta = eta * Pface(m,2,2) + + > (1.0d0-eta) * Pface(m,2,1) + Pzeta = zeta * Pface(m,3,2) + + > (1.0d0-zeta) * Pface(m,3,1) + + u(ii,jj,kk,m,c) = Pxi + Peta + Pzeta - + > Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + + > Pxi*Peta*Pzeta + + end do + ii = ii + 1 + end do + jj = jj + 1 + end do + kk = kk+1 + end do + end do + +c--------------------------------------------------------------------- +c now store the exact values on the boundaries +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c west face +c--------------------------------------------------------------------- + c = slice(1,1) + ii = 0 + xi = 0.0d0 + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(ii,jj,kk,m,c) = temp(m) + end do + jj = jj + 1 + end do + kk = kk + 1 + end do + +c--------------------------------------------------------------------- +c east face +c--------------------------------------------------------------------- + c = slice(1,ncells) + ii = cell_size(1,c)-1 + xi = 1.0d0 + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(ii,jj,kk,m,c) = temp(m) + end do + jj = jj + 1 + end do + kk = kk + 1 + end do + +c--------------------------------------------------------------------- +c south face +c--------------------------------------------------------------------- + c = slice(2,1) + jj = 0 + eta = 0.0d0 + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + ii = 0 + do i = cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(ii,jj,kk,m,c) = temp(m) + end do + ii = ii + 1 + end do + kk = kk + 1 + end do + + +c--------------------------------------------------------------------- +c north face +c--------------------------------------------------------------------- + c = slice(2,ncells) + jj = cell_size(2,c)-1 + eta = 1.0d0 + kk = 0 + do k = cell_low(3,c), cell_high(3,c) + zeta = dble(k) * dnzm1 + ii = 0 + do i = cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(ii,jj,kk,m,c) = temp(m) + end do + ii = ii + 1 + end do + kk = kk + 1 + end do + +c--------------------------------------------------------------------- +c bottom face +c--------------------------------------------------------------------- + c = slice(3,1) + kk = 0 + zeta = 0.0d0 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + ii = 0 + do i =cell_low(1,c), cell_high(1,c) + xi = dble(i) *dnxm1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(ii,jj,kk,m,c) = temp(m) + end do + ii = ii + 1 + end do + jj = jj + 1 + end do + +c--------------------------------------------------------------------- +c top face +c--------------------------------------------------------------------- + c = slice(3,ncells) + kk = cell_size(3,c)-1 + zeta = 1.0d0 + jj = 0 + do j = cell_low(2,c), cell_high(2,c) + eta = dble(j) * dnym1 + ii = 0 + do i =cell_low(1,c), cell_high(1,c) + xi = dble(i) * dnxm1 + call exact_solution(xi, eta, zeta, temp) + do m = 1, 5 + u(ii,jj,kk,m,c) = temp(m) + end do + ii = ii + 1 + end do + jj = jj + 1 + end do + + return + end + + + subroutine lhsinit + + include 'header.h' + + integer i, j, k, d, c, n + +c--------------------------------------------------------------------- +c loop over all cells +c--------------------------------------------------------------------- + do c = 1, ncells + +c--------------------------------------------------------------------- +c first, initialize the start and end arrays +c--------------------------------------------------------------------- + do d = 1, 3 + if (cell_coord(d,c) .eq. 1) then + start(d,c) = 1 + else + start(d,c) = 0 + endif + if (cell_coord(d,c) .eq. ncells) then + end(d,c) = 1 + else + end(d,c) = 0 + endif + end do + +c--------------------------------------------------------------------- +c zap the whole left hand side for starters +c--------------------------------------------------------------------- + do n = 1, 15 + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + lhs(i,j,k,n,c) = 0.0d0 + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c next, set all diagonal values to 1. This is overkill, but convenient +c--------------------------------------------------------------------- + do n = 1, 3 + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + lhs(i,j,k,5*n-2,c) = 1.0d0 + end do + end do + end do + end do + + end do + + return + end + + + diff --git a/examples/smpi/NAS/SP/inputsp.data.sample b/examples/smpi/NAS/SP/inputsp.data.sample new file mode 100644 index 0000000000..ae3801fdb7 --- /dev/null +++ b/examples/smpi/NAS/SP/inputsp.data.sample @@ -0,0 +1,3 @@ +400 number of time steps +0.0015d0 dt for class A = 0.0015d0. class B = 0.001d0 class C = 0.00067d0 +64 64 64 diff --git a/examples/smpi/NAS/SP/lhsx.f b/examples/smpi/NAS/SP/lhsx.f new file mode 100644 index 0000000000..cae7779122 --- /dev/null +++ b/examples/smpi/NAS/SP/lhsx.f @@ -0,0 +1,124 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine lhsx(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c This function computes the left hand side for the three x-factors +c--------------------------------------------------------------------- + + include 'header.h' + + double precision ru1 + integer i, j, k, c + + +c--------------------------------------------------------------------- +c treat only cell c +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c first fill the lhs for the u-eigenvalue +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c)-1, cell_size(1,c)-end(1,c) + ru1 = c3c4*rho_i(i,j,k,c) + cv(i) = us(i,j,k,c) + rhon(i) = dmax1(dx2+con43*ru1, + > dx5+c1c5*ru1, + > dxmax+ru1, + > dx1) + end do + + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + lhs(i,j,k,1,c) = 0.0d0 + lhs(i,j,k,2,c) = - dttx2 * cv(i-1) - dttx1 * rhon(i-1) + lhs(i,j,k,3,c) = 1.0d0 + c2dttx1 * rhon(i) + lhs(i,j,k,4,c) = dttx2 * cv(i+1) - dttx1 * rhon(i+1) + lhs(i,j,k,5,c) = 0.0d0 + end do + end do + end do + +c--------------------------------------------------------------------- +c add fourth order dissipation +c--------------------------------------------------------------------- + if (start(1,c) .gt. 0) then + i = 1 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 + + lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4 + lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz6 + lhs(i+1,j,k,4,c) = lhs(i+1,j,k,4,c) - comz4 + lhs(i+1,j,k,5,c) = lhs(i+1,j,k,5,c) + comz1 + end do + end do + endif + + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i=3*start(1,c), cell_size(1,c)-3*end(1,c)-1 + lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 + lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 + end do + end do + end do + + if (end(1,c) .gt. 0) then + i = cell_size(1,c)-3 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 + lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + + lhs(i+1,j,k,1,c) = lhs(i+1,j,k,1,c) + comz1 + lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4 + lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz5 + end do + end do + endif + +c--------------------------------------------------------------------- +c subsequently, fill the other factors (u+c), (u-c) by a4ing to +c the first +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c) + lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) - + > dttx2 * speed(i-1,j,k,c) + lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c) + lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) + + > dttx2 * speed(i+1,j,k,c) + lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c) + lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c) + lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + + > dttx2 * speed(i-1,j,k,c) + lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c) + lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - + > dttx2 * speed(i+1,j,k,c) + lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c) + end do + end do + end do + + return + end + + + diff --git a/examples/smpi/NAS/SP/lhsy.f b/examples/smpi/NAS/SP/lhsy.f new file mode 100644 index 0000000000..9c07a35538 --- /dev/null +++ b/examples/smpi/NAS/SP/lhsy.f @@ -0,0 +1,125 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine lhsy(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c This function computes the left hand side for the three y-factors +c--------------------------------------------------------------------- + + include 'header.h' + + double precision ru1 + integer i, j, k, c + +c--------------------------------------------------------------------- +c treat only cell c +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c first fill the lhs for the u-eigenvalue +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + + do j = start(2,c)-1, cell_size(2,c)-end(2,c) + ru1 = c3c4*rho_i(i,j,k,c) + cv(j) = vs(i,j,k,c) + rhoq(j) = dmax1( dy3 + con43 * ru1, + > dy5 + c1c5*ru1, + > dymax + ru1, + > dy1) + end do + + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + lhs(i,j,k,1,c) = 0.0d0 + lhs(i,j,k,2,c) = -dtty2 * cv(j-1) - dtty1 * rhoq(j-1) + lhs(i,j,k,3,c) = 1.0 + c2dtty1 * rhoq(j) + lhs(i,j,k,4,c) = dtty2 * cv(j+1) - dtty1 * rhoq(j+1) + lhs(i,j,k,5,c) = 0.0d0 + end do + end do + end do + +c--------------------------------------------------------------------- +c add fourth order dissipation +c--------------------------------------------------------------------- + if (start(2,c) .gt. 0) then + j = 1 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 + + lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4 + lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz6 + lhs(i,j+1,k,4,c) = lhs(i,j+1,k,4,c) - comz4 + lhs(i,j+1,k,5,c) = lhs(i,j+1,k,5,c) + comz1 + end do + end do + endif + + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j=3*start(2,c), cell_size(2,c)-3*end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + + lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 + lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 + end do + end do + end do + + if (end(2,c) .gt. 0) then + j = cell_size(2,c)-3 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 + lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + + lhs(i,j+1,k,1,c) = lhs(i,j+1,k,1,c) + comz1 + lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4 + lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz5 + end do + end do + endif + +c--------------------------------------------------------------------- +c subsequently, do the other two factors +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c) + lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) - + > dtty2 * speed(i,j-1,k,c) + lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c) + lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) + + > dtty2 * speed(i,j+1,k,c) + lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c) + lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c) + lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + + > dtty2 * speed(i,j-1,k,c) + lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c) + lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - + > dtty2 * speed(i,j+1,k,c) + lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c) + end do + end do + end do + + return + end + + + diff --git a/examples/smpi/NAS/SP/lhsz.f b/examples/smpi/NAS/SP/lhsz.f new file mode 100644 index 0000000000..08ea0bc24d --- /dev/null +++ b/examples/smpi/NAS/SP/lhsz.f @@ -0,0 +1,123 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine lhsz(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c This function computes the left hand side for the three z-factors +c--------------------------------------------------------------------- + + include 'header.h' + + double precision ru1 + integer i, j, k, c + +c--------------------------------------------------------------------- +c treat only cell c +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c first fill the lhs for the u-eigenvalue +c--------------------------------------------------------------------- + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + + do k = start(3,c)-1, cell_size(3,c)-end(3,c) + ru1 = c3c4*rho_i(i,j,k,c) + cv(k) = ws(i,j,k,c) + rhos(k) = dmax1(dz4 + con43 * ru1, + > dz5 + c1c5 * ru1, + > dzmax + ru1, + > dz1) + end do + + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + lhs(i,j,k,1,c) = 0.0d0 + lhs(i,j,k,2,c) = -dttz2 * cv(k-1) - dttz1 * rhos(k-1) + lhs(i,j,k,3,c) = 1.0 + c2dttz1 * rhos(k) + lhs(i,j,k,4,c) = dttz2 * cv(k+1) - dttz1 * rhos(k+1) + lhs(i,j,k,5,c) = 0.0d0 + end do + end do + end do + +c--------------------------------------------------------------------- +c add fourth order dissipation +c--------------------------------------------------------------------- + if (start(3,c) .gt. 0) then + k = 1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 + + lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4 + lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz6 + lhs(i,j,k+1,4,c) = lhs(i,j,k+1,4,c) - comz4 + lhs(i,j,k+1,5,c) = lhs(i,j,k+1,5,c) + comz1 + end do + end do + endif + + do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 + lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 + end do + end do + end do + + if (end(3,c) .gt. 0) then + k = cell_size(3,c)-3 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 + lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 + lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 + lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 + + lhs(i,j,k+1,1,c) = lhs(i,j,k+1,1,c) + comz1 + lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4 + lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz5 + end do + end do + endif + + +c--------------------------------------------------------------------- +c subsequently, fill the other factors (u+c), (u-c) +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c) + lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) - + > dttz2 * speed(i,j,k-1,c) + lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c) + lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) + + > dttz2 * speed(i,j,k+1,c) + lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c) + lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c) + lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + + > dttz2 * speed(i,j,k-1,c) + lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c) + lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - + > dttz2 * speed(i,j,k+1,c) + lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c) + end do + end do + end do + + return + end + + diff --git a/examples/smpi/NAS/SP/make_set.f b/examples/smpi/NAS/SP/make_set.f new file mode 100644 index 0000000000..7a84e93010 --- /dev/null +++ b/examples/smpi/NAS/SP/make_set.f @@ -0,0 +1,120 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine make_set + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c This function allocates space for a set of cells and fills the set +c such that communication between cells on different nodes is only +c nearest neighbor +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer p, i, j, c, dir, size, excess, ierr,ierrcode + +c--------------------------------------------------------------------- +c compute square root; add small number to allow for roundoff +c (note: this is computed in setup_mpi.f also, but prefer to do +c it twice because of some include file problems). +c--------------------------------------------------------------------- + ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0)) + +c--------------------------------------------------------------------- +c this makes coding easier +c--------------------------------------------------------------------- + p = ncells + +c--------------------------------------------------------------------- +c determine the location of the cell at the bottom of the 3D +c array of cells +c--------------------------------------------------------------------- + cell_coord(1,1) = mod(node,p) + cell_coord(2,1) = node/p + cell_coord(3,1) = 0 + +c--------------------------------------------------------------------- +c set the cell_coords for cells in the rest of the z-layers; +c this comes down to a simple linear numbering in the z-direct- +c ion, and to the doubly-cyclic numbering in the other dirs +c--------------------------------------------------------------------- + do c=2, p + cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) + cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) + cell_coord(3,c) = c-1 + end do + +c--------------------------------------------------------------------- +c offset all the coordinates by 1 to adjust for Fortran arrays +c--------------------------------------------------------------------- + do dir = 1, 3 + do c = 1, p + cell_coord(dir,c) = cell_coord(dir,c) + 1 + end do + end do + +c--------------------------------------------------------------------- +c slice(dir,n) contains the sequence number of the cell that is in +c coordinate plane n in the dir direction +c--------------------------------------------------------------------- + do dir = 1, 3 + do c = 1, p + slice(dir,cell_coord(dir,c)) = c + end do + end do + + +c--------------------------------------------------------------------- +c fill the predecessor and successor entries, using the indices +c of the bottom cells (they are the same at each level of k +c anyway) acting as if full periodicity pertains; note that p is +c added to those arguments to the mod functions that might +c otherwise return wrong values when using the modulo function +c--------------------------------------------------------------------- + i = cell_coord(1,1)-1 + j = cell_coord(2,1)-1 + + predecessor(1) = mod(i-1+p,p) + p*j + predecessor(2) = i + p*mod(j-1+p,p) + predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p) + successor(1) = mod(i+1,p) + p*j + successor(2) = i + p*mod(j+1,p) + successor(3) = mod(i-1+p,p) + p*mod(j+1,p) + +c--------------------------------------------------------------------- +c now compute the sizes of the cells +c--------------------------------------------------------------------- + do dir= 1, 3 +c--------------------------------------------------------------------- +c set cell_coord range for each direction +c--------------------------------------------------------------------- + size = grid_points(dir)/p + excess = mod(grid_points(dir),p) + do c=1, ncells + if (cell_coord(dir,c) .le. excess) then + cell_size(dir,c) = size+1 + cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1) + cell_high(dir,c) = cell_low(dir,c)+size + else + cell_size(dir,c) = size + cell_low(dir,c) = excess*(size+1)+ + > (cell_coord(dir,c)-excess-1)*size + cell_high(dir,c) = cell_low(dir,c)+size-1 + endif + if (cell_size(dir, c) .le. 2) then + write(*,50) + 50 format(' Error: Cell size too small. Min size is 3') + call MPI_Abort(mpi_comm_world,ierrcode,ierr) + stop + endif + end do + end do + + return + end + diff --git a/examples/smpi/NAS/SP/mpinpb.h b/examples/smpi/NAS/SP/mpinpb.h new file mode 100644 index 0000000000..439db34f60 --- /dev/null +++ b/examples/smpi/NAS/SP/mpinpb.h @@ -0,0 +1,13 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'mpif.h' + + integer node, no_nodes, total_nodes, root, comm_setup, + > comm_solve, comm_rhs, dp_type + logical active + common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, + > comm_solve, comm_rhs, dp_type, active + integer DEFAULT_TAG + parameter (DEFAULT_TAG = 0) diff --git a/examples/smpi/NAS/SP/ninvr.f b/examples/smpi/NAS/SP/ninvr.f new file mode 100644 index 0000000000..146d046e8b --- /dev/null +++ b/examples/smpi/NAS/SP/ninvr.f @@ -0,0 +1,45 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine ninvr(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c block-diagonal matrix-vector multiplication +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, i, j, k + double precision r1, r2, r3, r4, r5, t1, t2 + +c--------------------------------------------------------------------- +c treat only one cell +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + + r1 = rhs(i,j,k,1,c) + r2 = rhs(i,j,k,2,c) + r3 = rhs(i,j,k,3,c) + r4 = rhs(i,j,k,4,c) + r5 = rhs(i,j,k,5,c) + + t1 = bt * r3 + t2 = 0.5d0 * ( r4 + r5 ) + + rhs(i,j,k,1,c) = -r2 + rhs(i,j,k,2,c) = r1 + rhs(i,j,k,3,c) = bt * ( r4 - r5 ) + rhs(i,j,k,4,c) = -t1 + t2 + rhs(i,j,k,5,c) = t1 + t2 + enddo + enddo + enddo + + return + end diff --git a/examples/smpi/NAS/SP/pinvr.f b/examples/smpi/NAS/SP/pinvr.f new file mode 100644 index 0000000000..060f0a57ef --- /dev/null +++ b/examples/smpi/NAS/SP/pinvr.f @@ -0,0 +1,48 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine pinvr(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c block-diagonal matrix-vector multiplication +c--------------------------------------------------------------------- + + include 'header.h' + + integer i, j, k, c + double precision r1, r2, r3, r4, r5, t1, t2 + +c--------------------------------------------------------------------- +c treat only one cell +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + + r1 = rhs(i,j,k,1,c) + r2 = rhs(i,j,k,2,c) + r3 = rhs(i,j,k,3,c) + r4 = rhs(i,j,k,4,c) + r5 = rhs(i,j,k,5,c) + + t1 = bt * r1 + t2 = 0.5d0 * ( r4 + r5 ) + + rhs(i,j,k,1,c) = bt * ( r4 - r5 ) + rhs(i,j,k,2,c) = -r3 + rhs(i,j,k,3,c) = r2 + rhs(i,j,k,4,c) = -t1 + t2 + rhs(i,j,k,5,c) = t1 + t2 + end do + end do + end do + + return + end + + + diff --git a/examples/smpi/NAS/SP/rhs.f b/examples/smpi/NAS/SP/rhs.f new file mode 100644 index 0000000000..34e562a4e3 --- /dev/null +++ b/examples/smpi/NAS/SP/rhs.f @@ -0,0 +1,446 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine compute_rhs + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, i, j, k, m + double precision aux, rho_inv, uijk, up1, um1, vijk, vp1, vm1, + > wijk, wp1, wm1 + + +c--------------------------------------------------------------------- +c loop over all cells owned by this node +c--------------------------------------------------------------------- + do c = 1, ncells + +c--------------------------------------------------------------------- +c compute the reciprocal of density, and the kinetic energy, +c and the speed of sound. +c--------------------------------------------------------------------- + + do k = -1, cell_size(3,c) + do j = -1, cell_size(2,c) + do i = -1, cell_size(1,c) + rho_inv = 1.0d0/u(i,j,k,1,c) + rho_i(i,j,k,c) = rho_inv + us(i,j,k,c) = u(i,j,k,2,c) * rho_inv + vs(i,j,k,c) = u(i,j,k,3,c) * rho_inv + ws(i,j,k,c) = u(i,j,k,4,c) * rho_inv + square(i,j,k,c) = 0.5d0* ( + > u(i,j,k,2,c)*u(i,j,k,2,c) + + > u(i,j,k,3,c)*u(i,j,k,3,c) + + > u(i,j,k,4,c)*u(i,j,k,4,c) ) * rho_inv + qs(i,j,k,c) = square(i,j,k,c) * rho_inv +c--------------------------------------------------------------------- +c (don't need speed and ainx until the lhs computation) +c--------------------------------------------------------------------- + aux = c1c2*rho_inv* (u(i,j,k,5,c) - square(i,j,k,c)) + aux = dsqrt(aux) + speed(i,j,k,c) = aux + ainv(i,j,k,c) = 1.0d0/aux + end do + end do + end do + +c--------------------------------------------------------------------- +c copy the exact forcing term to the right hand side; because +c this forcing term is known, we can store it on the whole of every +c cell, including the boundary +c--------------------------------------------------------------------- + + do m = 1, 5 + do k = 0, cell_size(3,c)-1 + do j = 0, cell_size(2,c)-1 + do i = 0, cell_size(1,c)-1 + rhs(i,j,k,m,c) = forcing(i,j,k,m,c) + end do + end do + end do + end do + + +c--------------------------------------------------------------------- +c compute xi-direction fluxes +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + uijk = us(i,j,k,c) + up1 = us(i+1,j,k,c) + um1 = us(i-1,j,k,c) + + rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dx1tx1 * + > (u(i+1,j,k,1,c) - 2.0d0*u(i,j,k,1,c) + + > u(i-1,j,k,1,c)) - + > tx2 * (u(i+1,j,k,2,c) - u(i-1,j,k,2,c)) + + rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dx2tx1 * + > (u(i+1,j,k,2,c) - 2.0d0*u(i,j,k,2,c) + + > u(i-1,j,k,2,c)) + + > xxcon2*con43 * (up1 - 2.0d0*uijk + um1) - + > tx2 * (u(i+1,j,k,2,c)*up1 - + > u(i-1,j,k,2,c)*um1 + + > (u(i+1,j,k,5,c)- square(i+1,j,k,c)- + > u(i-1,j,k,5,c)+ square(i-1,j,k,c))* + > c2) + + rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dx3tx1 * + > (u(i+1,j,k,3,c) - 2.0d0*u(i,j,k,3,c) + + > u(i-1,j,k,3,c)) + + > xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) + + > vs(i-1,j,k,c)) - + > tx2 * (u(i+1,j,k,3,c)*up1 - + > u(i-1,j,k,3,c)*um1) + + rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dx4tx1 * + > (u(i+1,j,k,4,c) - 2.0d0*u(i,j,k,4,c) + + > u(i-1,j,k,4,c)) + + > xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) + + > ws(i-1,j,k,c)) - + > tx2 * (u(i+1,j,k,4,c)*up1 - + > u(i-1,j,k,4,c)*um1) + + rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dx5tx1 * + > (u(i+1,j,k,5,c) - 2.0d0*u(i,j,k,5,c) + + > u(i-1,j,k,5,c)) + + > xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) + + > qs(i-1,j,k,c)) + + > xxcon4 * (up1*up1 - 2.0d0*uijk*uijk + + > um1*um1) + + > xxcon5 * (u(i+1,j,k,5,c)*rho_i(i+1,j,k,c) - + > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) + + > u(i-1,j,k,5,c)*rho_i(i-1,j,k,c)) - + > tx2 * ( (c1*u(i+1,j,k,5,c) - + > c2*square(i+1,j,k,c))*up1 - + > (c1*u(i-1,j,k,5,c) - + > c2*square(i-1,j,k,c))*um1 ) + end do + end do + end do + +c--------------------------------------------------------------------- +c add fourth order xi-direction dissipation +c--------------------------------------------------------------------- + if (start(1,c) .gt. 0) then + i = 1 + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * + > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) + + > u(i+2,j,k,m,c)) + end do + end do + end do + + i = 2 + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > (-4.0d0*u(i-1,j,k,m,c) + 6.0d0*u(i,j,k,m,c) - + > 4.0d0*u(i+1,j,k,m,c) + u(i+2,j,k,m,c)) + end do + end do + end do + endif + + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) + + > 6.0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) + + > u(i+2,j,k,m,c) ) + end do + end do + end do + end do + + + if (end(1,c) .gt. 0) then + i = cell_size(1,c)-3 + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) + + > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) ) + end do + end do + end do + + i = cell_size(1,c)-2 + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i-2,j,k,m,c) - 4.d0*u(i-1,j,k,m,c) + + > 5.d0*u(i,j,k,m,c) ) + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c compute eta-direction fluxes +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + vijk = vs(i,j,k,c) + vp1 = vs(i,j+1,k,c) + vm1 = vs(i,j-1,k,c) + rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dy1ty1 * + > (u(i,j+1,k,1,c) - 2.0d0*u(i,j,k,1,c) + + > u(i,j-1,k,1,c)) - + > ty2 * (u(i,j+1,k,3,c) - u(i,j-1,k,3,c)) + rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dy2ty1 * + > (u(i,j+1,k,2,c) - 2.0d0*u(i,j,k,2,c) + + > u(i,j-1,k,2,c)) + + > yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + + > us(i,j-1,k,c)) - + > ty2 * (u(i,j+1,k,2,c)*vp1 - + > u(i,j-1,k,2,c)*vm1) + rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dy3ty1 * + > (u(i,j+1,k,3,c) - 2.0d0*u(i,j,k,3,c) + + > u(i,j-1,k,3,c)) + + > yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) - + > ty2 * (u(i,j+1,k,3,c)*vp1 - + > u(i,j-1,k,3,c)*vm1 + + > (u(i,j+1,k,5,c) - square(i,j+1,k,c) - + > u(i,j-1,k,5,c) + square(i,j-1,k,c)) + > *c2) + rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dy4ty1 * + > (u(i,j+1,k,4,c) - 2.0d0*u(i,j,k,4,c) + + > u(i,j-1,k,4,c)) + + > yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + + > ws(i,j-1,k,c)) - + > ty2 * (u(i,j+1,k,4,c)*vp1 - + > u(i,j-1,k,4,c)*vm1) + rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dy5ty1 * + > (u(i,j+1,k,5,c) - 2.0d0*u(i,j,k,5,c) + + > u(i,j-1,k,5,c)) + + > yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + + > qs(i,j-1,k,c)) + + > yycon4 * (vp1*vp1 - 2.0d0*vijk*vijk + + > vm1*vm1) + + > yycon5 * (u(i,j+1,k,5,c)*rho_i(i,j+1,k,c) - + > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) + + > u(i,j-1,k,5,c)*rho_i(i,j-1,k,c)) - + > ty2 * ((c1*u(i,j+1,k,5,c) - + > c2*square(i,j+1,k,c)) * vp1 - + > (c1*u(i,j-1,k,5,c) - + > c2*square(i,j-1,k,c)) * vm1) + end do + end do + end do + +c--------------------------------------------------------------------- +c add fourth order eta-direction dissipation +c--------------------------------------------------------------------- + if (start(2,c) .gt. 0) then + j = 1 + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * + > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) + + > u(i,j+2,k,m,c)) + end do + end do + end do + + j = 2 + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > (-4.0d0*u(i,j-1,k,m,c) + 6.0d0*u(i,j,k,m,c) - + > 4.0d0*u(i,j+1,k,m,c) + u(i,j+2,k,m,c)) + end do + end do + end do + endif + + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1 + do i = start(1,c),cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) + + > 6.0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) + + > u(i,j+2,k,m,c) ) + end do + end do + end do + end do + + if (end(2,c) .gt. 0) then + j = cell_size(2,c)-3 + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) + + > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) ) + end do + end do + end do + + j = cell_size(2,c)-2 + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i,j-2,k,m,c) - 4.d0*u(i,j-1,k,m,c) + + > 5.d0*u(i,j,k,m,c) ) + end do + end do + end do + endif + + +c--------------------------------------------------------------------- +c compute zeta-direction fluxes +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + wijk = ws(i,j,k,c) + wp1 = ws(i,j,k+1,c) + wm1 = ws(i,j,k-1,c) + + rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dz1tz1 * + > (u(i,j,k+1,1,c) - 2.0d0*u(i,j,k,1,c) + + > u(i,j,k-1,1,c)) - + > tz2 * (u(i,j,k+1,4,c) - u(i,j,k-1,4,c)) + rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dz2tz1 * + > (u(i,j,k+1,2,c) - 2.0d0*u(i,j,k,2,c) + + > u(i,j,k-1,2,c)) + + > zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + + > us(i,j,k-1,c)) - + > tz2 * (u(i,j,k+1,2,c)*wp1 - + > u(i,j,k-1,2,c)*wm1) + rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dz3tz1 * + > (u(i,j,k+1,3,c) - 2.0d0*u(i,j,k,3,c) + + > u(i,j,k-1,3,c)) + + > zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + + > vs(i,j,k-1,c)) - + > tz2 * (u(i,j,k+1,3,c)*wp1 - + > u(i,j,k-1,3,c)*wm1) + rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dz4tz1 * + > (u(i,j,k+1,4,c) - 2.0d0*u(i,j,k,4,c) + + > u(i,j,k-1,4,c)) + + > zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) - + > tz2 * (u(i,j,k+1,4,c)*wp1 - + > u(i,j,k-1,4,c)*wm1 + + > (u(i,j,k+1,5,c) - square(i,j,k+1,c) - + > u(i,j,k-1,5,c) + square(i,j,k-1,c)) + > *c2) + rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dz5tz1 * + > (u(i,j,k+1,5,c) - 2.0d0*u(i,j,k,5,c) + + > u(i,j,k-1,5,c)) + + > zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + + > qs(i,j,k-1,c)) + + > zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + + > wm1*wm1) + + > zzcon5 * (u(i,j,k+1,5,c)*rho_i(i,j,k+1,c) - + > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) + + > u(i,j,k-1,5,c)*rho_i(i,j,k-1,c)) - + > tz2 * ( (c1*u(i,j,k+1,5,c) - + > c2*square(i,j,k+1,c))*wp1 - + > (c1*u(i,j,k-1,5,c) - + > c2*square(i,j,k-1,c))*wm1) + end do + end do + end do + +c--------------------------------------------------------------------- +c add fourth order zeta-direction dissipation +c--------------------------------------------------------------------- + if (start(3,c) .gt. 0) then + k = 1 + do m = 1, 5 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * + > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) + + > u(i,j,k+2,m,c)) + end do + end do + end do + + k = 2 + do m = 1, 5 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > (-4.0d0*u(i,j,k-1,m,c) + 6.0d0*u(i,j,k,m,c) - + > 4.0d0*u(i,j,k+1,m,c) + u(i,j,k+2,m,c)) + end do + end do + end do + endif + + do m = 1, 5 + do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c),cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) + + > 6.0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) + + > u(i,j,k+2,m,c) ) + end do + end do + end do + end do + + if (end(3,c) .gt. 0) then + k = cell_size(3,c)-3 + do m = 1, 5 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) + + > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) ) + end do + end do + end do + + k = cell_size(3,c)-2 + do m = 1, 5 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * + > ( u(i,j,k-2,m,c) - 4.d0*u(i,j,k-1,m,c) + + > 5.d0*u(i,j,k,m,c) ) + end do + end do + end do + endif + + do m = 1, 5 + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) * dt + end do + end do + end do + end do + + end do + + return + end + + + + diff --git a/examples/smpi/NAS/SP/set_constants.f b/examples/smpi/NAS/SP/set_constants.f new file mode 100644 index 0000000000..63ce72bb9b --- /dev/null +++ b/examples/smpi/NAS/SP/set_constants.f @@ -0,0 +1,203 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine set_constants + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'header.h' + + ce(1,1) = 2.0d0 + ce(1,2) = 0.0d0 + ce(1,3) = 0.0d0 + ce(1,4) = 4.0d0 + ce(1,5) = 5.0d0 + ce(1,6) = 3.0d0 + ce(1,7) = 0.5d0 + ce(1,8) = 0.02d0 + ce(1,9) = 0.01d0 + ce(1,10) = 0.03d0 + ce(1,11) = 0.5d0 + ce(1,12) = 0.4d0 + ce(1,13) = 0.3d0 + + ce(2,1) = 1.0d0 + ce(2,2) = 0.0d0 + ce(2,3) = 0.0d0 + ce(2,4) = 0.0d0 + ce(2,5) = 1.0d0 + ce(2,6) = 2.0d0 + ce(2,7) = 3.0d0 + ce(2,8) = 0.01d0 + ce(2,9) = 0.03d0 + ce(2,10) = 0.02d0 + ce(2,11) = 0.4d0 + ce(2,12) = 0.3d0 + ce(2,13) = 0.5d0 + + ce(3,1) = 2.0d0 + ce(3,2) = 2.0d0 + ce(3,3) = 0.0d0 + ce(3,4) = 0.0d0 + ce(3,5) = 0.0d0 + ce(3,6) = 2.0d0 + ce(3,7) = 3.0d0 + ce(3,8) = 0.04d0 + ce(3,9) = 0.03d0 + ce(3,10) = 0.05d0 + ce(3,11) = 0.3d0 + ce(3,12) = 0.5d0 + ce(3,13) = 0.4d0 + + ce(4,1) = 2.0d0 + ce(4,2) = 2.0d0 + ce(4,3) = 0.0d0 + ce(4,4) = 0.0d0 + ce(4,5) = 0.0d0 + ce(4,6) = 2.0d0 + ce(4,7) = 3.0d0 + ce(4,8) = 0.03d0 + ce(4,9) = 0.05d0 + ce(4,10) = 0.04d0 + ce(4,11) = 0.2d0 + ce(4,12) = 0.1d0 + ce(4,13) = 0.3d0 + + ce(5,1) = 5.0d0 + ce(5,2) = 4.0d0 + ce(5,3) = 3.0d0 + ce(5,4) = 2.0d0 + ce(5,5) = 0.1d0 + ce(5,6) = 0.4d0 + ce(5,7) = 0.3d0 + ce(5,8) = 0.05d0 + ce(5,9) = 0.04d0 + ce(5,10) = 0.03d0 + ce(5,11) = 0.1d0 + ce(5,12) = 0.3d0 + ce(5,13) = 0.2d0 + + c1 = 1.4d0 + c2 = 0.4d0 + c3 = 0.1d0 + c4 = 1.0d0 + c5 = 1.4d0 + + bt = dsqrt(0.5d0) + + dnxm1 = 1.0d0 / dble(grid_points(1)-1) + dnym1 = 1.0d0 / dble(grid_points(2)-1) + dnzm1 = 1.0d0 / dble(grid_points(3)-1) + + c1c2 = c1 * c2 + c1c5 = c1 * c5 + c3c4 = c3 * c4 + c1345 = c1c5 * c3c4 + + conz1 = (1.0d0-c1c5) + + tx1 = 1.0d0 / (dnxm1 * dnxm1) + tx2 = 1.0d0 / (2.0d0 * dnxm1) + tx3 = 1.0d0 / dnxm1 + + ty1 = 1.0d0 / (dnym1 * dnym1) + ty2 = 1.0d0 / (2.0d0 * dnym1) + ty3 = 1.0d0 / dnym1 + + tz1 = 1.0d0 / (dnzm1 * dnzm1) + tz2 = 1.0d0 / (2.0d0 * dnzm1) + tz3 = 1.0d0 / dnzm1 + + dx1 = 0.75d0 + dx2 = 0.75d0 + dx3 = 0.75d0 + dx4 = 0.75d0 + dx5 = 0.75d0 + + dy1 = 0.75d0 + dy2 = 0.75d0 + dy3 = 0.75d0 + dy4 = 0.75d0 + dy5 = 0.75d0 + + dz1 = 1.0d0 + dz2 = 1.0d0 + dz3 = 1.0d0 + dz4 = 1.0d0 + dz5 = 1.0d0 + + dxmax = dmax1(dx3, dx4) + dymax = dmax1(dy2, dy4) + dzmax = dmax1(dz2, dz3) + + dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) ) + + c4dssp = 4.0d0 * dssp + c5dssp = 5.0d0 * dssp + + dttx1 = dt*tx1 + dttx2 = dt*tx2 + dtty1 = dt*ty1 + dtty2 = dt*ty2 + dttz1 = dt*tz1 + dttz2 = dt*tz2 + + c2dttx1 = 2.0d0*dttx1 + c2dtty1 = 2.0d0*dtty1 + c2dttz1 = 2.0d0*dttz1 + + dtdssp = dt*dssp + + comz1 = dtdssp + comz4 = 4.0d0*dtdssp + comz5 = 5.0d0*dtdssp + comz6 = 6.0d0*dtdssp + + c3c4tx3 = c3c4*tx3 + c3c4ty3 = c3c4*ty3 + c3c4tz3 = c3c4*tz3 + + dx1tx1 = dx1*tx1 + dx2tx1 = dx2*tx1 + dx3tx1 = dx3*tx1 + dx4tx1 = dx4*tx1 + dx5tx1 = dx5*tx1 + + dy1ty1 = dy1*ty1 + dy2ty1 = dy2*ty1 + dy3ty1 = dy3*ty1 + dy4ty1 = dy4*ty1 + dy5ty1 = dy5*ty1 + + dz1tz1 = dz1*tz1 + dz2tz1 = dz2*tz1 + dz3tz1 = dz3*tz1 + dz4tz1 = dz4*tz1 + dz5tz1 = dz5*tz1 + + c2iv = 2.5d0 + con43 = 4.0d0/3.0d0 + con16 = 1.0d0/6.0d0 + + xxcon1 = c3c4tx3*con43*tx3 + xxcon2 = c3c4tx3*tx3 + xxcon3 = c3c4tx3*conz1*tx3 + xxcon4 = c3c4tx3*con16*tx3 + xxcon5 = c3c4tx3*c1c5*tx3 + + yycon1 = c3c4ty3*con43*ty3 + yycon2 = c3c4ty3*ty3 + yycon3 = c3c4ty3*conz1*ty3 + yycon4 = c3c4ty3*con16*ty3 + yycon5 = c3c4ty3*c1c5*ty3 + + zzcon1 = c3c4tz3*con43*tz3 + zzcon2 = c3c4tz3*tz3 + zzcon3 = c3c4tz3*conz1*tz3 + zzcon4 = c3c4tz3*con16*tz3 + zzcon5 = c3c4tz3*c1c5*tz3 + + return + end diff --git a/examples/smpi/NAS/SP/setup_mpi.f b/examples/smpi/NAS/SP/setup_mpi.f new file mode 100644 index 0000000000..2d98f7dd02 --- /dev/null +++ b/examples/smpi/NAS/SP/setup_mpi.f @@ -0,0 +1,65 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine setup_mpi + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c set up MPI stuff +c--------------------------------------------------------------------- + + implicit none + include 'mpinpb.h' + include 'npbparams.h' + integer error, nc, color + + call mpi_init(error) + + call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error) + call mpi_comm_rank(MPI_COMM_WORLD, node, error) + + if (.not. convertdouble) then + dp_type = MPI_DOUBLE_PRECISION + else + dp_type = MPI_REAL + endif + +c--------------------------------------------------------------------- +c compute square root; add small number to allow for roundoff +c--------------------------------------------------------------------- + nc = dint(dsqrt(dble(total_nodes) + 0.00001d0)) + +c--------------------------------------------------------------------- +c We handle a non-square number of nodes by making the excess nodes +c inactive. However, we can never handle more cells than were compiled +c in. +c--------------------------------------------------------------------- + + if (nc .gt. maxcells) nc = maxcells + + if (node .ge. nc*nc) then + active = .false. + color = 1 + else + active = .true. + color = 0 + end if + + call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error) + if (.not. active) return + + call mpi_comm_size(comm_setup, no_nodes, error) + call mpi_comm_dup(comm_setup, comm_solve, error) + call mpi_comm_dup(comm_setup, comm_rhs, error) + +c--------------------------------------------------------------------- +c let node 0 be the root for the group (there is only one) +c--------------------------------------------------------------------- + root = 0 + + return + end + diff --git a/examples/smpi/NAS/SP/sp.f b/examples/smpi/NAS/SP/sp.f new file mode 100644 index 0000000000..740cadee46 --- /dev/null +++ b/examples/smpi/NAS/SP/sp.f @@ -0,0 +1,194 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! S P ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! +! It is described in NAS Technical Reports 95-020 and 02-007 ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + + +c--------------------------------------------------------------------- +c +c Authors: R. F. Van der Wijngaart +c W. Saphir +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- + program MPSP +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i, niter, step, c, error, fstatus + external timer_read + double precision mflops, t, tmax, timer_read + logical verified + character class + + call setup_mpi + if (.not. active) goto 999 + +c--------------------------------------------------------------------- +c Root node reads input file (if it exists) else takes +c defaults from parameters +c--------------------------------------------------------------------- + if (node .eq. root) then + + write(*, 1000) + open (unit=2,file='inputsp.data',status='old', iostat=fstatus) +c + if (fstatus .eq. 0) then + write(*,233) + 233 format(' Reading from input file inputsp.data') + read (2,*) niter + read (2,*) dt + read (2,*) grid_points(1), grid_points(2), grid_points(3) + close(2) + else + write(*,234) + niter = niter_default + dt = dt_default + grid_points(1) = problem_size + grid_points(2) = problem_size + grid_points(3) = problem_size + endif + 234 format(' No input file inputsp.data. Using compiled defaults') + + write(*, 1001) grid_points(1), grid_points(2), grid_points(3) + write(*, 1002) niter, dt + if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes + if (no_nodes .ne. maxcells*maxcells) + > write(*, 1005) maxcells*maxcells + write(*, 1003) no_nodes + + 1000 format(//,' NAS Parallel Benchmarks 3.3 -- SP Benchmark',/) + 1001 format(' Size: ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations: ', i4, ' dt: ', F11.7) + 1004 format(' Total number of processes: ', i5) + 1005 format(' WARNING: compiled for ', i5, ' processes ') + 1003 format(' Number of active processes: ', i5, /) + + endif + + call mpi_bcast(niter, 1, MPI_INTEGER, + > root, comm_setup, error) + + call mpi_bcast(dt, 1, dp_type, + > root, comm_setup, error) + + call mpi_bcast(grid_points(1), 3, MPI_INTEGER, + > root, comm_setup, error) + + + call make_set + + do c = 1, ncells + if ( (cell_size(1,c) .gt. IMAX) .or. + > (cell_size(2,c) .gt. JMAX) .or. + > (cell_size(3,c) .gt. KMAX) ) then + print *,node, c, (cell_size(i,c),i=1,3) + print *,' Problem size too big for compiled array sizes' + goto 999 + endif + end do + + call set_constants + + call initialize + +c call mpi_finalize(error) +c stop + + call lhsinit + + call exact_rhs + + call compute_buffer_size(5) + +c--------------------------------------------------------------------- +c do one time step to touch all code, and reinitialize +c--------------------------------------------------------------------- + call adi + call initialize + +c--------------------------------------------------------------------- +c Synchronize before placing time stamp +c--------------------------------------------------------------------- + call mpi_barrier(comm_setup, error) + + call timer_clear(1) + call timer_start(1) + + do step = 1, niter + + if (node .eq. root) then + if (mod(step, 20) .eq. 0 .or. + > step .eq. 1) then + write(*, 200) step + 200 format(' Time step ', i4) + endif + endif + + call adi + + end do + + call timer_stop(1) + t = timer_read(1) + + call verify(niter, class, verified) + + call mpi_reduce(t, tmax, 1, + > dp_type, MPI_MAX, + > root, comm_setup, error) + + if( node .eq. root ) then + if( tmax .ne. 0. ) then + mflops = (881.174*float( problem_size )**3 + > -4683.91*float( problem_size )**2 + > +11484.5*float( problem_size ) + > -19272.4) * float( niter ) / (tmax*1000000.0d0) + else + mflops = 0.0 + endif + + call print_results('SP', class, grid_points(1), + > grid_points(2), grid_points(3), niter, maxcells*maxcells, + > total_nodes, tmax, mflops, ' floating point', + > verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, + > cs6, '(none)') + endif + + 999 continue + call mpi_barrier(MPI_COMM_WORLD, error) + call mpi_finalize(error) + + end diff --git a/examples/smpi/NAS/SP/txinvr.f b/examples/smpi/NAS/SP/txinvr.f new file mode 100644 index 0000000000..b5ca4616f5 --- /dev/null +++ b/examples/smpi/NAS/SP/txinvr.f @@ -0,0 +1,59 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine txinvr + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c block-diagonal matrix-vector multiplication +c--------------------------------------------------------------------- + + include 'header.h' + + integer c, i, j, k + double precision t1, t2, t3, ac, ru1, uu, vv, ww, r1, r2, r3, + > r4, r5, ac2inv + +c--------------------------------------------------------------------- +c loop over all cells owned by this node +c--------------------------------------------------------------------- + do c = 1, ncells + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + + ru1 = rho_i(i,j,k,c) + uu = us(i,j,k,c) + vv = vs(i,j,k,c) + ww = ws(i,j,k,c) + ac = speed(i,j,k,c) + ac2inv = ainv(i,j,k,c)*ainv(i,j,k,c) + + r1 = rhs(i,j,k,1,c) + r2 = rhs(i,j,k,2,c) + r3 = rhs(i,j,k,3,c) + r4 = rhs(i,j,k,4,c) + r5 = rhs(i,j,k,5,c) + + t1 = c2 * ac2inv * ( qs(i,j,k,c)*r1 - uu*r2 - + > vv*r3 - ww*r4 + r5 ) + t2 = bt * ru1 * ( uu * r1 - r2 ) + t3 = ( bt * ru1 * ac ) * t1 + + rhs(i,j,k,1,c) = r1 - t1 + rhs(i,j,k,2,c) = - ru1 * ( ww*r1 - r4 ) + rhs(i,j,k,3,c) = ru1 * ( vv*r1 - r3 ) + rhs(i,j,k,4,c) = - t2 + t3 + rhs(i,j,k,5,c) = t2 + t3 + end do + end do + end do + end do + + return + end + + diff --git a/examples/smpi/NAS/SP/tzetar.f b/examples/smpi/NAS/SP/tzetar.f new file mode 100644 index 0000000000..554066d6fc --- /dev/null +++ b/examples/smpi/NAS/SP/tzetar.f @@ -0,0 +1,60 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine tzetar(c) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c block-diagonal matrix-vector multiplication +c--------------------------------------------------------------------- + + include 'header.h' + + integer i, j, k, c + double precision t1, t2, t3, ac, xvel, yvel, zvel, r1, r2, r3, + > r4, r5, btuz, acinv, ac2u, uzik1 + +c--------------------------------------------------------------------- +c treat only one cell +c--------------------------------------------------------------------- + do k = start(3,c), cell_size(3,c)-end(3,c)-1 + do j = start(2,c), cell_size(2,c)-end(2,c)-1 + do i = start(1,c), cell_size(1,c)-end(1,c)-1 + + xvel = us(i,j,k,c) + yvel = vs(i,j,k,c) + zvel = ws(i,j,k,c) + ac = speed(i,j,k,c) + acinv = ainv(i,j,k,c) + + ac2u = ac*ac + + r1 = rhs(i,j,k,1,c) + r2 = rhs(i,j,k,2,c) + r3 = rhs(i,j,k,3,c) + r4 = rhs(i,j,k,4,c) + r5 = rhs(i,j,k,5,c) + + uzik1 = u(i,j,k,1,c) + btuz = bt * uzik1 + + t1 = btuz*acinv * (r4 + r5) + t2 = r3 + t1 + t3 = btuz * (r4 - r5) + + rhs(i,j,k,1,c) = t2 + rhs(i,j,k,2,c) = -uzik1*r2 + xvel*t2 + rhs(i,j,k,3,c) = uzik1*r1 + yvel*t2 + rhs(i,j,k,4,c) = zvel*t2 + t3 + rhs(i,j,k,5,c) = uzik1*(-xvel*r2 + yvel*r1) + + > qs(i,j,k,c)*t2 + c2iv*ac2u*t1 + zvel*t3 + + end do + end do + end do + + return + end diff --git a/examples/smpi/NAS/SP/verify.f b/examples/smpi/NAS/SP/verify.f new file mode 100644 index 0000000000..08be79c8a6 --- /dev/null +++ b/examples/smpi/NAS/SP/verify.f @@ -0,0 +1,358 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine verify(no_time_steps, class, verified) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c verification routine +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), + > epsilon, xce(5), xcr(5), dtref + integer m, no_time_steps + character class + logical verified + +c--------------------------------------------------------------------- +c tolerance level +c--------------------------------------------------------------------- + epsilon = 1.0d-08 + + +c--------------------------------------------------------------------- +c compute the error norm and the residual norm, and exit if not printing +c--------------------------------------------------------------------- + call error_norm(xce) + call copy_faces + + call rhs_norm(xcr) + + do m = 1, 5 + xcr(m) = xcr(m) / dt + enddo + + if (node .ne. 0) return + + class = 'U' + verified = .true. + + do m = 1,5 + xcrref(m) = 1.0 + xceref(m) = 1.0 + end do + +c--------------------------------------------------------------------- +c reference data for 12X12X12 grids after 100 time steps, with DT = 1.50d-02 +c--------------------------------------------------------------------- + if ( (grid_points(1) .eq. 12 ) .and. + > (grid_points(2) .eq. 12 ) .and. + > (grid_points(3) .eq. 12 ) .and. + > (no_time_steps .eq. 100 )) then + + class = 'S' + dtref = 1.5d-2 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 2.7470315451339479d-02 + xcrref(2) = 1.0360746705285417d-02 + xcrref(3) = 1.6235745065095532d-02 + xcrref(4) = 1.5840557224455615d-02 + xcrref(5) = 3.4849040609362460d-02 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + xceref(1) = 2.7289258557377227d-05 + xceref(2) = 1.0364446640837285d-05 + xceref(3) = 1.6154798287166471d-05 + xceref(4) = 1.5750704994480102d-05 + xceref(5) = 3.4177666183390531d-05 + + +c--------------------------------------------------------------------- +c reference data for 36X36X36 grids after 400 time steps, with DT = 1.5d-03 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 36) .and. + > (grid_points(2) .eq. 36) .and. + > (grid_points(3) .eq. 36) .and. + > (no_time_steps . eq. 400) ) then + + class = 'W' + dtref = 1.5d-3 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.1893253733584d-02 + xcrref(2) = 0.1717075447775d-03 + xcrref(3) = 0.2778153350936d-03 + xcrref(4) = 0.2887475409984d-03 + xcrref(5) = 0.3143611161242d-02 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + xceref(1) = 0.7542088599534d-04 + xceref(2) = 0.6512852253086d-05 + xceref(3) = 0.1049092285688d-04 + xceref(4) = 0.1128838671535d-04 + xceref(5) = 0.1212845639773d-03 + +c--------------------------------------------------------------------- +c reference data for 64X64X64 grids after 400 time steps, with DT = 1.5d-03 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 64) .and. + > (grid_points(2) .eq. 64) .and. + > (grid_points(3) .eq. 64) .and. + > (no_time_steps . eq. 400) ) then + + class = 'A' + dtref = 1.5d-3 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 2.4799822399300195d0 + xcrref(2) = 1.1276337964368832d0 + xcrref(3) = 1.5028977888770491d0 + xcrref(4) = 1.4217816211695179d0 + xcrref(5) = 2.1292113035138280d0 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + xceref(1) = 1.0900140297820550d-04 + xceref(2) = 3.7343951769282091d-05 + xceref(3) = 5.0092785406541633d-05 + xceref(4) = 4.7671093939528255d-05 + xceref(5) = 1.3621613399213001d-04 + +c--------------------------------------------------------------------- +c reference data for 102X102X102 grids after 400 time steps, +c with DT = 1.0d-03 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 102) .and. + > (grid_points(2) .eq. 102) .and. + > (grid_points(3) .eq. 102) .and. + > (no_time_steps . eq. 400) ) then + + class = 'B' + dtref = 1.0d-3 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.6903293579998d+02 + xcrref(2) = 0.3095134488084d+02 + xcrref(3) = 0.4103336647017d+02 + xcrref(4) = 0.3864769009604d+02 + xcrref(5) = 0.5643482272596d+02 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + xceref(1) = 0.9810006190188d-02 + xceref(2) = 0.1022827905670d-02 + xceref(3) = 0.1720597911692d-02 + xceref(4) = 0.1694479428231d-02 + xceref(5) = 0.1847456263981d-01 + +c--------------------------------------------------------------------- +c reference data for 162X162X162 grids after 400 time steps, +c with DT = 0.67d-03 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 162) .and. + > (grid_points(2) .eq. 162) .and. + > (grid_points(3) .eq. 162) .and. + > (no_time_steps . eq. 400) ) then + + class = 'C' + dtref = 0.67d-3 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.5881691581829d+03 + xcrref(2) = 0.2454417603569d+03 + xcrref(3) = 0.3293829191851d+03 + xcrref(4) = 0.3081924971891d+03 + xcrref(5) = 0.4597223799176d+03 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + xceref(1) = 0.2598120500183d+00 + xceref(2) = 0.2590888922315d-01 + xceref(3) = 0.5132886416320d-01 + xceref(4) = 0.4806073419454d-01 + xceref(5) = 0.5483377491301d+00 + +c--------------------------------------------------------------------- +c reference data for 408X408X408 grids after 500 time steps, +c with DT = 0.3d-03 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 408) .and. + > (grid_points(2) .eq. 408) .and. + > (grid_points(3) .eq. 408) .and. + > (no_time_steps . eq. 500) ) then + + class = 'D' + dtref = 0.30d-3 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.1044696216887d+05 + xcrref(2) = 0.3204427762578d+04 + xcrref(3) = 0.4648680733032d+04 + xcrref(4) = 0.4238923283697d+04 + xcrref(5) = 0.7588412036136d+04 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + xceref(1) = 0.5089471423669d+01 + xceref(2) = 0.5323514855894d+00 + xceref(3) = 0.1187051008971d+01 + xceref(4) = 0.1083734951938d+01 + xceref(5) = 0.1164108338568d+02 + +c--------------------------------------------------------------------- +c reference data for 1020X1020X1020 grids after 500 time steps, +c with DT = 0.1d-03 +c--------------------------------------------------------------------- + elseif ( (grid_points(1) .eq. 1020) .and. + > (grid_points(2) .eq. 1020) .and. + > (grid_points(3) .eq. 1020) .and. + > (no_time_steps . eq. 500) ) then + + class = 'E' + dtref = 0.10d-3 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of residual. +c--------------------------------------------------------------------- + xcrref(1) = 0.6255387422609d+05 + xcrref(2) = 0.1495317020012d+05 + xcrref(3) = 0.2347595750586d+05 + xcrref(4) = 0.2091099783534d+05 + xcrref(5) = 0.4770412841218d+05 + +c--------------------------------------------------------------------- +c Reference values of RMS-norms of solution error. +c--------------------------------------------------------------------- + xceref(1) = 0.6742735164909d+02 + xceref(2) = 0.5390656036938d+01 + xceref(3) = 0.1680647196477d+02 + xceref(4) = 0.1536963126457d+02 + xceref(5) = 0.1575330146156d+03 + + else + verified = .false. + endif + +c--------------------------------------------------------------------- +c verification test for residuals if gridsize is one of +c the defined grid sizes above (class .ne. 'U') +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Compute the difference of solution values and the known reference values. +c--------------------------------------------------------------------- + do m = 1, 5 + + xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) + xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m)) + + enddo + +c--------------------------------------------------------------------- +c Output the comparison of computed results to known cases. +c--------------------------------------------------------------------- + + if (class .ne. 'U') then + write(*, 1990) class + 1990 format(' Verification being performed for class ', a) + write (*,2000) epsilon + 2000 format(' accuracy setting for epsilon = ', E20.13) + verified = (dabs(dt-dtref) .le. epsilon) + if (.not.verified) then + class = 'U' + write (*,1000) dtref + 1000 format(' DT does not match the reference value of ', + > E15.8) + endif + else + write(*, 1995) + 1995 format(' Unknown class') + endif + + + if (class .ne. 'U') then + write (*,2001) + else + write (*, 2005) + endif + + 2001 format(' Comparison of RMS-norms of residual') + 2005 format(' RMS-norms of residual') + do m = 1, 5 + if (class .eq. 'U') then + write(*, 2015) m, xcr(m) + else if (xcrdif(m) .le. epsilon) then + write (*,2011) m,xcr(m),xcrref(m),xcrdif(m) + else + verified = .false. + write (*,2010) m,xcr(m),xcrref(m),xcrdif(m) + endif + enddo + + if (class .ne. 'U') then + write (*,2002) + else + write (*,2006) + endif + 2002 format(' Comparison of RMS-norms of solution error') + 2006 format(' RMS-norms of solution error') + + do m = 1, 5 + if (class .eq. 'U') then + write(*, 2015) m, xce(m) + else if (xcedif(m) .le. epsilon) then + write (*,2011) m,xce(m),xceref(m),xcedif(m) + else + verified = .false. + write (*,2010) m,xce(m),xceref(m),xcedif(m) + endif + enddo + + 2010 format(' FAILURE: ', i2, E20.13, E20.13, E20.13) + 2011 format(' ', i2, E20.13, E20.13, E20.13) + 2015 format(' ', i2, E20.13) + + if (class .eq. 'U') then + write(*, 2022) + write(*, 2023) + 2022 format(' No reference values provided') + 2023 format(' No verification performed') + else if (verified) then + write(*, 2020) + 2020 format(' Verification Successful') + else + write(*, 2021) + 2021 format(' Verification failed') + endif + + return + + + end diff --git a/examples/smpi/NAS/SP/x_solve.f b/examples/smpi/NAS/SP/x_solve.f new file mode 100644 index 0000000000..cd40756ec3 --- /dev/null +++ b/examples/smpi/NAS/SP/x_solve.f @@ -0,0 +1,545 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine x_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c this function performs the solution of the approximate factorization +c step in the x-direction for all five matrix components +c simultaneously. The Thomas algorithm is employed to solve the +c systems for the x-lines. Boundary conditions are non-periodic +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + + integer i, j, k, jp, kp, n, iend, jsize, ksize, i1, i2, + > buffer_size, c, m, p, istart, stage, error, + > requests(2), statuses(MPI_STATUS_SIZE, 2) + double precision r1, r2, d, e, s(5), sm1, sm2, + > fac1, fac2 + + + +c--------------------------------------------------------------------- +c OK, now we know that there are multiple processors +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells +c on this node in the direction of increasing i for the forward sweep, +c and after that reversing the direction for the backsubstitution. +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c FORWARD ELIMINATION +c--------------------------------------------------------------------- + do stage = 1, ncells + c = slice(1,stage) + + istart = 0 + iend = cell_size(1,c)-1 + + jsize = cell_size(2,c) + ksize = cell_size(3,c) + jp = cell_coord(2,c)-1 + kp = cell_coord(3,c)-1 + + buffer_size = (jsize-start(2,c)-end(2,c)) * + > (ksize-start(3,c)-end(3,c)) + + if ( stage .ne. 1) then + +c--------------------------------------------------------------------- +c if this is not the first processor in this row of cells, +c receive data from predecessor containing the right hand +c sides and the upper diagonal elements of the previous two rows +c--------------------------------------------------------------------- + call mpi_irecv(in_buffer, 22*buffer_size, + > dp_type, predecessor(1), + > DEFAULT_TAG, comm_solve, + > requests(1), error) + + +c--------------------------------------------------------------------- +c communication has already been started. +c compute the left hand side while waiting for the msg +c--------------------------------------------------------------------- + call lhsx(c) + +c--------------------------------------------------------------------- +c wait for pending communication to complete +c This waits on the current receive and on the send +c from the previous stage. They always come in pairs. +c--------------------------------------------------------------------- + + call mpi_waitall(2, requests, statuses, error) + +c--------------------------------------------------------------------- +c unpack the buffer +c--------------------------------------------------------------------- + i = istart + i1 = istart + 1 + n = 0 + +c--------------------------------------------------------------------- +c create a running pointer +c--------------------------------------------------------------------- + p = 0 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - + > in_buffer(p+1) * lhs(i,j,k,n+1,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - + > in_buffer(p+2) * lhs(i,j,k,n+1,c) + do m = 1, 3 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > in_buffer(p+2+m) * lhs(i,j,k,n+1,c) + end do + d = in_buffer(p+6) + e = in_buffer(p+7) + do m = 1, 3 + s(m) = in_buffer(p+7+m) + end do + r1 = lhs(i,j,k,n+2,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 + lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 + do m = 1, 3 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 + end do + r2 = lhs(i1,j,k,n+1,c) + lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2 + lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2 + do m = 1, 3 + rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2 + end do + p = p + 10 + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - + > in_buffer(p+1) * lhs(i,j,k,n+1,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - + > in_buffer(p+2) * lhs(i,j,k,n+1,c) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > in_buffer(p+3) * lhs(i,j,k,n+1,c) + d = in_buffer(p+4) + e = in_buffer(p+5) + s(m) = in_buffer(p+6) + r1 = lhs(i,j,k,n+2,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 + lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 + r2 = lhs(i1,j,k,n+1,c) + lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2 + lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2 + rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2 + p = p + 6 + end do + end do + end do + + else + +c--------------------------------------------------------------------- +c if this IS the first cell, we still compute the lhs +c--------------------------------------------------------------------- + call lhsx(c) + endif + +c--------------------------------------------------------------------- +c perform the Thomas algorithm; first, FORWARD ELIMINATION +c--------------------------------------------------------------------- + n = 0 + + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + do i = istart, iend-2 + i1 = i + 1 + i2 = i + 2 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + end do + lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - + > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) - + > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - + > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c) + end do + lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) - + > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c) + lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) - + > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) - + > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c The last two rows in this grid block are a bit different, +c since they do not have two more rows available for the +c elimination of off-diagonal entries +c--------------------------------------------------------------------- + + i = iend - 1 + i1 = iend + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + end do + lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - + > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) - + > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - + > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c) + end do +c--------------------------------------------------------------------- +c scale the last row immediately (some of this is +c overkill in case this is the last cell) +c--------------------------------------------------------------------- + fac2 = 1.d0/lhs(i1,j,k,n+3,c) + lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c) + lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c) + do m = 1, 3 + rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c) + end do + end do + end do + +c--------------------------------------------------------------------- +c do the u+c and the u-c factors +c--------------------------------------------------------------------- + + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + do i = istart, iend-2 + i1 = i + 1 + i2 = i + 2 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - + > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) - + > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c) + rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - + > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c) + lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) - + > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c) + lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) - + > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c) + rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) - + > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c) + end do + end do + end do + +c--------------------------------------------------------------------- +c And again the last two rows separately +c--------------------------------------------------------------------- + i = iend - 1 + i1 = iend + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - + > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) - + > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c) + rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - + > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c) +c--------------------------------------------------------------------- +c Scale the last row immediately (some of this is overkill +c if this is the last cell) +c--------------------------------------------------------------------- + fac2 = 1.d0/lhs(i1,j,k,n+3,c) + lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c) + lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c) + rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c) + + end do + end do + end do + +c--------------------------------------------------------------------- +c send information to the next processor, except when this +c is the last grid block +c--------------------------------------------------------------------- + if (stage .ne. ncells) then + +c--------------------------------------------------------------------- +c create a running pointer for the send buffer +c--------------------------------------------------------------------- + p = 0 + n = 0 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + do i = iend-1, iend + out_buffer(p+1) = lhs(i,j,k,n+4,c) + out_buffer(p+2) = lhs(i,j,k,n+5,c) + do m = 1, 3 + out_buffer(p+2+m) = rhs(i,j,k,m,c) + end do + p = p+5 + end do + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + do i = iend-1, iend + out_buffer(p+1) = lhs(i,j,k,n+4,c) + out_buffer(p+2) = lhs(i,j,k,n+5,c) + out_buffer(p+3) = rhs(i,j,k,m,c) + p = p + 3 + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c send data to next phase +c can't receive data yet because buffer size will be wrong +c--------------------------------------------------------------------- + call mpi_isend(out_buffer, 22*buffer_size, + > dp_type, successor(1), + > DEFAULT_TAG, comm_solve, + > requests(2), error) + + endif + end do + +c--------------------------------------------------------------------- +c now go in the reverse direction +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c BACKSUBSTITUTION +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(1,stage) + + istart = 0 + iend = cell_size(1,c)-1 + + jsize = cell_size(2,c) + ksize = cell_size(3,c) + jp = cell_coord(2,c)-1 + kp = cell_coord(3,c)-1 + + buffer_size = (jsize-start(2,c)-end(2,c)) * + > (ksize-start(3,c)-end(3,c)) + + if (stage .ne. ncells) then + +c--------------------------------------------------------------------- +c if this is not the starting cell in this row of cells, +c wait for a message to be received, containing the +c solution of the previous two stations +c--------------------------------------------------------------------- + call mpi_irecv(in_buffer, 10*buffer_size, + > dp_type, successor(1), + > DEFAULT_TAG, comm_solve, + > requests(1), error) + + +c--------------------------------------------------------------------- +c communication has already been started +c while waiting, do the block-diagonal inversion for the +c cell that was just finished +c--------------------------------------------------------------------- + + call ninvr(slice(1,stage+1)) + +c--------------------------------------------------------------------- +c wait for pending communication to complete +c--------------------------------------------------------------------- + call mpi_waitall(2, requests, statuses, error) + +c--------------------------------------------------------------------- +c unpack the buffer for the first three factors +c--------------------------------------------------------------------- + n = 0 + p = 0 + i = iend + i1 = i - 1 + do m = 1, 3 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + sm1 = in_buffer(p+1) + sm2 = in_buffer(p+2) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*sm1 - + > lhs(i,j,k,n+5,c)*sm2 + rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - + > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - + > lhs(i1,j,k,n+5,c) * sm1 + p = p + 2 + end do + end do + end do + +c--------------------------------------------------------------------- +c now unpack the buffer for the remaining two factors +c--------------------------------------------------------------------- + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + sm1 = in_buffer(p+1) + sm2 = in_buffer(p+2) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*sm1 - + > lhs(i,j,k,n+5,c)*sm2 + rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - + > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - + > lhs(i1,j,k,n+5,c) * sm1 + p = p + 2 + end do + end do + end do + + else + +c--------------------------------------------------------------------- +c now we know this is the first grid block on the back sweep, +c so we don't need a message to start the substitution. +c--------------------------------------------------------------------- + i = iend-1 + i1 = iend + n = 0 + do m = 1, 3 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) + end do + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c Whether or not this is the last processor, we always have +c to complete the back-substitution +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c The first three factors +c--------------------------------------------------------------------- + n = 0 + do m = 1, 3 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + do i = iend-2, istart, -1 + i1 = i + 1 + i2 = i + 2 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) - + > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c And the remaining two +c--------------------------------------------------------------------- + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + do i = iend-2, istart, -1 + i1 = i + 1 + i2 = i + 2 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) - + > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c send on information to the previous processor, if needed +c--------------------------------------------------------------------- + if (stage .ne. 1) then + i = istart + i1 = istart+1 + p = 0 + do m = 1, 5 + do k = start(3,c), ksize-end(3,c)-1 + do j = start(2,c), jsize-end(2,c)-1 + out_buffer(p+1) = rhs(i,j,k,m,c) + out_buffer(p+2) = rhs(i1,j,k,m,c) + p = p + 2 + end do + end do + end do + +c--------------------------------------------------------------------- +c pack and send the buffer +c--------------------------------------------------------------------- + call mpi_isend(out_buffer, 10*buffer_size, + > dp_type, predecessor(1), + > DEFAULT_TAG, comm_solve, + > requests(2), error) + + endif + +c--------------------------------------------------------------------- +c If this was the last stage, do the block-diagonal inversion +c--------------------------------------------------------------------- + if (stage .eq. 1) call ninvr(c) + + end do + + return + end + + + + + + + diff --git a/examples/smpi/NAS/SP/y_solve.f b/examples/smpi/NAS/SP/y_solve.f new file mode 100644 index 0000000000..fdcbb4d03f --- /dev/null +++ b/examples/smpi/NAS/SP/y_solve.f @@ -0,0 +1,538 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine y_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c this function performs the solution of the approximate factorization +c step in the y-direction for all five matrix components +c simultaneously. The Thomas algorithm is employed to solve the +c systems for the y-lines. Boundary conditions are non-periodic +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i, j, k, stage, ip, kp, n, isize, jend, ksize, j1, j2, + > buffer_size, c, m, p, jstart, error, + > requests(2), statuses(MPI_STATUS_SIZE, 2) + double precision r1, r2, d, e, s(5), sm1, sm2, + > fac1, fac2 + + +c--------------------------------------------------------------------- +c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells +c on this node in the direction of increasing i for the forward sweep, +c and after that reversing the direction for the backsubstitution +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c FORWARD ELIMINATION +c--------------------------------------------------------------------- + do stage = 1, ncells + c = slice(2,stage) + + jstart = 0 + jend = cell_size(2,c)-1 + + isize = cell_size(1,c) + ksize = cell_size(3,c) + ip = cell_coord(1,c)-1 + kp = cell_coord(3,c)-1 + + buffer_size = (isize-start(1,c)-end(1,c)) * + > (ksize-start(3,c)-end(3,c)) + + if ( stage .ne. 1) then + +c--------------------------------------------------------------------- +c if this is not the first processor in this row of cells, +c receive data from predecessor containing the right hand +c sides and the upper diagonal elements of the previous two rows +c--------------------------------------------------------------------- + + call mpi_irecv(in_buffer, 22*buffer_size, + > dp_type, predecessor(2), + > DEFAULT_TAG, comm_solve, + > requests(1), error) + +c--------------------------------------------------------------------- +c communication has already been started. +c compute the left hand side while waiting for the msg +c--------------------------------------------------------------------- + call lhsy(c) + +c--------------------------------------------------------------------- +c wait for pending communication to complete +c This waits on the current receive and on the send +c from the previous stage. They always come in pairs. +c--------------------------------------------------------------------- + call mpi_waitall(2, requests, statuses, error) + +c--------------------------------------------------------------------- +c unpack the buffer +c--------------------------------------------------------------------- + j = jstart + j1 = jstart + 1 + n = 0 +c--------------------------------------------------------------------- +c create a running pointer +c--------------------------------------------------------------------- + p = 0 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - + > in_buffer(p+1) * lhs(i,j,k,n+1,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - + > in_buffer(p+2) * lhs(i,j,k,n+1,c) + do m = 1, 3 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > in_buffer(p+2+m) * lhs(i,j,k,n+1,c) + end do + d = in_buffer(p+6) + e = in_buffer(p+7) + do m = 1, 3 + s(m) = in_buffer(p+7+m) + end do + r1 = lhs(i,j,k,n+2,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 + lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 + do m = 1, 3 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 + end do + r2 = lhs(i,j1,k,n+1,c) + lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2 + lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2 + do m = 1, 3 + rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2 + end do + p = p + 10 + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - + > in_buffer(p+1) * lhs(i,j,k,n+1,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - + > in_buffer(p+2) * lhs(i,j,k,n+1,c) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > in_buffer(p+3) * lhs(i,j,k,n+1,c) + d = in_buffer(p+4) + e = in_buffer(p+5) + s(m) = in_buffer(p+6) + r1 = lhs(i,j,k,n+2,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 + lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 + r2 = lhs(i,j1,k,n+1,c) + lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2 + lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2 + rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2 + p = p + 6 + end do + end do + end do + + else + +c--------------------------------------------------------------------- +c if this IS the first cell, we still compute the lhs +c--------------------------------------------------------------------- + call lhsy(c) + endif + +c--------------------------------------------------------------------- +c perform the Thomas algorithm; first, FORWARD ELIMINATION +c--------------------------------------------------------------------- + n = 0 + + do k = start(3,c), ksize-end(3,c)-1 + do j = jstart, jend-2 + do i = start(1,c), isize-end(1,c)-1 + j1 = j + 1 + j2 = j + 2 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + end do + lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - + > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) - + > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - + > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c) + end do + lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) - + > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c) + lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) - + > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) - + > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c The last two rows in this grid block are a bit different, +c since they do not have two more rows available for the +c elimination of off-diagonal entries +c--------------------------------------------------------------------- + + j = jend - 1 + j1 = jend + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + end do + lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - + > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) - + > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - + > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c) + end do +c--------------------------------------------------------------------- +c scale the last row immediately (some of this is +c overkill in case this is the last cell) +c--------------------------------------------------------------------- + fac2 = 1.d0/lhs(i,j1,k,n+3,c) + lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c) + lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c) + do m = 1, 3 + rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c) + end do + end do + end do + +c--------------------------------------------------------------------- +c do the u+c and the u-c factors +c--------------------------------------------------------------------- + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do j = jstart, jend-2 + do i = start(1,c), isize-end(1,c)-1 + j1 = j + 1 + j2 = j + 2 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - + > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) - + > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c) + rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - + > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c) + lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) - + > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c) + lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) - + > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c) + rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) - + > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c) + end do + end do + end do + +c--------------------------------------------------------------------- +c And again the last two rows separately +c--------------------------------------------------------------------- + j = jend - 1 + j1 = jend + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - + > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) - + > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c) + rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - + > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c) +c--------------------------------------------------------------------- +c Scale the last row immediately (some of this is overkill +c if this is the last cell) +c--------------------------------------------------------------------- + fac2 = 1.d0/lhs(i,j1,k,n+3,c) + lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c) + lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c) + rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c) + + end do + end do + end do + +c--------------------------------------------------------------------- +c send information to the next processor, except when this +c is the last grid block; +c--------------------------------------------------------------------- + + if (stage .ne. ncells) then + +c--------------------------------------------------------------------- +c create a running pointer for the send buffer +c--------------------------------------------------------------------- + p = 0 + n = 0 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + do j = jend-1, jend + out_buffer(p+1) = lhs(i,j,k,n+4,c) + out_buffer(p+2) = lhs(i,j,k,n+5,c) + do m = 1, 3 + out_buffer(p+2+m) = rhs(i,j,k,m,c) + end do + p = p+5 + end do + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + do j = jend-1, jend + out_buffer(p+1) = lhs(i,j,k,n+4,c) + out_buffer(p+2) = lhs(i,j,k,n+5,c) + out_buffer(p+3) = rhs(i,j,k,m,c) + p = p + 3 + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c pack and send the buffer +c--------------------------------------------------------------------- + call mpi_isend(out_buffer, 22*buffer_size, + > dp_type, successor(2), + > DEFAULT_TAG, comm_solve, + > requests(2), error) + + endif + end do + +c--------------------------------------------------------------------- +c now go in the reverse direction +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c BACKSUBSTITUTION +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(2,stage) + + jstart = 0 + jend = cell_size(2,c)-1 + + isize = cell_size(1,c) + ksize = cell_size(3,c) + ip = cell_coord(1,c)-1 + kp = cell_coord(3,c)-1 + + buffer_size = (isize-start(1,c)-end(1,c)) * + > (ksize-start(3,c)-end(3,c)) + + if (stage .ne. ncells) then + +c--------------------------------------------------------------------- +c if this is not the starting cell in this row of cells, +c wait for a message to be received, containing the +c solution of the previous two stations +c--------------------------------------------------------------------- + + call mpi_irecv(in_buffer, 10*buffer_size, + > dp_type, successor(2), + > DEFAULT_TAG, comm_solve, + > requests(1), error) + + +c--------------------------------------------------------------------- +c communication has already been started +c while waiting, do the block-diagonal inversion for the +c cell that was just finished +c--------------------------------------------------------------------- + + call pinvr(slice(2,stage+1)) + +c--------------------------------------------------------------------- +c wait for pending communication to complete +c--------------------------------------------------------------------- + call mpi_waitall(2, requests, statuses, error) + +c--------------------------------------------------------------------- +c unpack the buffer for the first three factors +c--------------------------------------------------------------------- + n = 0 + p = 0 + j = jend + j1 = j - 1 + do m = 1, 3 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + sm1 = in_buffer(p+1) + sm2 = in_buffer(p+2) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*sm1 - + > lhs(i,j,k,n+5,c)*sm2 + rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - + > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) - + > lhs(i,j1,k,n+5,c) * sm1 + p = p + 2 + end do + end do + end do + +c--------------------------------------------------------------------- +c now unpack the buffer for the remaining two factors +c--------------------------------------------------------------------- + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + sm1 = in_buffer(p+1) + sm2 = in_buffer(p+2) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*sm1 - + > lhs(i,j,k,n+5,c)*sm2 + rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - + > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) - + > lhs(i,j1,k,n+5,c) * sm1 + p = p + 2 + end do + end do + end do + + else +c--------------------------------------------------------------------- +c now we know this is the first grid block on the back sweep, +c so we don't need a message to start the substitution. +c--------------------------------------------------------------------- + + j = jend - 1 + j1 = jend + n = 0 + do m = 1, 3 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) + end do + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c Whether or not this is the last processor, we always have +c to complete the back-substitution +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c The first three factors +c--------------------------------------------------------------------- + n = 0 + do m = 1, 3 + do k = start(3,c), ksize-end(3,c)-1 + do j = jend-2, jstart, -1 + do i = start(1,c), isize-end(1,c)-1 + j1 = j + 1 + j2 = j + 2 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) - + > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c And the remaining two +c--------------------------------------------------------------------- + do m = 4, 5 + n = (m-3)*5 + do k = start(3,c), ksize-end(3,c)-1 + do j = jend-2, jstart, -1 + do i = start(1,c), isize-end(1,c)-1 + j1 = j + 1 + j2 = j1 + 1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) - + > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c send on information to the previous processor, if needed +c--------------------------------------------------------------------- + if (stage .ne. 1) then + j = jstart + j1 = jstart + 1 + p = 0 + do m = 1, 5 + do k = start(3,c), ksize-end(3,c)-1 + do i = start(1,c), isize-end(1,c)-1 + out_buffer(p+1) = rhs(i,j,k,m,c) + out_buffer(p+2) = rhs(i,j1,k,m,c) + p = p + 2 + end do + end do + end do + +c--------------------------------------------------------------------- +c pack and send the buffer +c--------------------------------------------------------------------- + + call mpi_isend(out_buffer, 10*buffer_size, + > dp_type, predecessor(2), + > DEFAULT_TAG, comm_solve, + > requests(2), error) + + endif + +c--------------------------------------------------------------------- +c If this was the last stage, do the block-diagonal inversion +c--------------------------------------------------------------------- + if (stage .eq. 1) call pinvr(c) + + end do + + return + end + + + + + + + diff --git a/examples/smpi/NAS/SP/z_solve.f b/examples/smpi/NAS/SP/z_solve.f new file mode 100644 index 0000000000..ad0dc7e727 --- /dev/null +++ b/examples/smpi/NAS/SP/z_solve.f @@ -0,0 +1,532 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine z_solve + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c this function performs the solution of the approximate factorization +c step in the z-direction for all five matrix components +c simultaneously. The Thomas algorithm is employed to solve the +c systems for the z-lines. Boundary conditions are non-periodic +c--------------------------------------------------------------------- + + include 'header.h' + include 'mpinpb.h' + + integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2, + > buffer_size, c, m, p, kstart, error, + > requests(2), statuses(MPI_STATUS_SIZE, 2) + double precision r1, r2, d, e, s(5), sm1, sm2, + > fac1, fac2 + +c--------------------------------------------------------------------- +c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells +c on this node in the direction of increasing i for the forward sweep, +c and after that reversing the direction for the backsubstitution +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c FORWARD ELIMINATION +c--------------------------------------------------------------------- + do stage = 1, ncells + c = slice(3,stage) + + kstart = 0 + kend = cell_size(3,c)-1 + + isize = cell_size(1,c) + jsize = cell_size(2,c) + ip = cell_coord(1,c)-1 + jp = cell_coord(2,c)-1 + + buffer_size = (isize-start(1,c)-end(1,c)) * + > (jsize-start(2,c)-end(2,c)) + + if (stage .ne. 1) then + + +c--------------------------------------------------------------------- +c if this is not the first processor in this row of cells, +c receive data from predecessor containing the right hand +c sides and the upper diagonal elements of the previous two rows +c--------------------------------------------------------------------- + + call mpi_irecv(in_buffer, 22*buffer_size, + > dp_type, predecessor(3), + > DEFAULT_TAG, comm_solve, + > requests(1), error) + + +c--------------------------------------------------------------------- +c communication has already been started. +c compute the left hand side while waiting for the msg +c--------------------------------------------------------------------- + call lhsz(c) + +c--------------------------------------------------------------------- +c wait for pending communication to complete +c--------------------------------------------------------------------- + call mpi_waitall(2, requests, statuses, error) + +c--------------------------------------------------------------------- +c unpack the buffer +c--------------------------------------------------------------------- + k = kstart + k1 = kstart + 1 + n = 0 + +c--------------------------------------------------------------------- +c create a running pointer +c--------------------------------------------------------------------- + p = 0 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - + > in_buffer(p+1) * lhs(i,j,k,n+1,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - + > in_buffer(p+2) * lhs(i,j,k,n+1,c) + do m = 1, 3 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > in_buffer(p+2+m) * lhs(i,j,k,n+1,c) + end do + d = in_buffer(p+6) + e = in_buffer(p+7) + do m = 1, 3 + s(m) = in_buffer(p+7+m) + end do + r1 = lhs(i,j,k,n+2,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 + lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 + do m = 1, 3 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 + end do + r2 = lhs(i,j,k1,n+1,c) + lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2 + lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2 + do m = 1, 3 + rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2 + end do + p = p + 10 + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - + > in_buffer(p+1) * lhs(i,j,k,n+1,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - + > in_buffer(p+2) * lhs(i,j,k,n+1,c) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > in_buffer(p+3) * lhs(i,j,k,n+1,c) + d = in_buffer(p+4) + e = in_buffer(p+5) + s(m) = in_buffer(p+6) + r1 = lhs(i,j,k,n+2,c) + lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 + lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 + r2 = lhs(i,j,k1,n+1,c) + lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2 + lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2 + rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2 + p = p + 6 + end do + end do + end do + + else + +c--------------------------------------------------------------------- +c if this IS the first cell, we still compute the lhs +c--------------------------------------------------------------------- + call lhsz(c) + endif + +c--------------------------------------------------------------------- +c perform the Thomas algorithm; first, FORWARD ELIMINATION +c--------------------------------------------------------------------- + n = 0 + + do k = kstart, kend-2 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + k1 = k + 1 + k2 = k + 2 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + end do + lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - + > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - + > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - + > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) + end do + lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) - + > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c) + lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) - + > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) - + > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c The last two rows in this grid block are a bit different, +c since they do not have two more rows available for the +c elimination of off-diagonal entries +c--------------------------------------------------------------------- + k = kend - 1 + k1 = kend + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + end do + lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - + > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - + > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) + do m = 1, 3 + rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - + > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) + end do +c--------------------------------------------------------------------- +c scale the last row immediately (some of this is +c overkill in case this is the last cell) +c--------------------------------------------------------------------- + fac2 = 1.d0/lhs(i,j,k1,n+3,c) + lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c) + lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c) + do m = 1, 3 + rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c) + end do + end do + end do + +c--------------------------------------------------------------------- +c do the u+c and the u-c factors +c--------------------------------------------------------------------- + do m = 4, 5 + n = (m-3)*5 + do k = kstart, kend-2 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + k1 = k + 1 + k2 = k + 2 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - + > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - + > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) + rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - + > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) + lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) - + > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c) + lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) - + > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c) + rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) - + > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c) + end do + end do + end do + +c--------------------------------------------------------------------- +c And again the last two rows separately +c--------------------------------------------------------------------- + k = kend - 1 + k1 = kend + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + fac1 = 1.d0/lhs(i,j,k,n+3,c) + lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) + lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) + rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) + lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - + > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) + lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - + > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) + rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - + > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) +c--------------------------------------------------------------------- +c Scale the last row immediately (some of this is overkill +c if this is the last cell) +c--------------------------------------------------------------------- + fac2 = 1.d0/lhs(i,j,k1,n+3,c) + lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c) + lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c) + rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c) + + end do + end do + end do + +c--------------------------------------------------------------------- +c send information to the next processor, except when this +c is the last grid block, +c--------------------------------------------------------------------- + + if (stage .ne. ncells) then + +c--------------------------------------------------------------------- +c create a running pointer for the send buffer +c--------------------------------------------------------------------- + p = 0 + n = 0 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + do k = kend-1, kend + out_buffer(p+1) = lhs(i,j,k,n+4,c) + out_buffer(p+2) = lhs(i,j,k,n+5,c) + do m = 1, 3 + out_buffer(p+2+m) = rhs(i,j,k,m,c) + end do + p = p+5 + end do + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + do k = kend-1, kend + out_buffer(p+1) = lhs(i,j,k,n+4,c) + out_buffer(p+2) = lhs(i,j,k,n+5,c) + out_buffer(p+3) = rhs(i,j,k,m,c) + p = p + 3 + end do + end do + end do + end do + + + call mpi_isend(out_buffer, 22*buffer_size, + > dp_type, successor(3), + > DEFAULT_TAG, comm_solve, + > requests(2), error) + + endif + end do + +c--------------------------------------------------------------------- +c now go in the reverse direction +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c BACKSUBSTITUTION +c--------------------------------------------------------------------- + do stage = ncells, 1, -1 + c = slice(3,stage) + + kstart = 0 + kend = cell_size(3,c)-1 + + isize = cell_size(1,c) + jsize = cell_size(2,c) + ip = cell_coord(1,c)-1 + jp = cell_coord(2,c)-1 + + buffer_size = (isize-start(1,c)-end(1,c)) * + > (jsize-start(2,c)-end(2,c)) + + if (stage .ne. ncells) then + +c--------------------------------------------------------------------- +c if this is not the starting cell in this row of cells, +c wait for a message to be received, containing the +c solution of the previous two stations +c--------------------------------------------------------------------- + + call mpi_irecv(in_buffer, 10*buffer_size, + > dp_type, successor(3), + > DEFAULT_TAG, comm_solve, + > requests(1), error) + + +c--------------------------------------------------------------------- +c communication has already been started +c while waiting, do the block-diagonal inversion for the +c cell that was just finished +c--------------------------------------------------------------------- + + call tzetar(slice(3,stage+1)) + +c--------------------------------------------------------------------- +c wait for pending communication to complete +c--------------------------------------------------------------------- + call mpi_waitall(2, requests, statuses, error) + +c--------------------------------------------------------------------- +c unpack the buffer for the first three factors +c--------------------------------------------------------------------- + n = 0 + p = 0 + k = kend + k1 = k - 1 + do m = 1, 3 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + sm1 = in_buffer(p+1) + sm2 = in_buffer(p+2) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*sm1 - + > lhs(i,j,k,n+5,c)*sm2 + rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - + > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - + > lhs(i,j,k1,n+5,c) * sm1 + p = p + 2 + end do + end do + end do + +c--------------------------------------------------------------------- +c now unpack the buffer for the remaining two factors +c--------------------------------------------------------------------- + do m = 4, 5 + n = (m-3)*5 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + sm1 = in_buffer(p+1) + sm2 = in_buffer(p+2) + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*sm1 - + > lhs(i,j,k,n+5,c)*sm2 + rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - + > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - + > lhs(i,j,k1,n+5,c) * sm1 + p = p + 2 + end do + end do + end do + + else + +c--------------------------------------------------------------------- +c now we know this is the first grid block on the back sweep, +c so we don't need a message to start the substitution. +c--------------------------------------------------------------------- + + k = kend - 1 + k1 = kend + n = 0 + do m = 1, 3 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) + end do + end do + end do + + do m = 4, 5 + n = (m-3)*5 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) + end do + end do + end do + endif + +c--------------------------------------------------------------------- +c Whether or not this is the last processor, we always have +c to complete the back-substitution +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c The first three factors +c--------------------------------------------------------------------- + n = 0 + do m = 1, 3 + do k = kend-2, kstart, -1 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + k1 = k + 1 + k2 = k + 2 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) - + > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c And the remaining two +c--------------------------------------------------------------------- + do m = 4, 5 + n = (m-3)*5 + do k = kend-2, kstart, -1 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + k1 = k + 1 + k2 = k + 2 + rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - + > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) - + > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c) + end do + end do + end do + end do + +c--------------------------------------------------------------------- +c send on information to the previous processor, if needed +c--------------------------------------------------------------------- + if (stage .ne. 1) then + k = kstart + k1 = kstart + 1 + p = 0 + do m = 1, 5 + do j = start(2,c), jsize-end(2,c)-1 + do i = start(1,c), isize-end(1,c)-1 + out_buffer(p+1) = rhs(i,j,k,m,c) + out_buffer(p+2) = rhs(i,j,k1,m,c) + p = p + 2 + end do + end do + end do + + call mpi_isend(out_buffer, 10*buffer_size, + > dp_type, predecessor(3), + > DEFAULT_TAG, comm_solve, + > requests(2), error) + + endif + +c--------------------------------------------------------------------- +c If this was the last stage, do the block-diagonal inversion +c--------------------------------------------------------------------- + if (stage .eq. 1) call tzetar(c) + + end do + + return + end + + + + + + + diff --git a/examples/smpi/NAS/common/c_print_results.c b/examples/smpi/NAS/common/c_print_results.c new file mode 100644 index 0000000000..d7417fba15 --- /dev/null +++ b/examples/smpi/NAS/common/c_print_results.c @@ -0,0 +1,94 @@ +/*****************************************************************/ +/****** C _ P R I N T _ R E S U L T S ******/ +/*****************************************************************/ +#include +#include + +void c_print_results( char *name, + char class, + int n1, + int n2, + int n3, + int niter, + int nprocs_compiled, + int nprocs_total, + double t, + double mops, + char *optype, + int passed_verification, + char *npbversion, + char *compiletime, + char *mpicc, + char *clink, + char *cmpi_lib, + char *cmpi_inc, + char *cflags, + char *clinkflags ) +{ + char *evalue="1000"; + + printf( "\n\n %s Benchmark Completed\n", name ); + + printf( " Class = %c\n", class ); + + if( n3 == 0 ) { + long nn = n1; + if ( n2 != 0 ) nn *= n2; + printf( " Size = %12ld\n", nn ); /* as in IS */ + } + else + printf( " Size = %3dx %3dx %3d\n", n1,n2,n3 ); + + printf( " Iterations = %12d\n", niter ); + + printf( " Time in seconds = %12.2f\n", t ); + + printf( " Total processes = %12d\n", nprocs_total ); + + if ( nprocs_compiled != 0 ) + printf( " Compiled procs = %12d\n", nprocs_compiled ); + + printf( " Mop/s total = %12.2f\n", mops ); + + printf( " Mop/s/process = %12.2f\n", mops/((float) nprocs_total) ); + + printf( " Operation type = %24s\n", optype); + + if( passed_verification ) + printf( " Verification = SUCCESSFUL\n" ); + else + printf( " Verification = UNSUCCESSFUL\n" ); + + printf( " Version = %12s\n", npbversion ); + + printf( " Compile date = %12s\n", compiletime ); + + printf( "\n Compile options:\n" ); + + printf( " MPICC = %s\n", mpicc ); + + printf( " CLINK = %s\n", clink ); + + printf( " CMPI_LIB = %s\n", cmpi_lib ); + + printf( " CMPI_INC = %s\n", cmpi_inc ); + + printf( " CFLAGS = %s\n", cflags ); + + printf( " CLINKFLAGS = %s\n", clinkflags ); +#ifdef SMP + evalue = getenv("MP_SET_NUMTHREADS"); + printf( " MULTICPUS = %s\n", evalue ); +#endif + + printf( "\n\n" ); + printf( " Please send the results of this run to:\n\n" ); + printf( " NPB Development Team\n" ); + printf( " Internet: npb@nas.nasa.gov\n \n" ); + printf( " If email is not available, send this to:\n\n" ); + printf( " MS T27A-1\n" ); + printf( " NASA Ames Research Center\n" ); + printf( " Moffett Field, CA 94035-1000\n\n" ); + printf( " Fax: 650-604-3957\n\n" ); +} + diff --git a/examples/smpi/NAS/common/c_timers.c b/examples/smpi/NAS/common/c_timers.c new file mode 100644 index 0000000000..c8c81e7c7f --- /dev/null +++ b/examples/smpi/NAS/common/c_timers.c @@ -0,0 +1,45 @@ + +#include "mpi.h" + +double start[64], elapsed[64]; + +/*****************************************************************/ +/****** T I M E R _ C L E A R ******/ +/*****************************************************************/ +void timer_clear( int n ) +{ + elapsed[n] = 0.0; +} + + +/*****************************************************************/ +/****** T I M E R _ S T A R T ******/ +/*****************************************************************/ +void timer_start( int n ) +{ + start[n] = MPI_Wtime(); +} + + +/*****************************************************************/ +/****** T I M E R _ S T O P ******/ +/*****************************************************************/ +void timer_stop( int n ) +{ + double t, now; + + now = MPI_Wtime(); + t = now - start[n]; + elapsed[n] += t; + +} + + +/*****************************************************************/ +/****** T I M E R _ R E A D ******/ +/*****************************************************************/ +double timer_read( int n ) +{ + return( elapsed[n] ); +} + diff --git a/examples/smpi/NAS/common/print_results.f b/examples/smpi/NAS/common/print_results.f new file mode 100644 index 0000000000..9feddac482 --- /dev/null +++ b/examples/smpi/NAS/common/print_results.f @@ -0,0 +1,115 @@ + + subroutine print_results(name, class, n1, n2, n3, niter, + > nprocs_compiled, nprocs_total, + > t, mops, optype, verified, npbversion, + > compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + + implicit none + character*2 name + character*1 class + integer n1, n2, n3, niter, nprocs_compiled, nprocs_total, j + double precision t, mops + character optype*24, size*15 + logical verified + character*(*) npbversion, compiletime, + > cs1, cs2, cs3, cs4, cs5, cs6, cs7 + + write (*, 2) name + 2 format(//, ' ', A2, ' Benchmark Completed.') + + write (*, 3) Class + 3 format(' Class = ', 12x, a12) + +c If this is not a grid-based problem (EP, FT, CG), then +c we only print n1, which contains some measure of the +c problem size. In that case, n2 and n3 are both zero. +c Otherwise, we print the grid size n1xn2xn3 + + if ((n2 .eq. 0) .and. (n3 .eq. 0)) then + if (name(1:2) .eq. 'EP') then + write(size, '(f15.0)' ) 2.d0**n1 + j = 15 + if (size(j:j) .eq. '.') j = j - 1 + write (*,42) size(1:j) + 42 format(' Size = ',9x, a15) + else + write (*,44) n1 + 44 format(' Size = ',12x, i12) + endif + else + write (*, 4) n1,n2,n3 + 4 format(' Size = ',9x, i4,'x',i4,'x',i4) + endif + + write (*, 5) niter + 5 format(' Iterations = ', 12x, i12) + + write (*, 6) t + 6 format(' Time in seconds = ',12x, f12.2) + + write (*,7) nprocs_total + 7 format(' Total processes = ', 12x, i12) + + write (*,8) nprocs_compiled + 8 format(' Compiled procs = ', 12x, i12) + + write (*,9) mops + 9 format(' Mop/s total = ',12x, f12.2) + + write (*,10) mops/float( nprocs_total ) + 10 format(' Mop/s/process = ', 12x, f12.2) + + write(*, 11) optype + 11 format(' Operation type = ', a24) + + if (verified) then + write(*,12) ' SUCCESSFUL' + else + write(*,12) 'UNSUCCESSFUL' + endif + 12 format(' Verification = ', 12x, a) + + write(*,13) npbversion + 13 format(' Version = ', 12x, a12) + + write(*,14) compiletime + 14 format(' Compile date = ', 12x, a12) + + + write (*,121) cs1 + 121 format(/, ' Compile options:', /, + > ' MPIF77 = ', A) + + write (*,122) cs2 + 122 format(' FLINK = ', A) + + write (*,123) cs3 + 123 format(' FMPI_LIB = ', A) + + write (*,124) cs4 + 124 format(' FMPI_INC = ', A) + + write (*,125) cs5 + 125 format(' FFLAGS = ', A) + + write (*,126) cs6 + 126 format(' FLINKFLAGS = ', A) + + write(*, 127) cs7 + 127 format(' RAND = ', A) + + write (*,130) + 130 format(//' Please send the results of this run to:'// + > ' NPB Development Team '/ + > ' Internet: npb@nas.nasa.gov'/ + > ' '/ + > ' If email is not available, send this to:'// + > ' MS T27A-1'/ + > ' NASA Ames Research Center'/ + > ' Moffett Field, CA 94035-1000'// + > ' Fax: 650-604-3957'//) + + + return + end + diff --git a/examples/smpi/NAS/common/randdp.c b/examples/smpi/NAS/common/randdp.c new file mode 100644 index 0000000000..676624795d --- /dev/null +++ b/examples/smpi/NAS/common/randdp.c @@ -0,0 +1,64 @@ +//--------------------------------------------------------------------- +// This function is C verson of random number generator randdp.f +//--------------------------------------------------------------------- + +double randlc(X, A) +double *X; +double *A; +{ + static int KS=0; + static double R23, R46, T23, T46; + double T1, T2, T3, T4; + double A1; + double A2; + double X1; + double X2; + double Z; + int i, j; + + if (KS == 0) + { + R23 = 1.0; + R46 = 1.0; + T23 = 1.0; + T46 = 1.0; + + for (i=1; i<=23; i++) + { + R23 = 0.50 * R23; + T23 = 2.0 * T23; + } + for (i=1; i<=46; i++) + { + R46 = 0.50 * R46; + T46 = 2.0 * T46; + } + KS = 1; + } + +/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ + + T1 = R23 * *A; + j = T1; + A1 = j; + A2 = *A - T23 * A1; + +/* Break X into two parts such that X = 2^23 * X1 + X2, compute + Z = A1 * X2 + A2 * X1 (mod 2^23), and then + X = 2^23 * Z + A2 * X2 (mod 2^46). */ + + T1 = R23 * *X; + j = T1; + X1 = j; + X2 = *X - T23 * X1; + T1 = A1 * X2 + A2 * X1; + + j = R23 * T1; + T2 = j; + Z = T1 - T23 * T2; + T3 = T23 * Z + A2 * X2; + j = R46 * T3; + T4 = j; + *X = T3 - T46 * T4; + return(R46 * *X); +} diff --git a/examples/smpi/NAS/common/randdp.f b/examples/smpi/NAS/common/randdp.f new file mode 100644 index 0000000000..64860d96d0 --- /dev/null +++ b/examples/smpi/NAS/common/randdp.f @@ -0,0 +1,137 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + double precision function randlc (x, a) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c This routine returns a uniform pseudorandom double precision number in the +c range (0, 1) by using the linear congruential generator +c +c x_{k+1} = a x_k (mod 2^46) +c +c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +c before repeating. The argument A is the same as 'a' in the above formula, +c and X is the same as x_0. A and X must be odd double precision integers +c in the range (1, 2^46). The returned value RANDLC is normalized to be +c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain +c the new seed x_1, so that subsequent calls to RANDLC using the same +c arguments will generate a continuous sequence. +c +c This routine should produce the same results on any computer with at least +c 48 mantissa bits in double precision floating point data. On 64 bit +c systems, double precision should be disabled. +c +c David H. Bailey October 26, 1990 +c +c--------------------------------------------------------------------- + + implicit none + + double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z + parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23, + > t46 = t23 ** 2) + +c--------------------------------------------------------------------- +c Break A into two parts such that A = 2^23 * A1 + A2. +c--------------------------------------------------------------------- + t1 = r23 * a + a1 = int (t1) + a2 = a - t23 * a1 + +c--------------------------------------------------------------------- +c Break X into two parts such that X = 2^23 * X1 + X2, compute +c Z = A1 * X2 + A2 * X1 (mod 2^23), and then +c X = 2^23 * Z + A2 * X2 (mod 2^46). +c--------------------------------------------------------------------- + t1 = r23 * x + x1 = int (t1) + x2 = x - t23 * x1 + t1 = a1 * x2 + a2 * x1 + t2 = int (r23 * t1) + z = t1 - t23 * t2 + t3 = t23 * z + a2 * x2 + t4 = int (r46 * t3) + x = t3 - t46 * t4 + randlc = r46 * x + + return + end + + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine vranlc (n, x, a, y) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c This routine generates N uniform pseudorandom double precision numbers in +c the range (0, 1) by using the linear congruential generator +c +c x_{k+1} = a x_k (mod 2^46) +c +c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +c before repeating. The argument A is the same as 'a' in the above formula, +c and X is the same as x_0. A and X must be odd double precision integers +c in the range (1, 2^46). The N results are placed in Y and are normalized +c to be between 0 and 1. X is updated to contain the new seed, so that +c subsequent calls to VRANLC using the same arguments will generate a +c continuous sequence. If N is zero, only initialization is performed, and +c the variables X, A and Y are ignored. +c +c This routine is the standard version designed for scalar or RISC systems. +c However, it should produce the same results on any single processor +c computer with at least 48 mantissa bits in double precision floating point +c data. On 64 bit systems, double precision should be disabled. +c +c--------------------------------------------------------------------- + + implicit none + + integer i,n + double precision y,r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z + dimension y(*) + parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23, + > t46 = t23 ** 2) + + +c--------------------------------------------------------------------- +c Break A into two parts such that A = 2^23 * A1 + A2. +c--------------------------------------------------------------------- + t1 = r23 * a + a1 = int (t1) + a2 = a - t23 * a1 + +c--------------------------------------------------------------------- +c Generate N results. This loop is not vectorizable. +c--------------------------------------------------------------------- + do i = 1, n + +c--------------------------------------------------------------------- +c Break X into two parts such that X = 2^23 * X1 + X2, compute +c Z = A1 * X2 + A2 * X1 (mod 2^23), and then +c X = 2^23 * Z + A2 * X2 (mod 2^46). +c--------------------------------------------------------------------- + t1 = r23 * x + x1 = int (t1) + x2 = x - t23 * x1 + t1 = a1 * x2 + a2 * x1 + t2 = int (r23 * t1) + z = t1 - t23 * t2 + t3 = t23 * z + a2 * x2 + t4 = int (r46 * t3) + x = t3 - t46 * t4 + y(i) = r46 * x + enddo + + return + end diff --git a/examples/smpi/NAS/common/randdpvec.f b/examples/smpi/NAS/common/randdpvec.f new file mode 100644 index 0000000000..c7080717ce --- /dev/null +++ b/examples/smpi/NAS/common/randdpvec.f @@ -0,0 +1,186 @@ +c--------------------------------------------------------------------- + double precision function randlc (x, a) +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c This routine returns a uniform pseudorandom double precision number in the +c range (0, 1) by using the linear congruential generator +c +c x_{k+1} = a x_k (mod 2^46) +c +c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +c before repeating. The argument A is the same as 'a' in the above formula, +c and X is the same as x_0. A and X must be odd double precision integers +c in the range (1, 2^46). The returned value RANDLC is normalized to be +c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain +c the new seed x_1, so that subsequent calls to RANDLC using the same +c arguments will generate a continuous sequence. +c +c This routine should produce the same results on any computer with at least +c 48 mantissa bits in double precision floating point data. On 64 bit +c systems, double precision should be disabled. +c +c David H. Bailey October 26, 1990 +c +c--------------------------------------------------------------------- + + implicit none + + double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z + parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23, + > t46 = t23 ** 2) + +c--------------------------------------------------------------------- +c Break A into two parts such that A = 2^23 * A1 + A2. +c--------------------------------------------------------------------- + t1 = r23 * a + a1 = int (t1) + a2 = a - t23 * a1 + +c--------------------------------------------------------------------- +c Break X into two parts such that X = 2^23 * X1 + X2, compute +c Z = A1 * X2 + A2 * X1 (mod 2^23), and then +c X = 2^23 * Z + A2 * X2 (mod 2^46). +c--------------------------------------------------------------------- + t1 = r23 * x + x1 = int (t1) + x2 = x - t23 * x1 + + + t1 = a1 * x2 + a2 * x1 + t2 = int (r23 * t1) + z = t1 - t23 * t2 + t3 = t23 * z + a2 * x2 + t4 = int (r46 * t3) + x = t3 - t46 * t4 + randlc = r46 * x + return + end + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine vranlc (n, x, a, y) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c This routine generates N uniform pseudorandom double precision numbers in +c the range (0, 1) by using the linear congruential generator +c +c x_{k+1} = a x_k (mod 2^46) +c +c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +c before repeating. The argument A is the same as 'a' in the above formula, +c and X is the same as x_0. A and X must be odd double precision integers +c in the range (1, 2^46). The N results are placed in Y and are normalized +c to be between 0 and 1. X is updated to contain the new seed, so that +c subsequent calls to RANDLC using the same arguments will generate a +c continuous sequence. +c +c This routine generates the output sequence in batches of length NV, for +c convenience on vector computers. This routine should produce the same +c results on any computer with at least 48 mantissa bits in double precision +c floating point data. On Cray systems, double precision should be disabled. +c +c David H. Bailey August 30, 1990 +c--------------------------------------------------------------------- + + integer n + double precision x, a, y(*) + + double precision r23, r46, t23, t46 + integer nv + parameter (r23 = 2.d0 ** (-23), r46 = r23 * r23, t23 = 2.d0 ** 23, + > t46 = t23 * t23, nv = 64) + double precision xv(nv), t1, t2, t3, t4, an, a1, a2, x1, x2, yy + integer n1, i, j + external randlc + double precision randlc + +c--------------------------------------------------------------------- +c Compute the first NV elements of the sequence using RANDLC. +c--------------------------------------------------------------------- + t1 = x + n1 = min (n, nv) + + do i = 1, n1 + xv(i) = t46 * randlc (t1, a) + enddo + +c--------------------------------------------------------------------- +c It is not necessary to compute AN, A1 or A2 unless N is greater than NV. +c--------------------------------------------------------------------- + if (n .gt. nv) then + +c--------------------------------------------------------------------- +c Compute AN = AA ^ NV (mod 2^46) using successive calls to RANDLC. +c--------------------------------------------------------------------- + t1 = a + t2 = r46 * a + + do i = 1, nv - 1 + t2 = randlc (t1, a) + enddo + + an = t46 * t2 + +c--------------------------------------------------------------------- +c Break AN into two parts such that AN = 2^23 * A1 + A2. +c--------------------------------------------------------------------- + t1 = r23 * an + a1 = aint (t1) + a2 = an - t23 * a1 + endif + +c--------------------------------------------------------------------- +c Compute N pseudorandom results in batches of size NV. +c--------------------------------------------------------------------- + do j = 0, n - 1, nv + n1 = min (nv, n - j) + +c--------------------------------------------------------------------- +c Compute up to NV results based on the current seed vector XV. +c--------------------------------------------------------------------- + do i = 1, n1 + y(i+j) = r46 * xv(i) + enddo + +c--------------------------------------------------------------------- +c If this is the last pass through the 140 loop, it is not necessary to +c update the XV vector. +c--------------------------------------------------------------------- + if (j + n1 .eq. n) goto 150 + +c--------------------------------------------------------------------- +c Update the XV vector by multiplying each element by AN (mod 2^46). +c--------------------------------------------------------------------- + do i = 1, nv + t1 = r23 * xv(i) + x1 = aint (t1) + x2 = xv(i) - t23 * x1 + t1 = a1 * x2 + a2 * x1 + t2 = aint (r23 * t1) + yy = t1 - t23 * t2 + t3 = t23 * yy + a2 * x2 + t4 = aint (r46 * t3) + xv(i) = t3 - t46 * t4 + enddo + + enddo + +c--------------------------------------------------------------------- +c Save the last seed in X so that subsequent calls to VRANLC will generate +c a continuous sequence. +c--------------------------------------------------------------------- + 150 x = xv(n1) + + return + end + +c----- end of program ------------------------------------------------ + diff --git a/examples/smpi/NAS/common/randi8.f b/examples/smpi/NAS/common/randi8.f new file mode 100644 index 0000000000..21ab8815db --- /dev/null +++ b/examples/smpi/NAS/common/randi8.f @@ -0,0 +1,79 @@ + double precision function randlc(x, a) + +c--------------------------------------------------------------------- +c +c This routine returns a uniform pseudorandom double precision number in the +c range (0, 1) by using the linear congruential generator +c +c x_{k+1} = a x_k (mod 2^46) +c +c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +c before repeating. The argument A is the same as 'a' in the above formula, +c and X is the same as x_0. A and X must be odd double precision integers +c in the range (1, 2^46). The returned value RANDLC is normalized to be +c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain +c the new seed x_1, so that subsequent calls to RANDLC using the same +c arguments will generate a continuous sequence. + + implicit none + double precision x, a + integer*8 i246m1, Lx, La + double precision d2m46 + + parameter(d2m46=0.5d0**46) + + save i246m1 + data i246m1/X'00003FFFFFFFFFFF'/ + + Lx = X + La = A + + Lx = iand(Lx*La,i246m1) + randlc = d2m46*dble(Lx) + x = dble(Lx) + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + + SUBROUTINE VRANLC (N, X, A, Y) + implicit none + integer n, i + double precision x, a, y(*) + integer*8 i246m1, Lx, La + double precision d2m46 + +c This doesn't work, because the compiler does the calculation in 32 +c bits and overflows. No standard way (without f90 stuff) to specify +c that the rhs should be done in 64 bit arithmetic. +c parameter(i246m1=2**46-1) + + parameter(d2m46=0.5d0**46) + + save i246m1 + data i246m1/X'00003FFFFFFFFFFF'/ + +c Note that the v6 compiler on an R8000 does something stupid with +c the above. Using the following instead (or various other things) +c makes the calculation run almost 10 times as fast. +c +c save d2m46 +c data d2m46/0.0d0/ +c if (d2m46 .eq. 0.0d0) then +c d2m46 = 0.5d0**46 +c endif + + Lx = X + La = A + do i = 1, N + Lx = iand(Lx*La,i246m1) + y(i) = d2m46*dble(Lx) + end do + x = dble(Lx) + + return + end + diff --git a/examples/smpi/NAS/common/randi8_safe.f b/examples/smpi/NAS/common/randi8_safe.f new file mode 100644 index 0000000000..f725b6a1fb --- /dev/null +++ b/examples/smpi/NAS/common/randi8_safe.f @@ -0,0 +1,64 @@ + double precision function randlc(x, a) + +c--------------------------------------------------------------------- +c +c This routine returns a uniform pseudorandom double precision number in the +c range (0, 1) by using the linear congruential generator +c +c x_{k+1} = a x_k (mod 2^46) +c +c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +c before repeating. The argument A is the same as 'a' in the above formula, +c and X is the same as x_0. A and X must be odd double precision integers +c in the range (1, 2^46). The returned value RANDLC is normalized to be +c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain +c the new seed x_1, so that subsequent calls to RANDLC using the same +c arguments will generate a continuous sequence. + + implicit none + double precision x, a + integer*8 Lx, La, a1, a2, x1, x2, xa + double precision d2m46 + parameter(d2m46=0.5d0**46) + + Lx = x + La = A + a1 = ibits(La, 23, 23) + a2 = ibits(La, 0, 23) + x1 = ibits(Lx, 23, 23) + x2 = ibits(Lx, 0, 23) + xa = ishft(ibits(a1*x2+a2*x1, 0, 23), 23) + a2*x2 + Lx = ibits(xa,0, 46) + x = dble(Lx) + randlc = d2m46*x + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + + SUBROUTINE VRANLC (N, X, A, Y) + implicit none + integer n, i + double precision x, a, y(*) + integer*8 Lx, La, a1, a2, x1, x2, xa + double precision d2m46 + parameter(d2m46=0.5d0**46) + + Lx = X + La = A + a1 = ibits(La, 23, 23) + a2 = ibits(La, 0, 23) + do i = 1, N + x1 = ibits(Lx, 23, 23) + x2 = ibits(Lx, 0, 23) + xa = ishft(ibits(a1*x2+a2*x1, 0, 23), 23) + a2*x2 + Lx = ibits(xa,0, 46) + y(i) = d2m46*dble(Lx) + end do + x = dble(Lx) + return + end + diff --git a/examples/smpi/NAS/common/timers.f b/examples/smpi/NAS/common/timers.f new file mode 100644 index 0000000000..7a19ccf56f --- /dev/null +++ b/examples/smpi/NAS/common/timers.f @@ -0,0 +1,78 @@ +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine timer_clear(n) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + integer n + + double precision start(64), elapsed(64) + common /tt/ start, elapsed + + elapsed(n) = 0.0 + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine timer_start(n) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + integer n + include 'mpif.h' + double precision start(64), elapsed(64) + common /tt/ start, elapsed + + start(n) = MPI_Wtime() + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + subroutine timer_stop(n) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + integer n + include 'mpif.h' + double precision start(64), elapsed(64) + common /tt/ start, elapsed + double precision t, now + now = MPI_Wtime() + t = now - start(n) + elapsed(n) = elapsed(n) + t + + return + end + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + double precision function timer_read(n) + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + integer n + double precision start(64), elapsed(64) + common /tt/ start, elapsed + + timer_read = elapsed(n) + return + end + diff --git a/examples/smpi/NAS/config/NAS.samples/README b/examples/smpi/NAS/config/NAS.samples/README new file mode 100644 index 0000000000..ae535e95cf --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/README @@ -0,0 +1,7 @@ +This directory contains examples of make.def files that were used +by the NPB team in testing the benchmarks on different platforms. +They can be used as starting points for make.def files for your +own platform, but you may need to taylor them for best performance +on your installation. A clean template can be found in directory +`config'. +Some examples of suite.def files are also provided. \ No newline at end of file diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha b/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha new file mode 100644 index 0000000000..44f045354d --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha @@ -0,0 +1,18 @@ +#This is for a DEC Alpha 8400. The code will execute on a +#single processor +#Warning: parallel make does not work properly in general +MPIF77 = f77 +FLINK = f77 +#Optimization -O5 breaks SP; works fine for all other codes +FFLAGS = -O4 + +MPICC = cc +CLINK = cc +CFLAGS = -O5 + +include ../config/make.dummy + +CC = cc -g +BINDIR = ../bin + +RAND = randi8 diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2 b/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2 new file mode 100644 index 0000000000..f76404755a --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2 @@ -0,0 +1,16 @@ +#This is for a generic single-processor SGI workstation +MPIF77 = f77 +FLINK = f77 +FFLAGS = -O3 + +MPICC = cc +CLINK = cc +CFLAGS = -O3 + +include ../config/make.dummy + +CC = cc -g +BINDIR = ../bin + +RAND = randi8 + diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.origin b/examples/smpi/NAS/config/NAS.samples/make.def.origin new file mode 100644 index 0000000000..11c63c975c --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/make.def.origin @@ -0,0 +1,20 @@ +# This is for a an SGI Origin 2000 or 3000 with vendor MPI. The Fortran +# record length is specified, so it can be used for the I/O benchmark. +# as well +MPIF77 = f77 +FMPI_LIB = -lmpi +FLINK = f77 -64 +FFLAGS = -O3 -64 + +MPICC = cc +CMPI_LIB = -lmpi +CLINK = cc +CFLAGS = -O3 + +CC = cc -g +BINDIR = ../bin + +RAND = randi8 + +CONVERTFLAG = -DFORTRAN_REC_SIZE=4 + diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge b/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge new file mode 100644 index 0000000000..379726d057 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge @@ -0,0 +1,16 @@ +# This is for the SGI PowerChallenge Array at NASA Ames. mrf77 and +# mrcc are local scripts that invoke the proper MPI library. +MPIF77 = mrf77 +FLINK = mrf77 +FFLAGS = -O3 -OPT:fold_arith_limit=1204 + +MPICC = mrcc +CLINK = mrcc +CFLAGS = -O3 -OPT:fold_arith_limit=1204 + +CC = cc -g +BINDIR = ../bin + +RAND = randi8 + + diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage b/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage new file mode 100644 index 0000000000..7896d56f7d --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage @@ -0,0 +1,17 @@ +#This is for the IBM SP2 at Ames; mrf77 and mrcc are local scripts +MPIF77 = mrf77 +FLINK = mrf77 +FFLAGS = -O3 +FLINKFLAGS = -bmaxdata:0x60000000 + +MPICC = mrcc +CLINK = mrcc +CFLAGS = -O3 +CLINKFLAGS = -bmaxdata:0x60000000 + +CC = cc -g + +BINDIR = ../bin + +RAND = randi8 + diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc b/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc new file mode 100644 index 0000000000..420dfde821 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc @@ -0,0 +1,30 @@ +# This is for a Sun SparcCenter or UltraEnterprise machine +MPIF77 = f77 +FLINK = f77 +FMPI_LIB = -L/lib/solaris/ch_lfshmem -lmpi +FMPI_INC = -I/include +# sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m) +# and f77 -V returns 4.0 or greater +# FFLAGS = -fast -xtarget=super -xO4 -depend +# Ultra1,2, UltraEnterprise servers (uname -m returns sun4u) +FFLAGS = -fast -xtarget=ultra -xarch=v8plus -xO4 -depend +FLINKFLAGS = -lmopt -lcopt -lsunmath + +MPICC = cc +CLINK = cc +CMPI_LIB = -L/lib/solaris/ch_lfshmem -lmpi +CMPI_INC = -I/include +# sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m) +# and cc -V returns 4.0 or greater +#CFLAGS = -fast -xtarget=super -xO4 -xdepend +# Ultra1,2, UltraEnterprise servers (uname -m returns sun4u) +CFLAGS = -fast -xtarget=ultra -xarch=v8plus -xO4 -xdepend +CLINKFLAGS = -fast + +CC = cc -g + +BINDIR = ../bin + +# Cannot use randi8 or randi8-safe on a 32-but machine. Use double precision +RAND = randdp + diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos b/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos new file mode 100644 index 0000000000..d3b3bbf528 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos @@ -0,0 +1,25 @@ +#This is for the Cray T3D at the Jet Propulsion Laboratory +MPIF77 = cf77 +FLINK = cf77 +FMPI_LIB = -L/usr/local/mpp/lib -lmpi +FMPI_INC = -I/usr/local/mpp/lib/include/mpp +FFLAGS = -dp -Wf-onoieeedivide -C cray-t3d +#The following flags provide more effective optimization, but may +#cause the random number generator randi8(_safe) to break in EP +#FFLAGS = -dp -Wf-oaggress -Wf-onoieeedivide -C cray-t3d +FLINKFLAGS = -Wl-Drdahead=on -C cray-t3d + +MPICC = cc +CLINK = cc +CMPI_LIB = -L/usr/local/mpp/lib -lmpi +CMPI_INC = -I/usr/local/mpp/lib/include/mpp +CFLAGS = -O3 -Tcray-t3d +CLINKFLAGS = -Tcray-t3d + +CC = cc -g -Tcray-ymp +BINDIR = ../bin + +CONVERTFLAG= -DCONVERTDOUBLE + +RAND = randi8 + diff --git a/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich b/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich new file mode 100644 index 0000000000..99b0b69041 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich @@ -0,0 +1,165 @@ +#--------------------------------------------------------------------------- +# +# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. +# +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Items in this file will need to be changed for each platform. +# (Note these definitions are inconsistent with NPB2.1.) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Parallel Fortran: +# +# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must +# be defined: +# +# MPIF77 - Fortran compiler +# FFLAGS - Fortran compilation arguments +# FMPI_INC - any -I arguments required for compiling MPI/Fortran +# FLINK - Fortran linker +# FLINKFLAGS - Fortran linker arguments +# FMPI_LIB - any -L and -l arguments required for linking MPI/Fortran +# +# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or +# $(MPIF77) $(FFLAGS) +# linking is done with $(FLINK) $(FMPI_LIB) $(FLINKFLAGS) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# This is the fortran compiler used for MPI programs +#--------------------------------------------------------------------------- +MPIF77 = mpif77 +# This links MPI fortran programs; usually the same as ${MPIF77} +FLINK = $(MPIF77) + +#--------------------------------------------------------------------------- +# These macros are passed to the linker to help link with MPI correctly +#--------------------------------------------------------------------------- +FMPI_LIB = + +#--------------------------------------------------------------------------- +# These macros are passed to the compiler to help find 'mpif.h' +#--------------------------------------------------------------------------- +FMPI_INC = + +#--------------------------------------------------------------------------- +# Global *compile time* flags for Fortran programs +#--------------------------------------------------------------------------- +FFLAGS = -fast +# FFLAGS = -g + +#--------------------------------------------------------------------------- +# Global *link time* flags. Flags for increasing maximum executable +# size usually go here. +#--------------------------------------------------------------------------- +FLINKFLAGS = -fast + + +#--------------------------------------------------------------------------- +# Parallel C: +# +# For IS, which is in C, the following must be defined: +# +# MPICC - C compiler +# CFLAGS - C compilation arguments +# CMPI_INC - any -I arguments required for compiling MPI/C +# CLINK - C linker +# CLINKFLAGS - C linker flags +# CMPI_LIB - any -L and -l arguments required for linking MPI/C +# +# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or +# $(MPICC) $(CFLAGS) +# linking is done with $(CLINK) $(CMPI_LIB) $(CLINKFLAGS) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# This is the C compiler used for MPI programs +#--------------------------------------------------------------------------- +MPICC = mpicc +# This links MPI C programs; usually the same as ${MPICC} +CLINK = $(MPICC) + +#--------------------------------------------------------------------------- +# These macros are passed to the linker to help link with MPI correctly +#--------------------------------------------------------------------------- +CMPI_LIB = + +#--------------------------------------------------------------------------- +# These macros are passed to the compiler to help find 'mpi.h' +#--------------------------------------------------------------------------- +CMPI_INC = + +#--------------------------------------------------------------------------- +# Global *compile time* flags for C programs +#--------------------------------------------------------------------------- +CFLAGS = -fast +# CFLAGS = -g + +#--------------------------------------------------------------------------- +# Global *link time* flags. Flags for increasing maximum executable +# size usually go here. +#--------------------------------------------------------------------------- +CLINKFLAGS = -fast + + +#--------------------------------------------------------------------------- +# MPI dummy library: +# +# Uncomment if you want to use the MPI dummy library supplied by NAS instead +# of the true message-passing library. The include file redefines several of +# the above macros. It also invokes make in subdirectory MPI_dummy. Make +# sure that no spaces or tabs precede include. +#--------------------------------------------------------------------------- +# include ../config/make.dummy + + +#--------------------------------------------------------------------------- +# Utilities C: +# +# This is the C compiler used to compile C utilities. Flags required by +# this compiler go here also; typically there are few flags required; hence +# there are no separate macros provided for such flags. +#--------------------------------------------------------------------------- +CC = cc -g + + +#--------------------------------------------------------------------------- +# Destination of executables, relative to subdirs of the main directory. . +#--------------------------------------------------------------------------- +BINDIR = ../bin + + +#--------------------------------------------------------------------------- +# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which +# is twice the precision required for the NPB suite. A compiler flag +# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to +# 64 bits, but the MPI library may continue to send 128 bits. Short of +# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit +# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment +# the following line to enable this substitution. +# +# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO +# SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC +# VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE +# THE CORRECT VALUE FOR "length". +# IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED, +# UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR +# "length" +#--------------------------------------------------------------------------- +# CONVERTFLAG = -DCONVERTDOUBLE +CONVERTFLAG = -DFORTRAN_REC_SIZE=1 +# CONVERTFLAG = -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length + + +#--------------------------------------------------------------------------- +# The variable RAND controls which random number generator +# is used. It is described in detail in Doc/README.install. +# Use "randi8" unless there is a reason to use another one. +# Other allowed values are "randi8_safe", "randdp" and "randdpvec" +#--------------------------------------------------------------------------- +RAND = randi8 +# The following is highly reliable but may be slow: +# RAND = randdp + diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.bt b/examples/smpi/NAS/config/NAS.samples/suite.def.bt new file mode 100644 index 0000000000..f330636aa0 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.bt @@ -0,0 +1,37 @@ +bt S 1 +bt S 4 +bt S 9 +bt S 16 +bt A 1 +bt A 4 +bt A 9 +bt A 16 +bt A 25 +bt A 36 +bt A 49 +bt A 64 +bt A 81 +bt A 100 +bt A 121 +bt B 1 +bt B 4 +bt B 9 +bt B 16 +bt B 25 +bt B 36 +bt B 49 +bt B 64 +bt B 81 +bt B 100 +bt B 121 +bt C 1 +bt C 4 +bt C 9 +bt C 16 +bt C 25 +bt C 36 +bt C 49 +bt C 64 +bt C 81 +bt C 100 +bt C 121 diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.cg b/examples/smpi/NAS/config/NAS.samples/suite.def.cg new file mode 100644 index 0000000000..393bc508f6 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.cg @@ -0,0 +1,29 @@ +cg S 1 +cg S 2 +cg S 4 +cg S 8 +cg S 16 +cg A 1 +cg A 2 +cg A 4 +cg A 8 +cg A 16 +cg A 32 +cg A 64 +cg A 128 +cg B 1 +cg B 2 +cg B 4 +cg B 8 +cg B 16 +cg B 32 +cg B 64 +cg B 128 +cg C 1 +cg C 2 +cg C 4 +cg C 8 +cg C 16 +cg C 32 +cg C 64 +cg C 128 diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.ep b/examples/smpi/NAS/config/NAS.samples/suite.def.ep new file mode 100644 index 0000000000..e2ca3cd007 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.ep @@ -0,0 +1,29 @@ +ep S 1 +ep S 2 +ep S 4 +ep S 8 +ep S 16 +ep A 1 +ep A 2 +ep A 4 +ep A 8 +ep A 16 +ep A 32 +ep A 64 +ep A 128 +ep B 1 +ep B 2 +ep B 4 +ep B 8 +ep B 16 +ep B 32 +ep B 64 +ep B 128 +ep C 1 +ep C 2 +ep C 4 +ep C 8 +ep C 16 +ep C 32 +ep C 64 +ep C 128 diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.ft b/examples/smpi/NAS/config/NAS.samples/suite.def.ft new file mode 100644 index 0000000000..6f05189094 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.ft @@ -0,0 +1,29 @@ +ft S 1 +ft S 2 +ft S 4 +ft S 8 +ft S 16 +ft A 1 +ft A 2 +ft A 4 +ft A 8 +ft A 16 +ft A 32 +ft A 64 +ft A 128 +ft B 1 +ft B 2 +ft B 4 +ft B 8 +ft B 16 +ft B 32 +ft B 64 +ft B 128 +ft C 1 +ft C 2 +ft C 4 +ft C 8 +ft C 16 +ft C 32 +ft C 64 +ft C 128 diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.is b/examples/smpi/NAS/config/NAS.samples/suite.def.is new file mode 100644 index 0000000000..97e898d132 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.is @@ -0,0 +1,29 @@ +is S 1 +is S 2 +is S 4 +is S 8 +is S 16 +is A 1 +is A 2 +is A 4 +is A 8 +is A 16 +is A 32 +is A 64 +is A 128 +is B 1 +is B 2 +is B 4 +is B 8 +is B 16 +is B 32 +is B 64 +is B 128 +is C 1 +is C 2 +is C 4 +is C 8 +is C 16 +is C 32 +is C 64 +is C 128 diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.lu b/examples/smpi/NAS/config/NAS.samples/suite.def.lu new file mode 100644 index 0000000000..442e0b6842 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.lu @@ -0,0 +1,29 @@ +lu S 1 +lu S 2 +lu S 4 +lu S 8 +lu S 16 +lu A 1 +lu A 2 +lu A 4 +lu A 8 +lu A 16 +lu A 32 +lu A 64 +lu A 128 +lu B 1 +lu B 2 +lu B 4 +lu B 8 +lu B 16 +lu B 32 +lu B 64 +lu B 128 +lu C 1 +lu C 2 +lu C 4 +lu C 8 +lu C 16 +lu C 32 +lu C 64 +lu C 128 diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.mg b/examples/smpi/NAS/config/NAS.samples/suite.def.mg new file mode 100644 index 0000000000..b5c01d4557 --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.mg @@ -0,0 +1,29 @@ +mg S 1 +mg S 2 +mg S 4 +mg S 8 +mg S 16 +mg A 1 +mg A 2 +mg A 4 +mg A 8 +mg A 16 +mg A 32 +mg A 64 +mg A 128 +mg B 1 +mg B 2 +mg B 4 +mg B 8 +mg B 16 +mg B 32 +mg B 64 +mg B 128 +mg C 1 +mg C 2 +mg C 4 +mg C 8 +mg C 16 +mg C 32 +mg C 64 +mg C 128 diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.small b/examples/smpi/NAS/config/NAS.samples/suite.def.small new file mode 100644 index 0000000000..5a094042bd --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.small @@ -0,0 +1,8 @@ +bt S 1 +cg S 1 +ep S 1 +ft S 1 +is S 1 +lu S 1 +mg S 1 +sp S 1 diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.sp b/examples/smpi/NAS/config/NAS.samples/suite.def.sp new file mode 100644 index 0000000000..f8113a256e --- /dev/null +++ b/examples/smpi/NAS/config/NAS.samples/suite.def.sp @@ -0,0 +1,37 @@ +sp S 1 +sp S 4 +sp S 9 +sp S 16 +sp A 1 +sp A 4 +sp A 9 +sp A 16 +sp A 25 +sp A 36 +sp A 49 +sp A 64 +sp A 81 +sp A 100 +sp A 121 +sp B 1 +sp B 4 +sp B 9 +sp B 16 +sp B 25 +sp B 36 +sp B 49 +sp B 64 +sp B 81 +sp B 100 +sp B 121 +sp C 1 +sp C 4 +sp C 9 +sp C 16 +sp C 25 +sp C 36 +sp C 49 +sp C 64 +sp C 81 +sp C 100 +sp C 121 diff --git a/examples/smpi/NAS/config/make.def b/examples/smpi/NAS/config/make.def new file mode 100644 index 0000000000..ffcfda2bef --- /dev/null +++ b/examples/smpi/NAS/config/make.def @@ -0,0 +1,162 @@ +#--------------------------------------------------------------------------- +# +# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. +# +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Items in this file will need to be changed for each platform. +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Parallel Fortran: +# +# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must +# be defined: +# +# MPIF77 - Fortran compiler +# FFLAGS - Fortran compilation arguments +# FMPI_INC - any -I arguments required for compiling MPI/Fortran +# FLINK - Fortran linker +# FLINKFLAGS - Fortran linker arguments +# FMPI_LIB - any -L and -l arguments required for linking MPI/Fortran +# +# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or +# $(MPIF77) $(FFLAGS) +# linking is done with $(FLINK) $(FMPI_LIB) $(FLINKFLAGS) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# This is the fortran compiler used for MPI programs +#--------------------------------------------------------------------------- +MPIF77 = smpicc +# This links MPI fortran programs; usually the same as ${MPIF77} +FLINK = $(MPIF77) + +#--------------------------------------------------------------------------- +# These macros are passed to the linker to help link with MPI correctly +#--------------------------------------------------------------------------- +FMPI_LIB = -lgfortran + +#--------------------------------------------------------------------------- +# These macros are passed to the compiler to help find 'mpif.h' +#--------------------------------------------------------------------------- +FMPI_INC = -I/usr/lib/openmpi/include + +#--------------------------------------------------------------------------- +# Global *compile time* flags for Fortran programs +#--------------------------------------------------------------------------- +FFLAGS = -O2 + +#--------------------------------------------------------------------------- +# Global *link time* flags. Flags for increasing maximum executable +# size usually go here. +#--------------------------------------------------------------------------- +FLINKFLAGS = -O2 + + +#--------------------------------------------------------------------------- +# Parallel C: +# +# For IS, which is in C, the following must be defined: +# +# MPICC - C compiler +# CFLAGS - C compilation arguments +# CMPI_INC - any -I arguments required for compiling MPI/C +# CLINK - C linker +# CLINKFLAGS - C linker flags +# CMPI_LIB - any -L and -l arguments required for linking MPI/C +# +# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or +# $(MPICC) $(CFLAGS) +# linking is done with $(CLINK) $(CMPI_LIB) $(CLINKFLAGS) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# This is the C compiler used for MPI programs +#--------------------------------------------------------------------------- +MPICC = smpicc +# This links MPI C programs; usually the same as ${MPICC} +CLINK = $(MPICC) + +#--------------------------------------------------------------------------- +# These macros are passed to the linker to help link with MPI correctly +#--------------------------------------------------------------------------- +CMPI_LIB = + +#--------------------------------------------------------------------------- +# These macros are passed to the compiler to help find 'mpi.h' +#--------------------------------------------------------------------------- +CMPI_INC = + +#--------------------------------------------------------------------------- +# Global *compile time* flags for C programs +#--------------------------------------------------------------------------- +CFLAGS = -O2 + +#--------------------------------------------------------------------------- +# Global *link time* flags. Flags for increasing maximum executable +# size usually go here. +#--------------------------------------------------------------------------- +CLINKFLAGS = -O2 + + +#--------------------------------------------------------------------------- +# MPI dummy library: +# +# Uncomment if you want to use the MPI dummy library supplied by NAS instead +# of the true message-passing library. The include file redefines several of +# the above macros. It also invokes make in subdirectory MPI_dummy. Make +# sure that no spaces or tabs precede include. +#--------------------------------------------------------------------------- +# include ../config/make.dummy + + +#--------------------------------------------------------------------------- +# Utilities C: +# +# This is the C compiler used to compile C utilities. Flags required by +# this compiler go here also; typically there are few flags required; hence +# there are no separate macros provided for such flags. +#--------------------------------------------------------------------------- +CC = gcc -g + + +#--------------------------------------------------------------------------- +# Destination of executables, relative to subdirs of the main directory. . +#--------------------------------------------------------------------------- +BINDIR = ../bin + + +#--------------------------------------------------------------------------- +# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which +# is twice the precision required for the NPB suite. A compiler flag +# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to +# 64 bits, but the MPI library may continue to send 128 bits. Short of +# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit +# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment +# the following line to enable this substitution. +# +# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO +# SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC +# VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE +# THE CORRECT VALUE FOR "length". +# IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED, +# UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR +# "length" +#--------------------------------------------------------------------------- +# CONVERTFLAG = -DCONVERTDOUBLE +# CONVERTFLAG = -DFORTRAN_REC_SIZE=length +# CONVERTFLAG = -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length + + +#--------------------------------------------------------------------------- +# The variable RAND controls which random number generator +# is used. It is described in detail in README.install. +# Use "randi8" unless there is a reason to use another one. +# Other allowed values are "randi8_safe", "randdp" and "randdpvec" +#--------------------------------------------------------------------------- +RAND = randi8 +# The following is highly reliable but may be slow: +# RAND = randdp + diff --git a/examples/smpi/NAS/config/make.def.template b/examples/smpi/NAS/config/make.def.template new file mode 100644 index 0000000000..8cccc29439 --- /dev/null +++ b/examples/smpi/NAS/config/make.def.template @@ -0,0 +1,162 @@ +#--------------------------------------------------------------------------- +# +# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. +# +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Items in this file will need to be changed for each platform. +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Parallel Fortran: +# +# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must +# be defined: +# +# MPIF77 - Fortran compiler +# FFLAGS - Fortran compilation arguments +# FMPI_INC - any -I arguments required for compiling MPI/Fortran +# FLINK - Fortran linker +# FLINKFLAGS - Fortran linker arguments +# FMPI_LIB - any -L and -l arguments required for linking MPI/Fortran +# +# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or +# $(MPIF77) $(FFLAGS) +# linking is done with $(FLINK) $(FMPI_LIB) $(FLINKFLAGS) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# This is the fortran compiler used for MPI programs +#--------------------------------------------------------------------------- +MPIF77 = f77 +# This links MPI fortran programs; usually the same as ${MPIF77} +FLINK = $(MPIF77) + +#--------------------------------------------------------------------------- +# These macros are passed to the linker to help link with MPI correctly +#--------------------------------------------------------------------------- +FMPI_LIB = -L/usr/local/lib -lmpi + +#--------------------------------------------------------------------------- +# These macros are passed to the compiler to help find 'mpif.h' +#--------------------------------------------------------------------------- +FMPI_INC = -I/usr/local/include + +#--------------------------------------------------------------------------- +# Global *compile time* flags for Fortran programs +#--------------------------------------------------------------------------- +FFLAGS = -O + +#--------------------------------------------------------------------------- +# Global *link time* flags. Flags for increasing maximum executable +# size usually go here. +#--------------------------------------------------------------------------- +FLINKFLAGS = -O + + +#--------------------------------------------------------------------------- +# Parallel C: +# +# For IS, which is in C, the following must be defined: +# +# MPICC - C compiler +# CFLAGS - C compilation arguments +# CMPI_INC - any -I arguments required for compiling MPI/C +# CLINK - C linker +# CLINKFLAGS - C linker flags +# CMPI_LIB - any -L and -l arguments required for linking MPI/C +# +# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or +# $(MPICC) $(CFLAGS) +# linking is done with $(CLINK) $(CMPI_LIB) $(CLINKFLAGS) +#--------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# This is the C compiler used for MPI programs +#--------------------------------------------------------------------------- +MPICC = cc +# This links MPI C programs; usually the same as ${MPICC} +CLINK = $(MPICC) + +#--------------------------------------------------------------------------- +# These macros are passed to the linker to help link with MPI correctly +#--------------------------------------------------------------------------- +CMPI_LIB = -L/usr/local/lib -lmpi + +#--------------------------------------------------------------------------- +# These macros are passed to the compiler to help find 'mpi.h' +#--------------------------------------------------------------------------- +CMPI_INC = -I/usr/local/include + +#--------------------------------------------------------------------------- +# Global *compile time* flags for C programs +#--------------------------------------------------------------------------- +CFLAGS = -O + +#--------------------------------------------------------------------------- +# Global *link time* flags. Flags for increasing maximum executable +# size usually go here. +#--------------------------------------------------------------------------- +CLINKFLAGS = -O + + +#--------------------------------------------------------------------------- +# MPI dummy library: +# +# Uncomment if you want to use the MPI dummy library supplied by NAS instead +# of the true message-passing library. The include file redefines several of +# the above macros. It also invokes make in subdirectory MPI_dummy. Make +# sure that no spaces or tabs precede include. +#--------------------------------------------------------------------------- +# include ../config/make.dummy + + +#--------------------------------------------------------------------------- +# Utilities C: +# +# This is the C compiler used to compile C utilities. Flags required by +# this compiler go here also; typically there are few flags required; hence +# there are no separate macros provided for such flags. +#--------------------------------------------------------------------------- +CC = cc -g + + +#--------------------------------------------------------------------------- +# Destination of executables, relative to subdirs of the main directory. . +#--------------------------------------------------------------------------- +BINDIR = ../bin + + +#--------------------------------------------------------------------------- +# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which +# is twice the precision required for the NPB suite. A compiler flag +# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to +# 64 bits, but the MPI library may continue to send 128 bits. Short of +# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit +# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment +# the following line to enable this substitution. +# +# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO +# SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC +# VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE +# THE CORRECT VALUE FOR "length". +# IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED, +# UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR +# "length" +#--------------------------------------------------------------------------- +# CONVERTFLAG = -DCONVERTDOUBLE +# CONVERTFLAG = -DFORTRAN_REC_SIZE=length +# CONVERTFLAG = -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length + + +#--------------------------------------------------------------------------- +# The variable RAND controls which random number generator +# is used. It is described in detail in README.install. +# Use "randi8" unless there is a reason to use another one. +# Other allowed values are "randi8_safe", "randdp" and "randdpvec" +#--------------------------------------------------------------------------- +RAND = randi8 +# The following is highly reliable but may be slow: +# RAND = randdp + diff --git a/examples/smpi/NAS/config/make.dummy b/examples/smpi/NAS/config/make.dummy new file mode 100644 index 0000000000..16b2350667 --- /dev/null +++ b/examples/smpi/NAS/config/make.dummy @@ -0,0 +1,7 @@ +FMPI_LIB = -L../MPI_dummy -lmpi +FMPI_INC = -I../MPI_dummy +CMPI_LIB = -L../MPI_dummy -lmpi +CMPI_INC = -I../MPI_dummy +default:: ${PROGRAM} libmpi.a +libmpi.a: + cd ../MPI_dummy; $(MAKE) F77=$(MPIF77) CC=$(MPICC) diff --git a/examples/smpi/NAS/config/suite.def.template b/examples/smpi/NAS/config/suite.def.template new file mode 100644 index 0000000000..aea8b2358e --- /dev/null +++ b/examples/smpi/NAS/config/suite.def.template @@ -0,0 +1,24 @@ +# config/suite.def +# This file is used to build several benchmarks with a single command. +# Typing "make suite" in the main directory will build all the benchmarks +# specified in this file. +# Each line of this file contains a benchmark name, class, and number +# of nodes. The name is one of "cg", "is", "ep", mg", "ft", "sp", "bt", +# "lu", and "dt". +# The class is one of "S", "W", "A", "B", "C", "D", and "E" +# (except that no classes C, D and E for DT, and no class E for IS). +# The number of nodes must be a legal number for a particular +# benchmark. The utility which parses this file is primitive, so +# formatting is inflexible. Separate name/class/number by tabs. +# Comments start with "#" as the first character on a line. +# No blank lines. +# The following example builds 1 processor sample sizes of all benchmarks. +ft S 1 +mg S 1 +sp S 1 +lu S 1 +bt S 1 +is S 1 +ep S 1 +cg S 1 +dt S 1 diff --git a/examples/smpi/NAS/sys/Makefile b/examples/smpi/NAS/sys/Makefile new file mode 100644 index 0000000000..56d1c444bd --- /dev/null +++ b/examples/smpi/NAS/sys/Makefile @@ -0,0 +1,22 @@ +include ../config/make.def + +# Note that COMPILE is also defined in make.common and should +# be the same. We can't include make.common because it has a lot +# of other garbage. LINK is not defined in make.common because +# ${MPI_LIB} needs to go at the end of the line. +FCOMPILE = $(MPIF77) -c $(FMPI_INC) $(FFLAGS) + +all: setparams + +# setparams creates an npbparam.h file for each benchmark +# configuration. npbparams.h also contains info about how a benchmark +# was compiled and linked + +setparams: setparams.c ../config/make.def + $(CC) ${CONVERTFLAG} -o setparams setparams.c + + +clean: + -rm -f setparams setparams.h npbparams.h + -rm -f *~ *.o + diff --git a/examples/smpi/NAS/sys/README b/examples/smpi/NAS/sys/README new file mode 100644 index 0000000000..3c97c524c3 --- /dev/null +++ b/examples/smpi/NAS/sys/README @@ -0,0 +1,39 @@ +This directory contains utilities and files used by the +build process. You should not need to change anything +in this directory. + +Original Files +-------------- +setparams.c: + Source for the setparams program. This program is used internally + in the build process to create the file "npbparams.h" for each + benchmark. npbparams.h contains Fortran or C parameters to build a + benchmark for a specific class and number of nodes. The setparams + program is never run directly by a user. Its invocation syntax is + "setparams benchmark-name nprocs class". + It examines the file "npbparams.h" in the current directory. If + the specified parameters are the same as those in the npbparams.h + file, nothing it changed. If the file does not exist or corresponds + to a different class/number of nodes, it is (re)built. + One of the more complicated things in npbparams.h is that it + contains, in a Fortran string, the compiler flags used to build a + benchmark, so that a benchmark can print out how it was compiled. + +make.common + A makefile segment that is included in each individual benchmark + program makefile. It sets up some standard macros (COMPILE, etc) + and makes sure everything is configured correctly (npbparams.h) + +Makefile + Builds setparams + +README + This file. + + +Created files +------------- + +setparams + See descriptions above + diff --git a/examples/smpi/NAS/sys/make.common b/examples/smpi/NAS/sys/make.common new file mode 100644 index 0000000000..4469596990 --- /dev/null +++ b/examples/smpi/NAS/sys/make.common @@ -0,0 +1,54 @@ +PROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS).$(NPROCS) +FCOMPILE = $(MPIF77) -c $(FMPI_INC) $(FFLAGS) +CCOMPILE = $(MPICC) -c $(CMPI_INC) $(CFLAGS) + +# Class "U" is used internally by the setparams program to mean +# "unknown". This means that if you don't specify CLASS= +# on the command line, you'll get an error. It would be nice +# to be able to avoid this, but we'd have to get information +# from the setparams back to the make program, which isn't easy. +CLASS=U +NPROCS=1 + +default:: ${PROGRAM} + +# This makes sure the configuration utility setparams +# is up to date. +# Note that this must be run every time, which is why the +# target does not exist and is not created. +# If you create a file called "config" you will break things. +config: + @cd ../sys; ${MAKE} all + ../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE} + +COMMON=../common +${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f + cd ${COMMON}; ${FCOMPILE} ${RAND}.f +${COMMON}/c_randdp.o: ${COMMON}/randdp.c + cd ${COMMON}; ${CCOMPILE} -o c_randdp.o randdp.c + +${COMMON}/print_results.o: ${COMMON}/print_results.f + cd ${COMMON}; ${FCOMPILE} print_results.f + +${COMMON}/c_print_results.o: ${COMMON}/c_print_results.c + cd ${COMMON}; ${CCOMPILE} c_print_results.c + +${COMMON}/timers.o: ${COMMON}/timers.f + cd ${COMMON}; ${FCOMPILE} timers.f + +${COMMON}/c_timers.o: ${COMMON}/c_timers.c + cd ${COMMON}; ${CCOMPILE} c_timers.c + +# Normally setparams updates npbparams.h only if the settings (CLASS/NPROCS) +# have changed. However, we also want to update if the compile options +# may have changed (set in ../config/make.def). +npbparams.h: ../config/make.def + @ echo make.def modified. Rebuilding npbparams.h just in case + rm -f npbparams.h + ../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE} + +# So that "make benchmark-name" works +${BENCHMARK}: default +${BENCHMARKU}: default + + diff --git a/examples/smpi/NAS/sys/print_header b/examples/smpi/NAS/sys/print_header new file mode 100755 index 0000000000..4fdb5785d2 --- /dev/null +++ b/examples/smpi/NAS/sys/print_header @@ -0,0 +1,5 @@ +echo ' =========================================' +echo ' = NAS Parallel Benchmarks 3.3 =' +echo ' = MPI/F77/C =' +echo ' =========================================' +echo '' diff --git a/examples/smpi/NAS/sys/print_instructions b/examples/smpi/NAS/sys/print_instructions new file mode 100755 index 0000000000..d2f1999b7f --- /dev/null +++ b/examples/smpi/NAS/sys/print_instructions @@ -0,0 +1,26 @@ +echo '' +echo ' To make a NAS benchmark type ' +echo '' +echo ' make NPROCS= CLASS= [SUBTYPE=]' +echo '' +echo ' where is "bt", "cg", "ep", "ft", "is", "lu",' +echo ' "mg", or "sp"' +echo ' is the number of processors' +echo ' is "S", "W", "A", "B", "C", or "D"' +echo '' +echo ' Only when making the I/O benchmark:' +echo '' +echo ' is "bt"' +echo ' , as above' +echo ' is "full", "simple", "fortran", or "epio"' +echo '' +echo ' To make a set of benchmarks, create the file config/suite.def' +echo ' according to the instructions in config/suite.def.template and type' +echo '' +echo ' make suite' +echo '' +echo ' ***************************************************************' +echo ' * Remember to edit the file config/make.def for site specific *' +echo ' * information as described in the README file *' +echo ' ***************************************************************' + diff --git a/examples/smpi/NAS/sys/setparams.c b/examples/smpi/NAS/sys/setparams.c new file mode 100644 index 0000000000..9a4fba590c --- /dev/null +++ b/examples/smpi/NAS/sys/setparams.c @@ -0,0 +1,1210 @@ +/* + * This utility configures a NPB to be built for a specific number + * of nodes and a specific class. It creates a file "npbparams.h" + * in the source directory. This file keeps state information about + * which size of benchmark is currently being built (so that nothing + * if unnecessarily rebuilt) and defines (through PARAMETER statements) + * the number of nodes and class for which a benchmark is being built. + + * The utility takes 3 arguments: + * setparams benchmark-name nprocs class + * benchmark-name is "sp", "bt", etc + * nprocs is the number of processors to run on + * class is the size of the benchmark + * These parameters are checked for the current benchmark. If they + * are invalid, this program prints a message and aborts. + * If the parameters are ok, the current npbsize.h (actually just + * the first line) is read in. If the new parameters are the same as + * the old, nothing is done, but an exit code is returned to force the + * user to specify (otherwise the make procedure succeeds but builds a + * binary of the wrong name). Otherwise the file is rewritten. + * Errors write a message (to stdout) and abort. + * + * This program makes use of two extra benchmark "classes" + * class "X" means an invalid specification. It is returned if + * there is an error parsing the config file. + * class "U" is an external specification meaning "unknown class" + * + * Unfortunately everything has to be case sensitive. This is + * because we can always convert lower to upper or v.v. but + * can't feed this information back to the makefile, so typing + * make CLASS=a and make CLASS=A will produce different binaries. + * + * + */ + +#include +#include +#include +#include +#include +#include + +/* + * This is the master version number for this set of + * NPB benchmarks. It is in an obscure place so people + * won't accidentally change it. + */ + +#define VERSION "3.3" + +/* controls verbose output from setparams */ +/* #define VERBOSE */ + +#define FILENAME "npbparams.h" +#define DESC_LINE "c NPROCS = %d CLASS = %c\n" +#define BT_DESC_LINE "c NPROCS = %d CLASS = %c SUBTYPE = %s\n" +#define DEF_CLASS_LINE "#define CLASS '%c'\n" +#define DEF_NUM_PROCS_LINE "#define NUM_PROCS %d\n" +#define FINDENT " " +#define CONTINUE " > " + +#ifdef FORTRAN_REC_SIZE +int fortran_rec_size = FORTRAN_REC_SIZE; +#else +int fortran_rec_size = 4; +#endif + +void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp, + int* subtypep); +void check_info(int type, int nprocs, char class); +void read_info(int type, int *nprocsp, char *classp, int *subtypep); +void write_info(int type, int nprocs, char class, int subtype); +void write_sp_info(FILE *fp, int nprocs, char class); +void write_bt_info(FILE *fp, int nprocs, char class, int io); +void write_lu_info(FILE *fp, int nprocs, char class); +void write_mg_info(FILE *fp, int nprocs, char class); +void write_cg_info(FILE *fp, int nprocs, char class); +void write_ft_info(FILE *fp, int nprocs, char class); +void write_ep_info(FILE *fp, int nprocs, char class); +void write_ep_info_C(FILE *fp, int nprocs, char class); /* after C translation */ +void write_is_info(FILE *fp, int nprocs, char class); +void write_dt_info(FILE *fp, int nprocs, char class); +void write_compiler_info(int type, FILE *fp); +void write_convertdouble_info(int type, FILE *fp); +void check_line(char *line, char *label, char *val); +int check_include_line(char *line, char *filename); +void put_string(FILE *fp, char *name, char *val); +void put_def_string(FILE *fp, char *name, char *val); +void put_def_variable(FILE *fp, char *name, char *val); +int isqrt(int i); +int ilog2(int i); +int ipow2(int i); + +enum benchmark_types {SP, BT, LU, MG, FT, IS, DT, EP, CG}; +enum iotypes { NONE = 0, FULL, SIMPLE, EPIO, FORTRAN}; + +int main(int argc, char *argv[]) +{ + int nprocs, nprocs_old, type; + char class, class_old; + int subtype = -1, old_subtype = -1; + + /* Get command line arguments. Make sure they're ok. */ + get_info(argc, argv, &type, &nprocs, &class, &subtype); + if (class != 'U') { +#ifdef VERBOSE + printf("setparams: For benchmark %s: number of processors = %d class = %c\n", + argv[1], nprocs, class); +#endif + check_info(type, nprocs, class); + } + + /* Get old information. */ + read_info(type, &nprocs_old, &class_old, &old_subtype); + if (class != 'U') { + if (class_old != 'X') { +#ifdef VERBOSE + printf("setparams: old settings: number of processors = %d class = %c\n", + nprocs_old, class_old); +#endif + } + } else { + printf("setparams:\n\ + *********************************************************************\n\ + * You must specify NPROCS and CLASS to build this benchmark *\n\ + * For example, to build a class A benchmark for 4 processors, type *\n\ + * make {benchmark-name} NPROCS=4 CLASS=A *\n\ + *********************************************************************\n\n"); + + if (class_old != 'X') { +#ifdef VERBOSE + printf("setparams: Previous settings were CLASS=%c NPROCS=%d\n", + class_old, nprocs_old); +#endif + } + exit(1); /* exit on class==U */ + } + + /* Write out new information if it's different. */ + if (nprocs != nprocs_old || class != class_old || subtype != old_subtype) { +#ifdef VERBOSE + printf("setparams: Writing %s\n", FILENAME); +#endif + write_info(type, nprocs, class, subtype); + } else { +#ifdef VERBOSE + printf("setparams: Settings unchanged. %s unmodified\n", FILENAME); +#endif + } + + return 0; +} + + +/* + * get_info(): Get parameters from command line + */ + +void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp, + int *subtypep) +{ + + if (argc < 4) { + printf("Usage: %s (%d) benchmark-name nprocs class\n", argv[0], argc); + exit(1); + } + + *nprocsp = atoi(argv[2]); + + *classp = *argv[3]; + + if (!strcmp(argv[1], "sp") || !strcmp(argv[1], "SP")) *typep = SP; + else if (!strcmp(argv[1], "ft") || !strcmp(argv[1], "FT")) *typep = FT; + else if (!strcmp(argv[1], "lu") || !strcmp(argv[1], "LU")) *typep = LU; + else if (!strcmp(argv[1], "mg") || !strcmp(argv[1], "MG")) *typep = MG; + else if (!strcmp(argv[1], "is") || !strcmp(argv[1], "IS")) *typep = IS; + else if (!strcmp(argv[1], "dt") || !strcmp(argv[1], "DT")) *typep = DT; + else if (!strcmp(argv[1], "ep") || !strcmp(argv[1], "EP")) *typep = EP; + else if (!strcmp(argv[1], "cg") || !strcmp(argv[1], "CG")) *typep = CG; + else if (!strcmp(argv[1], "bt") || !strcmp(argv[1], "BT")) { + *typep = BT; + if (argc != 5) { + /* printf("Usage: %s (%d) benchmark-name nprocs class\n", argv[0], argc); */ + /* exit(1); */ + *subtypep = NONE; + } else { + if (!strcmp(argv[4], "full") || !strcmp(argv[4], "FULL")) { + *subtypep = FULL; + } else if (!strcmp(argv[4], "simple") || !strcmp(argv[4], "SIMPLE")) { + *subtypep = SIMPLE; + } else if (!strcmp(argv[4], "epio") || !strcmp(argv[4], "EPIO")) { + *subtypep = EPIO; + } else if (!strcmp(argv[4], "fortran") || !strcmp(argv[4], "FORTRAN")) { + *subtypep = FORTRAN; + } else if (!strcmp(argv[4], "none") || !strcmp(argv[4], "NONE")) { + *subtypep = NONE; + } else { + printf("setparams: Error: unknown btio type %s\n", argv[4]); + exit(1); + } + } + } else { + printf("setparams: Error: unknown benchmark type %s\n", argv[1]); + exit(1); + } +} + +/* + * check_info(): Make sure command line data is ok for this benchmark + */ + +void check_info(int type, int nprocs, char class) +{ + int rootprocs, logprocs; + + /* check number of processors */ + if (nprocs <= 0) { + printf("setparams: Number of processors must be greater than zero\n"); + exit(1); + } + switch(type) { + + case SP: + case BT: + rootprocs = isqrt(nprocs); + if (rootprocs < 0) { + printf("setparams: Number of processors %d must be a square (1,4,9,...) for this benchmark", + nprocs); + exit(1); + } + if (class == 'S' && nprocs > 16) { + printf("setparams: BT and SP sample sizes cannot be run on more\n"); + printf(" than 16 processors because the cell size would be too small.\n"); + exit(1); + } + break; + + case CG: + case FT: + case MG: + case IS: + case LU: + logprocs = ilog2(nprocs); + if (logprocs < 0) { + printf("setparams: Number of processors must be a power of two (1,2,4,...) for this benchmark\n"); + exit(1); + } + + break; + + case EP: + case DT: + break; + + default: + /* never should have gotten this far with a bad name */ + printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type); + exit(1); + } + + /* check class */ + if (class != 'S' && + class != 'W' && + class != 'A' && + class != 'B' && + class != 'C' && + class != 'D' && + class != 'E') { + printf("setparams: Unknown benchmark class %c\n", class); + printf("setparams: Allowed classes are \"S\", \"W\", and \"A\" through \"E\"\n"); + exit(1); + } + + if (class == 'E' && (type == IS || type == DT)) { + printf("setparams: Benchmark class %c not defined for IS or DT\n", class); + exit(1); + } + + if (class == 'D' && type == IS && nprocs < 4) { + printf("setparams: IS class D size cannot be run on less than 4 processors\n"); + exit(1); + } +} + + +/* + * read_info(): Read previous information from file. + * Not an error if file doesn't exist, because this + * may be the first time we're running. + * Assumes the first line of the file is in a special + * format that we understand (since we wrote it). + */ + +void read_info(int type, int *nprocsp, char *classp, int *subtypep) +{ + int nread = 0; + FILE *fp; + fp = fopen(FILENAME, "r"); + if (fp == NULL) { +#ifdef VERBOSE + printf("setparams: INFO: configuration file %s does not exist (yet)\n", FILENAME); +#endif + goto abort; + } + + /* first line of file contains info (fortran), first two lines (C) */ + + switch(type) { + case BT: { + char subtype_str[100]; + nread = fscanf(fp, BT_DESC_LINE, nprocsp, classp, subtype_str); + if (nread != 3) { + if (nread != 2) { + printf("setparams: Error parsing config file %s. Ignoring previous settings\n", FILENAME); + goto abort; + } + *subtypep = 0; + break; + } + if (!strcmp(subtype_str, "full") || !strcmp(subtype_str, "FULL")) { + *subtypep = FULL; + } else if (!strcmp(subtype_str, "simple") || + !strcmp(subtype_str, "SIMPLE")) { + *subtypep = SIMPLE; + } else if (!strcmp(subtype_str, "epio") || !strcmp(subtype_str, "EPIO")) { + *subtypep = EPIO; + } else if (!strcmp(subtype_str, "fortran") || + !strcmp(subtype_str, "FORTRAN")) { + *subtypep = FORTRAN; + } else { + *subtypep = -1; + } + break; + } + + case SP: + case FT: + case MG: + case LU: + //case EP: + case CG: + nread = fscanf(fp, DESC_LINE, nprocsp, classp); + if (nread != 2) { + printf("setparams: Error line %d parsing config file %s. Ignoring previous settings\n", __LINE__,FILENAME); + goto abort; + } + break; + case IS: + case EP: + case DT: + nread = fscanf(fp, DEF_CLASS_LINE, classp); + nread += fscanf(fp, DEF_NUM_PROCS_LINE, nprocsp); + if (nread != 2) { + printf("setparams: Error line %d parsing config file %s. Ignoring previous settings\n", __LINE__,FILENAME); + goto abort; + } + break; + default: + /* never should have gotten this far with a bad name */ + printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type); + exit(1); + } + + fclose(fp); + + + return; + + abort: + *nprocsp = -1; + *classp = 'X'; + *subtypep = -1; + return; +} + + +/* + * write_info(): Write new information to config file. + * First line is in a special format so we can read + * it in again. Then comes a warning. The rest is all + * specific to a particular benchmark. + */ + +void write_info(int type, int nprocs, char class, int subtype) +{ + FILE *fp; + char *BT_TYPES[] = {"NONE", "FULL", "SIMPLE", "EPIO", "FORTRAN"}; + + fp = fopen(FILENAME, "w"); + if (fp == NULL) { + printf("setparams: Can't open file %s for writing\n", FILENAME); + exit(1); + } + + switch(type) { + case BT: + /* Write out the header */ + if (subtype == -1 || subtype == 0) { + fprintf(fp, DESC_LINE, nprocs, class); + } else { + fprintf(fp, BT_DESC_LINE, nprocs, class, BT_TYPES[subtype]); + } + /* Print out a warning so bozos don't mess with the file */ + fprintf(fp, "\ +c \n\ +c \n\ +c This file is generated automatically by the setparams utility.\n\ +c It sets the number of processors and the class of the NPB\n\ +c in this directory. Do not modify it by hand.\n\ +c \n"); + + break; + + case SP: + case FT: + case MG: + case LU: + //case EP: + case CG: + /* Write out the header */ + fprintf(fp, DESC_LINE, nprocs, class); + /* Print out a warning so bozos don't mess with the file */ + fprintf(fp, "\ +c \n\ +c \n\ +c This file is generated automatically by the setparams utility.\n\ +c It sets the number of processors and the class of the NPB\n\ +c in this directory. Do not modify it by hand.\n\ +c \n"); + + break; + case EP: + case IS: + case DT: + fprintf(fp, DEF_CLASS_LINE, class); + fprintf(fp, DEF_NUM_PROCS_LINE, nprocs); + fprintf(fp, "\ +/*\n\ + This file is generated automatically by the setparams utility.\n\ + It sets the number of processors and the class of the NPB\n\ + in this directory. Do not modify it by hand. */\n\ + \n"); + break; + default: + printf("setparams: (Internal error): Unknown benchmark type %d\n", + type); + exit(1); + } + + /* Now do benchmark-specific stuff */ + switch(type) { + case SP: + write_sp_info(fp, nprocs, class); + break; + case LU: + write_lu_info(fp, nprocs, class); + break; + case MG: + write_mg_info(fp, nprocs, class); + break; + case IS: + write_is_info(fp, nprocs, class); + break; + case DT: + write_dt_info(fp, nprocs, class); + break; + case FT: + write_ft_info(fp, nprocs, class); + break; + case EP: + //write_ep_info(fp, nprocs, class); + write_ep_info_C(fp, nprocs, class); + break; + case CG: + write_cg_info(fp, nprocs, class); + break; + case BT: + write_bt_info(fp, nprocs, class, subtype); + break; + default: + printf("setparams: (Internal error): Unknown benchmark type %d\n", type); + exit(1); + } + write_convertdouble_info(type, fp); + write_compiler_info(type, fp); + fclose(fp); + return; +} + + +/* + * write_sp_info(): Write SP specific info to config file + */ + +void write_sp_info(FILE *fp, int nprocs, char class) +{ + int maxcells, problem_size, niter; + char *dt; + maxcells = isqrt(nprocs); + if (class == 'S') { problem_size = 12; dt = "0.015d0"; niter = 100; } + else if (class == 'W') { problem_size = 36; dt = "0.0015d0"; niter = 400; } + else if (class == 'A') { problem_size = 64; dt = "0.0015d0"; niter = 400; } + else if (class == 'B') { problem_size = 102; dt = "0.001d0"; niter = 400; } + else if (class == 'C') { problem_size = 162; dt = "0.00067d0"; niter = 400; } + else if (class == 'D') { problem_size = 408; dt = "0.00030d0"; niter = 500; } + else if (class == 'E') { problem_size = 1020; dt = "0.0001d0"; niter = 500; } + else { + printf("setparams: Internal error: invalid class %c\n", class); + exit(1); + } + fprintf(fp, "%sinteger maxcells, problem_size, niter_default\n", FINDENT); + fprintf(fp, "%sparameter (maxcells=%d, problem_size=%d, niter_default=%d)\n", + FINDENT, maxcells, problem_size, niter); + fprintf(fp, "%sdouble precision dt_default\n", FINDENT); + fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt); +} + +/* + * write_bt_info(): Write BT specific info to config file + */ + +void write_bt_info(FILE *fp, int nprocs, char class, int io) +{ + int maxcells, problem_size, niter, wr_interval; + char *dt; + maxcells = isqrt(nprocs); + if (class == 'S') { problem_size = 12; dt = "0.010d0"; niter = 60; } + else if (class == 'W') { problem_size = 24; dt = "0.0008d0"; niter = 200; } + else if (class == 'A') { problem_size = 64; dt = "0.0008d0"; niter = 200; } + else if (class == 'B') { problem_size = 102; dt = "0.0003d0"; niter = 200; } + else if (class == 'C') { problem_size = 162; dt = "0.0001d0"; niter = 200; } + else if (class == 'D') { problem_size = 408; dt = "0.00002d0"; niter = 250; } + else if (class == 'E') { problem_size = 1020; dt = "0.4d-5"; niter = 250; } + else { + printf("setparams: Internal error: invalid class %c\n", class); + exit(1); + } + wr_interval = 5; + fprintf(fp, "%sinteger maxcells, problem_size, niter_default\n", FINDENT); + fprintf(fp, "%sparameter (maxcells=%d, problem_size=%d, niter_default=%d)\n", + FINDENT, maxcells, problem_size, niter); + fprintf(fp, "%sdouble precision dt_default\n", FINDENT); + fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt); + fprintf(fp, "%sinteger wr_default\n", FINDENT); + fprintf(fp, "%sparameter (wr_default = %d)\n", FINDENT, wr_interval); + fprintf(fp, "%sinteger iotype\n", FINDENT); + fprintf(fp, "%sparameter (iotype = %d)\n", FINDENT, io); + if (io) { + fprintf(fp, "%scharacter*(*) filenm\n", FINDENT); + switch (io) { + case FULL: + fprintf(fp, "%sparameter (filenm = 'btio.full.out')\n", FINDENT); + break; + case SIMPLE: + fprintf(fp, "%sparameter (filenm = 'btio.simple.out')\n", FINDENT); + break; + case EPIO: + fprintf(fp, "%sparameter (filenm = 'btio.epio.out')\n", FINDENT); + break; + case FORTRAN: + fprintf(fp, "%sparameter (filenm = 'btio.fortran.out')\n", FINDENT); + fprintf(fp, "%sinteger fortran_rec_sz\n", FINDENT); + fprintf(fp, "%sparameter (fortran_rec_sz = %d)\n", + FINDENT, fortran_rec_size); + break; + default: + break; + } + } +} + + + +/* + * write_lu_info(): Write SP specific info to config file + */ + +void write_lu_info(FILE *fp, int nprocs, char class) +{ + int isiz1, isiz2, itmax, inorm, problem_size; + int xdiv, ydiv; /* number of cells in x and y direction */ + char *dt_default; + + if (class == 'S') { problem_size = 12; dt_default = "0.5d0"; itmax = 50; } + else if (class == 'W') { problem_size = 33; dt_default = "1.5d-3"; itmax = 300; } + else if (class == 'A') { problem_size = 64; dt_default = "2.0d0"; itmax = 250; } + else if (class == 'B') { problem_size = 102; dt_default = "2.0d0"; itmax = 250; } + else if (class == 'C') { problem_size = 162; dt_default = "2.0d0"; itmax = 250; } + else if (class == 'D') { problem_size = 408; dt_default = "1.0d0"; itmax = 300; } + else if (class == 'E') { problem_size = 1020; dt_default = "0.5d0"; itmax = 300; } + else { + printf("setparams: Internal error: invalid class %c\n", class); + exit(1); + } + inorm = itmax; + xdiv = ydiv = ilog2(nprocs)/2; + if (xdiv+ydiv != ilog2(nprocs)) xdiv += 1; + xdiv = ipow2(xdiv); ydiv = ipow2(ydiv); + isiz1 = problem_size/xdiv; if (isiz1*xdiv < problem_size) isiz1++; + isiz2 = problem_size/ydiv; if (isiz2*ydiv < problem_size) isiz2++; + + + fprintf(fp, "\nc number of nodes for which this version is compiled\n"); + fprintf(fp, "%sinteger nnodes_compiled\n", FINDENT); + fprintf(fp, "%sparameter (nnodes_compiled = %d)\n", FINDENT, nprocs); + + fprintf(fp, "\nc full problem size\n"); + fprintf(fp, "%sinteger isiz01, isiz02, isiz03\n", FINDENT); + fprintf(fp, "%sparameter (isiz01=%d, isiz02=%d, isiz03=%d)\n", + FINDENT, problem_size, problem_size, problem_size); + + fprintf(fp, "\nc sub-domain array size\n"); + fprintf(fp, "%sinteger isiz1, isiz2, isiz3\n", FINDENT); + fprintf(fp, "%sparameter (isiz1=%d, isiz2=%d, isiz3=isiz03)\n", + FINDENT, isiz1, isiz2); + + fprintf(fp, "\nc number of iterations and how often to print the norm\n"); + fprintf(fp, "%sinteger itmax_default, inorm_default\n", FINDENT); + fprintf(fp, "%sparameter (itmax_default=%d, inorm_default=%d)\n", + FINDENT, itmax, inorm); + + fprintf(fp, "%sdouble precision dt_default\n", FINDENT); + fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt_default); + +} + +/* + * write_mg_info(): Write MG specific info to config file + */ + +void write_mg_info(FILE *fp, int nprocs, char class) +{ + int problem_size, nit, log2_size, log2_nprocs, lt_default, lm; + int ndim1, ndim2, ndim3; + if (class == 'S') { problem_size = 32; nit = 4; } + else if (class == 'W') { problem_size = 128; nit = 4; } + else if (class == 'A') { problem_size = 256; nit = 4; } + else if (class == 'B') { problem_size = 256; nit = 20; } + else if (class == 'C') { problem_size = 512; nit = 20; } + else if (class == 'D') { problem_size = 1024; nit = 50; } + else if (class == 'E') { problem_size = 2048; nit = 50; } + else { + printf("setparams: Internal error: invalid class type %c\n", class); + exit(1); + } + log2_size = ilog2(problem_size); + log2_nprocs = ilog2(nprocs); + /* lt is log of largest total dimension */ + lt_default = log2_size; + /* log of log of maximum dimension on a node */ + lm = log2_size - log2_nprocs/3; + ndim1 = lm; + ndim3 = log2_size - (log2_nprocs+2)/3; + ndim2 = log2_size - (log2_nprocs+1)/3; + + fprintf(fp, "%sinteger nprocs_compiled\n", FINDENT); + fprintf(fp, "%sparameter (nprocs_compiled = %d)\n", FINDENT, nprocs); + fprintf(fp, "%sinteger nx_default, ny_default, nz_default\n", FINDENT); + fprintf(fp, "%sparameter (nx_default=%d, ny_default=%d, nz_default=%d)\n", + FINDENT, problem_size, problem_size, problem_size); + fprintf(fp, "%sinteger nit_default, lm, lt_default\n", FINDENT); + fprintf(fp, "%sparameter (nit_default=%d, lm = %d, lt_default=%d)\n", + FINDENT, nit, lm, lt_default); + fprintf(fp, "%sinteger debug_default\n", FINDENT); + fprintf(fp, "%sparameter (debug_default=%d)\n", FINDENT, 0); + fprintf(fp, "%sinteger ndim1, ndim2, ndim3\n", FINDENT); + fprintf(fp, "%sparameter (ndim1 = %d, ndim2 = %d, ndim3 = %d)\n", + FINDENT, ndim1, ndim2, ndim3); +} + + +/* + * write_dt_info(): Write DT specific info to config file + */ + +void write_dt_info(FILE *fp, int nprocs, char class) +{ + int num_samples,deviation,num_sources; + if (class == 'S') { num_samples=1728; deviation=128; num_sources=4; } + else if (class == 'W') { num_samples=1728*8; deviation=128*2; num_sources=4*2; } + else if (class == 'A') { num_samples=1728*64; deviation=128*4; num_sources=4*4; } + else if (class == 'B') { num_samples=1728*512; deviation=128*8; num_sources=4*8; } + else if (class == 'C') { num_samples=1728*4096; deviation=128*16; num_sources=4*16; } + else if (class == 'D') { num_samples=1728*4096*8; deviation=128*32; num_sources=4*32; } + else { + printf("setparams: Internal error: invalid class type %c\n", class); + exit(1); + } + fprintf(fp, "#define NUM_SAMPLES %d\n", num_samples); + fprintf(fp, "#define STD_DEVIATION %d\n", deviation); + fprintf(fp, "#define NUM_SOURCES %d\n", num_sources); +} + +/* + * write_is_info(): Write IS specific info to config file + */ + +void write_is_info(FILE *fp, int nprocs, char class) +{ + if( class != 'S' && + class != 'W' && + class != 'A' && + class != 'B' && + class != 'C' && + class != 'D' ) + { + printf("setparams: Internal error: invalid class type %c\n", class); + exit(1); + } +} + +/* + * write_cg_info(): Write CG specific info to config file + */ + +void write_cg_info(FILE *fp, int nprocs, char class) +{ + int na,nonzer,niter; + char *shift,*rcond="1.0d-1"; + char *shiftS="10.", + *shiftW="12.", + *shiftA="20.", + *shiftB="60.", + *shiftC="110.", + *shiftD="500.", + *shiftE="1.5d3"; + + int num_proc_cols, num_proc_rows; + + + if( class == 'S' ) + { na=1400; nonzer=7; niter=15; shift=shiftS; } + else if( class == 'W' ) + { na=7000; nonzer=8; niter=15; shift=shiftW; } + else if( class == 'A' ) + { na=14000; nonzer=11; niter=15; shift=shiftA; } + else if( class == 'B' ) + { na=75000; nonzer=13; niter=75; shift=shiftB; } + else if( class == 'C' ) + { na=150000; nonzer=15; niter=75; shift=shiftC; } + else if( class == 'D' ) + { na=1500000; nonzer=21; niter=100; shift=shiftD; } + else if( class == 'E' ) + { na=9000000; nonzer=26; niter=100; shift=shiftE; } + else + { + printf("setparams: Internal error: invalid class type %c\n", class); + exit(1); + } + fprintf( fp, "%sinteger na, nonzer, niter\n", FINDENT ); + fprintf( fp, "%sdouble precision shift, rcond\n", FINDENT ); + fprintf( fp, "%sparameter( na=%d,\n", FINDENT, na ); + fprintf( fp, "%s nonzer=%d,\n", CONTINUE, nonzer ); + fprintf( fp, "%s niter=%d,\n", CONTINUE, niter ); + fprintf( fp, "%s shift=%s,\n", CONTINUE, shift ); + fprintf( fp, "%s rcond=%s )\n", CONTINUE, rcond ); + + + num_proc_cols = num_proc_rows = ilog2(nprocs)/2; + if (num_proc_cols+num_proc_rows != ilog2(nprocs)) num_proc_cols += 1; + num_proc_cols = ipow2(num_proc_cols); num_proc_rows = ipow2(num_proc_rows); + + fprintf( fp, "\nc number of nodes for which this version is compiled\n" ); + fprintf( fp, "%sinteger nnodes_compiled\n", FINDENT ); + fprintf( fp, "%sparameter( nnodes_compiled = %d)\n", FINDENT, nprocs ); + fprintf( fp, "%sinteger num_proc_cols, num_proc_rows\n", FINDENT ); + fprintf( fp, "%sparameter( num_proc_cols=%d, num_proc_rows=%d )\n", + FINDENT, + num_proc_cols, + num_proc_rows ); +} + + +/* + * write_ft_info(): Write FT specific info to config file + */ + +void write_ft_info(FILE *fp, int nprocs, char class) +{ + /* easiest way (given the way the benchmark is written) + * is to specify log of number of grid points in each + * direction m1, m2, m3. nt is the number of iterations + */ + int nx, ny, nz, maxdim, niter; + if (class == 'S') { nx = 64; ny = 64; nz = 64; niter = 6;} + else if (class == 'W') { nx = 128; ny = 128; nz = 32; niter = 6;} + else if (class == 'A') { nx = 256; ny = 256; nz = 128; niter = 6;} + else if (class == 'B') { nx = 512; ny = 256; nz = 256; niter =20;} + else if (class == 'C') { nx = 512; ny = 512; nz = 512; niter =20;} + else if (class == 'D') { nx = 2048; ny = 1024; nz = 1024; niter =25;} + else if (class == 'E') { nx = 4096; ny = 2048; nz = 2048; niter =25;} + else { + printf("setparams: Internal error: invalid class type %c\n", class); + exit(1); + } + maxdim = nx; + if (ny > maxdim) maxdim = ny; + if (nz > maxdim) maxdim = nz; + fprintf(fp, "%sinteger nx, ny, nz, maxdim, niter_default, ntdivnp, np_min\n", FINDENT); + fprintf(fp, "%sparameter (nx=%d, ny=%d, nz=%d, maxdim=%d)\n", + FINDENT, nx, ny, nz, maxdim); + fprintf(fp, "%sparameter (niter_default=%d)\n", FINDENT, niter); + fprintf(fp, "%sparameter (np_min = %d)\n", FINDENT, nprocs); + fprintf(fp, "%sparameter (ntdivnp=((nx*ny)/np_min)*nz)\n", FINDENT); + fprintf(fp, "%sdouble precision ntotal_f\n", FINDENT); + fprintf(fp, "%sparameter (ntotal_f=1.d0*nx*ny*nz)\n", FINDENT); +} + +/* + * write_ep_info(): Write EP specific info to config file + */ + +void write_ep_info(FILE *fp, int nprocs, char class) +{ + /* easiest way (given the way the benchmark is written) + * is to specify log of number of grid points in each + * direction m1, m2, m3. nt is the number of iterations + */ + int m; + if (class == 'S') { m = 24; } + else if (class == 'W') { m = 25; } + else if (class == 'A') { m = 28; } + else if (class == 'B') { m = 30; } + else if (class == 'C') { m = 32; } + else if (class == 'D') { m = 36; } + else if (class == 'E') { m = 40; } + else { + printf("setparams: Internal error: invalid class type %c\n", class); + exit(1); + } + /* number of processors given by "npm" */ + + + fprintf(fp, "%scharacter class\n",FINDENT); + fprintf(fp, "%sparameter (class =\'%c\')\n", + FINDENT, class); + fprintf(fp, "%sinteger m, npm\n", FINDENT); + fprintf(fp, "%sparameter (m=%d, npm=%d)\n", + FINDENT, m, nprocs); +} +/* + * write_ep_info_C(): Write EP specific info to config file + */ + + +void write_ep_info_C(FILE *fp, int nprocs, char class) +{ + /* easiest way (given the way the benchmark is written) + * is to specify log of number of grid points in each + * direction m1, m2, m3. nt is the number of iterations + */ + int m; + if (class == 'S') { m = 24; } + else if (class == 'W') { m = 25; } + else if (class == 'A') { m = 28; } + else if (class == 'B') { m = 30; } + else if (class == 'C') { m = 32; } + else if (class == 'D') { m = 36; } + else if (class == 'E') { m = 40; } + else { + printf("setparams: Internal error: invalid class type %c\n", class); + exit(1); + } + /* number of processors given by "npm" */ + + + fprintf(fp, "%schar *_class=\"%c\";\n",FINDENT,class); + fprintf(fp, "%sint m=%d;\n", FINDENT,m); + fprintf(fp, "%sint npm=%d;\n", FINDENT,nprocs); +} +/* + * This is a gross hack to allow the benchmarks to + * print out how they were compiled. Various other ways + * of doing this have been tried and they all fail on + * some machine - due to a broken "make" program, or + * F77 limitations, of whatever. Hopefully this will + * always work because it uses very portable C. Unfortunately + * it relies on parsing the make.def file - YUK. + * If your machine doesn't have or , happy hacking! + * + */ + +#define VERBOSE +#define LL 400 +#include +#define DEFFILE "../config/make.def" +#define DEFAULT_MESSAGE "(none)" +FILE *deffile; +void write_compiler_info(int type, FILE *fp) +{ + char line[LL]; + char mpif77[LL], flink[LL], fmpi_lib[LL], fmpi_inc[LL], fflags[LL], flinkflags[LL]; + char compiletime[LL], randfile[LL]; + char mpicc[LL], cflags[LL], clink[LL], clinkflags[LL], + cmpi_lib[LL], cmpi_inc[LL]; + struct tm *tmp; + time_t t; + deffile = fopen(DEFFILE, "r"); + if (deffile == NULL) { + printf("\n\ +setparams: File %s doesn't exist. To build the NAS benchmarks\n\ + you need to create is according to the instructions\n\ + in the README in the main directory and comments in \n\ + the file config/make.def.template\n", DEFFILE); + exit(1); + } + strcpy(mpif77, DEFAULT_MESSAGE); + strcpy(flink, DEFAULT_MESSAGE); + strcpy(fmpi_lib, DEFAULT_MESSAGE); + strcpy(fmpi_inc, DEFAULT_MESSAGE); + strcpy(fflags, DEFAULT_MESSAGE); + strcpy(flinkflags, DEFAULT_MESSAGE); + strcpy(randfile, DEFAULT_MESSAGE); + strcpy(mpicc, DEFAULT_MESSAGE); + strcpy(cflags, DEFAULT_MESSAGE); + strcpy(clink, DEFAULT_MESSAGE); + strcpy(clinkflags, DEFAULT_MESSAGE); + strcpy(cmpi_lib, DEFAULT_MESSAGE); + strcpy(cmpi_inc, DEFAULT_MESSAGE); + + while (fgets(line, LL, deffile) != NULL) { + if (*line == '#') continue; + /* yes, this is inefficient. but it's simple! */ + check_line(line, "MPIF77", mpif77); + check_line(line, "FLINK", flink); + check_line(line, "FMPI_LIB", fmpi_lib); + check_line(line, "FMPI_INC", fmpi_inc); + check_line(line, "FFLAGS", fflags); + check_line(line, "FLINKFLAGS", flinkflags); + check_line(line, "RAND", randfile); + check_line(line, "MPICC", mpicc); + check_line(line, "CFLAGS", cflags); + check_line(line, "CLINK", clink); + check_line(line, "CLINKFLAGS", clinkflags); + check_line(line, "CMPI_LIB", cmpi_lib); + check_line(line, "CMPI_INC", cmpi_inc); + /* if the dummy library is used by including make.dummy, we set the + Fortran and C paths to libraries and headers accordingly */ + if(check_include_line(line, "../config/make.dummy")) { + strcpy(fmpi_lib, "-L../MPI_dummy -lmpi"); + strcpy(fmpi_inc, "-I../MPI_dummy"); + strcpy(cmpi_lib, "-L../MPI_dummy -lmpi"); + strcpy(cmpi_inc, "-I../MPI_dummy"); + } + } + + + (void) time(&t); + tmp = localtime(&t); + (void) strftime(compiletime, (size_t)LL, "%d %b %Y", tmp); + + + switch(type) { + case FT: + case SP: + case BT: + case MG: + case LU: + //case EP: + case CG: + put_string(fp, "compiletime", compiletime); + put_string(fp, "npbversion", VERSION); + put_string(fp, "cs1", mpif77); + put_string(fp, "cs2", flink); + put_string(fp, "cs3", fmpi_lib); + put_string(fp, "cs4", fmpi_inc); + put_string(fp, "cs5", fflags); + put_string(fp, "cs6", flinkflags); + put_string(fp, "cs7", randfile); + break; + case IS: + case EP: + case DT: + put_def_string(fp, "COMPILETIME", compiletime); + put_def_string(fp, "NPBVERSION", VERSION); + put_def_string(fp, "MPICC", mpicc); + put_def_string(fp, "CFLAGS", cflags); + put_def_string(fp, "CLINK", clink); + put_def_string(fp, "CLINKFLAGS", clinkflags); + put_def_string(fp, "CMPI_LIB", cmpi_lib); + put_def_string(fp, "CMPI_INC", cmpi_inc); + break; + default: + printf("setparams: (Internal error): Unknown benchmark type %d\n", + type); + exit(1); + } + +} + +void check_line(char *line, char *label, char *val) +{ + char *original_line; + int n; + original_line = line; + /* compare beginning of line and label */ + while (*label != '\0' && *line == *label) { + line++; label++; + } + /* if *label is not EOS, we must have had a mismatch */ + if (*label != '\0') return; + /* if *line is not a space, actual label is longer than test label */ + if (!isspace(*line) && *line != '=') return ; + /* skip over white space */ + while (isspace(*line)) line++; + /* next char should be '=' */ + if (*line != '=') return; + /* skip over white space */ + while (isspace(*++line)); + /* if EOS, nothing was specified */ + if (*line == '\0') return; + /* finally we've come to the value */ + strcpy(val, line); + /* chop off the newline at the end */ + n = strlen(val)-1; + if (n >= 0 && val[n] == '\n') + val[n--] = '\0'; + if (n >= 0 && val[n] == '\r') + val[n--] = '\0'; + /* treat continuation */ + while (val[n] == '\\' && fgets(original_line, LL, deffile)) { + line = original_line; + while (isspace(*line)) line++; + if (isspace(*original_line)) val[n++] = ' '; + while (*line && *line != '\n' && *line != '\r' && n < LL-1) + val[n++] = *line++; + val[n] = '\0'; + n--; + } +/* if (val[strlen(val) - 1] == '\\') { + printf("\n\ +setparams: Error in file make.def. Because of the way in which\n\ + command line arguments are incorporated into the\n\ + executable benchmark, you can't have any continued\n\ + lines in the file make.def, that is, lines ending\n\ + with the character \"\\\". Although it may be ugly, \n\ + you should be able to reformat without continuation\n\ + lines. The offending line is\n\ + %s\n", original_line); + exit(1); + } */ +} + +int check_include_line(char *line, char *filename) +{ + char *include_string = "include"; + /* compare beginning of line and "include" */ + while (*include_string != '\0' && *line == *include_string) { + line++; include_string++; + } + /* if *include_string is not EOS, we must have had a mismatch */ + if (*include_string != '\0') return(0); + /* if *line is not a space, first word is not "include" */ + if (!isspace(*line)) return(0); + /* skip over white space */ + while (isspace(*++line)); + /* if EOS, nothing was specified */ + if (*line == '\0') return(0); + /* next keyword should be name of include file in *filename */ + while (*filename != '\0' && *line == *filename) { + line++; filename++; + } + if (*filename != '\0' || + (*line != ' ' && *line != '\0' && *line !='\n')) return(0); + else return(1); +} + + +#define MAXL 46 +void put_string(FILE *fp, char *name, char *val) +{ + int len; + len = strlen(val); + if (len > MAXL) { + val[MAXL] = '\0'; + val[MAXL-1] = '.'; + val[MAXL-2] = '.'; + val[MAXL-3] = '.'; + len = MAXL; + } + fprintf(fp, "%scharacter*%d %s\n", FINDENT, len, name); + fprintf(fp, "%sparameter (%s=\'%s\')\n", FINDENT, name, val); +} + +/* NOTE: is the ... stuff necessary in C? */ +void put_def_string(FILE *fp, char *name, char *val) +{ + int len; + len = strlen(val); + if (len > MAXL) { + val[MAXL] = '\0'; + val[MAXL-1] = '.'; + val[MAXL-2] = '.'; + val[MAXL-3] = '.'; + len = MAXL; + } + fprintf(fp, "#define %s \"%s\"\n", name, val); +} + +void put_def_variable(FILE *fp, char *name, char *val) +{ + int len; + len = strlen(val); + if (len > MAXL) { + val[MAXL] = '\0'; + val[MAXL-1] = '.'; + val[MAXL-2] = '.'; + val[MAXL-3] = '.'; + len = MAXL; + } + fprintf(fp, "#define %s %s\n", name, val); +} + + + +#if 0 + +/* this version allows arbitrarily long lines but + * some compilers don't like that and they're rarely + * useful + */ + +#define LINELEN 65 +void put_string(FILE *fp, char *name, char *val) +{ + int len, nlines, pos, i; + char line[100]; + len = strlen(val); + nlines = len/LINELEN; + if (nlines*LINELEN < len) nlines++; + fprintf(fp, "%scharacter*%d %s\n", FINDENT, nlines*LINELEN, name); + fprintf(fp, "%sparameter (%s = \n", FINDENT, name); + for (i = 0; i < nlines; i++) { + pos = i*LINELEN; + if (i == 0) fprintf(fp, "%s\'", CONTINUE); + else fprintf(fp, "%s", CONTINUE); + /* number should be same as LINELEN */ + fprintf(fp, "%.65s", val+pos); + if (i == nlines-1) fprintf(fp, "\')\n"); + else fprintf(fp, "\n"); + } +} + +#endif + + +/* integer square root. Return error if argument isn't + * a perfect square or is less than or equal to zero + */ + +int isqrt(int i) +{ + int root, square; + if (i <= 0) return(-1); + square = 0; + for (root = 1; square <= i; root++) { + square = root*root; + if (square == i) return(root); + } + return(-1); +} + + +/* integer log base two. Return error is argument isn't + * a power of two or is less than or equal to zero + */ + +int ilog2(int i) +{ + int log2; + int exp2 = 1; + if (i <= 0) return(-1); + + for (log2 = 0; log2 < 20; log2++) { + if (exp2 == i) return(log2); + exp2 *= 2; + } + return(-1); +} + +int ipow2(int i) +{ + int pow2 = 1; + if (i < 0) return(-1); + if (i == 0) return(1); + while(i--) pow2 *= 2; + return(pow2); +} + + + +void write_convertdouble_info(int type, FILE *fp) +{ + switch(type) { + case SP: + case BT: + case LU: + case FT: + case MG: + //case EP: + case CG: + fprintf(fp, "%slogical convertdouble\n", FINDENT); +#ifdef CONVERTDOUBLE + fprintf(fp, "%sparameter (convertdouble = .true.)\n", FINDENT); +#else + fprintf(fp, "%sparameter (convertdouble = .false.)\n", FINDENT); +#endif + break; + } +} diff --git a/examples/smpi/NAS/sys/suite.awk b/examples/smpi/NAS/sys/suite.awk new file mode 100644 index 0000000000..2e5fc3137c --- /dev/null +++ b/examples/smpi/NAS/sys/suite.awk @@ -0,0 +1,20 @@ +BEGIN { SMAKE = "make" } { + if ($1 !~ /^#/ && NF > 2) { + printf "cd `echo %s|tr '[a-z]' '[A-Z]'`; %s clean;", $1, SMAKE; + printf "%s CLASS=%s NPROCS=%s", SMAKE, $2, $3; + if ( NF > 3 ) { + if ( $4 ~ /^vec/ || $4 ~ /^VEC/ ) { + printf " VERSION=%s", $4; + if ( NF > 4 ) { + printf " SUBTYPE=%s", $5; + } + } else { + printf " SUBTYPE=%s", $4; + if ( NF > 4 ) { + printf " VERSION=%s", $5; + } + } + } + printf "; cd ..\n"; + } +} -- 2.20.1