From 826897d361add5db3272a9810e70371a40ba1660 Mon Sep 17 00:00:00 2001
From: Augustin Degomme <augustin.degomme@imag.fr>
Date: Mon, 23 Jun 2014 17:39:40 +0200
Subject: [PATCH] add MPICH3 rma tests (15 out of 88 should be passing now)

---
 buildtools/Cmake/AddTests.cmake               |   3 +-
 buildtools/Cmake/DefinePackages.cmake         |   1 +
 buildtools/Cmake/MakeExe.cmake                |   1 +
 teshsuite/smpi/mpich3-test/rma/CMakeLists.txt | 400 ++++++++++++++
 teshsuite/smpi/mpich3-test/rma/accfence1.c    | 103 ++++
 teshsuite/smpi/mpich3-test/rma/accfence2.c    |  91 ++++
 teshsuite/smpi/mpich3-test/rma/accfence2_am.c |  97 ++++
 teshsuite/smpi/mpich3-test/rma/accpscw1.c     | 110 ++++
 teshsuite/smpi/mpich3-test/rma/adlb_mimic1.c  | 169 ++++++
 teshsuite/smpi/mpich3-test/rma/allocmem.c     |  49 ++
 teshsuite/smpi/mpich3-test/rma/attrorderwin.c | 129 +++++
 teshsuite/smpi/mpich3-test/rma/baseattrwin.c  |  80 +++
 .../smpi/mpich3-test/rma/compare_and_swap.c   | 108 ++++
 .../smpi/mpich3-test/rma/contention_put.c     | 105 ++++
 .../smpi/mpich3-test/rma/contention_putget.c  |  99 ++++
 teshsuite/smpi/mpich3-test/rma/contig_displ.c |  98 ++++
 teshsuite/smpi/mpich3-test/rma/epochtest.c    | 191 +++++++
 teshsuite/smpi/mpich3-test/rma/fetch_and_op.c | 311 +++++++++++
 teshsuite/smpi/mpich3-test/rma/fetchandadd.c  | 127 +++++
 .../smpi/mpich3-test/rma/fetchandadd_am.c     | 137 +++++
 .../smpi/mpich3-test/rma/fetchandadd_tree.c   | 176 +++++++
 .../mpich3-test/rma/fetchandadd_tree_am.c     | 188 +++++++
 teshsuite/smpi/mpich3-test/rma/fkeyvalwin.c   |  93 ++++
 teshsuite/smpi/mpich3-test/rma/flush.c        |  89 ++++
 .../smpi/mpich3-test/rma/get_acc_local.c      |  52 ++
 .../smpi/mpich3-test/rma/get_accumulate.c     | 413 +++++++++++++++
 teshsuite/smpi/mpich3-test/rma/getfence1.c    |  99 ++++
 teshsuite/smpi/mpich3-test/rma/getgroup.c     |  52 ++
 teshsuite/smpi/mpich3-test/rma/ircpi.c        |  71 +++
 teshsuite/smpi/mpich3-test/rma/linked_list.c  | 231 +++++++++
 .../rma/linked_list_bench_lock_all.c          | 263 ++++++++++
 .../rma/linked_list_bench_lock_excl.c         | 266 ++++++++++
 .../rma/linked_list_bench_lock_shr.c          | 263 ++++++++++
 .../smpi/mpich3-test/rma/linked_list_fop.c    | 242 +++++++++
 .../mpich3-test/rma/linked_list_lockall.c     | 231 +++++++++
 .../smpi/mpich3-test/rma/lockcontention.c     | 101 ++++
 .../smpi/mpich3-test/rma/lockcontention2.c    | 305 +++++++++++
 .../smpi/mpich3-test/rma/lockcontention3.c    | 487 ++++++++++++++++++
 teshsuite/smpi/mpich3-test/rma/locknull.c     |  66 +++
 teshsuite/smpi/mpich3-test/rma/lockopts.c     | 211 ++++++++
 teshsuite/smpi/mpich3-test/rma/manyrma2.c     | 308 +++++++++++
 teshsuite/smpi/mpich3-test/rma/mcs-mutex.c    | 216 ++++++++
 teshsuite/smpi/mpich3-test/rma/mcs-mutex.h    |  38 ++
 teshsuite/smpi/mpich3-test/rma/mixedsync.c    | 245 +++++++++
 teshsuite/smpi/mpich3-test/rma/mutex_bench.c  |  80 +++
 teshsuite/smpi/mpich3-test/rma/nullpscw.c     |  34 ++
 .../smpi/mpich3-test/rma/pscw_ordering.c      | 139 +++++
 teshsuite/smpi/mpich3-test/rma/put_base.c     | 148 ++++++
 teshsuite/smpi/mpich3-test/rma/put_bottom.c   | 138 +++++
 teshsuite/smpi/mpich3-test/rma/putfence1.c    | 109 ++++
 teshsuite/smpi/mpich3-test/rma/putfidx.c      | 125 +++++
 teshsuite/smpi/mpich3-test/rma/putpscw1.c     | 109 ++++
 teshsuite/smpi/mpich3-test/rma/req_example.c  |  91 ++++
 teshsuite/smpi/mpich3-test/rma/reqops.c       | 286 ++++++++++
 teshsuite/smpi/mpich3-test/rma/rmanull.c      | 231 +++++++++
 teshsuite/smpi/mpich3-test/rma/rmazero.c      | 220 ++++++++
 teshsuite/smpi/mpich3-test/rma/selfrma.c      | 113 ++++
 teshsuite/smpi/mpich3-test/rma/squelch.h      |  16 +
 .../mpich3-test/rma/strided_acc_indexed.c     | 143 +++++
 .../mpich3-test/rma/strided_acc_onelock.c     |  85 +++
 .../mpich3-test/rma/strided_acc_subarray.c    | 136 +++++
 .../mpich3-test/rma/strided_get_indexed.c     | 133 +++++
 .../mpich3-test/rma/strided_getacc_indexed.c  | 141 +++++
 .../rma/strided_getacc_indexed_shared.c       | 151 ++++++
 .../mpich3-test/rma/strided_putget_indexed.c  | 137 +++++
 .../rma/strided_putget_indexed_shared.c       | 147 ++++++
 teshsuite/smpi/mpich3-test/rma/test1.c        |  81 +++
 teshsuite/smpi/mpich3-test/rma/test1_am.c     | 100 ++++
 teshsuite/smpi/mpich3-test/rma/test1_dt.c     |  89 ++++
 teshsuite/smpi/mpich3-test/rma/test2.c        |  82 +++
 teshsuite/smpi/mpich3-test/rma/test2_am.c     |  99 ++++
 teshsuite/smpi/mpich3-test/rma/test3.c        | 100 ++++
 teshsuite/smpi/mpich3-test/rma/test3_am.c     | 100 ++++
 teshsuite/smpi/mpich3-test/rma/test4.c        |  81 +++
 teshsuite/smpi/mpich3-test/rma/test4_am.c     |  95 ++++
 teshsuite/smpi/mpich3-test/rma/test5.c        |  74 +++
 teshsuite/smpi/mpich3-test/rma/test5_am.c     |  92 ++++
 teshsuite/smpi/mpich3-test/rma/testlist       | 125 +++++
 teshsuite/smpi/mpich3-test/rma/transpose1.c   | 109 ++++
 teshsuite/smpi/mpich3-test/rma/transpose2.c   | 107 ++++
 teshsuite/smpi/mpich3-test/rma/transpose3.c   | 107 ++++
 teshsuite/smpi/mpich3-test/rma/transpose4.c   |  84 +++
 teshsuite/smpi/mpich3-test/rma/transpose5.c   | 111 ++++
 teshsuite/smpi/mpich3-test/rma/transpose6.c   |  76 +++
 teshsuite/smpi/mpich3-test/rma/transpose7.c   | 105 ++++
 .../smpi/mpich3-test/rma/win_dynamic_acc.c    |  65 +++
 teshsuite/smpi/mpich3-test/rma/win_flavors.c  | 122 +++++
 teshsuite/smpi/mpich3-test/rma/win_info.c     |  72 +++
 teshsuite/smpi/mpich3-test/rma/win_shared.c   |  88 ++++
 .../mpich3-test/rma/win_shared_noncontig.c    |  87 ++++
 .../rma/win_shared_noncontig_put.c            |  94 ++++
 teshsuite/smpi/mpich3-test/rma/wincall.c      |  65 +++
 .../smpi/mpich3-test/rma/window_creation.c    |  53 ++
 teshsuite/smpi/mpich3-test/rma/winname.c      |  47 ++
 teshsuite/smpi/mpich3-test/rma/wintest.c      |  83 +++
 teshsuite/smpi/mpich3-test/testlist           |   1 +
 96 files changed, 12620 insertions(+), 1 deletion(-)
 create mode 100644 teshsuite/smpi/mpich3-test/rma/CMakeLists.txt
 create mode 100644 teshsuite/smpi/mpich3-test/rma/accfence1.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/accfence2.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/accfence2_am.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/accpscw1.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/adlb_mimic1.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/allocmem.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/attrorderwin.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/baseattrwin.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/compare_and_swap.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/contention_put.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/contention_putget.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/contig_displ.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/epochtest.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/fetch_and_op.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/fetchandadd.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/fetchandadd_am.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/fetchandadd_tree.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/fetchandadd_tree_am.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/fkeyvalwin.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/flush.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/get_acc_local.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/get_accumulate.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/getfence1.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/getgroup.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/ircpi.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/linked_list.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_all.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_excl.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_shr.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/linked_list_fop.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/linked_list_lockall.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/lockcontention.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/lockcontention2.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/lockcontention3.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/locknull.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/lockopts.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/manyrma2.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/mcs-mutex.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/mcs-mutex.h
 create mode 100644 teshsuite/smpi/mpich3-test/rma/mixedsync.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/mutex_bench.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/nullpscw.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/pscw_ordering.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/put_base.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/put_bottom.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/putfence1.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/putfidx.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/putpscw1.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/req_example.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/reqops.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/rmanull.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/rmazero.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/selfrma.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/squelch.h
 create mode 100644 teshsuite/smpi/mpich3-test/rma/strided_acc_indexed.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/strided_acc_onelock.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/strided_acc_subarray.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/strided_get_indexed.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/strided_getacc_indexed.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/strided_getacc_indexed_shared.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/strided_putget_indexed.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/strided_putget_indexed_shared.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test1.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test1_am.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test1_dt.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test2.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test2_am.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test3.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test3_am.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test4.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test4_am.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test5.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/test5_am.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/testlist
 create mode 100644 teshsuite/smpi/mpich3-test/rma/transpose1.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/transpose2.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/transpose3.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/transpose4.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/transpose5.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/transpose6.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/transpose7.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/win_dynamic_acc.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/win_flavors.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/win_info.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/win_shared.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/win_shared_noncontig.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/win_shared_noncontig_put.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/wincall.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/window_creation.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/winname.c
 create mode 100644 teshsuite/smpi/mpich3-test/rma/wintest.c

diff --git a/buildtools/Cmake/AddTests.cmake b/buildtools/Cmake/AddTests.cmake
index 88f4921547..e902b7eeaf 100644
--- a/buildtools/Cmake/AddTests.cmake
+++ b/buildtools/Cmake/AddTests.cmake
@@ -424,7 +424,8 @@ IF(NOT enable_memcheck)
         ADD_TEST(test-smpi-mpich3-group-raw      ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/group perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/group -tests=testlist -execarg=--cfg=contexts/factory:raw)
         ADD_TEST(test-smpi-mpich3-pt2pt-raw      ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/pt2pt perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/pt2pt -tests=testlist -execarg=--cfg=contexts/factory:raw)
         ADD_TEST(test-smpi-mpich3-topo-raw       ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/topo perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/topo -tests=testlist -execarg=--cfg=contexts/factory:raw)
-        SET_TESTS_PROPERTIES(test-smpi-mpich3-attr-raw test-smpi-mpich3-comm-raw test-smpi-mpich3-init-raw test-smpi-mpich3-datatype-raw test-smpi-mpich3-group-raw test-smpi-mpich3-pt2pt-raw test-smpi-mpich3-topo-raw PROPERTIES PASS_REGULAR_EXPRESSION "tests passed!")
+        ADD_TEST(test-smpi-mpich3-rma-raw       ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/rma perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/rma -tests=testlist -execarg=--cfg=contexts/factory:raw -execarg=--cfg=smpi/privatize_global_variables:yes)
+        SET_TESTS_PROPERTIES(test-smpi-mpich3-attr-raw test-smpi-mpich3-comm-raw test-smpi-mpich3-init-raw test-smpi-mpich3-datatype-raw test-smpi-mpich3-group-raw test-smpi-mpich3-pt2pt-raw test-smpi-mpich3-topo-raw test-smpi-mpich3-rma-raw PROPERTIES PASS_REGULAR_EXPRESSION "tests passed!")
       ENDIF()
       IF(SMPI_FORTRAN)
         ADD_TEST(test-smpi-mpich3-thread-f77     ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/f77/ perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/f77/ -tests=testlist -execarg=--cfg=contexts/stack_size:8000 -execarg=--cfg=smpi/privatize_global_variables:yes)
diff --git a/buildtools/Cmake/DefinePackages.cmake b/buildtools/Cmake/DefinePackages.cmake
index 662d2f8c4b..aeefb87cf5 100644
--- a/buildtools/Cmake/DefinePackages.cmake
+++ b/buildtools/Cmake/DefinePackages.cmake
@@ -1074,6 +1074,7 @@ set(TESHSUITE_CMAKEFILES_TXT
   teshsuite/smpi/mpich3-test/init/CMakeLists.txt
   teshsuite/smpi/mpich3-test/pt2pt/CMakeLists.txt
   teshsuite/smpi/mpich3-test/topo/CMakeLists.txt
+  teshsuite/smpi/mpich3-test/rma/CMakeLists.txt
   teshsuite/surf/CMakeLists.txt
   teshsuite/surf/lmm_usage/CMakeLists.txt
   teshsuite/surf/maxmin_bench/CMakeLists.txt
diff --git a/buildtools/Cmake/MakeExe.cmake b/buildtools/Cmake/MakeExe.cmake
index 8c3a10751d..0deddd1389 100644
--- a/buildtools/Cmake/MakeExe.cmake
+++ b/buildtools/Cmake/MakeExe.cmake
@@ -134,6 +134,7 @@ add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/group)
 add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/topo)
 add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/init)
 add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/pt2pt)
+add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/rma)
 
 #add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/f77/attr)
 add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/f77/util)
diff --git a/teshsuite/smpi/mpich3-test/rma/CMakeLists.txt b/teshsuite/smpi/mpich3-test/rma/CMakeLists.txt
new file mode 100644
index 0000000000..3a062ced99
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/CMakeLists.txt
@@ -0,0 +1,400 @@
+cmake_minimum_required(VERSION 2.6)
+
+if(enable_smpi AND enable_smpi_MPICH3_testsuite)
+  if(WIN32)
+    set(CMAKE_C_FLAGS "-include ${CMAKE_HOME_DIRECTORY}/include/smpi/smpi_main.h")
+  else()
+    set(CMAKE_C_COMPILER "${CMAKE_BINARY_DIR}/smpi_script/bin/smpicc")
+    set(CMAKE_Fortran_COMPILER "${CMAKE_BINARY_DIR}/smpi_script/bin/smpiff")
+  endif()
+
+  set(EXECUTABLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}")
+  set(MPICH_FLAGS "-DHAVE_STDLIB_H=1 -DHAVE_UNISTD_H=1 -DHAVE_STRING_H=1 -DUSE_STDARG=1 -DHAVE_LONG_DOUBLE=1 -DHAVE_PROTOTYPES=1 -DHAVE_SIGNAL_H=1 -DHAVE_SIGACTION=1 -DHAVE_SLEEP=1 -DHAVE_SYSCONF=1  -Wno-error=unused-variable")
+  include_directories("${CMAKE_HOME_DIRECTORY}/include/smpi")
+  include_directories("${CMAKE_CURRENT_BINARY_DIR}/../include/")
+
+
+#  add_executable(accfence1 accfence1.c ../util/mtest.c)
+  add_executable(accfence2_am accfence2_am.c ../util/mtest.c)
+  add_executable(accfence2 accfence2.c ../util/mtest.c)
+#  add_executable(accpscw1 accpscw1.c ../util/mtest.c)
+#  add_executable(adlb_mimic1 adlb_mimic1.c ../util/mtest.c)
+  add_executable(allocmem allocmem.c ../util/mtest.c)
+#  add_executable(attrorderwin attrorderwin.c ../util/mtest.c)
+#  add_executable(baseattrwin baseattrwin.c ../util/mtest.c)
+#  add_executable(compare_and_swap compare_and_swap.c ../util/mtest.c)
+#  add_executable(contention_put contention_put.c ../util/mtest.c)
+#  add_executable(contention_putget contention_putget.c ../util/mtest.c)
+#  add_executable(contig_displ contig_displ.c ../util/mtest.c)
+  add_executable(epochtest epochtest.c ../util/mtest.c)
+#  add_executable(fetchandadd_am fetchandadd_am.c ../util/mtest.c)
+#  add_executable(fetchandadd fetchandadd.c ../util/mtest.c)
+#  add_executable(fetchandadd_tree_am fetchandadd_tree_am.c ../util/mtest.c)
+#  add_executable(fetchandadd_tree fetchandadd_tree.c ../util/mtest.c)
+#  add_executable(fetch_and_op fetch_and_op.c ../util/mtest.c)
+#  add_executable(fkeyvalwin fkeyvalwin.c ../util/mtest.c)
+#  add_executable(flush flush.c ../util/mtest.c)
+#  add_executable(get_acc_local get_acc_local.c ../util/mtest.c)
+#  add_executable(get_accumulate get_accumulate.c ../util/mtest.c)
+  add_executable(getfence1 getfence1.c ../util/mtest.c)
+#  add_executable(getgroup getgroup.c ../util/mtest.c)
+#  add_executable(ircpi ircpi.c ../util/mtest.c)
+#  add_executable(linked_list_bench_lock_all linked_list_bench_lock_all.c ../util/mtest.c)
+#  add_executable(linked_list_bench_lock_excl linked_list_bench_lock_excl.c ../util/mtest.c)
+#  add_executable(linked_list_bench_lock_shr linked_list_bench_lock_shr.c ../util/mtest.c)
+#  add_executable(linked_list linked_list.c ../util/mtest.c)
+#  add_executable(linked_list_fop linked_list_fop.c ../util/mtest.c)
+#  add_executable(linked_list_lockall linked_list_lockall.c ../util/mtest.c)
+#  add_executable(lockcontention2 lockcontention2.c ../util/mtest.c)
+#  add_executable(lockcontention3 lockcontention3.c ../util/mtest.c)
+#  add_executable(lockcontention lockcontention.c ../util/mtest.c)
+#  add_executable(locknull locknull.c ../util/mtest.c)
+#  add_executable(lockopts lockopts.c ../util/mtest.c)
+#  add_executable(manyrma2 manyrma2.c ../util/mtest.c)
+#  add_executable(mcs-mutex mcs-mutex.c ../util/mtest.c)
+#  add_executable(mixedsync mixedsync.c ../util/mtest.c)
+#  add_executable(mutex_bench mutex_bench.c ../util/mtest.c)
+#  add_executable(nullpscw nullpscw.c ../util/mtest.c)
+#  add_executable(pscw_ordering pscw_ordering.c ../util/mtest.c)
+#  add_executable(put_base put_base.c ../util/mtest.c)
+#  add_executable(put_bottom put_bottom.c ../util/mtest.c)
+  add_executable(putfence1 putfence1.c ../util/mtest.c)
+  add_executable(putfidx putfidx.c ../util/mtest.c)
+#  add_executable(putpscw1 putpscw1.c ../util/mtest.c)
+#  add_executable(req_example req_example.c ../util/mtest.c)
+#  add_executable(reqops reqops.c ../util/mtest.c)
+#  add_executable(rmanull rmanull.c ../util/mtest.c)
+#  add_executable(rmazero rmazero.c ../util/mtest.c)
+#  add_executable(selfrma selfrma.c ../util/mtest.c)
+#  add_executable(strided_acc_indexed strided_acc_indexed.c ../util/mtest.c)
+#  add_executable(strided_acc_onelock strided_acc_onelock.c ../util/mtest.c)
+#  add_executable(strided_acc_subarray strided_acc_subarray.c ../util/mtest.c)
+#  add_executable(strided_getacc_indexed strided_getacc_indexed.c ../util/mtest.c)
+#  add_executable(strided_getacc_indexed_shared strided_getacc_indexed_shared.c ../util/mtest.c)
+#  add_executable(strided_get_indexed strided_get_indexed.c ../util/mtest.c)
+#  add_executable(strided_putget_indexed strided_putget_indexed.c ../util/mtest.c)
+#  add_executable(strided_putget_indexed_shared strided_putget_indexed_shared.c ../util/mtest.c)
+  add_executable(test1_am test1_am.c ../util/mtest.c)
+  add_executable(test1 test1.c ../util/mtest.c)
+#  add_executable(test1_dt test1_dt.c ../util/mtest.c)
+#  add_executable(test2_am test2_am.c ../util/mtest.c)
+#  add_executable(test2 test2.c ../util/mtest.c)
+#  add_executable(test3_am test3_am.c ../util/mtest.c)
+#  add_executable(test3 test3.c ../util/mtest.c)
+#  add_executable(test4_am test4_am.c ../util/mtest.c)
+#  add_executable(test4 test4.c ../util/mtest.c)
+  add_executable(test5_am test5_am.c ../util/mtest.c)
+  add_executable(test5 test5.c ../util/mtest.c)
+  add_executable(transpose1 transpose1.c ../util/mtest.c)
+  add_executable(transpose2 transpose2.c ../util/mtest.c)
+#  add_executable(transpose3 transpose3.c ../util/mtest.c)
+#  add_executable(transpose4 transpose4.c ../util/mtest.c)
+#  add_executable(transpose5 transpose5.c ../util/mtest.c)
+#  add_executable(transpose6 transpose6.c ../util/mtest.c)
+  add_executable(transpose7 transpose7.c ../util/mtest.c)
+#  add_executable(wincall wincall.c ../util/mtest.c)
+  add_executable(window_creation window_creation.c ../util/mtest.c)
+#  add_executable(win_dynamic_acc win_dynamic_acc.c ../util/mtest.c)
+#  add_executable(win_flavors win_flavors.c ../util/mtest.c)
+#  add_executable(win_info win_info.c ../util/mtest.c)
+#  add_executable(winname winname.c ../util/mtest.c)
+#  add_executable(win_shared win_shared.c ../util/mtest.c)
+#  add_executable(win_shared_noncontig win_shared_noncontig.c ../util/mtest.c)
+#  add_executable(win_shared_noncontig_put win_shared_noncontig_put.c ../util/mtest.c)
+#  add_executable(wintest wintest.c ../util/mtest.c)
+
+
+
+#  target_link_libraries(accfence1  simgrid)
+  target_link_libraries(accfence2_am  simgrid)
+  target_link_libraries(accfence2  simgrid)
+#  target_link_libraries(accpscw1  simgrid)
+#  target_link_libraries(adlb_mimic1  simgrid)
+  target_link_libraries(allocmem  simgrid)
+#  target_link_libraries(attrorderwin  simgrid)
+#  target_link_libraries(baseattrwin  simgrid)
+#  target_link_libraries(compare_and_swap  simgrid)
+#  target_link_libraries(contention_put  simgrid)
+#  target_link_libraries(contention_putget  simgrid)
+#  target_link_libraries(contig_displ  simgrid)
+  target_link_libraries(epochtest  simgrid)
+#  target_link_libraries(fetchandadd_am  simgrid)
+#  target_link_libraries(fetchandadd  simgrid)
+#  target_link_libraries(fetchandadd_tree_am  simgrid)
+#  target_link_libraries(fetchandadd_tree  simgrid)
+#  target_link_libraries(fetch_and_op  simgrid)
+#  target_link_libraries(fkeyvalwin  simgrid)
+#  target_link_libraries(flush  simgrid)
+#  target_link_libraries(get_acc_local  simgrid)
+#  target_link_libraries(get_accumulate  simgrid)
+  target_link_libraries(getfence1  simgrid)
+#  target_link_libraries(getgroup  simgrid)
+#  target_link_libraries(ircpi  simgrid)
+#  target_link_libraries(linked_list_bench_lock_all  simgrid)
+#  target_link_libraries(linked_list_bench_lock_excl  simgrid)
+#  target_link_libraries(linked_list_bench_lock_shr  simgrid)
+#  target_link_libraries(linked_list  simgrid)
+#  target_link_libraries(linked_list_fop  simgrid)
+#  target_link_libraries(linked_list_lockall  simgrid)
+#  target_link_libraries(lockcontention2  simgrid)
+#  target_link_libraries(lockcontention3  simgrid)
+#  target_link_libraries(lockcontention  simgrid)
+#  target_link_libraries(locknull  simgrid)
+#  target_link_libraries(lockopts  simgrid)
+#  target_link_libraries(manyrma2  simgrid)
+#  target_link_libraries(mcs-mutex  simgrid)
+#  target_link_libraries(mixedsync  simgrid)
+#  target_link_libraries(mutex_bench  simgrid)
+#  target_link_libraries(nullpscw  simgrid)
+#  target_link_libraries(pscw_ordering  simgrid)
+#  target_link_libraries(put_base  simgrid)
+#  target_link_libraries(put_bottom  simgrid)
+  target_link_libraries(putfence1  simgrid)
+  target_link_libraries(putfidx  simgrid)
+#  target_link_libraries(putpscw1  simgrid)
+#  target_link_libraries(req_example  simgrid)
+#  target_link_libraries(reqops  simgrid)
+#  target_link_libraries(rmanull  simgrid)
+#  target_link_libraries(rmazero  simgrid)
+#  target_link_libraries(selfrma  simgrid)
+#  target_link_libraries(strided_acc_indexed  simgrid)
+#  target_link_libraries(strided_acc_onelock  simgrid)
+#  target_link_libraries(strided_acc_subarray  simgrid)
+#  target_link_libraries(strided_getacc_indexed  simgrid)
+#  target_link_libraries(strided_getacc_indexed_shared  simgrid)
+#  target_link_libraries(strided_get_indexed  simgrid)
+#  target_link_libraries(strided_putget_indexed  simgrid)
+#  target_link_libraries(strided_putget_indexed_shared  simgrid)
+  target_link_libraries(test1_am  simgrid)
+  target_link_libraries(test1  simgrid)
+#  target_link_libraries(test1_dt  simgrid)
+#  target_link_libraries(test2_am  simgrid)
+#  target_link_libraries(test2  simgrid)
+#  target_link_libraries(test3_am  simgrid)
+#  target_link_libraries(test3  simgrid)
+#  target_link_libraries(test4_am  simgrid)
+#  target_link_libraries(test4  simgrid)
+  target_link_libraries(test5_am  simgrid)
+  target_link_libraries(test5  simgrid)
+  target_link_libraries(transpose1  simgrid)
+  target_link_libraries(transpose2  simgrid)
+#  target_link_libraries(transpose3  simgrid)
+#  target_link_libraries(transpose4  simgrid)
+#  target_link_libraries(transpose5  simgrid)
+#  target_link_libraries(transpose6  simgrid)
+  target_link_libraries(transpose7  simgrid)
+#  target_link_libraries(wincall  simgrid)
+  target_link_libraries(window_creation  simgrid)
+#  target_link_libraries(win_dynamic_acc  simgrid)
+#  target_link_libraries(win_flavors  simgrid)
+#  target_link_libraries(win_info  simgrid)
+#  target_link_libraries(winname  simgrid)
+#  target_link_libraries(win_shared  simgrid)
+#  target_link_libraries(win_shared_noncontig  simgrid)
+#  target_link_libraries(win_shared_noncontig_put  simgrid)
+#  target_link_libraries(wintest  simgrid)
+
+
+
+# set_target_properties(accfence1 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(accfence2_am PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(accfence2 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(accpscw1 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(adlb_mimic1 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(allocmem PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(attrorderwin PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(baseattrwin PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(compare_and_swap PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(contention_put PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(contention_putget PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(contig_displ PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(epochtest PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(fetchandadd_am PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(fetchandadd PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(fetchandadd_tree_am PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(fetchandadd_tree PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(fetch_and_op PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(fkeyvalwin PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(flush PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(get_acc_local PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(get_accumulate PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(getfence1 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(getgroup PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(ircpi PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(linked_list_bench_lock_all PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(linked_list_bench_lock_excl PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(linked_list_bench_lock_shr PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(linked_list PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(linked_list_fop PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(linked_list_lockall PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(lockcontention2 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(lockcontention3 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(lockcontention PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(locknull PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(lockopts PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(manyrma2 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(mcs-mutex PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(mixedsync PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(mutex_bench PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(nullpscw PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(pscw_ordering PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(put_base PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(put_bottom PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(putfence1 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(putfidx PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(putpscw1 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(req_example PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(reqops PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(rmanull PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(rmazero PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(selfrma PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(strided_acc_indexed PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(strided_acc_onelock PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(strided_acc_subarray PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(strided_getacc_indexed PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(strided_getacc_indexed_shared PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(strided_get_indexed PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(strided_putget_indexed PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(strided_putget_indexed_shared PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(test1_am PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(test1 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(test1_dt PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(test2_am PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(test2 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(test3_am PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(test3 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(test4_am PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(test4 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(test5_am PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(test5 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(transpose1 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(transpose2 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(transpose3 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(transpose4 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(transpose5 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(transpose6 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(transpose7 PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(wincall PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+ set_target_properties(window_creation PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(win_dynamic_acc PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(win_flavors PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(win_info PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(winname PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(win_shared PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(win_shared_noncontig PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(win_shared_noncontig_put PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+# set_target_properties(wintest PROPERTIES COMPILE_FLAGS "${MPICH_FLAGS}")
+
+endif()
+
+set(tesh_files
+  ${tesh_files}
+  PARENT_SCOPE
+  )
+set(xml_files
+  ${xml_files}
+  PARENT_SCOPE
+  )
+set(examples_src
+  ${examples_src}
+ ${CMAKE_CURRENT_SOURCE_DIR}/accfence1.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/accfence2_am.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/accfence2.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/accpscw1.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/adlb_mimic1.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/allocmem.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/attrorderwin.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/baseattrwin.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/compare_and_swap.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/contention_put.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/contention_putget.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/contig_displ.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/epochtest.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/fetchandadd_am.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/fetchandadd.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/fetchandadd_tree_am.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/fetchandadd_tree.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/fetch_and_op.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/fkeyvalwin.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/flush.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/get_acc_local.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/get_accumulate.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/getfence1.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/getgroup.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/ircpi.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/linked_list_bench_lock_all.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/linked_list_bench_lock_excl.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/linked_list_bench_lock_shr.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/linked_list.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/linked_list_fop.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/linked_list_lockall.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/lockcontention2.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/lockcontention3.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/lockcontention.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/locknull.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/lockopts.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/manyrma2.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/mcs-mutex.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/mixedsync.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/mutex_bench.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/nullpscw.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/pscw_ordering.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/put_base.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/put_bottom.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/putfence1.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/putfidx.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/putpscw1.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/req_example.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/reqops.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/rmanull.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/rmazero.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/selfrma.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/strided_acc_indexed.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/strided_acc_onelock.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/strided_acc_subarray.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/strided_getacc_indexed.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/strided_getacc_indexed_shared.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/strided_get_indexed.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/strided_putget_indexed.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/strided_putget_indexed_shared.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test1_am.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test1.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test1_dt.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test2_am.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test2.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test3_am.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test3.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test4_am.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test4.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test5_am.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/test5.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/transpose1.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/transpose2.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/transpose3.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/transpose4.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/transpose5.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/transpose6.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/transpose7.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/wincall.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/window_creation.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/win_dynamic_acc.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/win_flavors.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/win_info.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/winname.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/win_shared.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/win_shared_noncontig.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/win_shared_noncontig_put.c 
+ ${CMAKE_CURRENT_SOURCE_DIR}/wintest.c 
+  PARENT_SCOPE
+  )
+set(bin_files
+  ${bin_files}
+  PARENT_SCOPE
+  )
+set(txt_files
+  ${txt_files}
+  ${CMAKE_CURRENT_SOURCE_DIR}/testlist
+  ${CMAKE_CURRENT_SOURCE_DIR}/mcs-mutex.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/squelch.h
+  PARENT_SCOPE
+  )
diff --git a/teshsuite/smpi/mpich3-test/rma/accfence1.c b/teshsuite/smpi/mpich3-test/rma/accfence1.c
new file mode 100644
index 0000000000..91d9f439cb
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/accfence1.c
@@ -0,0 +1,103 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Accumulate/Replace with Fence";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0, err;
+    int rank, size, source, dest;
+    int minsize = 2, count; 
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Aint      extent;
+    MTestDatatype sendtype, recvtype;
+
+    MTest_Init( &argc, &argv );
+
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	source = 0;
+	dest   = size - 1;
+	
+	for (count = 1; count < 65000; count = count * 2) {
+	    while (MTestGetDatatypes( &sendtype, &recvtype, count )) {
+		/* Make sure that everyone has a recv buffer */
+		recvtype.InitBuf( &recvtype );
+
+		MPI_Type_extent( recvtype.datatype, &extent );
+		MPI_Win_create( recvtype.buf, recvtype.count * extent, 
+				(int)extent, MPI_INFO_NULL, comm, &win );
+		MPI_Win_fence( 0, win );
+		if (rank == source) {
+		    sendtype.InitBuf( &sendtype );
+
+		    /* To improve reporting of problems about operations, we
+		       change the error handler to errors return */
+		    MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+		    
+		    /* MPI_REPLACE on accumulate is almost the same 
+		       as MPI_Put; the only difference is in the
+		       handling of overlapping accumulate operations,
+		       which are not tested here */
+		    err = MPI_Accumulate( sendtype.buf, sendtype.count, 
+					  sendtype.datatype, dest, 0, 
+					  recvtype.count, recvtype.datatype, 
+					  MPI_REPLACE, win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    printf( "Accumulate types: send %s, recv %s\n",
+				    MTestGetDatatypeName( &sendtype ),
+				    MTestGetDatatypeName( &recvtype ) );
+			    MTestPrintError( err );
+			}
+		    }
+		    err = MPI_Win_fence( 0, win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		}
+		else if (rank == dest) {
+		    MPI_Win_fence( 0, win );
+		    /* This should have the same effect, in terms of
+		       transfering data, as a send/recv pair */
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) {
+			errs += err;
+		    }
+		}
+		else {
+		    MPI_Win_fence( 0, win );
+		}
+		MPI_Win_free( &win );
+		MTestFreeDatatype( &sendtype );
+		MTestFreeDatatype( &recvtype );
+	    }
+	}
+        MTestFreeComm(&comm);
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/accfence2.c b/teshsuite/smpi/mpich3-test/rma/accfence2.c
new file mode 100644
index 0000000000..4bf0f93ce7
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/accfence2.c
@@ -0,0 +1,91 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+#ifndef MAX_INT
+#define MAX_INT 0x7fffffff
+#endif
+
+/*
+static char MTEST_Descrip[] = "Test MPI_Accumulate with fence";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0;
+    int rank, size, source, dest;
+    int minsize = 2, count, i; 
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Datatype  datatype;
+    int           *winbuf, *sbuf;
+
+    MTest_Init( &argc, &argv );
+
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	source = 0;
+	dest   = size - 1;
+	
+	for (count = 1; count < 65000; count = count * 2) {
+	    datatype = MPI_INT;
+	    /* We compare with an integer value that can be as large as
+	       size * (count * count + (1/2)*(size-1))
+	       For large machines (size large), this can exceed the 
+	       maximum integer for some large values of count.  We check
+	       that in advance and break this loop if the above value 
+	       would exceed MAX_INT.  Specifically,
+
+	       size*count*count + (1/2)*size*(size-1) > MAX_INT
+	       count*count > (MAX_INT/size - (1/2)*(size-1))
+	    */
+	    if (count * count > (MAX_INT/size - (size-1)/2)) break;
+	    winbuf = (int *)malloc( count * sizeof(int) );
+	    sbuf   = (int *)malloc( count * sizeof(int) );
+
+	    for (i=0; i<count; i++) winbuf[i] = 0;
+	    for (i=0; i<count; i++) sbuf[i] = rank + i * count;
+	    MPI_Win_create( winbuf, count * sizeof(int), sizeof(int),
+			    MPI_INFO_NULL, comm, &win );
+	    MPI_Win_fence( 0, win );
+	    MPI_Accumulate( sbuf, count, MPI_INT, source, 0, count, MPI_INT,
+				MPI_SUM, win );
+	    MPI_Win_fence( 0, win );
+	    if (rank == source) {
+		/* Check the results */
+		for (i=0; i<count; i++) {
+		    int result = i * count * size + (size*(size-1))/2;
+		    if (winbuf[i] != result) {
+			if (errs < 10) {
+			    fprintf( stderr, "Winbuf[%d] = %d, expected %d (count = %d, size = %d)\n",
+				     i, winbuf[i], result, count, size );
+			}
+			errs++;
+		    }
+		}
+	    }
+	    free( winbuf );
+	    free( sbuf );
+	    MPI_Win_free( &win );
+	}
+        MTestFreeComm(&comm);
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/accfence2_am.c b/teshsuite/smpi/mpich3-test/rma/accfence2_am.c
new file mode 100644
index 0000000000..11d2fb770a
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/accfence2_am.c
@@ -0,0 +1,97 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+#ifndef MAX_INT
+#define MAX_INT 0x7fffffff
+#endif
+
+/*
+static char MTEST_Descrip[] = "Test MPI_Accumulate with fence";
+*/
+
+/* same as accfence2.c, but uses alloc_mem */
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0;
+    int rank, size, source, dest;
+    int minsize = 2, count, i; 
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Datatype  datatype;
+    int           *winbuf, *sbuf;
+
+    MTest_Init( &argc, &argv );
+
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	source = 0;
+	dest   = size - 1;
+	
+	for (count = 32768; count < 65000; count = count * 2) {
+	    datatype = MPI_INT;
+
+	    /* We compare with an integer value that can be as large as
+	       size * (count * count + (1/2)*(size-1))
+	       For large machines (size large), this can exceed the 
+	       maximum integer for some large values of count.  We check
+	       that in advance and break this loop if the above value 
+	       would exceed MAX_INT.  Specifically,
+
+	       size*count*count + (1/2)*size*(size-1) > MAX_INT
+	       count*count > (MAX_INT/size - (1/2)*(size-1))
+	    */
+	    if (count * count > (MAX_INT/size - (size-1)/2)) break;
+
+	    MPI_Alloc_mem( count * sizeof(int), MPI_INFO_NULL, &winbuf );
+	    MPI_Alloc_mem( count * sizeof(int), MPI_INFO_NULL, &sbuf );
+
+	    for (i=0; i<count; i++) winbuf[i] = 0;
+	    for (i=0; i<count; i++) sbuf[i] = rank + i * count;
+	    MPI_Win_create( winbuf, count * sizeof(int), sizeof(int),
+			    MPI_INFO_NULL, comm, &win );
+	    MPI_Win_fence( 0, win );
+	    MPI_Accumulate( sbuf, count, MPI_INT, source, 0, count, MPI_INT,
+				MPI_SUM, win );
+	    MPI_Win_fence( 0, win );
+	    if (rank == source) {
+		/* Check the results */
+		for (i=0; i<count; i++) {
+		    int result = i * count * size + (size*(size-1))/2;
+		    if (winbuf[i] != result) {
+			if (errs < 10) {
+			    fprintf( stderr, "Winbuf[%d] = %d, expected %d (count = %d, size = %d)\n",
+				     i, winbuf[i], result, count, size );
+			}
+			errs++;
+		    }
+		}
+	    }
+
+	    MPI_Win_free( &win );
+
+            MPI_Free_mem(winbuf);
+            MPI_Free_mem(sbuf);
+	}
+        MTestFreeComm(&comm);
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/accpscw1.c b/teshsuite/smpi/mpich3-test/rma/accpscw1.c
new file mode 100644
index 0000000000..4b4976ee9e
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/accpscw1.c
@@ -0,0 +1,110 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Accumulate/replace with Post/Start/Complete/Wait";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0, err;
+    int rank, size, source, dest;
+    int minsize = 2, count; 
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Aint      extent;
+    MPI_Group     wingroup, neighbors;
+    MTestDatatype sendtype, recvtype;
+
+    MTest_Init( &argc, &argv );
+
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	source = 0;
+	dest   = size - 1;
+	
+	for (count = 1; count < 65000; count = count * 2) {
+	    while (MTestGetDatatypes( &sendtype, &recvtype, count )) {
+		/* Make sure that everyone has a recv buffer */
+		recvtype.InitBuf( &recvtype );
+
+		MPI_Type_extent( recvtype.datatype, &extent );
+		MPI_Win_create( recvtype.buf, recvtype.count * extent, 
+				(int)extent, MPI_INFO_NULL, comm, &win );
+		MPI_Win_get_group( win, &wingroup );
+		if (rank == source) {
+		    /* To improve reporting of problems about operations, we
+		       change the error handler to errors return */
+		    MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+		    sendtype.InitBuf( &sendtype );
+		    
+		    /* Neighbor is dest only */
+		    MPI_Group_incl( wingroup, 1, &dest, &neighbors );
+		    err = MPI_Win_start( neighbors, 0, win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		    MPI_Group_free( &neighbors );
+		    err = MPI_Accumulate( sendtype.buf, sendtype.count, 
+					  sendtype.datatype, dest, 0, 
+					  recvtype.count, recvtype.datatype, 
+					  MPI_REPLACE, win );
+		    if (err) {
+			errs++;
+			MTestPrintError( err );
+		    }
+		    err = MPI_Win_complete( win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		}
+		else if (rank == dest) {
+		    MPI_Group_incl( wingroup, 1, &source, &neighbors );
+		    MPI_Win_post( neighbors, 0, win );
+		    MPI_Group_free( &neighbors );
+		    MPI_Win_wait( win );
+		    /* This should have the same effect, in terms of
+		       transfering data, as a send/recv pair */
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) {
+			errs += errs;
+		    }
+		}
+		else {
+		    /* Nothing; the other processes need not call any 
+		       MPI routines */
+		    ;
+		}
+		MPI_Win_free( &win );
+		MTestFreeDatatype( &sendtype );
+		MTestFreeDatatype( &recvtype );
+		MPI_Group_free( &wingroup );
+	    }
+	}
+	MTestFreeComm( &comm );
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/adlb_mimic1.c b/teshsuite/smpi/mpich3-test/rma/adlb_mimic1.c
new file mode 100644
index 0000000000..22767fb7a7
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/adlb_mimic1.c
@@ -0,0 +1,169 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mpi.h"
+#include "mpitest.h"
+
+#define NUM_TIMES 500
+#define MAX_BUF_SIZE (400 * 1024 * 1024) /* 400 MB */
+#define PUT_SIZE (1024 * 1024) /* 1MB */
+
+/*
+static char MTEST_Descrip[] = "ADLB mimic test";
+*/
+
+/*
+ * ALGORITHM:
+ *    This test uses one server process (S), one target process (T)
+ *    and a bunch of origin processes (O). 'O' PUTs (LOCK/PUT/UNLOCK)
+ *    data to a distinct part of the window, and sends a message to
+ *    'S' once the UNLOCK has completed. The server forwards this
+ *    message to 'T'. 'T' GETS the data from this buffer after it
+ *    receives the message from 'S', to see if it contains the correct
+ *    contents.
+ *
+ *                          -------
+ *                          |  S  |
+ *                          -------
+ *                         ^       \
+ *                step 2  /         \ step 3
+ *                 SEND  /           \ SEND
+ *                      /             v
+ *                  -------  step 1   -------
+ *                  |     | --------> |     |
+ *                  |     |   PUT     |     |
+ *                  |  O  |           |  T  |
+ *                  |     |  step 4   |     |
+ *                  |     | <-------- |     |
+ *                  -------   SEND    -------
+ *
+ */
+
+int main(int argc, char **argv)
+{
+    int comm_size, comm_rank, i, by_rank, errs = 0;
+    int rc;
+    char *rma_win_addr, *local_buf;
+    char check;
+    MPI_Win win;
+    MPI_Status status;
+    int max_buf_size = 0, put_size = PUT_SIZE;
+
+    MTest_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
+
+    if (comm_size <= 2) {
+	fprintf( stderr, "This test requires at least 3 processes\n" );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+
+    max_buf_size = comm_size * put_size;
+    if (max_buf_size > MAX_BUF_SIZE) {
+	fprintf( stderr, "Too many processes in COMM_WORLD (max is %d)\n",
+		 MAX_BUF_SIZE / put_size );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+
+    /* If alloc mem returns an error (because too much memory is requested */
+    MPI_Errhandler_set( MPI_COMM_WORLD, MPI_ERRORS_RETURN );
+
+    rc = MPI_Alloc_mem(max_buf_size, MPI_INFO_NULL, (void *) &rma_win_addr);
+    if (rc) {
+	MTestPrintErrorMsg( "Unable to MPI_Alloc_mem space (not an error)", rc );
+	MPI_Abort( MPI_COMM_WORLD, 0 );
+    }
+
+    memset(rma_win_addr, 0, max_buf_size);
+    MPI_Win_create((void *) rma_win_addr, max_buf_size, 1, MPI_INFO_NULL, 
+		   MPI_COMM_WORLD, &win);
+
+    /* Note that it is not necessary to use MPI_Alloc_mem for the memory that
+       is not part of the MPI_Win.  */
+    rc = MPI_Alloc_mem(put_size, MPI_INFO_NULL, (void *) &local_buf);
+    if (rc) {
+	MTestPrintErrorMsg( "Unable to MPI_Alloc_mem space (not an error)", rc );
+	MPI_Abort( MPI_COMM_WORLD, 0 );
+    }
+
+    for (i = 0; i < put_size; i++)
+        local_buf[i] = 1;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (comm_rank == 0) { /* target */
+        for (i = 0; i < (NUM_TIMES * (comm_size - 2)); i++) {
+            /* Wait for a message from the server to notify me that
+             * someone put some data in my window */
+            MPI_Recv(&by_rank, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, &status);
+
+            /* Got a message from the server that 'by_rank' put some
+             * data in my local window. Check the last byte to make
+             * sure we got it correctly. */
+            MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, win);
+            MPI_Get((void *) &check, 1, MPI_CHAR, 0, 
+		    ((by_rank + 1) * put_size) - 1, 1,
+                    MPI_CHAR, win);
+            MPI_Win_unlock(0, win);
+
+            /* If this is not the value I expect, count it as an error */
+            if (check != 1)
+                errs++;
+
+            /* Reset the buffer to zero for the next round */
+            memset((void *) (rma_win_addr + (by_rank * put_size)), 0, put_size);
+
+            /* Tell the origin that I am ready for the next round */
+            MPI_Send(NULL, 0, MPI_INT, by_rank, 0, MPI_COMM_WORLD);
+        }
+    }
+
+    else if (comm_rank == 1) { /* server */
+        for (i = 0; i < (NUM_TIMES * (comm_size - 2)); i++) {
+            /* Wait for a message from any of the origin processes
+             * informing me that it has put data to the target
+             * process */
+            MPI_Recv(NULL, 0, MPI_INT, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, 
+		     &status);
+            by_rank = status.MPI_SOURCE;
+
+            /* Tell the target process that it should be seeing some
+             * data in its local buffer */
+            MPI_Send(&by_rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
+        }
+    }
+
+    else { /* origin */
+        for (i = 0; i < NUM_TIMES; i++) {
+            /* Put some data in the target window */
+            MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, win);
+            MPI_Put(local_buf, put_size, MPI_CHAR, 0, comm_rank * put_size, 
+		    put_size, MPI_CHAR, win);
+            MPI_Win_unlock(0, win);
+
+            /* Tell the server that the put has completed */
+            MPI_Send(NULL, 0, MPI_INT, 1, 0, MPI_COMM_WORLD);
+
+            /* Wait for a message from the target that it is ready for
+             * the next round */
+            MPI_Recv(NULL, 0, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
+        }
+    }
+
+    MPI_Win_free(&win);
+
+    MPI_Free_mem(rma_win_addr);
+    MPI_Free_mem(local_buf);
+
+    MTest_Finalize(errs);
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/allocmem.c b/teshsuite/smpi/mpich3-test/rma/allocmem.c
new file mode 100644
index 0000000000..1969941fae
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/allocmem.c
@@ -0,0 +1,49 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Simple test that alloc_mem and free_mem work together";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0, err;
+    int j, count;
+    char *ap;
+
+    MTest_Init( &argc, &argv );
+
+    MPI_Errhandler_set( MPI_COMM_WORLD, MPI_ERRORS_RETURN );
+    for (count=1; count < 128000; count *= 2) {
+	
+	err = MPI_Alloc_mem( count, MPI_INFO_NULL, &ap );
+	if (err) {
+	    int errclass;
+	    /* An error of  MPI_ERR_NO_MEM is allowed */
+	    MPI_Error_class( err, &errclass );
+	    if (errclass != MPI_ERR_NO_MEM) {
+		errs++;
+		MTestPrintError( err );
+	    }
+	    
+	}
+	else {
+	    /* Access all of this memory */
+	    for (j=0; j<count; j++) {
+		ap[j] = (char)(j & 0x7f);
+	    }
+	    MPI_Free_mem( ap );
+	}
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/attrorderwin.c b/teshsuite/smpi/mpich3-test/rma/attrorderwin.c
new file mode 100644
index 0000000000..972dda719b
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/attrorderwin.c
@@ -0,0 +1,129 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTestDescrip[] = "Test creating and inserting attributes in \
+different orders to ensure that the list management code handles all cases.";
+*/
+
+int checkAttrs( MPI_Win win, int n, int key[], int attrval[] );
+int checkNoAttrs( MPI_Win win, int n, int key[] );
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0;
+    int key[3], attrval[3];
+    int i;
+    int buf[1];
+    MPI_Comm comm;
+    MPI_Win  win;
+
+    MTest_Init( &argc, &argv );
+
+    {
+	comm = MPI_COMM_WORLD;
+	MPI_Win_create( buf, sizeof(int), sizeof(int), MPI_INFO_NULL,
+			comm, &win );
+
+	/* Create key values */
+	for (i=0; i<3; i++) {
+	    MPI_Win_create_keyval( MPI_NULL_COPY_FN, MPI_NULL_DELETE_FN,
+			       &key[i], (void *)0 );
+	    attrval[i] = 1024 * i;
+	}
+	
+	/* Insert attribute in several orders.  Test after put with get,
+	 then delete, then confirm delete with get. */
+
+	MPI_Win_set_attr( win, key[2], &attrval[2] );
+	MPI_Win_set_attr( win, key[1], &attrval[1] );
+	MPI_Win_set_attr( win, key[0], &attrval[0] );
+
+	errs += checkAttrs( win, 3, key, attrval );
+	
+	MPI_Win_delete_attr( win, key[0] );
+	MPI_Win_delete_attr( win, key[1] );
+	MPI_Win_delete_attr( win, key[2] );
+
+	errs += checkNoAttrs( win, 3, key );
+	
+	MPI_Win_set_attr( win, key[1], &attrval[1] );
+	MPI_Win_set_attr( win, key[2], &attrval[2] );
+	MPI_Win_set_attr( win, key[0], &attrval[0] );
+
+	errs += checkAttrs( win, 3, key, attrval );
+	
+	MPI_Win_delete_attr( win, key[2] );
+	MPI_Win_delete_attr( win, key[1] );
+	MPI_Win_delete_attr( win, key[0] );
+
+	errs += checkNoAttrs( win, 3, key );
+
+	MPI_Win_set_attr( win, key[0], &attrval[0] );
+	MPI_Win_set_attr( win, key[1], &attrval[1] );
+	MPI_Win_set_attr( win, key[2], &attrval[2] );
+
+	errs += checkAttrs( win, 3, key, attrval );
+	
+	MPI_Win_delete_attr( win, key[1] );
+	MPI_Win_delete_attr( win, key[2] );
+	MPI_Win_delete_attr( win, key[0] );
+
+	errs += checkNoAttrs( win, 3, key );
+	
+	for (i=0; i<3; i++) {
+	    MPI_Win_free_keyval( &key[i] );
+	}
+	MPI_Win_free( &win );
+    }
+    
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+  
+}
+
+int checkAttrs( MPI_Win win, int n, int key[], int attrval[] )
+{
+    int errs = 0;
+    int i, flag, *val_p;
+
+    for (i=0; i<n; i++) {
+	MPI_Win_get_attr( win, key[i], &val_p, &flag );
+	if (!flag) {
+	    errs++;
+	    fprintf( stderr, "Attribute for key %d not set\n", i );
+	}
+	else if (val_p != &attrval[i]) {
+	    errs++;
+	    fprintf( stderr, "Atribute value for key %d not correct\n",
+		     i );
+	}
+    }
+
+    return errs;
+}
+
+int checkNoAttrs( MPI_Win win, int n, int key[] )
+{
+    int errs = 0;
+    int i, flag, *val_p;
+
+    for (i=0; i<n; i++) {
+	MPI_Win_get_attr( win, key[i], &val_p, &flag );
+	if (flag) {
+	    errs++;
+	    fprintf( stderr, "Attribute for key %d set but should be deleted\n", i );
+	}
+    }
+
+    return errs;
+}
+	
diff --git a/teshsuite/smpi/mpich3-test/rma/baseattrwin.c b/teshsuite/smpi/mpich3-test/rma/baseattrwin.c
new file mode 100644
index 0000000000..fc1ccc95c2
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/baseattrwin.c
@@ -0,0 +1,80 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <stdio.h>
+#include "mpi.h"
+#include "mpitest.h"
+
+int main( int argc, char **argv)
+{
+    int    errs = 0;
+    void *v;
+    int  flag;
+    int  rank, size;
+    int base[1024];
+    MPI_Aint n;
+    int     disp;
+    MPI_Win win;
+
+    MTest_Init( &argc, &argv );
+    MPI_Comm_size( MPI_COMM_WORLD, &size );
+    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+
+    /* Create a window; then extract the values */
+    n    = 1024;
+    disp = 4;
+    MPI_Win_create( base, n, disp, MPI_INFO_NULL, MPI_COMM_WORLD, &win );
+
+    MPI_Win_get_attr( win, MPI_WIN_BASE, &v, &flag );
+    if (!flag) {
+	errs++;
+	fprintf( stderr, "Could not get WIN_BASE\n" );
+    }
+    else {
+	/* MPI 2.1, section 11.2.2.  v must be a pointer to the start of the 
+	 window.  It is not a pointer to a pointer to the start of the window. 
+	*/
+	if ((int*)v != base) {
+	    errs++;
+	    fprintf( stderr, "Got incorrect value for WIN_BASE (%p, should be %p)", 
+		     v, base );
+	}
+    }
+
+    MPI_Win_get_attr( win, MPI_WIN_SIZE, &v, &flag );
+    if (!flag) {
+	errs++;
+	fprintf( stderr, "Could not get WIN_SIZE\n" );
+    }
+    else {
+	MPI_Aint vval = *(MPI_Aint*)v;
+	if (vval != n) {
+	    errs++;
+	    fprintf( stderr, "Got wrong value for WIN_SIZE (%ld, should be %ld)\n", 
+		     (long) vval, (long) n );
+	}
+    }
+
+    MPI_Win_get_attr( win, MPI_WIN_DISP_UNIT, &v, &flag );
+    if (!flag) {
+	errs++;
+	fprintf( stderr, "Could not get WIN_DISP_UNIT\n" );
+    }
+    else {
+	int vval = *(int*)v;
+	if (vval != disp) {
+	    errs++;
+	    fprintf( stderr, "Got wrong value for WIN_DISP_UNIT (%d, should be %d)\n",
+		     vval, disp );
+	}
+    }
+
+    MPI_Win_free(&win);
+    MTest_Finalize( errs );
+    MPI_Finalize( );
+    
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/compare_and_swap.c b/teshsuite/smpi/mpich3-test/rma/compare_and_swap.c
new file mode 100644
index 0000000000..96eace1a9c
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/compare_and_swap.c
@@ -0,0 +1,108 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define ITER 100
+
+int main(int argc, char **argv) {
+    int       i, j, rank, nproc;
+    int       errors = 0, all_errors = 0;
+    int      *val_ptr;
+    MPI_Win   win;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    val_ptr = malloc(sizeof(int));
+
+    *val_ptr = 0;
+
+    MPI_Win_create(val_ptr, sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+    /* Test self communication */
+
+    for (i = 0; i < ITER; i++) {
+        int next = i + 1, result = -1;
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+        MPI_Compare_and_swap(&next, &i, &result, MPI_INT, rank, 0, win);
+        MPI_Win_unlock(rank, win);
+        if (result != i) {
+            SQUELCH( printf("%d->%d -- Error: next=%d compare=%d result=%d val=%d\n", rank,
+                           rank, next, i, result, *val_ptr); );
+            errors++;
+        }
+    }
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    *val_ptr = 0;
+    MPI_Win_unlock(rank, win);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Test neighbor communication */
+
+    for (i = 0; i < ITER; i++) {
+        int next = i + 1, result = -1;
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, (rank+1)%nproc, 0, win);
+        MPI_Compare_and_swap(&next, &i, &result, MPI_INT, (rank+1)%nproc, 0, win);
+        MPI_Win_unlock((rank+1)%nproc, win);
+        if (result != i) {
+            SQUELCH( printf("%d->%d -- Error: next=%d compare=%d result=%d val=%d\n", rank,
+                           (rank+1)%nproc, next, i, result, *val_ptr); );
+            errors++;
+        }
+    }
+
+    fflush(NULL);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    *val_ptr = 0;
+    MPI_Win_unlock(rank, win);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+
+    /* Test contention */
+
+    if (rank != 0) {
+        for (i = 0; i < ITER; i++) {
+            int next = i + 1, result = -1;
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+            MPI_Compare_and_swap(&next, &i, &result, MPI_INT, 0, 0, win);
+            MPI_Win_unlock(0, win);
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0 && nproc > 1) {
+        if (*val_ptr != ITER) {
+            SQUELCH( printf("%d - Error: expected=%d val=%d\n", rank, ITER, *val_ptr); );
+            errors++;
+        }
+    }
+
+    MPI_Win_free(&win);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    free(val_ptr);
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/contention_put.c b/teshsuite/smpi/mpich3-test/rma/contention_put.c
new file mode 100644
index 0000000000..2b2be301ca
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/contention_put.c
@@ -0,0 +1,105 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/** Contended RMA put test -- James Dinan <dinan@mcs.anl.gov>
+  *
+  * Each process issues COUNT put operations to non-overlapping locations on
+  * every other processs.
+  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "mpi.h"
+#include "mpitest.h"
+
+#define MAXELEMS      6400
+#define COUNT         1000
+
+static int me, nproc;
+static const int verbose = 0;
+
+int test_put(void);
+
+int test_put(void)
+{
+  MPI_Win  dst_win;
+  double  *dst_buf;
+  double   src_buf[MAXELEMS];
+  int      i, j;
+  int      errs = 0;
+
+  MPI_Alloc_mem(sizeof(double)*nproc*MAXELEMS, MPI_INFO_NULL, &dst_buf);
+  MPI_Win_create(dst_buf, sizeof(double)*nproc*MAXELEMS, 1, MPI_INFO_NULL, 
+		 MPI_COMM_WORLD, &dst_win);
+
+  for (i = 0; i < MAXELEMS; i++)
+    src_buf[i] = me + 1.0;
+
+  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, me, 0, dst_win);
+
+  for (i = 0; i < nproc*MAXELEMS; i++)
+    dst_buf[i] = 0.0;
+
+  MPI_Win_unlock(me, dst_win);
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  for(i = 0; i < nproc; i++) {
+    /* int target = (me + i) % nproc; */
+    int target = i;
+    for(j = 0; j < COUNT; j++) {
+      if (verbose) printf("%2d -> %2d [%2d]\n", me, target, j); 
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, dst_win);
+      MPI_Put(&src_buf[j], sizeof(double), MPI_BYTE, target, 
+	      (me*MAXELEMS+j)*sizeof(double), sizeof(double), MPI_BYTE, dst_win);
+      MPI_Win_unlock(target, dst_win);
+    }
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  /* Check that the correct data was returned.  This assumes that the 
+     systems have the same data representations */
+  for (i=0; i<nproc; i++) {
+      for (j=0; j<COUNT; j++) {
+	  if (dst_buf[i*MAXELEMS+j] != 1.0 + i) {
+	      errs++;
+	      printf( "dst_buf[%d] = %e, expected %e\n",
+		      i*MAXELEMS+j, dst_buf[i*MAXELEMS+j], 1.0 + i );
+	  }
+      }
+  }
+
+  MPI_Win_free(&dst_win);
+  MPI_Free_mem(dst_buf);
+
+  return errs;
+}
+
+
+int main(int argc, char* argv[]) {
+    int errs = 0;
+
+    MTest_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
+    
+    assert(COUNT <= MAXELEMS);
+    
+    if (me == 0 && verbose) {
+	printf("Test starting on %d processes\n", nproc); 
+	fflush(stdout);
+    }
+    
+    errs = test_put();
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return MTestReturnValue( errs );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/contention_putget.c b/teshsuite/smpi/mpich3-test/rma/contention_putget.c
new file mode 100644
index 0000000000..6a539a0e4f
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/contention_putget.c
@@ -0,0 +1,99 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/** Contended RMA put/get test -- James Dinan <dinan@mcs.anl.gov>
+  *
+  * Each process issues COUNT put and get operations to non-overlapping
+  * locations on every other processs.
+  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "mpi.h"
+
+#define MAXELEMS      6400
+#define COUNT         1000
+
+static int me, nproc;
+static const int verbose = 0;
+
+void test_put(void);
+void test_put(void)
+{
+  MPI_Win  dst_win;
+  double  *dst_buf;
+  double   src_buf[MAXELEMS];
+  int      i, j;
+
+  MPI_Alloc_mem(sizeof(double)*nproc*MAXELEMS, MPI_INFO_NULL, &dst_buf);
+  MPI_Win_create(dst_buf, sizeof(double)*nproc*MAXELEMS, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &dst_win);
+
+  for (i = 0; i < MAXELEMS; i++)
+    src_buf[i] = me + 1.0;
+
+  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, me, 0, dst_win);
+
+  for (i = 0; i < nproc*MAXELEMS; i++)
+    dst_buf[i] = 0.0;
+
+  MPI_Win_unlock(me, dst_win);
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  for(i = 0; i < nproc; i++) {
+    int target = i;
+
+    for(j = 0; j < COUNT; j++) {
+      if (verbose) printf("%2d -> %2d [%2d]\n", me, target, j); 
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, dst_win);
+      MPI_Put(&src_buf[j], sizeof(double), MPI_BYTE, target, (me*MAXELEMS+j)*sizeof(double), sizeof(double), MPI_BYTE, dst_win);
+      MPI_Win_unlock(target, dst_win);
+    }
+
+    for(j = 0; j < COUNT; j++) {
+      if (verbose) printf("%2d <- %2d [%2d]\n", me, target, j); 
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, dst_win);
+      MPI_Get(&src_buf[j], sizeof(double), MPI_BYTE, target, (me*MAXELEMS+j)*sizeof(double), sizeof(double), MPI_BYTE, dst_win);
+      MPI_Win_unlock(target, dst_win);
+    }
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  MPI_Win_free(&dst_win);
+  MPI_Free_mem(dst_buf);
+}
+
+
+int main(int argc, char* argv[]) {
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
+  assert(COUNT <= MAXELEMS);
+
+  if (me == 0 && verbose) {
+    printf("Test starting on %d processes\n", nproc); 
+    fflush(stdout);
+  }
+
+  test_put();
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  MPI_Finalize();
+
+  if (me == 0 && verbose) {
+    printf("Test completed.\n");
+    fflush(stdout);
+  }
+
+  if (me == 0)
+    printf(" No Errors\n");
+
+  return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/contig_displ.c b/teshsuite/smpi/mpich3-test/rma/contig_displ.c
new file mode 100644
index 0000000000..9d37da6cc3
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/contig_displ.c
@@ -0,0 +1,98 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <mpi.h>
+#include "mpitest.h"
+
+/* Run with 1 process.
+
+   This program does an MPI_Get with an indexed datatype. The datatype
+   comprises a single integer at an initial displacement of 1 integer. 
+   That is, the first integer in the array is to be skipped.
+
+   This program found a bug in IBM's MPI in which MPI_Get ignored the
+   displacement and got the first integer instead of the second. 
+*/
+
+int main(int argc, char **argv)
+{
+    int rank, nprocs, mpi_err, *array;
+    int getval, disp, errs=0;
+    MPI_Win win;
+    MPI_Datatype type;
+    
+    MTest_Init(&argc,&argv); 
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    if (rank == 0) {
+        /* To improve reporting of problems about operations, we
+           change the error handler to errors return */
+        MPI_Comm_set_errhandler( MPI_COMM_SELF, MPI_ERRORS_RETURN );
+ 
+        /* create an indexed datatype that points to the second integer 
+           in an array (the first integer is skipped). */
+        disp  =  1;
+        mpi_err = MPI_Type_create_indexed_block(1, 1, &disp, MPI_INT, &type);
+        if (mpi_err != MPI_SUCCESS) goto err_return;
+        mpi_err = MPI_Type_commit(&type);
+        if (mpi_err != MPI_SUCCESS) goto err_return;
+ 
+        /* allocate window of size 2 integers*/
+        mpi_err = MPI_Alloc_mem(2*sizeof(int), MPI_INFO_NULL, &array);
+        if (mpi_err != MPI_SUCCESS) goto err_return;
+ 
+        /* create window object */
+        mpi_err = MPI_Win_create(array, 2*sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_SELF, &win);
+        if (mpi_err != MPI_SUCCESS) goto err_return;
+  
+        /* initialize array */
+        array[0] = 100;
+        array[1] = 200;
+ 
+        getval = 0;
+        
+        /* To improve reporting of problems about operations, we
+           change the error handler to errors return */
+        MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+ 
+        mpi_err = MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+        if (mpi_err != MPI_SUCCESS) goto err_return;
+ 
+        /* get the current value of element array[1] */
+        mpi_err = MPI_Get(&getval, 1, MPI_INT, 0, 0, 1, type, win);
+        if (mpi_err != MPI_SUCCESS) goto err_return;
+ 
+        mpi_err = MPI_Win_unlock(0, win);
+        if (mpi_err != MPI_SUCCESS) goto err_return;
+ 
+        /* getval should contain the value of array[1] */
+        if (getval != array[1]) {
+            errs++;
+            printf("getval=%d, should be %d\n", getval, array[1]);
+        }
+ 
+        MPI_Free_mem(array);
+        MPI_Win_free(&win);
+        MPI_Type_free(&type);
+    }
+
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 0;
+
+ err_return:
+    printf("MPI function error returned an error\n");
+    MTestPrintError( mpi_err );
+    errs++;
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 1;
+}
+
+
diff --git a/teshsuite/smpi/mpich3-test/rma/epochtest.c b/teshsuite/smpi/mpich3-test/rma/epochtest.c
new file mode 100644
index 0000000000..7a3222a3fc
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/epochtest.c
@@ -0,0 +1,191 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2009 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/*
+ * This test looks at the behavior of MPI_Win_fence and epochs.  Each 
+ * MPI_Win_fence may both begin and end both the exposure and access epochs.
+ * Thus, it is not necessary to use MPI_Win_fence in pairs.
+ *
+ * The tests have this form:
+ *    Process A             Process B
+ *     fence                 fence
+ *      put,put
+ *     fence                 fence
+ *                            put,put
+ *     fence                 fence
+ *      put,put               put,put
+ *     fence                 fence
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Put with Fences used to separate epochs";
+*/
+
+#define MAX_PERR 10
+
+int PrintRecvedError( const char *, MTestDatatype *, MTestDatatype * );
+
+int main( int argc, char **argv )
+{
+    int errs = 0, err;
+    int rank, size, source, dest;
+    int minsize = 2, count; 
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Aint      extent;
+    MTestDatatype sendtype, recvtype;
+    int           onlyInt = 0;
+
+    MTest_Init( &argc, &argv );
+    /* Check for a simple choice of communicator and datatypes */
+    if (getenv( "MTEST_SIMPLE" )) onlyInt = 1;
+
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	source = 0;
+	dest   = size - 1;
+	
+	for (count = 1; count < 65000; count = count * 2) {
+	    while (MTestGetDatatypes( &sendtype, &recvtype, count )) {
+
+		MTestPrintfMsg( 1, 
+		       "Putting count = %d of sendtype %s receive type %s\n", 
+				count, MTestGetDatatypeName( &sendtype ),
+				MTestGetDatatypeName( &recvtype ) );
+
+		/* Make sure that everyone has a recv buffer */
+		recvtype.InitBuf( &recvtype );
+
+		MPI_Type_extent( recvtype.datatype, &extent );
+		MPI_Win_create( recvtype.buf, recvtype.count * extent, 
+				extent, MPI_INFO_NULL, comm, &win );
+		/* To improve reporting of problems about operations, we
+		   change the error handler to errors return */
+		MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+
+		/* At this point, we have all of the elements that we 
+		   need to begin the multiple fence and put tests */
+		/* Fence 1 */
+		err = MPI_Win_fence( MPI_MODE_NOPRECEDE, win ); 
+		if (err) { if (errs++ < MAX_PERR) MTestPrintError(err); }
+		/* Source puts */
+		if (rank == source) {
+		    sendtype.InitBuf( &sendtype );
+		    
+		    err = MPI_Put( sendtype.buf, sendtype.count, 
+				   sendtype.datatype, dest, 0, 
+				   recvtype.count, recvtype.datatype, win );
+		    if (err) { if (errs++ < MAX_PERR) MTestPrintError(err); }
+		}
+
+		/* Fence 2 */
+		err = MPI_Win_fence( 0, win );
+		if (err) { if (errs++ < MAX_PERR) MTestPrintError(err); }
+		/* dest checks data, then Dest puts */
+		if (rank == dest) {
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) { if (errs++ < MAX_PERR) { 
+			    PrintRecvedError( "fence 2", &sendtype, &recvtype );
+			}
+		    }
+		    sendtype.InitBuf( &sendtype );
+		    
+		    err = MPI_Put( sendtype.buf, sendtype.count, 
+				   sendtype.datatype, source, 0, 
+				   recvtype.count, recvtype.datatype, win );
+		    if (err) { if (errs++ < MAX_PERR) MTestPrintError(err); }
+		}
+
+		/* Fence 3 */
+		err = MPI_Win_fence( 0, win );
+		if (err) { if (errs++ < MAX_PERR) MTestPrintError(err); }
+		/* src checks data, then Src and dest puts*/
+		if (rank == source) {
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) { if (errs++ < MAX_PERR) { 
+			    PrintRecvedError( "fence 3", &sendtype, &recvtype );
+			}
+		    }
+		    sendtype.InitBuf( &sendtype );
+		    
+		    err = MPI_Put( sendtype.buf, sendtype.count, 
+				   sendtype.datatype, dest, 0, 
+				   recvtype.count, recvtype.datatype, win );
+		    if (err) { if (errs++ < MAX_PERR) MTestPrintError(err); }
+		}
+		if (rank == dest) {
+		    sendtype.InitBuf( &sendtype );
+		    
+		    err = MPI_Put( sendtype.buf, sendtype.count, 
+				   sendtype.datatype, source, 0, 
+				   recvtype.count, recvtype.datatype, win );
+		    if (err) { if (errs++ < MAX_PERR) MTestPrintError(err); }
+		}
+
+		/* Fence 4 */
+		err = MPI_Win_fence( MPI_MODE_NOSUCCEED, win );
+		if (err) { if (errs++ < MAX_PERR) MTestPrintError(err); }
+		/* src and dest checks data */
+		if (rank == source) {
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) { if (errs++ < MAX_PERR) { 
+			    PrintRecvedError( "src fence4", &sendtype, &recvtype );
+			}
+		    }
+		}
+		if (rank == dest) {
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) { if (errs++ < MAX_PERR) { 
+			    PrintRecvedError( "dest fence4", &sendtype, &recvtype );
+			}
+		    }
+		}
+
+		MPI_Win_free( &win );
+		MTestFreeDatatype( &sendtype );
+		MTestFreeDatatype( &recvtype );
+
+		/* Only do one datatype in the simple case */
+		if (onlyInt) break;
+	    }
+	    /* Only do one count in the simple case */
+	    if (onlyInt) break;
+	}
+        MTestFreeComm(&comm);
+	/* Only do one communicator in the simple case */
+	if (onlyInt) break;
+    }
+
+    MTest_Finalize( errs );
+
+    
+    
+    MPI_Finalize();
+    return 0;
+}
+
+
+int PrintRecvedError( const char *msg, 
+		      MTestDatatype *sendtypePtr, MTestDatatype *recvtypePtr )
+{
+    printf( "At step %s, Data in target buffer did not match for destination datatype %s (put with source datatype %s)\n", 
+	    msg, 
+	    MTestGetDatatypeName( recvtypePtr ),
+	    MTestGetDatatypeName( sendtypePtr ) );
+    /* Redo the test, with the errors printed */
+    recvtypePtr->printErrors = 1;
+    (void)MTestCheckRecv( 0, recvtypePtr );
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/fetch_and_op.c b/teshsuite/smpi/mpich3-test/rma/fetch_and_op.c
new file mode 100644
index 0000000000..93503ddfaa
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/fetch_and_op.c
@@ -0,0 +1,311 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define ITER 100
+
+#if defined (FOP_TYPE_CHAR)
+#  define TYPE_C   char
+#  define TYPE_MPI MPI_CHAR
+#  define TYPE_FMT "%d"
+#elif defined (FOP_TYPE_SHORT)
+#  define TYPE_C   short
+#  define TYPE_MPI MPI_SHORT
+#  define TYPE_FMT "%d"
+#elif defined (FOP_TYPE_LONG)
+#  define TYPE_C   long
+#  define TYPE_MPI MPI_LONG
+#  define TYPE_FMT "%ld"
+#elif defined (FOP_TYPE_DOUBLE)
+#  define TYPE_C   double
+#  define TYPE_MPI MPI_DOUBLE
+#  define TYPE_FMT "%f"
+#elif defined (FOP_TYPE_LONG_DOUBLE)
+#  define TYPE_C   long double
+#  define TYPE_MPI MPI_LONG_DOUBLE
+#  define TYPE_FMT "%Lf"
+#else
+#  define TYPE_C   int
+#  define TYPE_MPI MPI_INT
+#  define TYPE_FMT "%d"
+#endif
+
+#define CMP(x, y) ((x - ((TYPE_C) (y))) > 1.0e-9)
+
+void reset_vars(TYPE_C *val_ptr, TYPE_C *res_ptr, MPI_Win win) {
+    int i, rank, nproc;
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++) {
+        val_ptr[i] = 0;
+        res_ptr[i] = -1;
+    }
+    MPI_Win_unlock(rank, win);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+int main(int argc, char **argv) {
+    int       i, rank, nproc, mpi_type_size;
+    int       errors = 0, all_errors = 0;
+    TYPE_C   *val_ptr, *res_ptr;
+    MPI_Win   win;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+ 
+    MPI_Type_size(TYPE_MPI, &mpi_type_size);
+    assert(mpi_type_size == sizeof(TYPE_C));
+
+    val_ptr = malloc(sizeof(TYPE_C)*nproc);
+    res_ptr = malloc(sizeof(TYPE_C)*nproc);
+
+    MPI_Win_create(val_ptr, sizeof(TYPE_C)*nproc, sizeof(TYPE_C), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+    /* Test self communication */
+
+    reset_vars(val_ptr, res_ptr, win);
+
+    for (i = 0; i < ITER; i++) {
+        TYPE_C one = 1, result = -1;
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+        MPI_Fetch_and_op(&one, &result, TYPE_MPI, rank, 0, MPI_SUM, win);
+        MPI_Win_unlock(rank, win);
+    }
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    if ( CMP(val_ptr[0], ITER) ) {
+        SQUELCH( printf("%d->%d -- SELF: expected "TYPE_FMT", got "TYPE_FMT"\n", rank, rank, (TYPE_C) ITER, val_ptr[0]); );
+        errors++;
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test neighbor communication */
+
+    reset_vars(val_ptr, res_ptr, win);
+
+    for (i = 0; i < ITER; i++) {
+        TYPE_C one = 1, result = -1;
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, (rank+1)%nproc, 0, win);
+        MPI_Fetch_and_op(&one, &result, TYPE_MPI, (rank+1)%nproc, 0, MPI_SUM, win);
+        MPI_Win_unlock((rank+1)%nproc, win);
+        if ( CMP(result, i) ) {
+            SQUELCH( printf("%d->%d -- NEIGHBOR[%d]: expected result "TYPE_FMT", got "TYPE_FMT"\n", (rank+1)%nproc, rank, i, (TYPE_C) i, result); );
+            errors++;
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    if ( CMP(val_ptr[0], ITER) ) {
+        SQUELCH( printf("%d->%d -- NEIGHBOR: expected "TYPE_FMT", got "TYPE_FMT"\n", (rank+1)%nproc, rank, (TYPE_C) ITER, val_ptr[0]); );
+        errors++;
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test contention */
+
+    reset_vars(val_ptr, res_ptr, win);
+
+    if (rank != 0) {
+        for (i = 0; i < ITER; i++) {
+            TYPE_C one = 1, result;
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+            MPI_Fetch_and_op(&one, &result, TYPE_MPI, 0, 0, MPI_SUM, win);
+            MPI_Win_unlock(0, win);
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    if (rank == 0 && nproc > 1) {
+        if ( CMP(val_ptr[0], ITER*(nproc-1)) ) {
+            SQUELCH( printf("*->%d - CONTENTION: expected="TYPE_FMT" val="TYPE_FMT"\n", rank, (TYPE_C) ITER*(nproc-1), val_ptr[0]); );
+            errors++;
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test all-to-all communication (fence) */
+
+    reset_vars(val_ptr, res_ptr, win);
+
+    for (i = 0; i < ITER; i++) {
+        int j;
+
+        MPI_Win_fence(MPI_MODE_NOPRECEDE, win);
+        for (j = 0; j < nproc; j++) {
+            TYPE_C rank_cnv = (TYPE_C) rank;
+            MPI_Fetch_and_op(&rank_cnv, &res_ptr[j], TYPE_MPI, j, rank, MPI_SUM, win);
+            res_ptr[j] = i*rank;
+        }
+        MPI_Win_fence(MPI_MODE_NOSUCCEED, win);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        for (j = 0; j < nproc; j++) {
+            if ( CMP(res_ptr[j], i*rank) ) {
+                SQUELCH( printf("%d->%d -- ALL-TO-ALL (FENCE) [%d]: expected result "TYPE_FMT", got "TYPE_FMT"\n", rank, j, i, (TYPE_C) i*rank, res_ptr[j]); );
+                errors++;
+            }
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++) {
+        if ( CMP(val_ptr[i], ITER*i) ) {
+            SQUELCH( printf("%d->%d -- ALL-TO-ALL (FENCE): expected "TYPE_FMT", got "TYPE_FMT"\n", i, rank, (TYPE_C) ITER*i, val_ptr[i]); );
+            errors++;
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test all-to-all communication (lock-all) */
+
+    reset_vars(val_ptr, res_ptr, win);
+
+    for (i = 0; i < ITER; i++) {
+        int j;
+
+        MPI_Win_lock_all(0, win);
+        for (j = 0; j < nproc; j++) {
+            TYPE_C rank_cnv = (TYPE_C) rank;
+            MPI_Fetch_and_op(&rank_cnv, &res_ptr[j], TYPE_MPI, j, rank, MPI_SUM, win);
+            res_ptr[j] = i*rank;
+        }
+        MPI_Win_unlock_all(win);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        for (j = 0; j < nproc; j++) {
+            if ( CMP(res_ptr[j], i*rank) ) {
+                SQUELCH( printf("%d->%d -- ALL-TO-ALL (LOCK-ALL) [%d]: expected result "TYPE_FMT", got "TYPE_FMT"\n", rank, j, i, (TYPE_C) i*rank, res_ptr[j]); );
+                errors++;
+            }
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++) {
+        if ( CMP(val_ptr[i], ITER*i) ) {
+            SQUELCH( printf("%d->%d -- ALL-TO-ALL (LOCK-ALL): expected "TYPE_FMT", got "TYPE_FMT"\n", i, rank, (TYPE_C) ITER*i, val_ptr[i]); );
+            errors++;
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test all-to-all communication (lock-all+flush) */
+
+    reset_vars(val_ptr, res_ptr, win);
+
+    for (i = 0; i < ITER; i++) {
+        int j;
+
+        MPI_Win_lock_all(0, win);
+        for (j = 0; j < nproc; j++) {
+            TYPE_C rank_cnv = (TYPE_C) rank;
+            MPI_Fetch_and_op(&rank_cnv, &res_ptr[j], TYPE_MPI, j, rank, MPI_SUM, win);
+            res_ptr[j] = i*rank;
+            MPI_Win_flush(j, win);
+        }
+        MPI_Win_unlock_all(win);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        for (j = 0; j < nproc; j++) {
+            if ( CMP(res_ptr[j], i*rank) ) {
+                SQUELCH( printf("%d->%d -- ALL-TO-ALL (LOCK-ALL+FLUSH) [%d]: expected result "TYPE_FMT", got "TYPE_FMT"\n", rank, j, i, (TYPE_C) i*rank, res_ptr[j]); );
+                errors++;
+            }
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++) {
+        if ( CMP(val_ptr[i], ITER*i) ) {
+            SQUELCH( printf("%d->%d -- ALL-TO-ALL (LOCK-ALL+FLUSH): expected "TYPE_FMT", got "TYPE_FMT"\n", i, rank, (TYPE_C) ITER*i, val_ptr[i]); );
+            errors++;
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test NO_OP (neighbor communication) */
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    reset_vars(val_ptr, res_ptr, win);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++)
+        val_ptr[i] = (TYPE_C) rank;
+    MPI_Win_unlock(rank, win);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (i = 0; i < ITER; i++) {
+        int target = (rank+1) % nproc;
+
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, win);
+        MPI_Fetch_and_op(NULL, res_ptr, TYPE_MPI, target, 0, MPI_NO_OP, win);
+        MPI_Win_unlock(target, win);
+
+        if (res_ptr[0] != (TYPE_C) target) {
+            SQUELCH( printf("%d->%d -- NOP[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                            target, rank, i, (TYPE_C) target, res_ptr[0]); );
+            errors++;
+        }
+    }
+
+    /* Test NO_OP (self communication) */
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    reset_vars(val_ptr, res_ptr, win);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++)
+        val_ptr[i] = (TYPE_C) rank;
+    MPI_Win_unlock(rank, win);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (i = 0; i < ITER; i++) {
+        int target = rank;
+
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, win);
+        MPI_Fetch_and_op(NULL, res_ptr, TYPE_MPI, target, 0, MPI_NO_OP, win);
+        MPI_Win_unlock(target, win);
+
+        if (res_ptr[0] != (TYPE_C) target) {
+            SQUELCH( printf("%d->%d -- NOP_SELF[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                            target, rank, i, (TYPE_C) target, res_ptr[0]); );
+            errors++;
+        }
+    }
+
+    MPI_Win_free(&win);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    free(val_ptr);
+    free(res_ptr);
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/fetchandadd.c b/teshsuite/smpi/mpich3-test/rma/fetchandadd.c
new file mode 100644
index 0000000000..dd12a4c64a
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/fetchandadd.c
@@ -0,0 +1,127 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+
+/* Fetch and add example from Using MPI-2 (the non-scalable version,
+   Fig. 6.12). */ 
+
+
+#define NTIMES 20  /* no of times each process calls the counter
+                      routine */
+
+int localvalue=0;  /* contribution of this process to the counter. We
+                    define it as a global variable because attribute
+                    caching on the window is not enabled yet. */ 
+
+void Get_nextval(MPI_Win win, int *val_array, MPI_Datatype get_type,
+                 int rank, int nprocs, int *value);
+
+int compar(const void *a, const void *b);
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, i, blens[2], disps[2], *counter_mem, *val_array,
+        *results, *counter_vals;
+    MPI_Datatype get_type;
+    MPI_Win win;
+    int errs = 0;
+ 
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (rank == 0) {
+        /* allocate counter memory and initialize to 0 */
+        counter_mem = (int *) calloc(nprocs, sizeof(int));
+        MPI_Win_create(counter_mem, nprocs*sizeof(int), sizeof(int),
+                       MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+        MPI_Win_free(&win); 
+        free(counter_mem);
+
+        /* gather the results from other processes, sort them, and check 
+           whether they represent a counter being incremented by 1 */
+
+        results = (int *) malloc(NTIMES*nprocs*sizeof(int));
+        for (i=0; i<NTIMES*nprocs; i++)
+            results[i] = -1;
+
+        MPI_Gather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, results, NTIMES, MPI_INT, 
+                   0, MPI_COMM_WORLD);
+
+        qsort(results+NTIMES, NTIMES*(nprocs-1), sizeof(int), compar);
+
+        for (i=NTIMES+1; i<(NTIMES*nprocs); i++)
+            if (results[i] != results[i-1] + 1)
+                errs++;
+        
+        free(results);
+    }
+    else {
+        blens[0] = rank;
+        disps[0] = 0;
+        blens[1] = nprocs - rank - 1;
+        disps[1] = rank + 1;
+
+        MPI_Type_indexed(2, blens, disps, MPI_INT, &get_type);
+        MPI_Type_commit(&get_type);
+
+        val_array = (int *) malloc(nprocs * sizeof(int));
+
+        /* allocate array to store the values obtained from the 
+           fetch-and-add counter */
+        counter_vals = (int *) malloc(NTIMES * sizeof(int));
+
+        MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); 
+
+        for (i=0; i<NTIMES; i++) {
+            Get_nextval(win, val_array, get_type, rank, nprocs, counter_vals+i);
+            /* printf("Rank %d, counter %d\n", rank, value); */
+        }
+
+        MPI_Win_free(&win);
+
+        free(val_array);
+        MPI_Type_free(&get_type);
+
+        /* gather the results to the root */
+        MPI_Gather(counter_vals, NTIMES, MPI_INT, NULL, 0, MPI_DATATYPE_NULL, 
+                   0, MPI_COMM_WORLD);
+        free(counter_vals);
+    }
+
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
+
+
+void Get_nextval(MPI_Win win, int *val_array, MPI_Datatype get_type,
+                 int rank, int nprocs, int *value) 
+{
+    int one=1, i;
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+    MPI_Accumulate(&one, 1, MPI_INT, 0, rank, 1, MPI_INT, MPI_SUM, win);
+    MPI_Get(val_array, 1, get_type, 0, 0, 1, get_type, win); 
+    MPI_Win_unlock(0, win);
+
+    *value = 0;
+    val_array[rank] = localvalue;
+    for (i=0; i<nprocs; i++)
+        *value = *value + val_array[i];
+
+    localvalue++;
+}
+
+int compar(const void *a, const void *b)
+{
+    return (*((int *)a) - *((int *)b));
+}
+
diff --git a/teshsuite/smpi/mpich3-test/rma/fetchandadd_am.c b/teshsuite/smpi/mpich3-test/rma/fetchandadd_am.c
new file mode 100644
index 0000000000..0ecde99535
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/fetchandadd_am.c
@@ -0,0 +1,137 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+
+/* Fetch and add example from Using MPI-2 (the non-scalable version,
+   Fig. 6.12). */ 
+
+/* same as fetchandadd.c but uses alloc_mem */
+
+#define NTIMES 20  /* no of times each process calls the counter
+                      routine */
+
+int localvalue=0;  /* contribution of this process to the counter. We
+                    define it as a global variable because attribute
+                    caching on the window is not enabled yet. */ 
+
+void Get_nextval(MPI_Win win, int *val_array, MPI_Datatype get_type,
+                 int rank, int nprocs, int *value);
+
+int compar(const void *a, const void *b);
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, i, blens[2], disps[2], *counter_mem, *val_array,
+        *results, *counter_vals;
+    MPI_Datatype get_type;
+    MPI_Win win;
+    int errs = 0;
+ 
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (rank == 0) {
+        /* allocate counter memory and initialize to 0 */
+        /* counter_mem = (int *) calloc(nprocs, sizeof(int)); */
+
+        i = MPI_Alloc_mem(nprocs*sizeof(int), MPI_INFO_NULL, &counter_mem);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        for (i=0; i<nprocs; i++) counter_mem[i] = 0;
+
+        MPI_Win_create(counter_mem, nprocs*sizeof(int), sizeof(int),
+                       MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+        MPI_Win_free(&win); 
+        MPI_Free_mem(counter_mem);
+
+        /* gather the results from other processes, sort them, and check 
+           whether they represent a counter being incremented by 1 */
+
+        results = (int *) malloc(NTIMES*nprocs*sizeof(int));
+        for (i=0; i<NTIMES*nprocs; i++)
+            results[i] = -1;
+
+        MPI_Gather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, results, NTIMES, MPI_INT, 
+                   0, MPI_COMM_WORLD);
+
+        qsort(results+NTIMES, NTIMES*(nprocs-1), sizeof(int), compar);
+
+        for (i=NTIMES+1; i<(NTIMES*nprocs); i++)
+            if (results[i] != results[i-1] + 1)
+                errs++;
+        
+        free(results);
+    }
+    else {
+        blens[0] = rank;
+        disps[0] = 0;
+        blens[1] = nprocs - rank - 1;
+        disps[1] = rank + 1;
+
+        MPI_Type_indexed(2, blens, disps, MPI_INT, &get_type);
+        MPI_Type_commit(&get_type);
+
+        val_array = (int *) malloc(nprocs * sizeof(int));
+
+        /* allocate array to store the values obtained from the 
+           fetch-and-add counter */
+        counter_vals = (int *) malloc(NTIMES * sizeof(int));
+
+        MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); 
+
+        for (i=0; i<NTIMES; i++) {
+            Get_nextval(win, val_array, get_type, rank, nprocs, counter_vals+i);
+            /* printf("Rank %d, counter %d\n", rank, value); */
+        }
+
+        MPI_Win_free(&win);
+
+        free(val_array);
+        MPI_Type_free(&get_type);
+
+        /* gather the results to the root */
+        MPI_Gather(counter_vals, NTIMES, MPI_INT, NULL, 0, MPI_DATATYPE_NULL, 
+                   0, MPI_COMM_WORLD);
+        free(counter_vals);
+    }
+
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
+
+
+void Get_nextval(MPI_Win win, int *val_array, MPI_Datatype get_type,
+                 int rank, int nprocs, int *value) 
+{
+    int one=1, i;
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+    MPI_Accumulate(&one, 1, MPI_INT, 0, rank, 1, MPI_INT, MPI_SUM, win);
+    MPI_Get(val_array, 1, get_type, 0, 0, 1, get_type, win); 
+    MPI_Win_unlock(0, win);
+
+    *value = 0;
+    val_array[rank] = localvalue;
+    for (i=0; i<nprocs; i++)
+        *value = *value + val_array[i];
+
+    localvalue++;
+}
+
+int compar(const void *a, const void *b)
+{
+    return (*((int *)a) - *((int *)b));
+}
+
diff --git a/teshsuite/smpi/mpich3-test/rma/fetchandadd_tree.c b/teshsuite/smpi/mpich3-test/rma/fetchandadd_tree.c
new file mode 100644
index 0000000000..8b3c8f37a4
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/fetchandadd_tree.c
@@ -0,0 +1,176 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+
+/* This is the tree-based scalable version of the fetch-and-add
+   example from Using MPI-2, pg 206-207. The code in the book (Fig
+   6.16) has bugs that are fixed below. */ 
+
+
+#define NTIMES 20  /* no of times each process calls the counter
+                      routine */
+
+int localvalue=0;  /* contribution of this process to the counter. We
+                    define it as a global variable because attribute
+                    caching on the window is not enabled yet. */ 
+
+void Get_nextval_tree(MPI_Win win, int *get_array, MPI_Datatype get_type,
+                 MPI_Datatype acc_type, int nlevels, int *value);
+
+int compar(const void *a, const void *b);
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, i, *counter_mem, *get_array, *get_idx, *acc_idx,
+        mask, nlevels, level, idx, tmp_rank, pof2;
+    MPI_Datatype get_type, acc_type;
+    MPI_Win win;
+    int errs = 0, *results, *counter_vals;
+ 
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (rank == 0) {
+        /* allocate counter memory and initialize to 0 */
+
+        /* find the next power-of-two >= nprocs */
+        pof2 = 1;
+        while (pof2 < nprocs) pof2 *= 2;
+
+        counter_mem = (int *) calloc(pof2*2, sizeof(int));
+        MPI_Win_create(counter_mem, pof2*2*sizeof(int), sizeof(int),
+                       MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+        MPI_Win_free(&win); 
+        free(counter_mem);
+
+        /* gather the results from other processes, sort them, and check 
+           whether they represent a counter being incremented by 1 */
+
+        results = (int *) malloc(NTIMES*nprocs*sizeof(int));
+        for (i=0; i<NTIMES*nprocs; i++)
+            results[i] = -1;
+
+        MPI_Gather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, results, NTIMES, MPI_INT, 
+                   0, MPI_COMM_WORLD);
+
+        qsort(results+NTIMES, NTIMES*(nprocs-1), sizeof(int), compar);
+
+        for (i=NTIMES+1; i<(NTIMES*nprocs); i++)
+            if (results[i] != results[i-1] + 1)
+                errs++;
+        
+        free(results);
+    }
+    else {
+        /* Get the largest power of two smaller than nprocs */ 
+        mask = 1; 
+        nlevels = 0;
+        while (mask < nprocs) {
+            mask <<= 1; 
+            nlevels++;
+        }
+        mask >>= 1;
+
+        get_array = (int *) malloc(nlevels * sizeof(int));
+        get_idx = (int *) malloc(nlevels * sizeof(int));
+        acc_idx = (int *) malloc(nlevels * sizeof(int));
+
+        level = 0; 
+        idx   = 0; 
+        tmp_rank = rank;
+        while (mask >= 1) { 
+            if (tmp_rank < mask) { 
+                /* go to left for acc_idx, go to right for
+                   get_idx. set idx=acc_idx for next iteration */ 
+                acc_idx[level] = idx + 1; 
+                get_idx[level] = idx + mask*2; 
+                idx            = idx + 1; 
+            } 
+            else { 
+                /* go to right for acc_idx, go to left for
+                   get_idx. set idx=acc_idx for next iteration */ 
+                acc_idx[level] = idx + mask*2; 
+                get_idx[level] = idx + 1; 
+                idx            = idx + mask*2; 
+            } 
+            level++;
+            tmp_rank = tmp_rank % mask;
+            mask >>= 1; 
+        } 
+
+/*        for (i=0; i<nlevels; i++)
+            printf("Rank %d, acc_idx[%d]=%d, get_idx[%d]=%d\n", rank,
+                   i, acc_idx[i], i, get_idx[i]);
+*/
+
+        MPI_Type_create_indexed_block(nlevels, 1, get_idx, MPI_INT, &get_type);
+        MPI_Type_create_indexed_block(nlevels, 1, acc_idx, MPI_INT, &acc_type);
+        MPI_Type_commit(&get_type);
+        MPI_Type_commit(&acc_type);
+
+        /* allocate array to store the values obtained from the 
+           fetch-and-add counter */
+        counter_vals = (int *) malloc(NTIMES * sizeof(int));
+
+        MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); 
+
+        for (i=0; i<NTIMES; i++) {
+            Get_nextval_tree(win, get_array, get_type, acc_type,
+                             nlevels, counter_vals+i); 
+            /* printf("Rank %d, counter %d\n", rank, value); */
+        }
+
+        MPI_Win_free(&win);
+        free(get_array);
+        free(get_idx);
+        free(acc_idx);
+        MPI_Type_free(&get_type);
+        MPI_Type_free(&acc_type);
+
+         /* gather the results to the root */
+        MPI_Gather(counter_vals, NTIMES, MPI_INT, NULL, 0, MPI_DATATYPE_NULL, 
+                   0, MPI_COMM_WORLD);
+        free(counter_vals);
+   }
+
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return MTestReturnValue( errs );
+} 
+
+
+void Get_nextval_tree(MPI_Win win, int *get_array, MPI_Datatype get_type,
+                      MPI_Datatype acc_type, int nlevels, int *value)
+{
+    int *one, i;
+
+    one = (int *) malloc(nlevels*sizeof(int));
+    for (i=0; i<nlevels; i++) one[i] = 1;
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+    MPI_Accumulate(one, nlevels, MPI_INT, 0, 0, 1, acc_type,
+                   MPI_SUM, win);
+    MPI_Get(get_array, nlevels, MPI_INT, 0, 0, 1, get_type, win);
+    MPI_Win_unlock(0, win);
+
+    *value = localvalue;
+    for (i=0; i<nlevels; i++)
+        *value = *value + get_array[i];
+
+    localvalue++;
+
+    free(one);
+}
+
+int compar(const void *a, const void *b)
+{
+    return (*((int *)a) - *((int *)b));
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/fetchandadd_tree_am.c b/teshsuite/smpi/mpich3-test/rma/fetchandadd_tree_am.c
new file mode 100644
index 0000000000..f5ca1e0eb8
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/fetchandadd_tree_am.c
@@ -0,0 +1,188 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+
+/* This is the tree-based scalable version of the fetch-and-add
+   example from Using MPI-2, pg 206-207. The code in the book (Fig
+   6.16) has bugs that are fixed below. */ 
+
+/* same as fetchandadd_tree.c but uses alloc_mem */
+
+#define NTIMES 20  /* no of times each process calls the counter
+                      routine */
+
+int localvalue=0;  /* contribution of this process to the counter. We
+                    define it as a global variable because attribute
+                    caching on the window is not enabled yet. */ 
+
+void Get_nextval_tree(MPI_Win win, int *get_array, MPI_Datatype get_type,
+                 MPI_Datatype acc_type, int nlevels, int *value);
+
+int compar(const void *a, const void *b);
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, i, *counter_mem, *get_array, *get_idx, *acc_idx,
+        mask, nlevels, level, idx, tmp_rank, pof2;
+    MPI_Datatype get_type, acc_type;
+    MPI_Win win;
+    int errs = 0, *results, *counter_vals;
+ 
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (rank == 0) {
+        /* allocate counter memory and initialize to 0 */
+
+        /* find the next power-of-two >= nprocs */
+        pof2 = 1;
+        while (pof2 < nprocs) pof2 *= 2;
+
+        /* counter_mem = (int *) calloc(pof2*2, sizeof(int)); */
+
+        i = MPI_Alloc_mem(pof2*2*sizeof(int), MPI_INFO_NULL, &counter_mem);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        for (i=0; i<(pof2*2); i++) counter_mem[i] = 0;
+
+        MPI_Win_create(counter_mem, pof2*2*sizeof(int), sizeof(int),
+                       MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+        MPI_Win_free(&win); 
+
+        /* free(counter_mem) */
+        MPI_Free_mem(counter_mem);
+
+        /* gather the results from other processes, sort them, and check 
+           whether they represent a counter being incremented by 1 */
+
+        results = (int *) malloc(NTIMES*nprocs*sizeof(int));
+        for (i=0; i<NTIMES*nprocs; i++)
+            results[i] = -1;
+
+        MPI_Gather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, results, NTIMES, MPI_INT, 
+                   0, MPI_COMM_WORLD);
+
+        qsort(results+NTIMES, NTIMES*(nprocs-1), sizeof(int), compar);
+
+        for (i=NTIMES+1; i<(NTIMES*nprocs); i++)
+            if (results[i] != results[i-1] + 1)
+                errs++;
+        
+        free(results);
+    }
+    else {
+        /* Get the largest power of two smaller than nprocs */ 
+        mask = 1; 
+        nlevels = 0;
+        while (mask < nprocs) {
+            mask <<= 1; 
+            nlevels++;
+        }
+        mask >>= 1;
+
+        get_array = (int *) malloc(nlevels * sizeof(int));
+        get_idx = (int *) malloc(nlevels * sizeof(int));
+        acc_idx = (int *) malloc(nlevels * sizeof(int));
+
+        level = 0; 
+        idx   = 0; 
+        tmp_rank = rank;
+        while (mask >= 1) { 
+            if (tmp_rank < mask) { 
+                /* go to left for acc_idx, go to right for
+                   get_idx. set idx=acc_idx for next iteration */ 
+                acc_idx[level] = idx + 1; 
+                get_idx[level] = idx + mask*2; 
+                idx            = idx + 1; 
+            } 
+            else { 
+                /* go to right for acc_idx, go to left for
+                   get_idx. set idx=acc_idx for next iteration */ 
+                acc_idx[level] = idx + mask*2; 
+                get_idx[level] = idx + 1; 
+                idx            = idx + mask*2; 
+            } 
+            level++;
+            tmp_rank = tmp_rank % mask;
+            mask >>= 1; 
+        } 
+
+/*        for (i=0; i<nlevels; i++)
+            printf("Rank %d, acc_idx[%d]=%d, get_idx[%d]=%d\n", rank,
+                   i, acc_idx[i], i, get_idx[i]);
+*/
+
+        MPI_Type_create_indexed_block(nlevels, 1, get_idx, MPI_INT, &get_type);
+        MPI_Type_create_indexed_block(nlevels, 1, acc_idx, MPI_INT, &acc_type);
+        MPI_Type_commit(&get_type);
+        MPI_Type_commit(&acc_type);
+
+        /* allocate array to store the values obtained from the 
+           fetch-and-add counter */
+        counter_vals = (int *) malloc(NTIMES * sizeof(int));
+
+        MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); 
+
+        for (i=0; i<NTIMES; i++) {
+            Get_nextval_tree(win, get_array, get_type, acc_type,
+                             nlevels, counter_vals+i); 
+            /* printf("Rank %d, counter %d\n", rank, value); */
+        }
+
+        MPI_Win_free(&win);
+        free(get_array);
+        free(get_idx);
+        free(acc_idx);
+        MPI_Type_free(&get_type);
+        MPI_Type_free(&acc_type);
+
+         /* gather the results to the root */
+        MPI_Gather(counter_vals, NTIMES, MPI_INT, NULL, 0, MPI_DATATYPE_NULL, 
+                   0, MPI_COMM_WORLD);
+        free(counter_vals);
+   }
+
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return MTestReturnValue( errs );
+} 
+
+
+void Get_nextval_tree(MPI_Win win, int *get_array, MPI_Datatype get_type,
+                      MPI_Datatype acc_type, int nlevels, int *value)
+{
+    int *one, i;
+
+    one = (int *) malloc(nlevels*sizeof(int));
+    for (i=0; i<nlevels; i++) one[i] = 1;
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+    MPI_Accumulate(one, nlevels, MPI_INT, 0, 0, 1, acc_type,
+                   MPI_SUM, win);
+    MPI_Get(get_array, nlevels, MPI_INT, 0, 0, 1, get_type, win);
+    MPI_Win_unlock(0, win);
+
+    *value = localvalue;
+    for (i=0; i<nlevels; i++)
+        *value = *value + get_array[i];
+
+    localvalue++;
+
+    free(one);
+}
+
+int compar(const void *a, const void *b)
+{
+    return (*((int *)a) - *((int *)b));
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/fkeyvalwin.c b/teshsuite/smpi/mpich3-test/rma/fkeyvalwin.c
new file mode 100644
index 0000000000..7aed170871
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/fkeyvalwin.c
@@ -0,0 +1,93 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitestconf.h"
+#include "mpitest.h"
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
+/*
+static char MTestDescrip[] = "Test freeing keyvals while still attached to \
+a win, then make sure that the keyval delete code are still \
+executed";
+*/
+
+/* Copy increments the attribute value */
+/* Note that we can really ignore this because there is no win dup */
+int copy_fn( MPI_Win oldwin, int keyval, void *extra_state,
+	     void *attribute_val_in, void *attribute_val_out, 
+	     int *flag);
+int copy_fn( MPI_Win oldwin, int keyval, void *extra_state,
+	     void *attribute_val_in, void *attribute_val_out, 
+	     int *flag)
+{
+    /* Copy the address of the attribute */
+    *(void **)attribute_val_out = attribute_val_in;
+    /* Change the value */
+    *(int *)attribute_val_in = *(int *)attribute_val_in + 1;
+    *flag = 1;
+    return MPI_SUCCESS;
+}
+
+/* Delete decrements the attribute value */
+int delete_fn( MPI_Win win, int keyval, void *attribute_val, 
+	       void *extra_state);
+int delete_fn( MPI_Win win, int keyval, void *attribute_val, 
+	       void *extra_state)
+{
+    *(int *)attribute_val = *(int *)attribute_val - 1;
+    return MPI_SUCCESS;
+}
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0;
+    int attrval;
+    int i, key[32], keyval, saveKeyval;
+    MPI_Win win;
+    MTest_Init( &argc, &argv );
+
+    while (MTestGetWin( &win, 0 )) {
+	if (win == MPI_WIN_NULL) continue;
+
+	MPI_Win_create_keyval( copy_fn, delete_fn, &keyval, (void *)0 );
+	saveKeyval = keyval;   /* in case we need to free explicitly */
+	attrval = 1;
+	MPI_Win_set_attr( win, keyval, (void*)&attrval );
+	/* See MPI-1, 5.7.1.  Freeing the keyval does not remove it if it
+	   is in use in an attribute */
+	MPI_Win_free_keyval( &keyval );
+	
+	/* We create some dummy keyvals here in case the same keyval
+	   is reused */
+	for (i=0; i<32; i++) {
+	    MPI_Win_create_keyval( MPI_NULL_COPY_FN, MPI_NULL_DELETE_FN,
+			       &key[i], (void *)0 );
+	}
+
+        MTestFreeWin(&win);
+
+	/* Check that the original attribute was freed */
+	if (attrval != 0) {
+	    errs++;
+	    printf( "Attribute not decremented when win %s freed\n",
+		    MTestGetWinName() );
+	}
+	/* Free those other keyvals */
+	for (i=0; i<32; i++) {
+	    MPI_Win_free_keyval( &key[i] );
+	}
+
+    }
+    MTest_Finalize( errs );
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/flush.c b/teshsuite/smpi/mpich3-test/rma/flush.c
new file mode 100644
index 0000000000..1f2147003c
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/flush.c
@@ -0,0 +1,89 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <mpi.h>
+#include <stdio.h>
+#include "mpitest.h"
+
+#define ITER 100
+
+int main( int argc, char *argv[] )
+{
+    int rank, nproc, i;
+    int errors = 0, all_errors = 0;
+    int *buf;
+    MPI_Win window;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    if (nproc < 2) {
+        if (rank == 0) printf("Error: must be run with two or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    /** Create using MPI_Win_create() **/
+
+    if (rank == 0) {
+      MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &buf);
+      *buf = nproc-1;
+    } else
+      buf = NULL;
+
+    MPI_Win_create(buf, sizeof(int)*(rank == 0), 1, MPI_INFO_NULL, MPI_COMM_WORLD, &window);
+
+    /* Test flush of an empty epoch */
+    MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, window);
+    MPI_Win_flush_all(window);
+    MPI_Win_unlock(0, window);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Test third-party communication, through rank 0. */
+    MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, window);
+
+    for (i = 0; i < ITER; i++) {
+        int val = -1, exp = -1;
+
+        /* Processes form a ring.  Process 0 starts first, then passes a token
+         * to the right.  Each process, in turn, performs third-party
+         * communication via process 0's window. */
+        if (rank > 0) {
+            MPI_Recv(NULL, 0, MPI_BYTE, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+
+        MPI_Get_accumulate(&rank, 1, MPI_INT, &val, 1, MPI_INT, 0, 0, 1, MPI_INT, MPI_REPLACE, window);
+        MPI_Win_flush(0, window);
+
+        exp = (rank + nproc-1) % nproc;
+
+        if (val != exp) {
+            printf("%d - Got %d, expected %d\n", rank, val, exp);
+            errors++;
+        }
+
+        if (rank < nproc-1) {
+            MPI_Send(NULL, 0, MPI_BYTE, rank+1, 0, MPI_COMM_WORLD);
+        }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    MPI_Win_unlock(0, window);
+
+    MPI_Win_free(&window);
+    if (buf) MPI_Free_mem(buf);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/get_acc_local.c b/teshsuite/smpi/mpich3-test/rma/get_acc_local.c
new file mode 100644
index 0000000000..ecf3258407
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/get_acc_local.c
@@ -0,0 +1,52 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#include "mpitest.h"
+
+int       errors  = 0;
+const int NITER   = 1000;
+const int acc_val = 3;
+
+int main(int argc, char **argv)
+{
+    int         rank, nproc;
+    int         out_val, i, counter = 0;
+    MPI_Win     win;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_create(&counter, sizeof(int), sizeof(int), MPI_INFO_NULL,
+                   MPI_COMM_WORLD, &win);
+
+    for (i = 0; i < NITER; i++) {
+        MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
+        MPI_Get_accumulate(&acc_val, 1, MPI_INT, &out_val, 1, MPI_INT,
+                            rank, 0, 1, MPI_INT, MPI_SUM, win);
+        MPI_Win_unlock(rank, win);
+
+        if (out_val != acc_val*i) {
+            errors++;
+            printf("Error: got %d, expected %d at iter %d\n", out_val, acc_val*i, i);
+            break;
+        }
+    }
+
+    MPI_Win_free(&win);
+
+    if (errors == 0 && rank == 0)
+        printf(" No errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/get_accumulate.c b/teshsuite/smpi/mpich3-test/rma/get_accumulate.c
new file mode 100644
index 0000000000..5a533fe0cc
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/get_accumulate.c
@@ -0,0 +1,413 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define ITER  100
+#define COUNT 5
+
+#if defined (GACC_TYPE_SHORT)
+#  define TYPE_C   short
+#  define TYPE_MPI_BASE MPI_SHORT
+#  define TYPE_FMT "%d"
+#elif defined (GACC_TYPE_LONG)
+#  define TYPE_C   long
+#  define TYPE_MPI_BASE MPI_LONG
+#  define TYPE_FMT "%ld"
+#elif defined (GACC_TYPE_DOUBLE)
+#  define TYPE_C   double
+#  define TYPE_MPI_BASE MPI_DOUBLE
+#  define TYPE_FMT "%f"
+#else
+#  define TYPE_C   int
+#  define TYPE_MPI_BASE MPI_INT
+#  define TYPE_FMT "%d"
+#endif
+
+#if defined(GACC_TYPE_DERIVED)
+#  define TYPE_MPI derived_type
+#else
+#  define TYPE_MPI TYPE_MPI_BASE
+#endif
+
+void reset_bufs(TYPE_C *win_ptr, TYPE_C *res_ptr, TYPE_C *val_ptr, TYPE_C value, MPI_Win win) {
+    int rank, nproc, i;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    memset(win_ptr, 0, sizeof(TYPE_C)*nproc*COUNT);
+    MPI_Win_unlock(rank, win);
+
+    memset(res_ptr, -1, sizeof(TYPE_C)*nproc*COUNT);
+
+    for (i = 0; i < COUNT; i++)
+        val_ptr[i] = value;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+int main(int argc, char **argv) {
+    int       i, rank, nproc;
+    int       errors = 0, all_errors = 0;
+    TYPE_C   *win_ptr, *res_ptr, *val_ptr;
+    MPI_Win   win;
+#if defined (GACC_TYPE_DERIVED)
+    MPI_Datatype derived_type;
+#endif
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    win_ptr = malloc(sizeof(TYPE_C)*nproc*COUNT);
+    res_ptr = malloc(sizeof(TYPE_C)*nproc*COUNT);
+    val_ptr = malloc(sizeof(TYPE_C)*COUNT);
+
+#if defined (GACC_TYPE_DERIVED)
+    MPI_Type_contiguous(1, TYPE_MPI_BASE, &derived_type);
+    MPI_Type_commit(&derived_type);
+#endif
+
+    MPI_Win_create(win_ptr, sizeof(TYPE_C)*nproc*COUNT, sizeof(TYPE_C),
+                   MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+    /* Test self communication */
+
+    reset_bufs(win_ptr, res_ptr, val_ptr, 1, win);
+
+    for (i = 0; i < ITER; i++) {
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+        MPI_Get_accumulate(val_ptr, COUNT, TYPE_MPI, res_ptr, COUNT, TYPE_MPI, 
+                            rank, 0, COUNT, TYPE_MPI, MPI_SUM, win);
+        MPI_Win_unlock(rank, win);
+    }
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < COUNT; i++) {
+        if (win_ptr[i] != ITER) {
+            SQUELCH( printf("%d->%d -- SELF[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                            rank, rank, i, (TYPE_C) ITER, win_ptr[i]); );
+            errors++;
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test neighbor communication */
+
+    reset_bufs(win_ptr, res_ptr, val_ptr, 1, win);
+
+    for (i = 0; i < ITER; i++) {
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, (rank+1)%nproc, 0, win);
+        MPI_Get_accumulate(val_ptr, COUNT, TYPE_MPI, res_ptr, COUNT, TYPE_MPI, 
+                            (rank+1)%nproc, 0, COUNT, TYPE_MPI, MPI_SUM, win);
+        MPI_Win_unlock((rank+1)%nproc, win);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < COUNT; i++) {
+        if (win_ptr[i] != ITER) {
+            SQUELCH( printf("%d->%d -- NEIGHBOR[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                            (rank+1)%nproc, rank, i, (TYPE_C) ITER, win_ptr[i]); );
+            errors++;
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test contention */
+
+    reset_bufs(win_ptr, res_ptr, val_ptr, 1, win);
+
+    if (rank != 0) {
+        for (i = 0; i < ITER; i++) {
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+            MPI_Get_accumulate(val_ptr, COUNT, TYPE_MPI, res_ptr, COUNT, TYPE_MPI, 
+                                0, 0, COUNT, TYPE_MPI, MPI_SUM, win);
+            MPI_Win_unlock(0, win);
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    if (rank == 0 && nproc > 1) {
+        for (i = 0; i < COUNT; i++) {
+            if (win_ptr[i] != ITER*(nproc-1)) {
+                SQUELCH( printf("*->%d - CONTENTION[%d]: expected="TYPE_FMT" val="TYPE_FMT"\n",
+                                rank, i, (TYPE_C) ITER*(nproc-1), win_ptr[i]); );
+                errors++;
+            }
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test all-to-all communication (fence) */
+
+    reset_bufs(win_ptr, res_ptr, val_ptr, rank, win);
+
+    for (i = 0; i < ITER; i++) {
+        int j;
+
+        MPI_Win_fence(MPI_MODE_NOPRECEDE, win);
+        for (j = 0; j < nproc; j++) {
+            MPI_Get_accumulate(val_ptr, COUNT, TYPE_MPI, &res_ptr[j*COUNT], COUNT, TYPE_MPI,
+                                j, rank*COUNT, COUNT, TYPE_MPI, MPI_SUM, win);
+        }
+        MPI_Win_fence(MPI_MODE_NOSUCCEED, win);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        for (j = 0; j < nproc; j++) {
+            int c;
+            for (c = 0; c < COUNT; c++) {
+                if (res_ptr[j*COUNT+c] != i*rank) {
+                    SQUELCH( printf("%d->%d -- ALL-TO-ALL (FENCE) [%d]: iter %d, expected result "TYPE_FMT", got "TYPE_FMT"\n", 
+                                    rank, j, c, i, (TYPE_C) i*rank, res_ptr[j*COUNT+c]); );
+                    errors++;
+                }
+            }
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++) {
+        int c;
+        for (c = 0; c < COUNT; c++) {
+            if (win_ptr[i*COUNT+c] != ITER*i) {
+                SQUELCH( printf("%d->%d -- ALL-TO-ALL (FENCE): expected "TYPE_FMT", got "TYPE_FMT"\n", 
+                                i, rank, (TYPE_C) ITER*i, win_ptr[i*COUNT+c]); );
+                errors++;
+            }
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test all-to-all communication (lock-all) */
+
+    reset_bufs(win_ptr, res_ptr, val_ptr, rank, win);
+
+    for (i = 0; i < ITER; i++) {
+        int j;
+
+        MPI_Win_lock_all(0, win);
+        for (j = 0; j < nproc; j++) {
+            MPI_Get_accumulate(val_ptr, COUNT, TYPE_MPI, &res_ptr[j*COUNT], COUNT, TYPE_MPI,
+                                j, rank*COUNT, COUNT, TYPE_MPI, MPI_SUM, win);
+        }
+        MPI_Win_unlock_all(win);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        for (j = 0; j < nproc; j++) {
+            int c;
+            for (c = 0; c < COUNT; c++) {
+                if (res_ptr[j*COUNT+c] != i*rank) {
+                    SQUELCH( printf("%d->%d -- ALL-TO-ALL (LOCK-ALL) [%d]: iter %d, expected result "TYPE_FMT", got "TYPE_FMT"\n", 
+                                    rank, j, c, i, (TYPE_C) i*rank, res_ptr[j*COUNT+c]); );
+                    errors++;
+                }
+            }
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++) {
+        int c;
+        for (c = 0; c < COUNT; c++) {
+            if (win_ptr[i*COUNT+c] != ITER*i) {
+                SQUELCH( printf("%d->%d -- ALL-TO-ALL (LOCK-ALL): expected "TYPE_FMT", got "TYPE_FMT"\n", 
+                                i, rank, (TYPE_C) ITER*i, win_ptr[i*COUNT+c]); );
+                errors++;
+            }
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test all-to-all communication (lock-all+flush) */
+
+    reset_bufs(win_ptr, res_ptr, val_ptr, rank, win);
+
+    for (i = 0; i < ITER; i++) {
+        int j;
+
+        MPI_Win_lock_all(0, win);
+        for (j = 0; j < nproc; j++) {
+            MPI_Get_accumulate(val_ptr, COUNT, TYPE_MPI, &res_ptr[j*COUNT], COUNT, TYPE_MPI,
+                                j, rank*COUNT, COUNT, TYPE_MPI, MPI_SUM, win);
+            MPI_Win_flush(j, win);
+        }
+        MPI_Win_unlock_all(win);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        for (j = 0; j < nproc; j++) {
+            int c;
+            for (c = 0; c < COUNT; c++) {
+                if (res_ptr[j*COUNT+c] != i*rank) {
+                    SQUELCH( printf("%d->%d -- ALL-TO-ALL (LOCK-ALL+FLUSH) [%d]: iter %d, expected result "TYPE_FMT", got "TYPE_FMT"\n", 
+                                    rank, j, c, i, (TYPE_C) i*rank, res_ptr[j*COUNT+c]); );
+                    errors++;
+                }
+            }
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < nproc; i++) {
+        int c;
+        for (c = 0; c < COUNT; c++) {
+            if (win_ptr[i*COUNT+c] != ITER*i) {
+                SQUELCH( printf("%d->%d -- ALL-TO-ALL (LOCK-ALL+FLUSH): expected "TYPE_FMT", got "TYPE_FMT"\n", 
+                                i, rank, (TYPE_C) ITER*i, win_ptr[i*COUNT+c]); );
+                errors++;
+            }
+        }
+    }
+    MPI_Win_unlock(rank, win);
+
+    /* Test NO_OP (neighbor communication) */
+
+    reset_bufs(win_ptr, res_ptr, val_ptr, 1, win);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < COUNT*nproc; i++)
+        win_ptr[i] = (TYPE_C) rank;
+    MPI_Win_unlock(rank, win);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (i = 0; i < ITER; i++) {
+        int j, target = (rank+1) % nproc;
+
+        /* Test: origin_buf = NULL */
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, win);
+        MPI_Get_accumulate(NULL, COUNT, TYPE_MPI, res_ptr, COUNT, TYPE_MPI,
+                            target, 0, COUNT, TYPE_MPI, MPI_NO_OP, win);
+        MPI_Win_unlock(target, win);
+
+        for (j = 0; j < COUNT; j++) {
+            if (res_ptr[j] != (TYPE_C) target) {
+                SQUELCH( printf("%d->%d -- NOP(1)[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                                target, rank, i, (TYPE_C) target, res_ptr[i]); );
+                errors++;
+            }
+        }
+
+        /* Test: origin_buf = NULL, origin_count = 0 */
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, win);
+        MPI_Get_accumulate(NULL, 0, TYPE_MPI, res_ptr, COUNT, TYPE_MPI,
+                            target, 0, COUNT, TYPE_MPI, MPI_NO_OP, win);
+        MPI_Win_unlock(target, win);
+
+        for (j = 0; j < COUNT; j++) {
+            if (res_ptr[j] != (TYPE_C) target) {
+                SQUELCH( printf("%d->%d -- NOP(2)[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                                target, rank, i, (TYPE_C) target, res_ptr[i]); );
+                errors++;
+            }
+        }
+
+        /* Test: origin_buf = NULL, origin_count = 0, origin_dtype = NULL */
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, win);
+        MPI_Get_accumulate(NULL, 0, MPI_DATATYPE_NULL, res_ptr, COUNT, TYPE_MPI,
+                            target, 0, COUNT, TYPE_MPI, MPI_NO_OP, win);
+        MPI_Win_unlock(target, win);
+
+        for (j = 0; j < COUNT; j++) {
+            if (res_ptr[j] != (TYPE_C) target) {
+                SQUELCH( printf("%d->%d -- NOP(2)[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                                target, rank, i, (TYPE_C) target, res_ptr[i]); );
+                errors++;
+            }
+        }
+    }
+
+    /* Test NO_OP (self communication) */
+
+    reset_bufs(win_ptr, res_ptr, val_ptr, 1, win);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
+    for (i = 0; i < COUNT*nproc; i++)
+        win_ptr[i] = (TYPE_C) rank;
+    MPI_Win_unlock(rank, win);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (i = 0; i < ITER; i++) {
+        int j, target = rank;
+
+        /* Test: origin_buf = NULL */
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, win);
+        MPI_Get_accumulate(NULL, COUNT, TYPE_MPI, res_ptr, COUNT, TYPE_MPI,
+                            target, 0, COUNT, TYPE_MPI, MPI_NO_OP, win);
+        MPI_Win_unlock(target, win);
+
+        for (j = 0; j < COUNT; j++) {
+            if (res_ptr[j] != (TYPE_C) target) {
+                SQUELCH( printf("%d->%d -- NOP_SELF(1)[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                                target, rank, i, (TYPE_C) target, res_ptr[i]); );
+                errors++;
+            }
+        }
+
+        /* Test: origin_buf = NULL, origin_count = 0 */
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, win);
+        MPI_Get_accumulate(NULL, 0, TYPE_MPI, res_ptr, COUNT, TYPE_MPI,
+                            target, 0, COUNT, TYPE_MPI, MPI_NO_OP, win);
+        MPI_Win_unlock(target, win);
+
+        for (j = 0; j < COUNT; j++) {
+            if (res_ptr[j] != (TYPE_C) target) {
+                SQUELCH( printf("%d->%d -- NOP_SELF(2)[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                                target, rank, i, (TYPE_C) target, res_ptr[i]); );
+                errors++;
+            }
+        }
+
+        /* Test: origin_buf = NULL, origin_count = 0, origin_dtype = NULL */
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, 0, win);
+        MPI_Get_accumulate(NULL, 0, MPI_DATATYPE_NULL, res_ptr, COUNT, TYPE_MPI,
+                            target, 0, COUNT, TYPE_MPI, MPI_NO_OP, win);
+        MPI_Win_unlock(target, win);
+
+        for (j = 0; j < COUNT; j++) {
+            if (res_ptr[j] != (TYPE_C) target) {
+                SQUELCH( printf("%d->%d -- NOP_SELF(2)[%d]: expected "TYPE_FMT", got "TYPE_FMT"\n",
+                                target, rank, i, (TYPE_C) target, res_ptr[i]); );
+                errors++;
+            }
+        }
+    }
+
+    MPI_Win_free(&win);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+#if defined (GACC_TYPE_DERIVED)
+    MPI_Type_free(&derived_type);
+#endif
+
+    free(win_ptr);
+    free(res_ptr);
+    free(val_ptr);
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/getfence1.c b/teshsuite/smpi/mpich3-test/rma/getfence1.c
new file mode 100644
index 0000000000..2aaba8229e
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/getfence1.c
@@ -0,0 +1,99 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Get with Fence";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0, err;
+    int rank, size, source, dest;
+    int minsize = 2, count; 
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Aint      extent;
+    MTestDatatype sendtype, recvtype;
+
+    MTest_Init( &argc, &argv );
+
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	source = 0;
+	dest   = size - 1;
+	
+	for (count = 1; count < 65000; count = count * 2) {
+	    while (MTestGetDatatypes( &sendtype, &recvtype, count )) {
+		/* Make sure that everyone has a recv buffer */
+		recvtype.InitBuf( &recvtype );
+		sendtype.InitBuf( &sendtype );
+		/* By default, print information about errors */
+		recvtype.printErrors = 1;
+		sendtype.printErrors = 1;
+
+		MPI_Type_extent( sendtype.datatype, &extent );
+		MPI_Win_create( sendtype.buf, sendtype.count * extent, 
+				(int)extent, MPI_INFO_NULL, comm, &win );
+		MPI_Win_fence( 0, win );
+		if (rank == source) {
+		    /* The source does not need to do anything besides the
+		       fence */
+		    MPI_Win_fence( 0, win );
+		}
+		else if (rank == dest) {
+		    /* To improve reporting of problems about operations, we
+		       change the error handler to errors return */
+		    MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+
+		    /* This should have the same effect, in terms of
+		       transfering data, as a send/recv pair */
+		    err = MPI_Get( recvtype.buf, recvtype.count, 
+				   recvtype.datatype, source, 0, 
+				   sendtype.count, sendtype.datatype, win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		    err = MPI_Win_fence( 0, win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) {
+			errs += err;
+		    }
+		}
+		else {
+		    MPI_Win_fence( 0, win );
+		}
+		MPI_Win_free( &win );
+		MTestFreeDatatype( &recvtype );
+		MTestFreeDatatype( &sendtype );
+	    }
+	}
+        MTestFreeComm(&comm);
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/getgroup.c b/teshsuite/smpi/mpich3-test/rma/getgroup.c
new file mode 100644
index 0000000000..4dd0c0db93
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/getgroup.c
@@ -0,0 +1,52 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Test of Win_get_group";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0;
+    int result;
+    int buf[10];
+    MPI_Win   win;
+    MPI_Group group, wingroup;
+    int minsize = 2;
+    MPI_Comm      comm;
+
+    MTest_Init( &argc, &argv );
+
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+
+	MPI_Win_create( buf, sizeof(int) * 10, sizeof(int), 
+			MPI_INFO_NULL, comm, &win );
+	MPI_Win_get_group( win, &wingroup );
+	MPI_Comm_group( comm, &group );
+	MPI_Group_compare( group, wingroup, &result );
+	if (result != MPI_IDENT) {
+	    errs++;
+	    fprintf( stderr, "Group returned by Win_get_group not the same as the input group\n" );
+	}
+	MPI_Group_free( &wingroup );
+	MPI_Group_free( &group );
+	MPI_Win_free( &win );
+        MTestFreeComm(&comm);
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/ircpi.c b/teshsuite/smpi/mpich3-test/rma/ircpi.c
new file mode 100644
index 0000000000..99a83ed9a4
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/ircpi.c
@@ -0,0 +1,71 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include <math.h> 
+
+/* From Using MPI-2 */
+
+int main(int argc, char *argv[]) 
+{ 
+    int n, myid, numprocs, i, ierr; 
+    double PI25DT = 3.141592653589793238462643; 
+    double mypi, pi, h, sum, x; 
+    MPI_Win nwin, piwin; 
+ 
+    MPI_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&numprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&myid); 
+ 
+    if (myid == 0) { 
+	MPI_Win_create(&n, sizeof(int), 1, MPI_INFO_NULL, 
+		       MPI_COMM_WORLD, &nwin); 
+	MPI_Win_create(&pi, sizeof(double), 1, MPI_INFO_NULL, 
+		       MPI_COMM_WORLD, &piwin);  
+    } 
+    else { 
+	MPI_Win_create(MPI_BOTTOM, 0, 1, MPI_INFO_NULL, 
+		       MPI_COMM_WORLD, &nwin); 
+	MPI_Win_create(MPI_BOTTOM, 0, 1, MPI_INFO_NULL, 
+		       MPI_COMM_WORLD, &piwin); 
+    } 
+    while (1) { 
+        if (myid == 0) { 
+            fprintf(stdout, "Enter the number of intervals: (0 quits) ");
+	    fflush(stdout); 
+            ierr=scanf("%d",&n); 
+	    pi = 0.0;			 
+        } 
+	MPI_Win_fence(0, nwin); 
+	if (myid != 0)  
+	    MPI_Get(&n, 1, MPI_INT, 0, 0, 1, MPI_INT, nwin); 
+	MPI_Win_fence(0, nwin); 
+        if (n == 0) 
+            break; 
+        else { 
+            h   = 1.0 / (double) n; 
+            sum = 0.0; 
+            for (i = myid + 1; i <= n; i += numprocs) { 
+                x = h * ((double)i - 0.5); 
+                sum += (4.0 / (1.0 + x*x)); 
+            } 
+            mypi = h * sum; 
+	    MPI_Win_fence( 0, piwin); 
+	    MPI_Accumulate(&mypi, 1, MPI_DOUBLE, 0, 0, 1, MPI_DOUBLE, 
+			   MPI_SUM, piwin); 
+	    MPI_Win_fence(0, piwin); 
+            if (myid == 0) { 
+                fprintf(stdout, "pi is approximately %.16f, Error is %.16f\n", 
+                       pi, fabs(pi - PI25DT)); 
+		fflush(stdout);
+	    }
+        } 
+    } 
+    MPI_Win_free(&nwin); 
+    MPI_Win_free(&piwin); 
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/linked_list.c b/teshsuite/smpi/mpich3-test/rma/linked_list.c
new file mode 100644
index 0000000000..c0dc769871
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/linked_list.c
@@ -0,0 +1,231 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/*            MPI-3 distributed linked list construction example
+ *            --------------------------------------------------
+ * 
+ * Construct a distributed shared linked list using proposed MPI-3 dynamic
+ * windows.  Initially process 0 creates the head of the list, attaches it to
+ * the window, and broadcasts the pointer to all processes.  All processes then
+ * concurrently append N new elements to the list.  When a process attempts to
+ * attach its element to the tail of list it may discover that its tail pointer
+ * is stale and it must chase ahead to the new tail before the element can be
+ * attached.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <assert.h>
+#include "mpitest.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#define NUM_ELEMS 32
+#define NPROBE    100
+#define ELEM_PER_ROW 16
+
+/* Linked list pointer */
+typedef struct {
+    int      rank;
+    MPI_Aint disp;
+} llist_ptr_t;
+
+/* Linked list element */
+typedef struct {
+    int value;
+    llist_ptr_t next;
+} llist_elem_t;
+
+static const llist_ptr_t nil = { -1, (MPI_Aint) MPI_BOTTOM };
+static const int verbose = 0;
+
+/* List of locally allocated list elements. */
+static llist_elem_t **my_elems = NULL;
+static int my_elems_size  = 0;
+static int my_elems_count = 0;
+
+/* Allocate a new shared linked list element */
+MPI_Aint alloc_elem(int value, MPI_Win win) {
+    MPI_Aint disp;
+    llist_elem_t *elem_ptr;
+
+    /* Allocate the new element and register it with the window */
+    MPI_Alloc_mem(sizeof(llist_elem_t), MPI_INFO_NULL, &elem_ptr);
+    elem_ptr->value = value;
+    elem_ptr->next  = nil;
+    MPI_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
+
+    /* Add the element to the list of local elements so we can free it later. */
+    if (my_elems_size == my_elems_count) {
+        my_elems_size += 100;
+        my_elems = realloc(my_elems, my_elems_size*sizeof(void*));
+    }
+    my_elems[my_elems_count] = elem_ptr;
+    my_elems_count++;
+
+    MPI_Get_address(elem_ptr, &disp);
+    return disp;
+}
+
+int main(int argc, char **argv) {
+    int           procid, nproc, i;
+    MPI_Win       llist_win;
+    llist_ptr_t   head_ptr, tail_ptr;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);
+
+    /* Process 0 creates the head node */
+    if (procid == 0)
+        head_ptr.disp = alloc_elem(-1, llist_win);
+
+    /* Broadcast the head pointer to everyone */
+    head_ptr.rank = 0;
+    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
+    tail_ptr = head_ptr;
+
+    /* All processes concurrently append NUM_ELEMS elements to the list */
+    for (i = 0; i < NUM_ELEMS; i++) {
+        llist_ptr_t new_elem_ptr;
+        int success;
+
+        /* Create a new list element and register it with the window */
+        new_elem_ptr.rank = procid;
+        new_elem_ptr.disp = alloc_elem(procid, llist_win);
+
+        /* Append the new node to the list.  This might take multiple attempts if
+           others have already appended and our tail pointer is stale. */
+        do {
+            llist_ptr_t next_tail_ptr = nil;
+
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
+
+            MPI_Compare_and_swap((void*) &new_elem_ptr.rank, (void*) &nil.rank,
+                                  (void*) &next_tail_ptr.rank, MPI_INT, tail_ptr.rank,
+                                  (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.rank), llist_win);
+
+            MPI_Win_unlock(tail_ptr.rank, llist_win);
+            success = (next_tail_ptr.rank == nil.rank);
+
+            if (success) {
+                int i, flag;
+
+                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
+
+                MPI_Put(&new_elem_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
+                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp), 1,
+                        MPI_AINT, llist_win);
+
+                MPI_Win_unlock(tail_ptr.rank, llist_win);
+                tail_ptr = new_elem_ptr;
+
+                /* For implementations that use pt-to-pt messaging, force progress for other threads'
+                   RMA operations. */
+                for (i = 0; i < NPROBE; i++)
+                    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
+
+            } else {
+                /* Tail pointer is stale, fetch the displacement.  May take multiple tries
+                   if it is being updated. */
+                do {
+                    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
+
+                    MPI_Get( &next_tail_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
+                             (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp),
+                             1, MPI_AINT, llist_win);
+
+                    MPI_Win_unlock(tail_ptr.rank, llist_win);
+                } while (next_tail_ptr.disp == nil.disp);
+                tail_ptr = next_tail_ptr;
+            }
+        } while (!success);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Traverse the list and verify that all processes inserted exactly the correct
+       number of elements. */
+    if (procid == 0) {
+        int  have_root = 0;
+        int  errors    = 0;
+        int *counts, count = 0;
+
+        counts = (int*) malloc(sizeof(int) * nproc);
+        assert(counts != NULL);
+
+        for (i = 0; i < nproc; i++)
+            counts[i] = 0;
+
+        tail_ptr = head_ptr;
+
+        /* Walk the list and tally up the number of elements inserted by each rank */
+        while (tail_ptr.disp != nil.disp) {
+            llist_elem_t elem;
+
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
+
+            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
+                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);
+
+            MPI_Win_unlock(tail_ptr.rank, llist_win);
+
+            tail_ptr = elem.next;
+
+            /* This is not the root */
+            if (have_root) {
+                assert(elem.value >= 0 && elem.value < nproc);
+                counts[elem.value]++;
+                count++;
+
+                if (verbose) {
+                    int last_elem = tail_ptr.disp == nil.disp;
+                    printf("%2d%s", elem.value, last_elem ? "" : " -> ");
+                    if (count % ELEM_PER_ROW == 0 && !last_elem)
+                        printf("\n");
+                }
+            }
+
+            /* This is the root */
+            else {
+                assert(elem.value == -1);
+                have_root = 1;
+            }
+        }
+
+        if (verbose)
+          printf("\n\n");
+
+        /* Verify the counts we collected */
+        for (i = 0; i < nproc; i++) {
+            int expected = NUM_ELEMS;
+
+            if (counts[i] != expected) {
+                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
+                errors++;
+            }
+        }
+
+        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
+        free(counts);
+    }
+
+    MPI_Win_free(&llist_win);
+
+    /* Free all the elements in the list */
+    for ( ; my_elems_count > 0; my_elems_count--)
+        MPI_Free_mem(my_elems[my_elems_count-1]);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_all.c b/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_all.c
new file mode 100644
index 0000000000..ef6705083c
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_all.c
@@ -0,0 +1,263 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/*            MPI-3 distributed linked list construction example
+ *            --------------------------------------------------
+ *
+ * Construct a distributed shared linked list using proposed MPI-3 dynamic
+ * windows.  Initially process 0 creates the head of the list, attaches it to
+ * the window, and broadcasts the pointer to all processes.  Each process p then
+ * appends N new elements to the list when the tail reaches process p-1.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <assert.h>
+#include "mpitest.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#define NUM_ELEMS 1000
+#define MAX_NPROBE nproc
+#define MIN_NPROBE 1
+#define ELEM_PER_ROW 16
+
+#define MIN(X,Y) ((X < Y) ? (X) : (Y))
+#define MAX(X,Y) ((X > Y) ? (X) : (Y))
+
+/* Linked list pointer */
+typedef struct {
+    int      rank;
+    MPI_Aint disp;
+} llist_ptr_t;
+
+/* Linked list element */
+typedef struct {
+    int value;
+    llist_ptr_t next;
+} llist_elem_t;
+
+static const llist_ptr_t nil = { -1, (MPI_Aint) MPI_BOTTOM };
+static const int verbose = 0;
+static const int print_perf = 0;
+
+/* List of locally allocated list elements. */
+static llist_elem_t **my_elems = NULL;
+static int my_elems_size  = 0;
+static int my_elems_count = 0;
+
+/* Allocate a new shared linked list element */
+MPI_Aint alloc_elem(int value, MPI_Win win) {
+    MPI_Aint disp;
+    llist_elem_t *elem_ptr;
+
+    /* Allocate the new element and register it with the window */
+    MPI_Alloc_mem(sizeof(llist_elem_t), MPI_INFO_NULL, &elem_ptr);
+    elem_ptr->value = value;
+    elem_ptr->next  = nil;
+    MPI_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
+
+    /* Add the element to the list of local elements so we can free it later. */
+    if (my_elems_size == my_elems_count) {
+        my_elems_size += 100;
+        my_elems = realloc(my_elems, my_elems_size*sizeof(void*));
+    }
+    my_elems[my_elems_count] = elem_ptr;
+    my_elems_count++;
+
+    MPI_Get_address(elem_ptr, &disp);
+    return disp;
+}
+
+int main(int argc, char **argv) {
+    int           procid, nproc, i, j, my_nelem;
+    int           pollint = 0;
+    double        time;
+    MPI_Win       llist_win;
+    llist_ptr_t   head_ptr, tail_ptr;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);
+
+    /* Process 0 creates the head node */
+    if (procid == 0)
+        head_ptr.disp = alloc_elem(procid, llist_win);
+
+    /* Broadcast the head pointer to everyone */
+    head_ptr.rank = 0;
+    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
+    tail_ptr = head_ptr;
+
+    /* All processes append NUM_ELEMS elements to the list; rank 0 has already
+     * appended an element. */
+    if (procid == 0)
+        i = 1;
+    else
+        i = 0;
+
+    my_nelem = NUM_ELEMS/nproc;
+    if (procid < NUM_ELEMS % nproc)
+        my_nelem++;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    time = MPI_Wtime();
+
+    /* Lock the window for shared access to all targets */
+    MPI_Win_lock_all(0, llist_win);
+
+    for ( ; i < my_nelem; i++) {
+        llist_ptr_t new_elem_ptr;
+        int success = 0;
+
+        /* Create a new list element and register it with the window */
+        new_elem_ptr.rank = procid;
+        new_elem_ptr.disp = alloc_elem(procid, llist_win);
+
+        /* Append the new node to the list.  This might take multiple attempts if
+           others have already appended and our tail pointer is stale. */
+        do {
+            int flag;
+
+            /* The tail is at my left neighbor, append my element. */
+            if (tail_ptr.rank == (procid + nproc-1) % nproc)
+            {
+                if (verbose)
+                    printf("%d: Appending to <%d, %p>\n", procid, tail_ptr.rank, (void*) tail_ptr.disp);
+#if 1
+                MPI_Accumulate(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                               (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
+                               MPI_BYTE, MPI_REPLACE, llist_win);
+#else
+                MPI_Put(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
+                        MPI_BYTE, llist_win);
+#endif
+                MPI_Win_flush(tail_ptr.rank, llist_win);
+
+                success = 1;
+                tail_ptr = new_elem_ptr;
+            }
+
+            /* Otherwise, chase the tail. */
+            else
+            {
+                llist_ptr_t next_tail_ptr;
+
+                MPI_Get_accumulate( NULL, 0, MPI_DATATYPE_NULL, &next_tail_ptr,
+                                    sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                                    (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next),
+                                    sizeof(llist_ptr_t), MPI_BYTE, MPI_NO_OP, llist_win);
+
+                MPI_Win_flush(tail_ptr.rank, llist_win);
+
+                if (next_tail_ptr.rank != nil.rank) {
+                    if (verbose)
+                        printf("%d: Chasing to <%d, %p>\n", procid, next_tail_ptr.rank, (void*) next_tail_ptr.disp);
+                    tail_ptr = next_tail_ptr;
+                    pollint = MAX(MIN_NPROBE, pollint/2);
+                }
+                else {
+                    for (j = 0; j < pollint; j++)
+                        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
+
+                    pollint = MIN(MAX_NPROBE, pollint*2);
+                }
+            }
+        } while (!success);
+    }
+
+    MPI_Win_unlock_all(llist_win);
+    MPI_Barrier(MPI_COMM_WORLD);
+    time = MPI_Wtime() - time;
+
+    /* Traverse the list and verify that all processes inserted exactly the correct
+       number of elements. */
+    if (procid == 0) {
+        int  errors    = 0;
+        int *counts, count = 0;
+
+        counts = (int*) malloc(sizeof(int) * nproc);
+        assert(counts != NULL);
+
+        for (i = 0; i < nproc; i++)
+            counts[i] = 0;
+
+        tail_ptr = head_ptr;
+
+        MPI_Win_lock_all(0, llist_win);
+
+        /* Walk the list and tally up the number of elements inserted by each rank */
+        while (tail_ptr.disp != nil.disp) {
+            llist_elem_t elem;
+
+            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
+                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);
+
+            MPI_Win_flush(tail_ptr.rank, llist_win);
+
+            tail_ptr = elem.next;
+
+            assert(elem.value >= 0 && elem.value < nproc);
+            counts[elem.value]++;
+            count++;
+
+            if (verbose) {
+                int last_elem = tail_ptr.disp == nil.disp;
+                printf("%2d%s", elem.value, last_elem ? "" : " -> ");
+                if (count % ELEM_PER_ROW == 0 && !last_elem)
+                    printf("\n");
+            }
+        }
+
+        MPI_Win_unlock_all(llist_win);
+
+        if (verbose)
+          printf("\n\n");
+
+        /* Verify the counts we collected */
+        for (i = 0; i < nproc; i++) {
+            int expected;
+
+            expected = NUM_ELEMS/nproc;
+            if (i < NUM_ELEMS % nproc)
+                expected++;
+
+            if (counts[i] != expected) {
+                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
+                errors++;
+            }
+        }
+
+        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
+        free(counts);
+    }
+
+    if (print_perf) {
+        double max_time;
+
+        MPI_Reduce(&time, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+
+        if (procid == 0) {
+            printf("Total time = %0.2f sec, elem/sec = %0.2f, sec/elem = %0.2f usec\n", max_time, NUM_ELEMS/max_time, max_time/NUM_ELEMS*1.0e6);
+        }
+    }
+
+    MPI_Win_free(&llist_win);
+
+    /* Free all the elements in the list */
+    for ( ; my_elems_count > 0; my_elems_count--)
+        MPI_Free_mem(my_elems[my_elems_count-1]);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_excl.c b/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_excl.c
new file mode 100644
index 0000000000..8bf5427d48
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_excl.c
@@ -0,0 +1,266 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/*            MPI-3 distributed linked list construction example
+ *            --------------------------------------------------
+ *
+ * Construct a distributed shared linked list using proposed MPI-3 dynamic
+ * windows.  Initially process 0 creates the head of the list, attaches it to
+ * the window, and broadcasts the pointer to all processes.  Each process p then
+ * appends N new elements to the list when the tail reaches process p-1.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <assert.h>
+#include "mpitest.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#define NUM_ELEMS 1000
+#define MAX_NPROBE nproc
+#define MIN_NPROBE 1
+#define ELEM_PER_ROW 16
+
+#define MIN(X,Y) ((X < Y) ? (X) : (Y))
+#define MAX(X,Y) ((X > Y) ? (X) : (Y))
+
+/* Linked list pointer */
+typedef struct {
+    int      rank;
+    MPI_Aint disp;
+} llist_ptr_t;
+
+/* Linked list element */
+typedef struct {
+    int value;
+    llist_ptr_t next;
+} llist_elem_t;
+
+static const llist_ptr_t nil = { -1, (MPI_Aint) MPI_BOTTOM };
+static const int verbose = 0;
+static const int print_perf = 0;
+
+/* List of locally allocated list elements. */
+static llist_elem_t **my_elems = NULL;
+static int my_elems_size  = 0;
+static int my_elems_count = 0;
+
+/* Allocate a new shared linked list element */
+MPI_Aint alloc_elem(int value, MPI_Win win) {
+    MPI_Aint disp;
+    llist_elem_t *elem_ptr;
+
+    /* Allocate the new element and register it with the window */
+    MPI_Alloc_mem(sizeof(llist_elem_t), MPI_INFO_NULL, &elem_ptr);
+    elem_ptr->value = value;
+    elem_ptr->next  = nil;
+    MPI_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
+
+    /* Add the element to the list of local elements so we can free it later. */
+    if (my_elems_size == my_elems_count) {
+        my_elems_size += 100;
+        my_elems = realloc(my_elems, my_elems_size*sizeof(void*));
+    }
+    my_elems[my_elems_count] = elem_ptr;
+    my_elems_count++;
+
+    MPI_Get_address(elem_ptr, &disp);
+    return disp;
+}
+
+int main(int argc, char **argv) {
+    int           procid, nproc, i, j, my_nelem;
+    int           pollint = 0;
+    double        time;
+    MPI_Win       llist_win;
+    llist_ptr_t   head_ptr, tail_ptr;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);
+
+    /* Process 0 creates the head node */
+    if (procid == 0)
+        head_ptr.disp = alloc_elem(procid, llist_win);
+
+    /* Broadcast the head pointer to everyone */
+    head_ptr.rank = 0;
+    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
+    tail_ptr = head_ptr;
+
+    /* All processes append NUM_ELEMS elements to the list; rank 0 has already
+     * appended an element. */
+    if (procid == 0)
+        i = 1;
+    else
+        i = 0;
+    my_nelem = NUM_ELEMS/nproc;
+    if (procid < NUM_ELEMS % nproc)
+        my_nelem++;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    time = MPI_Wtime();
+
+    for ( ; i < my_nelem; i++) {
+        llist_ptr_t new_elem_ptr;
+        int success = 0;
+
+        /* Create a new list element and register it with the window */
+        new_elem_ptr.rank = procid;
+        new_elem_ptr.disp = alloc_elem(procid, llist_win);
+
+        /* Append the new node to the list.  This might take multiple attempts if
+           others have already appended and our tail pointer is stale. */
+        do {
+            int flag;
+
+            /* The tail is at my left neighbor, append my element. */
+            if (tail_ptr.rank == (procid + nproc-1) % nproc)
+            {
+                if (verbose)
+                    printf("%d: Appending to <%d, %p>\n", procid, tail_ptr.rank, (void*) tail_ptr.disp);
+
+                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
+#if USE_ACC
+                MPI_Accumulate(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                               (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
+                               MPI_BYTE, MPI_REPLACE, llist_win);
+#else
+                MPI_Put(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
+                        MPI_BYTE, llist_win);
+#endif
+                MPI_Win_unlock(tail_ptr.rank, llist_win);
+
+                success = 1;
+                tail_ptr = new_elem_ptr;
+            }
+
+            /* Otherwise, chase the tail. */
+            else
+            {
+                llist_ptr_t next_tail_ptr;
+
+                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
+#if USE_ACC
+                MPI_Get_accumulate( NULL, 0, MPI_DATATYPE_NULL, &next_tail_ptr,
+                                    sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                                    (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next),
+                                    sizeof(llist_ptr_t), MPI_BYTE, MPI_NO_OP, llist_win);
+#else
+                MPI_Get(&next_tail_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next),
+                        sizeof(llist_ptr_t), MPI_BYTE, llist_win);
+#endif
+                MPI_Win_unlock(tail_ptr.rank, llist_win);
+
+                if (next_tail_ptr.rank != nil.rank) {
+                    if (verbose)
+                        printf("%d: Chasing to <%d, %p>\n", procid, next_tail_ptr.rank, (void*) next_tail_ptr.disp);
+                    tail_ptr = next_tail_ptr;
+                    pollint = MAX(MIN_NPROBE, pollint/2);
+                }
+                else {
+                    for (j = 0; j < pollint; j++)
+                        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
+
+                    pollint = MIN(MAX_NPROBE, pollint*2);
+                }
+            }
+        } while (!success);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    time = MPI_Wtime() - time;
+
+    /* Traverse the list and verify that all processes inserted exactly the correct
+       number of elements. */
+    if (procid == 0) {
+        int  errors    = 0;
+        int *counts, count = 0;
+
+        counts = (int*) malloc(sizeof(int) * nproc);
+        assert(counts != NULL);
+
+        for (i = 0; i < nproc; i++)
+            counts[i] = 0;
+
+        tail_ptr = head_ptr;
+
+        MPI_Win_lock_all(0, llist_win);
+
+        /* Walk the list and tally up the number of elements inserted by each rank */
+        while (tail_ptr.disp != nil.disp) {
+            llist_elem_t elem;
+
+            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
+                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);
+
+            MPI_Win_flush(tail_ptr.rank, llist_win);
+
+            tail_ptr = elem.next;
+
+            assert(elem.value >= 0 && elem.value < nproc);
+            counts[elem.value]++;
+            count++;
+
+            if (verbose) {
+                int last_elem = tail_ptr.disp == nil.disp;
+                printf("%2d%s", elem.value, last_elem ? "" : " -> ");
+                if (count % ELEM_PER_ROW == 0 && !last_elem)
+                    printf("\n");
+            }
+        }
+
+        MPI_Win_unlock_all(llist_win);
+
+        if (verbose)
+          printf("\n\n");
+
+        /* Verify the counts we collected */
+        for (i = 0; i < nproc; i++) {
+            int expected;
+
+            expected = NUM_ELEMS/nproc;
+            if (i < NUM_ELEMS % nproc)
+                expected++;
+
+            if (counts[i] != expected) {
+                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
+                errors++;
+            }
+        }
+
+        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
+        free(counts);
+    }
+
+    if (print_perf) {
+        double max_time;
+
+        MPI_Reduce(&time, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+
+        if (procid == 0) {
+            printf("Total time = %0.2f sec, elem/sec = %0.2f, sec/elem = %0.2f usec\n", max_time, NUM_ELEMS/max_time, max_time/NUM_ELEMS*1.0e6);
+        }
+    }
+
+    MPI_Win_free(&llist_win);
+
+    /* Free all the elements in the list */
+    for ( ; my_elems_count > 0; my_elems_count--)
+        MPI_Free_mem(my_elems[my_elems_count-1]);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_shr.c b/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_shr.c
new file mode 100644
index 0000000000..62315eddbf
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/linked_list_bench_lock_shr.c
@@ -0,0 +1,263 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/*            MPI-3 distributed linked list construction example
+ *            --------------------------------------------------
+ *
+ * Construct a distributed shared linked list using proposed MPI-3 dynamic
+ * windows.  Initially process 0 creates the head of the list, attaches it to
+ * the window, and broadcasts the pointer to all processes.  Each process p then
+ * appends N new elements to the list when the tail reaches process p-1.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <assert.h>
+#include "mpitest.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#define NUM_ELEMS 1000
+#define MAX_NPROBE nproc
+#define MIN_NPROBE 1
+#define ELEM_PER_ROW 16
+
+#define MIN(X,Y) ((X < Y) ? (X) : (Y))
+#define MAX(X,Y) ((X > Y) ? (X) : (Y))
+
+/* Linked list pointer */
+typedef struct {
+    int      rank;
+    MPI_Aint disp;
+} llist_ptr_t;
+
+/* Linked list element */
+typedef struct {
+    int value;
+    llist_ptr_t next;
+} llist_elem_t;
+
+static const llist_ptr_t nil = { -1, (MPI_Aint) MPI_BOTTOM };
+static const int verbose = 0;
+static const int print_perf = 0;
+
+/* List of locally allocated list elements. */
+static llist_elem_t **my_elems = NULL;
+static int my_elems_size  = 0;
+static int my_elems_count = 0;
+
+/* Allocate a new shared linked list element */
+MPI_Aint alloc_elem(int value, MPI_Win win) {
+    MPI_Aint disp;
+    llist_elem_t *elem_ptr;
+
+    /* Allocate the new element and register it with the window */
+    MPI_Alloc_mem(sizeof(llist_elem_t), MPI_INFO_NULL, &elem_ptr);
+    elem_ptr->value = value;
+    elem_ptr->next  = nil;
+    MPI_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
+
+    /* Add the element to the list of local elements so we can free it later. */
+    if (my_elems_size == my_elems_count) {
+        my_elems_size += 100;
+        my_elems = realloc(my_elems, my_elems_size*sizeof(void*));
+    }
+    my_elems[my_elems_count] = elem_ptr;
+    my_elems_count++;
+
+    MPI_Get_address(elem_ptr, &disp);
+    return disp;
+}
+
+int main(int argc, char **argv) {
+    int           procid, nproc, i, j, my_nelem;
+    int           pollint = 0;
+    double        time;
+    MPI_Win       llist_win;
+    llist_ptr_t   head_ptr, tail_ptr;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);
+
+    /* Process 0 creates the head node */
+    if (procid == 0)
+        head_ptr.disp = alloc_elem(procid, llist_win);
+
+    /* Broadcast the head pointer to everyone */
+    head_ptr.rank = 0;
+    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
+    tail_ptr = head_ptr;
+
+    /* All processes append NUM_ELEMS elements to the list; rank 0 has already
+     * appended an element. */
+    if (procid == 0)
+        i = 1;
+    else
+        i = 0;
+
+    my_nelem = NUM_ELEMS/nproc;
+    if (procid < NUM_ELEMS % nproc)
+        my_nelem++;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    time = MPI_Wtime();
+
+    for ( ; i < my_nelem; i++) {
+        llist_ptr_t new_elem_ptr;
+        int success = 0;
+
+        /* Create a new list element and register it with the window */
+        new_elem_ptr.rank = procid;
+        new_elem_ptr.disp = alloc_elem(procid, llist_win);
+
+        /* Append the new node to the list.  This might take multiple attempts if
+           others have already appended and our tail pointer is stale. */
+        do {
+            int flag;
+
+            /* The tail is at my left neighbor, append my element. */
+            if (tail_ptr.rank == (procid + nproc-1) % nproc)
+            {
+                if (verbose)
+                    printf("%d: Appending to <%d, %p>\n", procid, tail_ptr.rank, (void*) tail_ptr.disp);
+
+#ifdef USE_MODE_NOCHECK
+                MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
+#else
+                MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, 0, llist_win);
+#endif
+                MPI_Accumulate(&new_elem_ptr, sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                               (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next), sizeof(llist_ptr_t),
+                               MPI_BYTE, MPI_REPLACE, llist_win);
+                MPI_Win_unlock(tail_ptr.rank, llist_win);
+
+                success = 1;
+                tail_ptr = new_elem_ptr;
+            }
+
+            /* Otherwise, chase the tail. */
+            else
+            {
+                llist_ptr_t next_tail_ptr;
+
+#ifdef USE_MODE_NOCHECK
+                MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
+#else
+                MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, 0, llist_win);
+#endif
+                MPI_Get_accumulate( NULL, 0, MPI_DATATYPE_NULL, &next_tail_ptr,
+                                    sizeof(llist_ptr_t), MPI_BYTE, tail_ptr.rank,
+                                    (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next),
+                                    sizeof(llist_ptr_t), MPI_BYTE, MPI_NO_OP, llist_win);
+                MPI_Win_unlock(tail_ptr.rank, llist_win);
+
+                if (next_tail_ptr.rank != nil.rank) {
+                    if (verbose)
+                        printf("%d: Chasing to <%d, %p>\n", procid, next_tail_ptr.rank, (void*) next_tail_ptr.disp);
+                    tail_ptr = next_tail_ptr;
+                    pollint = MAX(MIN_NPROBE, pollint/2);
+                }
+                else {
+                    for (j = 0; j < pollint; j++)
+                        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
+
+                    pollint = MIN(MAX_NPROBE, pollint*2);
+                }
+            }
+        } while (!success);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    time = MPI_Wtime() - time;
+
+    /* Traverse the list and verify that all processes inserted exactly the correct
+       number of elements. */
+    if (procid == 0) {
+        int  errors    = 0;
+        int *counts, count = 0;
+
+        counts = (int*) malloc(sizeof(int) * nproc);
+        assert(counts != NULL);
+
+        for (i = 0; i < nproc; i++)
+            counts[i] = 0;
+
+        tail_ptr = head_ptr;
+
+        MPI_Win_lock_all(0, llist_win);
+
+        /* Walk the list and tally up the number of elements inserted by each rank */
+        while (tail_ptr.disp != nil.disp) {
+            llist_elem_t elem;
+
+            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
+                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);
+
+            MPI_Win_flush(tail_ptr.rank, llist_win);
+
+            tail_ptr = elem.next;
+
+            assert(elem.value >= 0 && elem.value < nproc);
+            counts[elem.value]++;
+            count++;
+
+            if (verbose) {
+                int last_elem = tail_ptr.disp == nil.disp;
+                printf("%2d%s", elem.value, last_elem ? "" : " -> ");
+                if (count % ELEM_PER_ROW == 0 && !last_elem)
+                    printf("\n");
+            }
+        }
+
+        MPI_Win_unlock_all(llist_win);
+
+        if (verbose)
+          printf("\n\n");
+
+        /* Verify the counts we collected */
+        for (i = 0; i < nproc; i++) {
+            int expected;
+
+            expected = NUM_ELEMS/nproc;
+            if (i < NUM_ELEMS % nproc)
+                expected++;
+
+            if (counts[i] != expected) {
+                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
+                errors++;
+            }
+        }
+
+        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
+        free(counts);
+    }
+
+    if (print_perf) {
+        double max_time;
+
+        MPI_Reduce(&time, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+
+        if (procid == 0) {
+            printf("Total time = %0.2f sec, elem/sec = %0.2f, sec/elem = %0.2f usec\n", max_time, NUM_ELEMS/max_time, max_time/NUM_ELEMS*1.0e6);
+        }
+    }
+
+    MPI_Win_free(&llist_win);
+
+    /* Free all the elements in the list */
+    for ( ; my_elems_count > 0; my_elems_count--)
+        MPI_Free_mem(my_elems[my_elems_count-1]);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/linked_list_fop.c b/teshsuite/smpi/mpich3-test/rma/linked_list_fop.c
new file mode 100644
index 0000000000..bb95b77c58
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/linked_list_fop.c
@@ -0,0 +1,242 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/*            MPI-3 distributed linked list construction example
+ *            --------------------------------------------------
+ * 
+ * Construct a distributed shared linked list using proposed MPI-3 dynamic
+ * windows.  Initially process 0 creates the head of the list, attaches it to
+ * the window, and broadcasts the pointer to all processes.  All processes then
+ * concurrently append N new elements to the list.  When a process attempts to
+ * attach its element to the tail of list it may discover that its tail pointer
+ * is stale and it must chase ahead to the new tail before the element can be
+ * attached.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <assert.h>
+#include "mpitest.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#define NUM_ELEMS 32
+#define NPROBE    100
+#define ELEM_PER_ROW 16
+
+/* Linked list pointer */
+typedef struct {
+    int      rank;
+    MPI_Aint disp;
+} llist_ptr_t;
+
+/* Linked list element */
+typedef struct {
+    int value;
+    llist_ptr_t next;
+} llist_elem_t;
+
+static const llist_ptr_t nil = { -1, (MPI_Aint) MPI_BOTTOM };
+static const int verbose = 0;
+
+/* List of locally allocated list elements. */
+static llist_elem_t **my_elems = NULL;
+static int my_elems_size  = 0;
+static int my_elems_count = 0;
+
+/* Allocate a new shared linked list element */
+MPI_Aint alloc_elem(int value, MPI_Win win) {
+    MPI_Aint disp;
+    llist_elem_t *elem_ptr;
+
+    /* Allocate the new element and register it with the window */
+    MPI_Alloc_mem(sizeof(llist_elem_t), MPI_INFO_NULL, &elem_ptr);
+    elem_ptr->value = value;
+    elem_ptr->next  = nil;
+    MPI_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
+
+    /* Add the element to the list of local elements so we can free it later. */
+    if (my_elems_size == my_elems_count) {
+        my_elems_size += 100;
+        my_elems = realloc(my_elems, my_elems_size*sizeof(void*));
+    }
+    my_elems[my_elems_count] = elem_ptr;
+    my_elems_count++;
+
+    MPI_Get_address(elem_ptr, &disp);
+    return disp;
+}
+
+int main(int argc, char **argv) {
+    int           procid, nproc, i;
+    MPI_Win       llist_win;
+    llist_ptr_t   head_ptr, tail_ptr;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);
+
+    /* Process 0 creates the head node */
+    if (procid == 0)
+        head_ptr.disp = alloc_elem(-1, llist_win);
+
+    /* Broadcast the head pointer to everyone */
+    head_ptr.rank = 0;
+    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
+    tail_ptr = head_ptr;
+
+    /* All processes concurrently append NUM_ELEMS elements to the list */
+    for (i = 0; i < NUM_ELEMS; i++) {
+        llist_ptr_t new_elem_ptr;
+        int success;
+
+        /* Create a new list element and register it with the window */
+        new_elem_ptr.rank = procid;
+        new_elem_ptr.disp = alloc_elem(procid, llist_win);
+
+        /* Append the new node to the list.  This might take multiple attempts if
+           others have already appended and our tail pointer is stale. */
+        do {
+            llist_ptr_t next_tail_ptr = nil;
+
+            MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
+
+            MPI_Compare_and_swap((void*) &new_elem_ptr.rank, (void*) &nil.rank,
+                                  (void*) &next_tail_ptr.rank, MPI_INT, tail_ptr.rank,
+                                  (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.rank), llist_win);
+
+            MPI_Win_unlock(tail_ptr.rank, llist_win);
+            success = (next_tail_ptr.rank == nil.rank);
+
+            if (success) {
+                int i, flag;
+                MPI_Aint result;
+
+                MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
+
+                MPI_Fetch_and_op(&new_elem_ptr.disp, &result, MPI_AINT, tail_ptr.rank,
+                                  (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp),
+                                  MPI_REPLACE, llist_win);
+
+                /* Note: accumulate is faster, since we don't need the result.  Replacing with
+                   Fetch_and_op to create a more complete test case. */
+                /*
+                MPI_Accumulate(&new_elem_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
+                               (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp), 1,
+                               MPI_AINT, MPI_REPLACE, llist_win);
+                */
+
+                MPI_Win_unlock(tail_ptr.rank, llist_win);
+                tail_ptr = new_elem_ptr;
+
+                /* For implementations that use pt-to-pt messaging, force progress for other threads'
+                   RMA operations. */
+                for (i = 0; i < NPROBE; i++)
+                    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
+
+            } else {
+                /* Tail pointer is stale, fetch the displacement.  May take multiple tries
+                   if it is being updated. */
+                do {
+                    MPI_Aint junk = 0;
+
+                    MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
+
+                    MPI_Fetch_and_op(NULL, &next_tail_ptr.disp, MPI_AINT, tail_ptr.rank,
+                                      (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp),
+                                      MPI_NO_OP, llist_win);
+
+                    MPI_Win_unlock(tail_ptr.rank, llist_win);
+                } while (next_tail_ptr.disp == nil.disp);
+                tail_ptr = next_tail_ptr;
+            }
+        } while (!success);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Traverse the list and verify that all processes inserted exactly the correct
+       number of elements. */
+    if (procid == 0) {
+        int  have_root = 0;
+        int  errors    = 0;
+        int *counts, count = 0;
+
+        counts = (int*) malloc(sizeof(int) * nproc);
+        assert(counts != NULL);
+
+        for (i = 0; i < nproc; i++)
+            counts[i] = 0;
+
+        tail_ptr = head_ptr;
+
+        /* Walk the list and tally up the number of elements inserted by each rank */
+        while (tail_ptr.disp != nil.disp) {
+            llist_elem_t elem;
+
+            MPI_Win_lock(MPI_LOCK_SHARED, tail_ptr.rank, MPI_MODE_NOCHECK, llist_win);
+
+            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
+                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);
+
+            MPI_Win_unlock(tail_ptr.rank, llist_win);
+
+            tail_ptr = elem.next;
+
+            /* This is not the root */
+            if (have_root) {
+                assert(elem.value >= 0 && elem.value < nproc);
+                counts[elem.value]++;
+                count++;
+
+                if (verbose) {
+                    int last_elem = tail_ptr.disp == nil.disp;
+                    printf("%2d%s", elem.value, last_elem ? "" : " -> ");
+                    if (count % ELEM_PER_ROW == 0 && !last_elem)
+                        printf("\n");
+                }
+            }
+
+            /* This is the root */
+            else {
+                assert(elem.value == -1);
+                have_root = 1;
+            }
+        }
+
+        if (verbose)
+          printf("\n\n");
+
+        /* Verify the counts we collected */
+        for (i = 0; i < nproc; i++) {
+            int expected = NUM_ELEMS;
+
+            if (counts[i] != expected) {
+                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
+                errors++;
+            }
+        }
+
+        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
+        free(counts);
+    }
+
+    MPI_Win_free(&llist_win);
+
+    /* Free all the elements in the list */
+    for ( ; my_elems_count > 0; my_elems_count--)
+        MPI_Free_mem(my_elems[my_elems_count-1]);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/linked_list_lockall.c b/teshsuite/smpi/mpich3-test/rma/linked_list_lockall.c
new file mode 100644
index 0000000000..f1373fb490
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/linked_list_lockall.c
@@ -0,0 +1,231 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/*            MPI-3 distributed linked list construction example
+ *            --------------------------------------------------
+ *
+ * Construct a distributed shared linked list using proposed MPI-3 dynamic
+ * windows.  Initially process 0 creates the head of the list, attaches it to
+ * the window, and broadcasts the pointer to all processes.  All processes then
+ * concurrently append N new elements to the list.  When a process attempts to
+ * attach its element to the tail of list it may discover that its tail pointer
+ * is stale and it must chase ahead to the new tail before the element can be
+ * attached.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <assert.h>
+#include "mpitest.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#define NUM_ELEMS 32
+#define NPROBE    100
+#define ELEM_PER_ROW 16
+
+/* Linked list pointer */
+typedef struct {
+    int      rank;
+    MPI_Aint disp;
+} llist_ptr_t;
+
+/* Linked list element */
+typedef struct {
+    int value;
+    llist_ptr_t next;
+} llist_elem_t;
+
+static const llist_ptr_t nil = { -1, (MPI_Aint) MPI_BOTTOM };
+static const int verbose = 0;
+
+/* List of locally allocated list elements. */
+static llist_elem_t **my_elems = NULL;
+static int my_elems_size  = 0;
+static int my_elems_count = 0;
+
+/* Allocate a new shared linked list element */
+MPI_Aint alloc_elem(int value, MPI_Win win) {
+    MPI_Aint disp;
+    llist_elem_t *elem_ptr;
+
+    /* Allocate the new element and register it with the window */
+    MPI_Alloc_mem(sizeof(llist_elem_t), MPI_INFO_NULL, &elem_ptr);
+    elem_ptr->value = value;
+    elem_ptr->next  = nil;
+    MPI_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
+
+    /* Add the element to the list of local elements so we can free it later. */
+    if (my_elems_size == my_elems_count) {
+        my_elems_size += 100;
+        my_elems = realloc(my_elems, my_elems_size*sizeof(void*));
+    }
+    my_elems[my_elems_count] = elem_ptr;
+    my_elems_count++;
+
+    MPI_Get_address(elem_ptr, &disp);
+    return disp;
+}
+
+int main(int argc, char **argv) {
+    int           procid, nproc, i;
+    MPI_Win       llist_win;
+    llist_ptr_t   head_ptr, tail_ptr;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);
+
+    /* Process 0 creates the head node */
+    if (procid == 0)
+        head_ptr.disp = alloc_elem(-1, llist_win);
+
+    /* Broadcast the head pointer to everyone */
+    head_ptr.rank = 0;
+    MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
+    tail_ptr = head_ptr;
+
+    /* Lock the window for shared access to all targets */
+    MPI_Win_lock_all(0, llist_win);
+
+    /* All processes concurrently append NUM_ELEMS elements to the list */
+    for (i = 0; i < NUM_ELEMS; i++) {
+        llist_ptr_t new_elem_ptr;
+        int success;
+
+        /* Create a new list element and register it with the window */
+        new_elem_ptr.rank = procid;
+        new_elem_ptr.disp = alloc_elem(procid, llist_win);
+
+        /* Append the new node to the list.  This might take multiple attempts if
+           others have already appended and our tail pointer is stale. */
+        do {
+            llist_ptr_t next_tail_ptr = nil;
+
+            MPI_Compare_and_swap((void*) &new_elem_ptr.rank, (void*) &nil.rank,
+                                  (void*) &next_tail_ptr.rank, MPI_INT, tail_ptr.rank,
+                                  (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.rank), llist_win);
+
+            MPI_Win_flush(tail_ptr.rank, llist_win);
+            success = (next_tail_ptr.rank == nil.rank);
+
+            if (success) {
+                int i, flag;
+
+                MPI_Accumulate(&new_elem_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
+                               (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp), 1,
+                               MPI_AINT, MPI_REPLACE, llist_win);
+
+                MPI_Win_flush(tail_ptr.rank, llist_win);
+                tail_ptr = new_elem_ptr;
+
+                /* For implementations that use pt-to-pt messaging, force progress for other threads'
+                   RMA operations. */
+                for (i = 0; i < NPROBE; i++)
+                    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
+
+            } else {
+                /* Tail pointer is stale, fetch the displacement.  May take multiple tries
+                   if it is being updated. */
+                do {
+                    MPI_Get_accumulate( NULL, 0, MPI_DATATYPE_NULL, &next_tail_ptr.disp,
+                                        1, MPI_AINT, tail_ptr.rank,
+                                        (MPI_Aint) &(((llist_elem_t*)tail_ptr.disp)->next.disp),
+                                        1, MPI_AINT, MPI_NO_OP, llist_win);
+                    MPI_Win_flush(tail_ptr.rank, llist_win);
+                } while (next_tail_ptr.disp == nil.disp);
+                tail_ptr = next_tail_ptr;
+            }
+        } while (!success);
+    }
+
+    MPI_Win_unlock_all(llist_win);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Traverse the list and verify that all processes inserted exactly the correct
+       number of elements. */
+    if (procid == 0) {
+        int  have_root = 0;
+        int  errors    = 0;
+        int *counts, count = 0;
+
+        counts = (int*) malloc(sizeof(int) * nproc);
+        assert(counts != NULL);
+
+        for (i = 0; i < nproc; i++)
+            counts[i] = 0;
+
+        tail_ptr = head_ptr;
+
+        MPI_Win_lock_all(0, llist_win);
+
+        /* Walk the list and tally up the number of elements inserted by each rank */
+        while (tail_ptr.disp != nil.disp) {
+            llist_elem_t elem;
+
+            MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
+                    tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);
+
+            MPI_Win_flush(tail_ptr.rank, llist_win);
+
+            tail_ptr = elem.next;
+
+            /* This is not the root */
+            if (have_root) {
+                assert(elem.value >= 0 && elem.value < nproc);
+                counts[elem.value]++;
+                count++;
+
+                if (verbose) {
+                    int last_elem = tail_ptr.disp == nil.disp;
+                    printf("%2d%s", elem.value, last_elem ? "" : " -> ");
+                    if (count % ELEM_PER_ROW == 0 && !last_elem)
+                        printf("\n");
+                }
+            }
+
+            /* This is the root */
+            else {
+                assert(elem.value == -1);
+                have_root = 1;
+            }
+        }
+
+        MPI_Win_unlock_all(llist_win);
+
+        if (verbose)
+          printf("\n\n");
+
+        /* Verify the counts we collected */
+        for (i = 0; i < nproc; i++) {
+            int expected = NUM_ELEMS;
+
+            if (counts[i] != expected) {
+                printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i], expected);
+                errors++;
+            }
+        }
+
+        printf("%s\n", errors == 0 ? " No Errors" : "FAIL");
+        free(counts);
+    }
+
+    MPI_Win_free(&llist_win);
+
+    /* Free all the elements in the list */
+    for ( ; my_elems_count > 0; my_elems_count--)
+        MPI_Free_mem(my_elems[my_elems_count-1]);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/lockcontention.c b/teshsuite/smpi/mpich3-test/rma/lockcontention.c
new file mode 100644
index 0000000000..8fbce4ba70
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/lockcontention.c
@@ -0,0 +1,101 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+
+/* This is a modified version of test4.c. Sent by Liwei Peng, Microsoft. */
+
+/* tests passive target RMA on 3 processes. tests the lock-single_op-unlock
+   optimization. */
+
+
+#define SIZE1 100
+#define SIZE2 200
+
+int main(int argc, char *argv[])
+{
+    int rank, nprocs, A[SIZE2], B[SIZE2], i;
+    MPI_Comm CommThree;
+    MPI_Win win;
+    int errs = 0;
+    int trank = 1;
+
+    MTest_Init(&argc,&argv);
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+
+    if (nprocs < 3) {
+        fprintf(stderr, "Run this program with 3 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 3), rank, &CommThree);
+
+    if (rank < 3) {
+        if (rank == 0) {
+            for (i=0; i<SIZE2; i++) {
+                A[i] = B[i] = i;
+            }
+        }
+        else if (rank == 2) {
+            for (i=0; i<SIZE2; i++) {
+                A[i] = B[i] = -1;
+            }
+        }
+        else if (rank == 1) {
+            for (i=0; i<SIZE2; i++) {
+                B[i] = (-4)*i;
+            }
+        }
+
+        MPI_Win_create(B, SIZE2*sizeof(int), sizeof(int), MPI_INFO_NULL, CommThree, &win);
+
+        if (rank == 0) {
+            for (i=0; i<SIZE1; i++) {
+                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, trank, 0, win);
+                MPI_Put(A+i, 1, MPI_INT, trank, i, 1, MPI_INT, win);
+                /*  MPI_Put(A+i, 1, MPI_INT, trank, i, 1, MPI_INT, win);
+                    MPI_Put(A+i, 1, MPI_INT, trank, i, 1, MPI_INT, win); */
+                MPI_Win_unlock(trank, win);
+            }
+
+            MPI_Win_free(&win);
+        }
+        else if (rank == 2) {
+            for (i=0; i<SIZE1; i++) {
+                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, trank, 0, win);
+                MPI_Get(A+i, 1, MPI_INT, trank, SIZE1+i, 1, MPI_INT, win);
+                MPI_Win_unlock(trank, win);
+            }
+
+            MPI_Win_free(&win);
+
+            for (i=0; i<SIZE1; i++)
+                if (A[i] != (-4)*(i+SIZE1)) {
+                    printf("Get Error: A[%d] is %d, should be %d\n", i, A[i], (-4)*(i+SIZE1));
+                    errs++;
+                }
+        }
+
+        else if (rank == 1) { /*target*/
+            MPI_Win_free(&win);
+
+            for (i=0; i<SIZE1; i++) {
+                if (B[i] != i) {
+                    printf("Put Error: B[%d] is %d, should be %d\n", i, B[i], i);
+                    errs++;
+                }
+            }
+        }
+    }
+    MPI_Comm_free(&CommThree);
+
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/lockcontention2.c b/teshsuite/smpi/mpich3-test/rma/lockcontention2.c
new file mode 100644
index 0000000000..bae541f154
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/lockcontention2.c
@@ -0,0 +1,305 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2010 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+
+/* 
+ * Tests for lock contention, including special cases within the MPICH code 
+ * (any MPI implementation should pass these tests; in the MPICH case, our
+ * coverage analysis showed that the lockcontention.c test was not covering
+ * all cases, and in fact, this test revealed a bug in the code).
+ *
+ * In all of these tests, each process writes (or accesses) the values
+ * rank + i*size_of_world for NELM times.
+ *
+ * This test strives to avoid operations not strictly permitted by MPI RMA,
+ * for example, it doesn't target the same locations with multiple put/get
+ * calls in the same access epoch.
+ */
+
+#define NELM 200
+#define NBLOCK 10
+#define MAX_ERRS_REPORT 10
+
+/* 
+ *  Each process writes data into the rmabuf on the process with target rank
+ *  trank.  The final result in rmabuf are the consecutive integers starting
+ *  from 0.  Each process, however, does not write a consecutive block.  
+ *  Instead, they write these locations:
+ *
+ *  for i=0,...,NELM-1
+ *     for j=0,...,NBLOCK-1
+ *         j + NBLOCK * (rank + i * wsize)
+ *  
+ * The value written is the location.
+ *
+ * In many cases, multiple RMA operations are needed.  Where these must not
+ * overlap, the above pattern is replicated at NBLOCK*NELM*wsize.
+ * (NBLOCK is either 1 or NBLOCK in the code below, depending on use) 
+ */
+
+static int toterrs = 0;
+
+int testValues( int, int, int, int *, const char * );
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, wsize, i, j, cnt;
+    int *rmabuf, *localbuf, *localbuf2, *vals;
+    MPI_Win win;
+    int trank = 0;
+    int windowsize;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&wsize); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (wsize < 2) {
+        fprintf(stderr, "Run this program with at least 2 processes\n");
+        MPI_Abort(MPI_COMM_WORLD,1);
+    }
+
+    windowsize = (2*NBLOCK + 2) * NELM * wsize;
+    rmabuf     = (int *)malloc( windowsize * sizeof(int) );
+    localbuf   = (int *)malloc( NELM * sizeof(int) );
+    localbuf2  = (int *)malloc( NELM * NBLOCK * sizeof(int) );
+    vals       = (int *)malloc( NELM*sizeof(int) );
+
+    /* 
+     * Initialize the buffers
+     */
+    for (i=0; i<NELM; i++) {
+      localbuf[i] = rank + i*wsize;
+    }
+    cnt = 0;
+    for (i=0; i<NELM; i++) {
+	for (j=0; j<NBLOCK; j++) {
+	    localbuf2[cnt++] = j + NBLOCK * (rank + i*wsize);
+	}
+    }
+    for (i=0; i<windowsize; i++) {
+      rmabuf[i] = -1;
+    }
+
+    /* Create the window */
+    MPI_Win_create(rmabuf, windowsize*sizeof(int), sizeof(int), MPI_INFO_NULL, 
+		   MPI_COMM_WORLD, &win); 
+
+    /* Multiple puts, with contention at trank */
+    MPI_Barrier( MPI_COMM_WORLD );
+    for (i=0; i<NELM; i++) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	MPI_Put( &localbuf[i], 1, MPI_INT, trank, 
+		 rank + i*wsize, 1, MPI_INT, win );
+	MPI_Put( &localbuf[i], 1, MPI_INT, trank, 
+		 rank + (i+NELM)*wsize, 1, MPI_INT, win );
+	MPI_Win_unlock( trank, win );
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+    if (rank == trank) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	toterrs += testValues( 1, NELM, wsize, rmabuf, "Multiple puts (1)" );
+	toterrs += testValues( 1, NELM, wsize, rmabuf + wsize*NELM,
+			       "Multiple puts (2)" );
+	MPI_Win_unlock( trank, win );
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+    /* Reinit the rmabuf */
+    for (i=0; i<windowsize; i++) {
+      rmabuf[i] = -1;
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+
+    /* Single put with contention */
+    trank = 0;
+    for (i=0; i<NELM; i++) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	MPI_Put( &localbuf[i], 1, MPI_INT, trank, rank + i*wsize, 1, MPI_INT, 
+		 win );
+	MPI_Win_unlock( trank, win );
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+    if (rank == trank) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	toterrs += testValues( 1, NELM, wsize, rmabuf, "Single put" );
+	MPI_Win_unlock( trank, win );
+    }
+
+    /* Reinit the rmabuf */
+    for (i=0; i<windowsize; i++) {
+	rmabuf[i] = -1;
+    }
+    /* Longer puts with contention at trank */
+    MPI_Barrier( MPI_COMM_WORLD );
+    for (i=0; i<NELM; i++) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	if (rank != trank) {
+	    MPI_Put( &localbuf2[i*NBLOCK], NBLOCK, MPI_INT, trank, 
+		     NELM * wsize + NBLOCK*(rank+i*wsize), NBLOCK, 
+		     MPI_INT, win );
+	    MPI_Put( &localbuf2[i*NBLOCK], NBLOCK, MPI_INT, trank, 
+		     NELM * wsize + NBLOCK*(rank+(i+NELM)*wsize), NBLOCK, 
+		     MPI_INT, win );
+	}
+	MPI_Put( &localbuf[i], 1, MPI_INT, trank, rank+i*wsize, 1, MPI_INT, 
+		 win );
+	MPI_Win_unlock( trank, win );
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+    if (rank == trank) {
+	/* For simplicity in testing, set the values that rank==trank
+	   would have set. */
+	for (i=0; i<NELM; i++) {
+	    for (j=0; j<NBLOCK; j++) {
+		rmabuf[NELM*wsize + NBLOCK*(trank+i*wsize) + j] = 
+		    j + NBLOCK*(trank +i*wsize);
+		rmabuf[NELM*wsize + NBLOCK*(trank+(i+NELM)*wsize) + j] = 
+		    j + NBLOCK*(trank + i*wsize);
+	    }
+	}
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	toterrs += testValues( 1, NELM, wsize, rmabuf, "Long puts (1)" );
+	toterrs += testValues( NBLOCK, NELM, wsize, rmabuf + NELM * wsize,
+			       "Long puts(2)" );
+	toterrs += testValues( NBLOCK, NELM, wsize, 
+			       rmabuf + NELM * wsize * (1 + NBLOCK),
+			       "Long puts(3)" );
+	MPI_Win_unlock( trank, win );
+    }
+    
+    /* Reinit the rmabuf */
+    for (i=0; i<windowsize; i++) {
+	rmabuf[i] = -1;
+    }
+    for (i=0; i< NELM; i++) 
+	vals[i] = -2;
+    
+    /* Put mixed with Get */
+    MPI_Barrier( MPI_COMM_WORLD );
+    for (i=0; i<NELM; i++) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	if (rank != trank) {
+	    MPI_Put( &localbuf2[i], NBLOCK, MPI_INT, trank, 
+		     NELM*wsize + NBLOCK*(rank + i*wsize), NBLOCK, MPI_INT, 
+		     win );
+	    MPI_Put( &localbuf[i], 1, MPI_INT, trank, 
+		     rank + i*wsize, 1, MPI_INT, win );
+	}
+	else {
+	    MPI_Get( &vals[i], 1, MPI_INT, trank, i, 1, MPI_INT, win );
+	}
+	MPI_Win_unlock( trank, win );
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+    if (rank == trank) {
+	/* Just test the Get */
+	for (i=0; i<wsize; i++) {
+	    if (i == trank) {
+		if (vals[i] != -1) {
+		    toterrs++;
+		    if (toterrs < MAX_ERRS_REPORT) {
+			printf( "put/get: vals[%d] = %d, expected -1\n",
+				i, vals[i] );
+		    }
+		}
+	    }
+	    else if (vals[i] != i && vals[i] != -1) {
+		toterrs++;
+		if (toterrs < MAX_ERRS_REPORT) {
+		    printf( "put/get: vals[%d] = %d, expected -1 or %d\n",
+			    i, vals[i], i );
+		}
+	    }
+	}
+    }
+
+    /* Contention only with get */
+    for (i=0; i<windowsize; i++) {
+	rmabuf[i] = -i;
+    }
+    for (i=0; i<NELM; i++)
+	vals[i] = -2;
+
+    MPI_Barrier( MPI_COMM_WORLD );
+    for (i=0; i<NELM; i++) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	MPI_Get( &vals[i], 1, MPI_INT, trank, i, 1, MPI_INT, win );
+	MPI_Win_unlock( trank, win );
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+    if (rank == trank) {
+	for (i=0; i<NELM; i++) {
+	    if (vals[i] != -i) {
+		toterrs++;
+		if (toterrs < MAX_ERRS_REPORT) {
+		    printf( "single get: vals[%d] = %d, expected %d\n",
+			    i, vals[i], -i );
+		}
+	    }
+	}
+    }
+
+    /* Contention with accumulate */
+    MPI_Barrier( MPI_COMM_WORLD );
+    for (i=0; i<NELM*wsize; i++) {
+	rmabuf[i] = 0;
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+    for (i=0; i<NELM; i++) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	MPI_Accumulate( &localbuf[i], 1, MPI_INT, trank, rank+i*wsize, 
+			1, MPI_INT, MPI_SUM, win );
+	MPI_Accumulate( &localbuf[i], 1, MPI_INT, trank, rank+i*wsize, 
+			1, MPI_INT, MPI_SUM, win );
+	MPI_Win_unlock( trank, win );
+    }
+    MPI_Barrier( MPI_COMM_WORLD );
+    if (rank == trank) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, trank, 0, win );
+	for (i=0; i<NELM * wsize; i++) {
+	    if (rmabuf[i] != 2*i) {
+		toterrs++;
+		if (toterrs < MAX_ERRS_REPORT) {
+		    printf( "2 accumulate: rmabuf[%d] = %d, expected %d\n",
+			    i, rmabuf[i], 2*i );
+		}
+	    }
+	}
+	MPI_Win_unlock( trank, win );
+    }
+
+    MPI_Win_free( &win );
+
+    free( rmabuf );
+    free( localbuf );
+    free( localbuf2 );
+    free( vals );
+   
+    MTest_Finalize(toterrs);
+    MPI_Finalize(); 
+    return 0; 
+} 
+
+/* Test the values in the rmabuf against the expected values.  Return the 
+   number of errors */
+int testValues( int nb, int nelm, int wsize, int *rmabuf, const char *msg )
+{
+    int i, errs = 0;
+		
+    for (i=0; i<nb * nelm * wsize; i++) {
+	if (rmabuf[i] != i) {
+	    if (toterrs + errs < MAX_ERRS_REPORT) {
+		printf( "%s:rmabuf[%d] = %d expected %d\n", 
+			msg, i, rmabuf[i], i );
+	    }
+	    errs++;
+	}
+    }
+
+    return errs;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/lockcontention3.c b/teshsuite/smpi/mpich3-test/rma/lockcontention3.c
new file mode 100644
index 0000000000..d70e26c2a0
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/lockcontention3.c
@@ -0,0 +1,487 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+#include <assert.h>
+#include <string.h>
+
+#define LAST_TEST 14
+#define RMA_SIZE  2048
+#define OFFSET_1  7
+#define OFFSET_2  83
+#define OFFSET_3  157
+
+#define PUT_VAL 0xdcba97
+#define ACC_VAL 10771134
+
+/* 
+ * Additional tests for lock contention.  These are designed to exercise
+ * some of the optimizations within MPICH, but all are valid MPI programs.
+ * Tests structure includes
+ *    lock local (must happen at this time since application can use load
+ *                store after the lock)
+ *    send message to partner
+ *                                  receive message
+ *                                  send ack
+ *    receive ack
+ *    Provide a delay so that
+ *      the partner will see the
+ *      conflict
+ *                                  partner executes:
+ *                                  lock         // Note: this may block
+ *                                     rma operations (see below)
+ *                                  unlock
+ *    
+ *    unlock                        send back to partner
+ *    receive from partner 
+ *    check for correct data
+ *
+ * The delay may be implemented as a ring of message communication; this
+ * is likely to automatically scale the time to what is needed
+ */
+
+/* Define a datatype to be used with */
+int stride    = 11;
+int veccount  = 7;
+MPI_Datatype  vectype;
+/* Define long RMA ops size */
+int longcount = 512;
+int medcount  = 127;
+int mednum    = 4;
+
+void RMATest( int i, MPI_Win win, int master, int *srcbuf, int srcbufsize, int *getbuf, int getbufsize );
+int  RMACheck( int i, int *buf, MPI_Aint bufsize );
+int  RMACheckGet( int i, MPI_Win win, int *getbuf, MPI_Aint getsize);
+void RMATestInit( int i, int *buf, MPI_Aint bufsize );
+
+int main( int argc, char *argv[] )
+{
+    int      errs = 0;
+    MPI_Win  win;
+    int  *rmabuffer=0, *getbuf=0;
+    MPI_Aint bufsize=0, getbufsize=0;
+    int      master, partner, next, wrank, wsize, i;
+    int      ntest = LAST_TEST;
+    int *srcbuf;
+
+    MTest_Init( &argc, &argv );
+
+    /* Determine who is responsible for each part of the test */
+    MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+    MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+    if (wsize < 3) {
+	fprintf( stderr, "This test requires at least 3 processes\n" );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+
+    master  = 0;
+    partner = 1;
+    next = wrank + 1;
+    if (next == partner) next++;
+    if (next >= wsize) {
+	next = 0;
+	if (next == partner) next++;
+    }
+
+    /* Determine the last test to run (by default, run them all) */
+    for (i=1; i<argc; i++) {
+	if (strcmp( "-ntest", argv[i] ) == 0) { 
+	    i++;
+	    if (i < argc) {
+		ntest = atoi( argv[i] );
+	    }
+	    else {
+		fprintf( stderr, "Missing value for -ntest\n" );
+		MPI_Abort( MPI_COMM_WORLD, 1 );
+	    }
+	}
+    }
+
+    MPI_Type_vector( veccount, 1, stride, MPI_INT, &vectype );
+    MPI_Type_commit( &vectype );
+
+    /* Create the RMA window */
+    bufsize = 0;
+    if (wrank == master) {
+	bufsize = RMA_SIZE;
+	MPI_Alloc_mem( bufsize*sizeof(int), MPI_INFO_NULL, &rmabuffer );
+    }
+    else if (wrank == partner) {
+	getbufsize = RMA_SIZE;
+	getbuf = (int *)malloc( getbufsize*sizeof(int) );
+	if (!getbuf) {
+	    fprintf( stderr, "Unable to allocated %d bytes for getbuf\n", 
+		    (int)getbufsize );
+	    MPI_Abort( MPI_COMM_WORLD, 1 );
+	}
+    }
+    srcbuf = malloc(RMA_SIZE*sizeof(*srcbuf));
+    assert(srcbuf);
+
+    MPI_Win_create( rmabuffer, bufsize, sizeof(int), MPI_INFO_NULL,
+		    MPI_COMM_WORLD, &win );
+    
+    /* Run a sequence of tests */
+    for (i=0; i<=ntest; i++) {
+	if (wrank == master) {
+	    MTestPrintfMsg( 0, "Test %d\n", i );
+	    /* Because this lock is local, it must return only when the
+	     lock is acquired */
+	    MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, master, win );
+	    RMATestInit( i, rmabuffer, bufsize );
+	    MPI_Send( MPI_BOTTOM, 0, MPI_INT, partner, i, MPI_COMM_WORLD );
+	    MPI_Send( MPI_BOTTOM, 0, MPI_INT, next, i, MPI_COMM_WORLD );
+	    MPI_Recv( MPI_BOTTOM, 0, MPI_INT, MPI_ANY_SOURCE, i, 
+		      MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+	    MPI_Win_unlock( master, win );
+	    MPI_Recv( MPI_BOTTOM, 0, MPI_INT, partner, i, MPI_COMM_WORLD, 
+		      MPI_STATUS_IGNORE );
+	    errs += RMACheck( i, rmabuffer, bufsize );
+	}
+	else if (wrank == partner) {
+	    MPI_Recv( MPI_BOTTOM, 0, MPI_INT, master, i, MPI_COMM_WORLD,
+		      MPI_STATUS_IGNORE );
+	    MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, master, win );
+	    RMATest( i, win, master, srcbuf, RMA_SIZE, getbuf, getbufsize );
+	    MPI_Win_unlock( master, win );
+	    errs += RMACheckGet( i, win, getbuf, getbufsize );
+	    MPI_Send( MPI_BOTTOM, 0, MPI_INT, master, i, MPI_COMM_WORLD );
+	}
+	else {
+	    MPI_Recv( MPI_BOTTOM, 0, MPI_INT, MPI_ANY_SOURCE, i, 
+		      MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+	    MPI_Send( MPI_BOTTOM, 0, MPI_INT, next, i, MPI_COMM_WORLD );
+	}
+    }
+
+    if (rmabuffer) {
+	MPI_Free_mem( rmabuffer );
+    }
+    if (getbuf) {
+	free( getbuf );
+    }
+    MPI_Win_free( &win );
+    MPI_Type_free( &vectype );
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return MTestReturnValue( errs );
+}
+
+/* Perform the tests.
+ *
+ * The srcbuf must be passed in because the buffer must remain valid
+ * until the subsequent unlock call. */
+void RMATest( int i, MPI_Win win, int master, int *srcbuf, int srcbufsize, int *getbuf, int getbufsize )
+{
+    int j, k;
+    int *source = srcbuf;
+    assert(srcbufsize == RMA_SIZE);
+
+    for (j=0; j<srcbufsize; j++) source[j] = -j;
+
+    switch (i) {
+    case 0: /* Single short put (1 word at OFFSET_1) */
+	source[0] = PUT_VAL;
+	MPI_Put( source, 1, MPI_INT, master, OFFSET_1, 1, MPI_INT, win );
+	break;
+    case 1: /* Single short accumulate (1 word of value 17 at OFFSET_2) */
+	source[0] = ACC_VAL;
+	MPI_Accumulate( source, 1, MPI_INT, master, 
+			OFFSET_2, 1, MPI_INT, MPI_SUM, win );
+	break;
+    case 2: /* Single short get (1 word at OFFSET_3) */
+	getbuf[0] = -1;
+	MPI_Get( getbuf, 1, MPI_INT, master, OFFSET_3, 1, MPI_INT, win );
+	break;
+    case 3: /* Datatype single put (strided put) */
+	for (j=0; j<veccount; j++) {
+	    source[j*stride] = PUT_VAL + j;
+	}
+	MPI_Put( source, 1, vectype, master, OFFSET_1, 1, vectype, win );
+	break;
+    case 4: /* Datatype single accumulate (strided acc) */
+	for (j=0; j<veccount; j++) {
+	    source[j*stride] = ACC_VAL + j;
+	}
+	MPI_Accumulate( source, 1, vectype, master, 
+			OFFSET_2, 1, vectype, MPI_SUM, win );
+	break;
+    case 5: /* Datatype single get (strided get) */
+	for (j=0; j<veccount; j++) {
+	    getbuf[j] = -j;
+	}
+	MPI_Get( getbuf, 1, vectype, master,
+		 OFFSET_3, 1, vectype, win );
+	break;
+    case 6: /* a few small puts (like strided put, but 1 word at a time) */
+	for (j=0; j<veccount; j++) {
+	    source[j*stride] = PUT_VAL + j;
+	}
+	for (j=0; j<veccount; j++) {
+	    MPI_Put( source + j*stride, 1, MPI_INT, master, 
+		     OFFSET_1+j*stride, 1, MPI_INT, win );
+	}
+	break;
+    case 7: /* a few small accumulates (like strided acc, but 1 word at a time )*/
+	for (j=0; j<veccount; j++) {
+	    source[j*stride] = ACC_VAL + j;
+	}
+	for (j=0; j<veccount; j++) {
+	    MPI_Accumulate( source + j*stride, 1, MPI_INT, master, 
+			    OFFSET_2+j*stride, 1, MPI_INT, MPI_SUM, win );
+	}
+	break;
+    case 8: /* a few small gets (like strided get, but 1 word at a time) */
+	for (j=0; j<veccount; j++) {
+	    getbuf[j*stride] = -j;
+	}
+	for (j=0; j<veccount; j++) {
+	    MPI_Get( getbuf + j*stride, 1, MPI_INT, master, 
+		     OFFSET_3+j*stride, 1, MPI_INT, win );
+	}
+	break;
+    case 9: /* Single long put (OFFSET_1) */
+	for (j=0; j<longcount; j++) source[j] = j;
+	MPI_Put( source, longcount, MPI_INT, master,
+		 OFFSET_1, longcount, MPI_INT, win ); 
+	break;
+    case 10: /* Single long accumulate (OFFSET_2) */
+	for (j=0; j<longcount; j++) source[j] = j;
+	MPI_Accumulate( source, longcount, MPI_INT, master, 
+			OFFSET_2, longcount, MPI_INT, MPI_SUM, win );
+	break;
+    case 11: /* Single long get (OFFSET_3) */
+	for (j=0; j<longcount; j++) getbuf[j] = -j;
+	MPI_Get( getbuf, longcount, MPI_INT, master,
+		 OFFSET_3, longcount, MPI_INT, win ); 
+	break;
+    case 12: /* a few long puts (start at OFFSET_1, medcount ) */
+	for (j=0; j<mednum; j++) {
+	    for (k=0; k<medcount; k++) {
+		source[j*medcount+k] = j*2*medcount+k;
+	    }
+	    MPI_Put( source + j*medcount, medcount, MPI_INT, master,
+		     OFFSET_1 + j*2*medcount, medcount, MPI_INT, win );
+	}
+	break;
+    case 13: /* a few long accumulates (start at OFFSET_2, medcount) */
+	for (j=0; j<mednum; j++) {
+	    for (k=0; k<medcount; k++) {
+		source[j*medcount+k] = ACC_VAL + j*2*medcount+k;
+	    }
+	    MPI_Accumulate( source + j*medcount, medcount, MPI_INT, master,
+			    OFFSET_2 + j*2*medcount, medcount, MPI_INT, 
+			    MPI_SUM, win );
+	}
+	break;
+    case 14: /* a few long gets (start at OFFSET_3, medcount) */
+	for (j=0; j<mednum; j++) {
+	    for (k=0; k<medcount; k++) {
+		getbuf[j*medcount+k] = -(j*medcount+k);
+	    }
+	    MPI_Get( getbuf + j*medcount, medcount, MPI_INT, master,
+			    OFFSET_3 + j*2*medcount, medcount, MPI_INT, win );
+	}
+	break;
+    }
+}
+
+int RMACheck( int i, int *buf, MPI_Aint bufsize )
+{
+    int j, k;
+    int errs = 0;
+
+    switch (i) {
+    case 0: /* Single short put (1 word at OFFSET_1) */
+	if (buf[OFFSET_1] != PUT_VAL) {
+	    errs++;
+	    printf( "case 0: value is %d should be %d\n", 
+		    buf[OFFSET_1], PUT_VAL );
+	}
+	break;
+    case 1: /* Single short accumulate (1 word of value 17 at OFFSET_2) */
+	if (buf[OFFSET_2] != ACC_VAL + OFFSET_2) {
+	    errs++;
+	    printf( "case 1: value is %d should be %d\n", 
+		    buf[OFFSET_2], ACC_VAL + OFFSET_2 );
+	}
+	break;
+    case 2: /* Single short get (1 word at OFFSET_3) */
+	/* See RMACheckGet */
+	break;
+    case 3: /* Datatype single put (strided put) */
+    case 6: /* a few small puts (like strided put, but 1 word at a time) */
+        /* FIXME: The conditional and increment are reversed below.  This looks
+         * like a bug, and currently prevents the following test from running. */
+	for (j=0; j++; j<veccount) {
+	    if (buf[j*stride] != PUT_VAL + j) {
+		errs++;
+		printf( "case %d: value is %d should be %d\n", i,
+			buf[j*stride], PUT_VAL+j );
+	    }
+	}
+	break;
+    case 4: /* Datatype single accumulate (strided acc) */
+    case 7: /* a few small accumulates (like strided acc, but 1 word at a time )*/
+        /* FIXME: The conditional and increment are reversed below.  This looks
+         * like a bug, and currently prevents the following test from running. */
+	for (j=0; j++; j<veccount) {
+	    if (buf[j*stride] != ACC_VAL + j + OFFSET_2 + j*stride) {
+		errs++;
+		printf( "case %d: value is %d should be %d\n", i,
+			buf[j*stride], ACC_VAL+j+OFFSET_2+j*stride );
+	    }
+	}
+	break;
+    case 5: /* Datatype single get (strided get) */
+    case 8: /* a few small gets (like strided get, but 1 word at a time) */
+	/* See RMACheckGet */
+	break;
+    case 9: /* Single long put (OFFSET_1) */
+	for (j=0; j<longcount; j++) {
+	    if (buf[OFFSET_1+j] != j) {
+		errs++;
+		printf( "case 9: value is %d should be %d\n",
+			buf[OFFSET_1+j], OFFSET_1 + j );
+	    }
+	}
+	break;
+    case 10: /* Single long accumulate (OFFSET_2) */
+	for (j=0; j<longcount; j++) {
+	    if (buf[OFFSET_2+j] != OFFSET_2 + j + j) {
+		errs++;
+		printf( "case 10: value is %d should be %d\n", 
+			buf[OFFSET_2+j], OFFSET_2 + j + j );
+	    }
+	}
+	break;
+    case 11: /* Single long get (OFFSET_3) */
+	/* See RMACheckGet */
+	break;
+    case 12: /* a few long puts (start at OFFSET_1, medcount ) */
+	for (j=0; j<mednum; j++) {
+	    for (k=0; k<medcount; k++) {
+		if (buf[OFFSET_1 + j*2*medcount + k] != 
+		    j*2*medcount+k ) {
+		    errs++;
+		    printf( "case 12: value is %d should be %d\n", 
+			    buf[OFFSET_1+j*2*medcount + k], j*2*medcount + k );
+		}
+	    }
+	}
+	break;
+    case 13: /* a few long accumulates (start at OFFSET_2, medcount) */
+	for (j=0; j<mednum; j++) {
+	    for (k=0; k<medcount; k++) {
+		if (buf[OFFSET_2 + j*2*medcount + k] != 
+		    OFFSET_2 + 2*j*2*medcount+2*k + ACC_VAL ) {
+		    errs++;
+		    printf( "case 13: value is %d should be %d\n", 
+			    buf[OFFSET_2+j*2*medcount + k], 
+			    OFFSET_2 + 2*j*2*medcount + k +ACC_VAL);
+		}
+	    }
+	}
+	break;
+    case 14: /* a few long gets (start at OFFSET_3, medcount) */
+	/* See RMACheckGet */
+	break;
+    default:
+	fprintf( stderr, "Unrecognized case %d\n", i );
+	errs++;
+	break;
+    }
+    return errs;
+}
+
+int RMACheckGet( int i, MPI_Win win, int *getbuf, MPI_Aint getsize)
+{
+    int errs = 0;
+    int j, k;
+
+    /* */
+    switch (i) {
+    case 0: /* Single short put (1 word at OFFSET_1) */
+	break;
+    case 1: /* Single short accumulate (1 word of value 17 at OFFSET_2) */
+	break;
+    case 2: /* Single short get (1 word at OFFSET_3) */
+	if (getbuf[0] != OFFSET_3) {
+	    errs++;
+	    printf( "case 2: value is %d should be %d\n", 
+		    getbuf[0], OFFSET_3 );
+	}
+	break;
+    case 3: /* Datatype single put (strided put) */
+	break;
+    case 4: /* Datatype single accumulate (strided acc) */
+	break;
+    case 5: /* Datatype single get (strided get) */
+    case 8: /* a few small gets (like strided get, but 1 word at a time) */
+	for (j=0; j<veccount; j++) {
+	    if (getbuf[j*stride] != OFFSET_3 + j*stride) {
+		errs++;
+		printf( "case %d: value is %d should be %d\n", i,
+			getbuf[j*stride], OFFSET_3 + j*stride );
+	    }
+	}
+	
+	break;
+    case 6: /* a few small puts (like strided put, but 1 word at a time) */
+	break;
+    case 7: /* a few small accumulates (like strided acc, but 1 word at a time )*/
+	break;
+    case 9: /* Single long put (OFFSET_1) */
+	break;
+    case 10: /* Single long accumulate (OFFSET_2) */
+	break;
+    case 11: /* Single long get (OFFSET_3) */
+	for (j=0; j<longcount; j++) {
+	    if (getbuf[j] != OFFSET_3 + j) {
+		errs++;
+		printf( "case 11: value is %d should be %d\n",
+			getbuf[j], OFFSET_3 + j );
+	    }
+	}
+	break;
+    case 12: /* a few long puts (start at OFFSET_1, medcount ) */
+	break;
+    case 13: /* a few long accumulates (start at OFFSET_2, medcount) */
+	break;
+    case 14: /* a few long gets (start at OFFSET_3, medcount) */
+	for (j=0; j<mednum; j++) {
+	    for (k=0; k<medcount; k++) {
+		if (getbuf[j*medcount + k] != 
+		    OFFSET_3 + j*2*medcount+k ) {
+		    errs++;
+		    printf( "case 14: buf[%d] value is %d should be %d\n", 
+			    j*medcount + k,
+			    getbuf[j*medcount + k], 
+			    OFFSET_3 + j*2*medcount + k );
+		}
+	    }
+	}
+	break;
+    default:
+	fprintf( stderr, "Unrecognized case %d\n", i );
+	errs++;
+	break;
+    }
+    return errs;
+}
+
+
+void RMATestInit( int i, int *buf, MPI_Aint bufsize )
+{
+    int j;
+    for (j=0; j<bufsize; j++) {
+	buf[j] = j;
+    }
+}
+
diff --git a/teshsuite/smpi/mpich3-test/rma/locknull.c b/teshsuite/smpi/mpich3-test/rma/locknull.c
new file mode 100644
index 0000000000..8b04ce6825
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/locknull.c
@@ -0,0 +1,66 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2008 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+#include <string.h>
+
+/*
+static char MTEST_Descrip[] = "Locks with no RMA operations";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0;
+    int rank, size, i;
+    MPI_Comm      comm;
+    MPI_Win       win;
+    int           *winbuf, count;
+
+    MTest_Init( &argc, &argv );
+
+    comm = MPI_COMM_WORLD;
+
+    MPI_Comm_rank( comm, &rank );
+    MPI_Comm_size( comm, &size );
+
+    /* Allocate and initialize buf */
+    count  = 1000;
+
+    MPI_Alloc_mem( count*sizeof(int), MPI_INFO_NULL, &winbuf );
+
+    MPI_Win_create( winbuf, count * sizeof(int), sizeof(int), MPI_INFO_NULL, 
+		    comm, &win );
+
+    /* Clear winbuf */
+    memset( winbuf, 0, count*sizeof(int) );
+
+    /* Note that for i == rank, this is a useful operation - it allows 
+       the programmer to use direct loads and stores, rather than 
+       put/get/accumulate, to access the local memory window. */
+    for (i=0; i<size; i++) {
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, i, 0, win );
+	MPI_Win_unlock( i, win );
+    }
+
+    for (i=0; i<size; i++) {
+	MPI_Win_lock( MPI_LOCK_SHARED, i, 0, win );
+	MPI_Win_unlock( i, win );
+    }
+
+    MPI_Win_free( &win );
+    MPI_Free_mem( winbuf );
+
+    /* If this test completes, no error has been found */
+    /* A more complete test may ensure that local locks in fact block
+       remote, exclusive locks */
+    MTest_Finalize( errs );
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/lockopts.c b/teshsuite/smpi/mpich3-test/rma/lockopts.c
new file mode 100644
index 0000000000..6f962e3eab
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/lockopts.c
@@ -0,0 +1,211 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+
+/* tests passive target RMA on 2 processes. tests the lock-single_op-unlock 
+   optimization for less common cases:
+
+   origin datatype derived, target datatype predefined
+
+*/
+int main(int argc, char *argv[]) 
+{ 
+    int          wrank, nprocs, *srcbuf, *rmabuf, i;
+    int          memsize;
+    MPI_Datatype vectype;
+    MPI_Win      win;
+    int          errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&wrank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    memsize = 10 * 4 * nprocs;
+    /* Create and initialize data areas */
+    srcbuf = (int *)malloc( sizeof(int) * memsize );
+    MPI_Alloc_mem( sizeof(int) * memsize, MPI_INFO_NULL, &rmabuf );
+    if (!srcbuf || !rmabuf) {
+	printf( "Unable to allocate srcbuf and rmabuf of size %d\n", memsize );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+    for (i=0; i<memsize; i++) {
+      rmabuf[i] = -i;
+      srcbuf[i] = i;
+    }
+
+    MPI_Win_create( rmabuf, memsize*sizeof(int), sizeof(int), MPI_INFO_NULL, 
+		    MPI_COMM_WORLD, &win );
+
+    /* Vector of 10 elements, separated by 4 */
+    MPI_Type_vector( 10, 1, 4, MPI_INT, &vectype );
+    MPI_Type_commit( &vectype );
+
+    /* Accumulate with a derived origin type and target predefined type*/
+    if (wrank == 0) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, 0, win );
+	for (i=0; i<10; i++) {
+	    if (rmabuf[i] != -i + 4*i) {
+		errs++;
+		printf( "Acc: expected rmabuf[%d] = %d but saw %d\n", 
+			i, -i + 4*i, rmabuf[i] );
+	    }
+	    rmabuf[i] = -i;
+	}
+	for (i=10; i<memsize; i++) {
+	    if (rmabuf[i] != -i) {
+		errs++;
+		printf( "Acc: expected rmabuf[%d] = %d but saw %d\n", 
+			i, -i, rmabuf[i] );
+		rmabuf[i] = -i;
+	    }
+	}
+	MPI_Win_unlock( 0, win );
+    }
+    else if (wrank == 1) {
+	MPI_Win_lock( MPI_LOCK_SHARED, 0, 0, win );
+	MPI_Accumulate( srcbuf, 1, vectype, 0, 0, 10, MPI_INT, MPI_SUM, win );
+	MPI_Win_unlock( 0, win );
+	MPI_Barrier( MPI_COMM_WORLD );
+    }
+    else {
+	MPI_Barrier( MPI_COMM_WORLD );
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Put with a derived origin type and target predefined type*/
+    if (wrank == 0) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, 0, win );
+	for (i=0; i<10; i++) {
+	    if (rmabuf[i] != 4*i) {
+		errs++;
+		printf( "Put: expected rmabuf[%d] = %d but saw %d\n", 
+			i, 4*i, rmabuf[i] );
+	    }
+	    rmabuf[i] = -i;
+	}
+	for (i=10; i<memsize; i++) {
+	    if (rmabuf[i] != -i) {
+		errs++;
+		printf( "Put: expected rmabuf[%d] = %d but saw %d\n", 
+			i, -i, rmabuf[i] );
+		rmabuf[i] = -i;
+	    }
+	}
+	MPI_Win_unlock( 0, win );
+    }
+    else if (wrank == 1) {
+	MPI_Win_lock( MPI_LOCK_SHARED, 0, 0, win );
+	MPI_Put( srcbuf, 1, vectype, 0, 0, 10, MPI_INT, win );
+	MPI_Win_unlock( 0, win );
+	MPI_Barrier( MPI_COMM_WORLD );
+    }
+    else {
+	MPI_Barrier( MPI_COMM_WORLD );
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Put with a derived origin type and target predefined type, with 
+       a get (see the move-to-end optimization) */
+    if (wrank == 0) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, 0, win );
+	for (i=0; i<10; i++) {
+	    if (rmabuf[i] != 4*i) {
+		errs++;
+		printf( "Put: expected rmabuf[%d] = %d but saw %d\n", 
+			i, 4*i, rmabuf[i] );
+	    }
+	    rmabuf[i] = -i;
+	}
+	for (i=10; i<memsize; i++) {
+	    if (rmabuf[i] != -i) {
+		errs++;
+		printf( "Put: expected rmabuf[%d] = %d but saw %d\n", 
+			i, -i, rmabuf[i] );
+		rmabuf[i] = -i;
+	    }
+	}
+	MPI_Win_unlock( 0, win );
+    }
+    else if (wrank == 1) {
+	int val;
+	MPI_Win_lock( MPI_LOCK_SHARED, 0, 0, win );
+	MPI_Get( &val, 1, MPI_INT, 0, 10, 1, MPI_INT, win );
+	MPI_Put( srcbuf, 1, vectype, 0, 0, 10, MPI_INT, win );
+	MPI_Win_unlock( 0, win );
+	MPI_Barrier( MPI_COMM_WORLD );
+	if (val != -10) {
+	    errs++;
+	    printf( "Get: Expected -10, got %d\n", val );
+	}
+    }
+    else {
+	MPI_Barrier( MPI_COMM_WORLD );
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Put with a derived origin type and target predefined type, with 
+       a get already at the end (see the move-to-end optimization) */
+    if (wrank == 0) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, 0, win );
+	for (i=0; i<10; i++) {
+	    if (rmabuf[i] != 4*i) {
+		errs++;
+		printf( "Put: expected rmabuf[%d] = %d but saw %d\n", 
+			i, 4*i, rmabuf[i] );
+	    }
+	    rmabuf[i] = -i;
+	}
+	for (i=10; i<memsize; i++) {
+	    if (rmabuf[i] != -i) {
+		errs++;
+		printf( "Put: expected rmabuf[%d] = %d but saw %d\n", 
+			i, -i, rmabuf[i] );
+		rmabuf[i] = -i;
+	    }
+	}
+	MPI_Win_unlock( 0, win );
+    }
+    else if (wrank == 1) {
+	int val;
+	MPI_Win_lock( MPI_LOCK_SHARED, 0, 0, win );
+	MPI_Put( srcbuf, 1, vectype, 0, 0, 10, MPI_INT, win );
+	MPI_Get( &val, 1, MPI_INT, 0, 10, 1, MPI_INT, win );
+	MPI_Win_unlock( 0, win );
+	MPI_Barrier( MPI_COMM_WORLD );
+	if (val != -10) {
+	    errs++;
+	    printf( "Get: Expected -10, got %d\n", val );
+	}
+    }
+    else {
+	MPI_Barrier( MPI_COMM_WORLD );
+    }
+
+    MPI_Win_free( &win );
+    MPI_Free_mem( rmabuf );
+    free( srcbuf );
+    MPI_Type_free( &vectype );
+
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
+
diff --git a/teshsuite/smpi/mpich3-test/rma/manyrma2.c b/teshsuite/smpi/mpich3-test/rma/manyrma2.c
new file mode 100644
index 0000000000..91d9518a29
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/manyrma2.c
@@ -0,0 +1,308 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2010 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* This test is a simplification of the one in perf/manyrma.c that tests
+   for correct handling of the case where many RMA operations occur between
+   synchronization events.
+   This is one of the ways that RMA may be used, and is used in the 
+   reference implementation of the graph500 benchmark.
+*/
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define MAX_COUNT 65536*4/16
+#define MAX_RMA_SIZE 2 /* 16 in manyrma performance test */
+#define MAX_RUNS 10
+
+typedef enum { SYNC_NONE=0, 
+	       SYNC_ALL=-1, SYNC_FENCE=1, SYNC_LOCK=2, SYNC_PSCW=4 } sync_t;
+typedef enum { RMA_NONE=0, RMA_ALL=-1, RMA_PUT=1, RMA_ACC=2, RMA_GET=4 } rma_t;
+/* Note GET not yet implemented */
+/* By default, run only a subset of the available tests, to keep the
+   total runtime reasonably short.  Command line arguments may be used
+   to run other tests. */
+sync_t syncChoice = SYNC_FENCE;
+rma_t rmaChoice = RMA_ACC;
+
+static int verbose = 0;
+
+void RunAccFence( MPI_Win win, int destRank, int cnt, int sz );
+void RunAccLock( MPI_Win win, int destRank, int cnt, int sz );
+void RunPutFence( MPI_Win win, int destRank, int cnt, int sz );
+void RunPutLock( MPI_Win win, int destRank, int cnt, int sz );
+void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz, 
+		 MPI_Group exposureGroup, MPI_Group accessGroup );
+void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz, 
+		 MPI_Group exposureGroup, MPI_Group accessGroup );
+
+int main( int argc, char *argv[] )
+{
+    int arraysize, i, cnt, sz, maxCount=MAX_COUNT, *arraybuffer;
+    int wrank, wsize, destRank, srcRank;
+    MPI_Win win;
+    MPI_Group wgroup, accessGroup, exposureGroup;
+    int    maxSz = MAX_RMA_SIZE;
+
+    MPI_Init( &argc, &argv );
+
+    for (i=1; i<argc; i++) {
+	if (strcmp( argv[i], "-put" ) == 0) {
+	    if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
+	    rmaChoice  |= RMA_PUT;
+	}
+	else if (strcmp( argv[i], "-acc" ) == 0) {
+	    if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
+	    rmaChoice  |= RMA_ACC;
+	}
+	else if (strcmp( argv[i], "-fence" ) == 0) {
+	    if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
+	    syncChoice |= SYNC_FENCE;
+	}
+	else if (strcmp( argv[i], "-lock" ) == 0) {
+	    if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
+	    syncChoice |= SYNC_LOCK;
+	}
+	else if (strcmp( argv[i], "-pscw" ) == 0) {
+	    if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
+	    syncChoice |= SYNC_PSCW;
+	}
+	else if (strcmp( argv[i], "-maxsz" ) == 0) {
+	    i++;
+	    maxSz = atoi( argv[i] );
+	}
+	else if (strcmp( argv[i], "-maxcount" ) == 0) {
+	    i++;
+	    maxCount = atoi( argv[i] );
+	}
+	else {
+	    fprintf( stderr, "Unrecognized argument %s\n", argv[i] );
+	    fprintf( stderr, "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -maxsz msgsize ]\n", argv[0] );
+	    MPI_Abort( MPI_COMM_WORLD, 1 );
+	}
+    }
+    
+    MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+    MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+    destRank = wrank + 1;
+    while (destRank >= wsize) destRank = destRank - wsize;
+    srcRank = wrank - 1;
+    if (srcRank < 0) srcRank += wsize;
+
+    /* Create groups for PSCW */
+    MPI_Comm_group( MPI_COMM_WORLD, &wgroup );
+    MPI_Group_incl( wgroup, 1, &destRank, &accessGroup );
+    MPI_Group_incl( wgroup, 1, &srcRank, &exposureGroup );
+    MPI_Group_free( &wgroup );
+
+    arraysize = maxSz * MAX_COUNT;
+    arraybuffer = (int*)malloc( arraysize * sizeof(int) );
+    if (!arraybuffer) {
+	fprintf( stderr, "Unable to allocate %d words\n", arraysize );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+
+    MPI_Win_create( arraybuffer, arraysize*sizeof(int), (int)sizeof(int),
+		    MPI_INFO_NULL, MPI_COMM_WORLD, &win );
+
+    if (maxCount > MAX_COUNT) {
+	fprintf( stderr, "MaxCount must not exceed %d\n", MAX_COUNT );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+
+    if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_ACC)) {
+	for (sz=1; sz<=maxSz; sz = sz + sz) {
+	    if (wrank == 0 && verbose) 
+		printf( "Accumulate with fence, %d elements\n", sz );
+	    cnt = 1;
+	    while (cnt <= maxCount) {
+		RunAccFence( win, destRank, cnt, sz );
+		cnt = 2 * cnt;
+	    }
+	}
+    }
+
+    if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_ACC)) {
+	for (sz=1; sz<=maxSz; sz = sz + sz) {
+	    if (wrank == 0 && verbose) 
+		printf( "Accumulate with lock, %d elements\n", sz );
+	    cnt = 1;
+	    while (cnt <= maxCount) {
+		RunAccLock( win, destRank, cnt, sz );
+		cnt = 2 * cnt;
+	    }
+	}
+    }
+
+    if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_PUT)) {
+	for (sz=1; sz<=maxSz; sz = sz + sz) {
+	    if (wrank == 0 && verbose) 
+		printf( "Put with fence, %d elements\n", sz );
+	    cnt = 1;
+	    while (cnt <= maxCount) {
+		RunPutFence( win, destRank, cnt, sz );
+		cnt = 2 * cnt;
+	    }
+	}
+    }
+
+    if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_PUT)) {
+	for (sz=1; sz<=maxSz; sz = sz + sz) {
+	    if (wrank == 0 && verbose) 
+		printf( "Put with lock, %d elements\n", sz );
+	    cnt = 1;
+	    while (cnt <= maxCount) {
+		RunPutLock( win, destRank, cnt, sz );
+		cnt = 2 * cnt;
+	    }
+	}
+    }
+
+    if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_PUT)) {
+	for (sz=1; sz<=maxSz; sz = sz + sz) {
+	    if (wrank == 0 && verbose) 
+		printf( "Put with pscw, %d elements\n", sz );
+	    cnt = 1;
+	    while (cnt <= maxCount) {
+		RunPutPSCW( win, destRank, cnt, sz, 
+			    exposureGroup, accessGroup );
+		cnt = 2 * cnt;
+	    }
+	}
+    }
+
+    if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_ACC)) {
+	for (sz=1; sz<=maxSz; sz = sz + sz) {
+	    if (wrank == 0 && verbose) 
+		printf( "Accumulate with pscw, %d elements\n", sz );
+	    cnt = 1;
+	    while (cnt <= maxCount) {
+		RunAccPSCW( win, destRank, cnt, sz, 
+			    exposureGroup, accessGroup );
+		cnt = 2 * cnt;
+	    }
+	}
+    }
+
+    MPI_Win_free( &win );
+
+    MPI_Group_free( &accessGroup );
+    MPI_Group_free( &exposureGroup );
+
+    /* If we get here without timing out or failing, we succeeded */
+    if (wrank == 0) printf( " No Errors\n" );
+    
+    MPI_Finalize();
+    return 0;
+}
+
+
+void RunAccFence( MPI_Win win, int destRank, int cnt, int sz )
+{
+    int k, i, j, one = 1;
+
+    for (k=0; k<MAX_RUNS; k++) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_fence( 0, win );
+	j = 0;
+	for (i=0; i<cnt; i++) {
+	    MPI_Accumulate( &one, sz, MPI_INT, destRank, 
+			    j, sz, MPI_INT, MPI_SUM, win );
+	    j += sz;
+	}
+	MPI_Win_fence( 0, win );
+    }
+}
+
+void RunAccLock( MPI_Win win, int destRank, int cnt, int sz )
+{
+    int k, i, j, one = 1;
+
+    for (k=0; k<MAX_RUNS; k++) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
+	j = 0;
+	for (i=0; i<cnt; i++) {
+	    MPI_Accumulate( &one, sz, MPI_INT, destRank, 
+			    j, sz, MPI_INT, MPI_SUM, win );
+	    j += sz;
+	}
+	MPI_Win_unlock( destRank, win );
+    }
+}
+
+void RunPutFence( MPI_Win win, int destRank, int cnt, int sz )
+{
+    int k, i, j, one = 1;
+
+    for (k=0; k<MAX_RUNS; k++) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_fence( 0, win );
+	j = 0;
+	for (i=0; i<cnt; i++) {
+	    MPI_Put( &one, sz, MPI_INT, destRank, 
+			    j, sz, MPI_INT, win );
+	    j += sz;
+	}
+	MPI_Win_fence( 0, win );
+    }
+}
+
+void RunPutLock( MPI_Win win, int destRank, int cnt, int sz )
+{
+    int k, i, j, one = 1;
+
+    for (k=0; k<MAX_RUNS; k++) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
+	j = 0;
+	for (i=0; i<cnt; i++) {
+	    MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
+	    j += sz;
+	}
+	MPI_Win_unlock( destRank, win );
+    }
+}
+
+void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz, 
+		 MPI_Group exposureGroup, MPI_Group accessGroup )
+{
+    int k, i, j, one = 1;
+
+    for (k=0; k<MAX_RUNS; k++) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_post( exposureGroup, 0, win );
+	MPI_Win_start( accessGroup, 0, win );
+	j = 0;
+	for (i=0; i<cnt; i++) {
+	    MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
+	    j += sz;
+	}
+	MPI_Win_complete( win );
+	MPI_Win_wait( win );
+    }
+}
+
+void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz, 
+		 MPI_Group exposureGroup, MPI_Group accessGroup )
+{
+    int k, i, j, one = 1;
+
+    for (k=0; k<MAX_RUNS; k++) {
+	MPI_Barrier( MPI_COMM_WORLD );
+	MPI_Win_post( exposureGroup, 0, win );
+	MPI_Win_start( accessGroup, 0, win );
+	j = 0;
+	for (i=0; i<cnt; i++) {
+	    MPI_Accumulate( &one, sz, MPI_INT, destRank, 
+			    j, sz, MPI_INT, MPI_SUM, win );
+	    j += sz;
+	}
+	MPI_Win_complete( win );
+	MPI_Win_wait( win );
+    }
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/mcs-mutex.c b/teshsuite/smpi/mpich3-test/rma/mcs-mutex.c
new file mode 100644
index 0000000000..533fbdb882
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/mcs-mutex.c
@@ -0,0 +1,216 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2013 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <strings.h>
+
+#include <mpi.h>
+#include "mcs-mutex.h"
+
+/* TODO: Make these mutex operations no-ops for sequential runs */
+
+/** Create an MCS mutex.  Collective on comm.
+  *
+  * @param[out] comm communicator containing all processes that will use the
+  *                  mutex
+  * @param[out] tail_rank rank of the process in comm that holds the tail
+  *                  pointer
+  * @param[out] hdl  handle to the mutex
+  * @return          MPI status
+  */
+int MCS_Mutex_create(int tail_rank, MPI_Comm comm, MCS_Mutex * hdl_out)
+{
+    int rank, nproc;
+    MCS_Mutex hdl;
+
+    hdl = malloc(sizeof(struct mcs_mutex_s));
+    assert(hdl != NULL);
+
+    MPI_Comm_dup(comm, &hdl->comm);
+
+    MPI_Comm_rank(hdl->comm, &rank);
+    MPI_Comm_size(hdl->comm, &nproc);
+
+    hdl->tail_rank = tail_rank;
+
+#ifdef USE_WIN_SHARED
+    MPI_Win_allocate_shared(2*sizeof(int), sizeof(int), MPI_INFO_NULL,
+                            hdl->comm, &hdl->base, &hdl->window);
+#else
+    MPI_Win_allocate(2*sizeof(int), sizeof(int), MPI_INFO_NULL, hdl->comm,
+                     &hdl->base, &hdl->window);
+#endif
+
+    MPI_Win_lock_all(0, hdl->window);
+
+    hdl->base[0] = MPI_PROC_NULL;
+    hdl->base[1] = MPI_PROC_NULL;
+
+    MPI_Win_sync(hdl->window);
+    MPI_Barrier(hdl->comm);
+
+    *hdl_out = hdl;
+    return MPI_SUCCESS;
+}
+
+
+/** Free an MCS mutex.  Collective on ranks in the communicator used at the
+  * time of creation.
+  *
+  * @param[in] hdl handle to the group that will be freed
+  * @return        MPI status
+  */
+int MCS_Mutex_free(MCS_Mutex * hdl_ptr)
+{
+    MCS_Mutex hdl = *hdl_ptr;
+
+    MPI_Win_unlock_all(hdl->window);
+
+    MPI_Win_free(&hdl->window);
+    MPI_Comm_free(&hdl->comm);
+
+    free(hdl);
+    hdl_ptr = NULL;
+
+    return MPI_SUCCESS;
+}
+
+
+/** Lock a mutex.
+  *
+  * @param[in] hdl   Handle to the mutex
+  * @return          MPI status
+  */
+int MCS_Mutex_lock(MCS_Mutex hdl)
+{
+    int rank, nproc;
+    int prev;
+
+    MPI_Comm_rank(hdl->comm, &rank);
+    MPI_Comm_size(hdl->comm, &nproc);
+
+    /* This store is safe, since it cannot happen concurrently with a remote
+     * write */
+    hdl->base[MCS_MTX_ELEM_DISP] = MPI_PROC_NULL;
+    MPI_Win_sync(hdl->window);
+
+    MPI_Fetch_and_op(&rank, &prev, MPI_INT, hdl->tail_rank, MCS_MTX_TAIL_DISP,
+                     MPI_REPLACE, hdl->window);
+    MPI_Win_flush(hdl->tail_rank, hdl->window);
+
+    /* If there was a previous tail, update their next pointer and wait for
+     * notification.  Otherwise, the mutex was successfully acquired. */
+    if (prev != MPI_PROC_NULL) {
+        /* Wait for notification */
+        MPI_Status status;
+
+        MPI_Accumulate(&rank, 1, MPI_INT, prev, MCS_MTX_ELEM_DISP, 1, MPI_INT, MPI_REPLACE, hdl->window);
+        MPI_Win_flush(prev, hdl->window);
+
+        debug_print("%2d: LOCK   - waiting for notification from %d\n", rank, prev);
+        MPI_Recv(NULL, 0, MPI_BYTE, prev, MCS_MUTEX_TAG, hdl->comm, &status);
+    }
+
+    debug_print("%2d: LOCK   - lock acquired\n", rank);
+
+    return MPI_SUCCESS;
+}
+
+
+/** Attempt to acquire a mutex.
+  *
+  * @param[in] hdl   Handle to the mutex
+  * @param[out] success Indicates whether the mutex was acquired
+  * @return          MPI status
+  */
+int MCS_Mutex_trylock(MCS_Mutex hdl, int *success)
+{
+    int rank, nproc;
+    int tail, nil = MPI_PROC_NULL;
+
+    MPI_Comm_rank(hdl->comm, &rank);
+    MPI_Comm_size(hdl->comm, &nproc);
+
+    /* This store is safe, since it cannot happen concurrently with a remote
+     * write */
+    hdl->base[MCS_MTX_ELEM_DISP] = MPI_PROC_NULL;
+    MPI_Win_sync(hdl->window);
+
+    /* Check if the lock is available and claim it if it is. */
+    MPI_Compare_and_swap(&rank, &nil, &tail, MPI_INT, hdl->tail_rank,
+                         MCS_MTX_TAIL_DISP, hdl->window);
+    MPI_Win_flush(hdl->tail_rank, hdl->window);
+
+    /* If the old tail was MPI_PROC_NULL, we have claimed the mutex */
+    *success = (tail == nil);
+
+    debug_print("%2d: TRYLOCK - %s\n", rank, (*success) ? "Success" : "Non-success");
+
+    return MPI_SUCCESS;
+}
+
+
+/** Unlock a mutex.
+  *
+  * @param[in] hdl   Handle to the mutex
+  * @return          MPI status
+  */
+int MCS_Mutex_unlock(MCS_Mutex hdl)
+{
+    int rank, nproc, next;
+
+    MPI_Comm_rank(hdl->comm, &rank);
+    MPI_Comm_size(hdl->comm, &nproc);
+
+    MPI_Win_sync(hdl->window);
+
+    /* Read my next pointer.  FOP is used since another process may write to
+     * this location concurrent with this read. */
+    MPI_Fetch_and_op(NULL, &next, MPI_INT, rank, MCS_MTX_ELEM_DISP, MPI_NO_OP,
+                     hdl->window);
+    MPI_Win_flush(rank, hdl->window);
+
+    if ( next == MPI_PROC_NULL) {
+        int tail;
+        int nil = MPI_PROC_NULL;
+
+        /* Check if we are the at the tail of the lock queue.  If so, we're
+         * done.  If not, we need to send notification. */
+        MPI_Compare_and_swap(&nil, &rank, &tail, MPI_INT, hdl->tail_rank,
+                             MCS_MTX_TAIL_DISP, hdl->window);
+        MPI_Win_flush(hdl->tail_rank, hdl->window);
+
+        if (tail != rank) {
+            debug_print("%2d: UNLOCK - waiting for next pointer (tail = %d)\n", rank, tail);
+            assert(tail >= 0 && tail < nproc);
+
+            for (;;) {
+                int flag;
+
+                MPI_Fetch_and_op(NULL, &next, MPI_INT, rank, MCS_MTX_ELEM_DISP,
+                                 MPI_NO_OP, hdl->window);
+
+                MPI_Win_flush(rank, hdl->window);
+                if (next != MPI_PROC_NULL) break;
+
+                MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag,
+                           MPI_STATUS_IGNORE);
+            }
+        }
+    }
+
+    /* Notify the next waiting process */
+    if (next != MPI_PROC_NULL) {
+        debug_print("%2d: UNLOCK - notifying %d\n", rank, next);
+        MPI_Send(NULL, 0, MPI_BYTE, next, MCS_MUTEX_TAG, hdl->comm);
+    }
+
+    debug_print("%2d: UNLOCK - lock released\n", rank);
+
+    return MPI_SUCCESS;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/mcs-mutex.h b/teshsuite/smpi/mpich3-test/rma/mcs-mutex.h
new file mode 100644
index 0000000000..c8d8843ccb
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/mcs-mutex.h
@@ -0,0 +1,38 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2013 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#if !defined MCSMUTEX_H_INCLUDED
+#define MCSMUTEX_H_INCLUDED
+
+#include <mpi.h>
+
+#define MCS_MUTEX_TAG 100
+
+#ifdef ENABLE_DEBUG
+#define debug_print(...) do { printf(__VA_ARGS__); } while (0)
+#else
+#define debug_print(...)
+#endif
+
+struct mcs_mutex_s {
+    int tail_rank;
+    MPI_Comm comm;
+    MPI_Win window;
+    int *base;
+};
+
+typedef struct mcs_mutex_s * MCS_Mutex;
+
+#define MCS_MTX_ELEM_DISP 0
+#define MCS_MTX_TAIL_DISP 1
+
+int MCS_Mutex_create(int tail_rank, MPI_Comm comm, MCS_Mutex * hdl_out);
+int MCS_Mutex_free(MCS_Mutex * hdl_ptr);
+int MCS_Mutex_lock(MCS_Mutex hdl);
+int MCS_Mutex_trylock(MCS_Mutex hdl, int *success);
+int MCS_Mutex_unlock(MCS_Mutex hdl);
+
+#endif /* MCSMUTEX_H_INCLUDED */
diff --git a/teshsuite/smpi/mpich3-test/rma/mixedsync.c b/teshsuite/smpi/mpich3-test/rma/mixedsync.c
new file mode 100644
index 0000000000..c558516c9d
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/mixedsync.c
@@ -0,0 +1,245 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+#include <string.h>
+
+/*
+static char MTEST_Descrip[] = "Mix synchronization types";
+*/
+
+void delay( double time );
+void delay( double time )
+{
+    double t1;
+    t1 = MPI_Wtime();
+    while (MPI_Wtime() - t1 < time) ;
+}
+
+int main( int argc, char *argv[] )
+{
+    int      errs = 0;
+    int      crank, csize, source, dest, loop;
+    int      *buf0, *buf1, *buf2, *inbuf2, count0, count1, count2, count, i;
+    MPI_Comm comm;
+    MPI_Win  win;
+    int      *winbuf;
+
+    MTest_Init( &argc, &argv );
+
+    comm = MPI_COMM_WORLD;
+
+    count0 = 1000;
+    count1 = 1;
+    count2 = 100;
+
+    count = count0 + count1 + count2 + 2;
+    
+    /* Allocate and initialize the local buffers */
+    buf0   = (int *)malloc( count0 * sizeof(int) );
+    buf1   = (int *)malloc( count1 * sizeof(int) );
+    buf2   = (int *)malloc( count2 * sizeof(int) );
+    inbuf2 = (int *)malloc( count2 * sizeof(int) );
+    if (!buf0 || !buf1 || !buf2 || !inbuf2) {
+	fprintf( stderr, "Unable to allocated buf0-2\n" );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+    for (i=0; i<count0; i++) buf0[i] = i;
+    for (i=0; i<count1; i++) buf1[i] = i + count0;
+    for (i=0; i<count2; i++) buf2[i] = i + count0 + count1;
+
+    /* Allocate the window buffer and create the memory window. */
+    MPI_Alloc_mem( count*sizeof(int), MPI_INFO_NULL, &winbuf );
+    if (!winbuf) {
+	fprintf( stderr, "Unable to allocate %d words\n", count );
+	MPI_Abort( MPI_COMM_WORLD, 0 );
+    }
+    MPI_Win_create( winbuf, count*sizeof(int), sizeof(int), MPI_INFO_NULL, 
+		    comm, &win );
+
+    MPI_Comm_size( comm, &csize );
+    MPI_Comm_rank( comm, &crank );
+    dest   = 0;
+    source = 1;
+
+    for (loop=0; loop<2; loop++) {
+	/* Perform several communication operations, mixing synchronization
+	   types.  Use multiple communication to avoid the single-operation
+	   optimization that may be present. */
+	MTestPrintfMsg( 3, "Beginning loop %d of mixed sync put operations\n", 
+			loop );	
+	MPI_Barrier( comm );
+	if (crank == source) {
+	    MTestPrintfMsg( 3, "About to perform exclusive lock\n" );
+	    MPI_Win_lock( MPI_LOCK_EXCLUSIVE, dest, 0, win );
+	    MPI_Put( buf0, count0, MPI_INT, dest, 0, count0, MPI_INT, win );
+	    MPI_Put( buf1, count1, MPI_INT, dest, count0, count1, MPI_INT, 
+		     win );
+	    MPI_Put( buf2, count2, MPI_INT, dest, count0+count1, count2, 
+		     MPI_INT, win );
+	    MPI_Win_unlock( dest, win );
+	    MTestPrintfMsg( 3, "Released exclusive lock\n" );
+	}
+	else if (crank == dest) {
+	    /* Just delay a bit */
+	    delay( 0.0001 );
+	}
+
+	/* The synchronization mode can only be changed when the process 
+	   memory and public copy are guaranteed to have the same values 
+	   (See 11.7, Semantics and Correctness). This barrier ensures that 
+	   the lock/unlock completes before the fence call.  */
+	MPI_Barrier( comm );
+
+	MTestPrintfMsg( 3, "About to start fence\n" );
+	MPI_Win_fence( 0, win );
+	if (crank == source) {
+	    MPI_Put( buf0, count0, MPI_INT, dest, 1, count0, MPI_INT, win );
+	    MPI_Put( buf1, count1, MPI_INT, dest, 1+count0, count1, MPI_INT, 
+		     win );
+	    MPI_Put( buf2, count2, MPI_INT, dest, 1+count0+count1, count2, 
+		     MPI_INT, win );
+	}
+	MPI_Win_fence( 0, win );
+	MTestPrintfMsg( 3, "Finished with fence sync\n" );
+
+	/* Check results */
+	if (crank == dest) {
+	    for (i=0; i<count0+count1+count2; i++) {
+		if (winbuf[1+i] != i) {
+		    errs++;
+		    if (errs < 10) {
+			fprintf( stderr, "winbuf[%d] = %d, expected %d\n",
+				 1+i, winbuf[1+i], i ); fflush(stderr);
+		    }
+		}
+	    }
+	}
+	
+	/* End of test loop */
+    }
+
+    /* Use mixed put and accumulate */
+    for (loop=0; loop<2; loop++) {
+	/* Perform several communication operations, mixing synchronization
+	   types.  Use multiple communication to avoid the single-operation
+	   optimization that may be present. */
+	MTestPrintfMsg( 3, "Begining loop %d of mixed sync put/acc operations\n", 
+			loop );	
+	memset( winbuf, 0, count*sizeof(int) );
+	MPI_Barrier( comm );
+	if (crank == source) {
+	    MPI_Win_lock( MPI_LOCK_EXCLUSIVE, dest, 0, win );
+	    MPI_Accumulate( buf0, count0, MPI_INT, dest, 0, count0, MPI_INT, 
+			    MPI_SUM, win );
+	    MPI_Accumulate( buf1, count1, MPI_INT, dest, count0, count1, 
+			    MPI_INT, MPI_SUM, win );
+	    MPI_Put( buf2, count2, MPI_INT, dest, count0+count1, count2, 
+		     MPI_INT, win );
+	    MPI_Win_unlock( dest, win );
+	}
+	else if (crank == dest) {
+	    /* Just delay a bit */
+	    delay( 0.0001 );
+	}
+	/* See above - the fence should not start until the unlock completes */
+	MPI_Barrier( comm );
+	MPI_Win_fence( 0, win );
+	if (crank == source) {
+	    MPI_Accumulate( buf0, count0, MPI_INT, dest, 1, count0, MPI_INT, 
+			    MPI_REPLACE, win );
+	    MPI_Accumulate( buf1, count1, MPI_INT, dest, 1+count0, count1, 
+			    MPI_INT, MPI_REPLACE, win );
+	    MPI_Put( buf2, count2, MPI_INT, dest, 1+count0+count1, count2, 
+		     MPI_INT, win );
+	}
+	MPI_Win_fence( 0, win );
+
+	/* Check results */
+	if (crank == dest) {
+	    for (i=0; i<count0+count1+count2; i++) {
+		if (winbuf[1+i] != i) {
+		    errs++;
+		    if (errs < 10) {
+			fprintf( stderr, "winbuf[%d] = %d, expected %d\n",
+				 1+i, winbuf[1+i], i ); fflush(stderr);
+		    }
+		}
+	    }
+	}
+	
+	/* End of test loop */
+    }
+
+    /* Use mixed accumulate and get */
+    for (loop=0; loop<2; loop++) {
+	/* Perform several communication operations, mixing synchronization
+	   types.  Use multiple communication to avoid the single-operation
+	   optimization that may be present. */
+	MTestPrintfMsg( 3, "Begining loop %d of mixed sync put/get/acc operations\n", 
+			loop );	
+	MPI_Barrier( comm );
+	if (crank == source) {
+	    MPI_Win_lock( MPI_LOCK_EXCLUSIVE, dest, 0, win );
+	    MPI_Accumulate( buf0, count0, MPI_INT, dest, 0, count0, MPI_INT, 
+			    MPI_REPLACE, win );
+	    MPI_Put( buf1, count1, MPI_INT, dest, count0, count1, MPI_INT, 
+		     win );
+	    MPI_Get( inbuf2, count2, MPI_INT, dest, count0+count1, count2, 
+		     MPI_INT, win );
+	    MPI_Win_unlock( dest, win );
+	}
+	else if (crank == dest) {
+	    /* Just delay a bit */
+	    delay( 0.0001 );
+	}
+	/* See above - the fence should not start until the unlock completes */
+	MPI_Barrier( comm );
+	MPI_Win_fence( 0, win );
+	if (crank == source) {
+	    MPI_Accumulate( buf0, count0, MPI_INT, dest, 1, count0, MPI_INT, 
+			    MPI_REPLACE, win );
+	    MPI_Put( buf1, count1, MPI_INT, dest, 1+count0, count1, MPI_INT, 
+		     win );
+	    MPI_Get( inbuf2, count2, MPI_INT, dest, 1+count0+count1, count2, 
+		     MPI_INT, win );
+	}
+	MPI_Win_fence( 0, win );
+
+	/* Check results */
+	if (crank == dest) {
+	    /* Do the put/accumulate parts */
+	    for (i=0; i<count0+count1; i++) {
+		if (winbuf[1+i] != i) {
+		    errs++;
+		    if (errs < 10) {
+			fprintf( stderr, "winbuf[%d] = %d, expected %d\n",
+				 1+i, winbuf[1+i], i ); fflush(stderr);
+		    }
+		}
+	    }
+	}
+	
+	/* End of test loop */
+    }
+
+    MTestPrintfMsg( 3, "Freeing the window\n" );
+    MPI_Barrier( comm );
+    MPI_Win_free( &win );
+    MPI_Free_mem( winbuf );
+    free( buf0 );
+    free( buf1 );
+    free( buf2 );
+    free( inbuf2 );
+
+    MTest_Finalize( errs );
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/mutex_bench.c b/teshsuite/smpi/mpich3-test/rma/mutex_bench.c
new file mode 100644
index 0000000000..2db24e41af
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/mutex_bench.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2013. See COPYRIGHT in top-level directory.
+ */
+
+/** MPI Mutex test -- James Dinan <dinan@mcs.anl.gov>
+  *
+  * All processes create a mutex then lock+unlock it N times.
+  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include <mpi.h>
+#include "mpitest.h"
+#include "mcs-mutex.h"
+
+#define NUM_ITER    1000
+#define NUM_MUTEXES 1
+
+const int verbose = 0;
+double delay_ctr = 0.0;
+
+int main(int argc, char ** argv) {
+  int rank, nproc, i;
+  double t_mpix_mtx, t_mcs_mtx;
+  MPI_Comm mtx_comm;
+  MCS_Mutex mcs_mtx;
+
+  MPI_Init(&argc, &argv);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+#ifdef USE_WIN_SHARED
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank,
+                      MPI_INFO_NULL, &mtx_comm);
+#else
+  mtx_comm = MPI_COMM_WORLD;
+#endif
+
+  MCS_Mutex_create(0, mtx_comm, &mcs_mtx);
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  t_mcs_mtx = MPI_Wtime();
+
+  for (i = 0; i < NUM_ITER; i++) {
+    /* Combining trylock and lock here is helpful for testing because it makes
+     * CAS and Fetch-and-op contend for the tail pointer. */
+    if (rank % 2) {
+      int success = 0;
+      while (!success) {
+        MCS_Mutex_trylock(mcs_mtx, &success);
+      }
+    }
+    else {
+        MCS_Mutex_lock(mcs_mtx);
+    }
+    MCS_Mutex_unlock(mcs_mtx);
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  t_mcs_mtx = MPI_Wtime() - t_mcs_mtx;
+
+  MCS_Mutex_free(&mcs_mtx);
+
+  if (rank == 0) {
+      if (verbose) {
+          printf("Nproc %d, MCS Mtx = %f us\n", nproc, t_mcs_mtx/NUM_ITER*1.0e6);
+      }
+  }
+
+  if (mtx_comm != MPI_COMM_WORLD)
+      MPI_Comm_free(&mtx_comm);
+
+  MTest_Finalize(0);
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/nullpscw.c b/teshsuite/smpi/mpich3-test/rma/nullpscw.c
new file mode 100644
index 0000000000..c5b134257f
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/nullpscw.c
@@ -0,0 +1,34 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2006 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <stdio.h>
+#include <mpi.h>
+#include "mpitest.h"
+
+int main(int argc, char* argv[])
+{
+  MPI_Win win;
+  MPI_Group group;
+  int errs = 0;
+
+  MTest_Init(&argc,&argv); 
+
+  MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+  MPI_Win_get_group(win, &group);
+  
+  MPI_Win_post(group, 0, win);
+  MPI_Win_start(group, 0, win);
+  
+  MPI_Win_complete(win);
+  
+  MPI_Win_wait(win);
+
+  MPI_Group_free( &group );
+  MPI_Win_free(&win); 
+
+  MTest_Finalize(errs);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/pscw_ordering.c b/teshsuite/smpi/mpich3-test/rma/pscw_ordering.c
new file mode 100644
index 0000000000..9cb1cee0e2
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/pscw_ordering.c
@@ -0,0 +1,139 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* This test checks an oddball case for generalized active target
+ * synchronization where the start occurs before the post.  Since start can
+ * block until the corresponding post, the group passed to start must be
+ * disjoint from the group passed to post and processes must avoid a circular
+ * wait.  Here, odd/even groups are used to accomplish this and the even group
+ * reverses its start/post calls.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+int main(int argc, char **argv) {
+    int i, rank, nproc, errors = 0;
+
+    int *win_buf;
+    MPI_Win win;
+
+    int odd_nproc, even_nproc;
+    int *odd_ranks, *even_ranks;
+    MPI_Group odd_group, even_group, world_group;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    if (nproc < 2) {
+        if (rank == 0)
+            printf("Error: this test requires two or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 100);
+    }
+
+    /* Set up odd/even groups and buffers */
+
+    odd_nproc = nproc / 2;
+    even_nproc  = nproc / 2 + ( (nproc % 2 == 0) ? 0 : 1 );
+
+    odd_ranks = malloc(sizeof(int) * odd_nproc);
+    even_ranks = malloc(sizeof(int) * even_nproc);
+
+    for (i = 0; i < even_nproc; i++)
+        even_ranks[i] = i*2;
+
+    for (i = 0; i < odd_nproc; i++)
+        odd_ranks[i] = i*2+1;
+
+    MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+    MPI_Group_incl(world_group, odd_nproc, odd_ranks, &odd_group);
+    MPI_Group_incl(world_group, even_nproc, even_ranks, &even_group);
+
+    /* Create the window */
+
+    MPI_Alloc_mem(nproc*sizeof(int), MPI_INFO_NULL, &win_buf);
+
+    for (i = 0; i < nproc; i++)
+        win_buf[i] = -1;
+
+    MPI_Win_create(win_buf, nproc*sizeof(int), sizeof(int), MPI_INFO_NULL,
+                   MPI_COMM_WORLD, &win);
+
+    /* Perform PSCW communication: Odd/even matchup */
+
+    if (rank % 2 == 0) {
+        MPI_Win_start(odd_group, 0, win);  /* Even-numbered procs target odd procs */
+        MPI_Win_post(odd_group, 0, win);   /* Even procs are targeted by odd procs */
+
+        /* Write to my slot at each target */
+        for (i = 0; i < odd_nproc; i++)
+            MPI_Put(&rank, 1, MPI_INT, odd_ranks[i], rank, 1, MPI_INT, win);
+    }
+    else {
+        MPI_Win_post(even_group, 0, win);  /* Odd procs are targeted by even procs */
+        MPI_Win_start(even_group, 0, win); /* Odd-numbered procs target even procs */
+
+        /* Write to my slot at each target */
+        for (i = 0; i < even_nproc; i++)
+            MPI_Put(&rank, 1, MPI_INT, even_ranks[i], rank, 1, MPI_INT, win);
+    }
+
+
+    MPI_Win_complete(win);
+    MPI_Win_wait(win);
+
+    /* Perform PSCW communication: Odd/odd and even/even matchup */
+
+    if (rank % 2 == 0) {
+        MPI_Win_post(even_group, 0, win);  /* Even procs are targeted by even procs */
+        MPI_Win_start(even_group, 0, win); /* Even-numbered procs target even procs */
+
+        /* Write to my slot at each target */
+        for (i = 0; i < even_nproc; i++)
+            MPI_Put(&rank, 1, MPI_INT, even_ranks[i], rank, 1, MPI_INT, win);
+    }
+    else {
+        MPI_Win_post(odd_group, 0, win);   /* Odd procs are targeted by odd procs */
+        MPI_Win_start(odd_group, 0, win);  /* Odd-numbered procs target odd procs */
+
+        /* Write to my slot at each target */
+        for (i = 0; i < odd_nproc; i++)
+            MPI_Put(&rank, 1, MPI_INT, odd_ranks[i], rank, 1, MPI_INT, win);
+    }
+
+
+    MPI_Win_complete(win);
+    MPI_Win_wait(win);
+
+    for (i = 0; i < nproc; i++) {
+        if (win_buf[i] != i) {
+            errors++;
+
+            SQUELCH( printf("%d: Error -- win_buf[%d] = %d, expected %d\n",
+                            rank, i, win_buf[i], i);
+                   );
+        }
+    }
+
+    MPI_Win_free(&win);
+    MPI_Free_mem(win_buf);
+
+    MPI_Group_free(&world_group);
+    MPI_Group_free(&odd_group);
+    MPI_Group_free(&even_group);
+
+    free(odd_ranks);
+    free(even_ranks);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/put_base.c b/teshsuite/smpi/mpich3-test/rma/put_base.c
new file mode 100644
index 0000000000..ba95a1cfc9
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/put_base.c
@@ -0,0 +1,148 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Put Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov> 
+ * Date  : March, 2011
+ *
+ * This code performs N strided put operations into a 2d patch of a shared
+ * array.  The array has dimensions [X, Y] and the subarray has dimensions
+ * [SUB_X, SUB_Y] and begins at index [0, 0].  The input and output buffers are
+ * specified using an MPI datatype.
+ *
+ * This test generates a datatype that is relative to an arbitrary base address
+ * in memory and tests the RMA implementation's ability to perform the correct
+ * transfer.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 1024
+#define YDIM 1024
+#define SUB_XDIM 1024
+#define SUB_YDIM 1024
+#define ITERATIONS 10
+
+int main(int argc, char **argv) {
+    int i, j, rank, nranks, peer, bufsize, errors;
+    double  *win_buf, *src_buf, *dst_buf;
+    MPI_Win buf_win;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
+    /* Alloc_mem is not required for the origin buffers for RMA operations - 
+       just for the Win_create memory */
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf  + i) = 1.0 + rank;
+        *(src_buf + i) = 1.0 + rank;
+    }
+
+    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Perform ITERATIONS strided put operations */
+
+    for (i = 0; i < ITERATIONS; i++) {
+      MPI_Aint idx_loc[SUB_YDIM];
+      int idx_rem[SUB_YDIM];
+      int blk_len[SUB_YDIM];
+      MPI_Datatype src_type, dst_type;
+
+      void *base_ptr = dst_buf;
+      MPI_Aint base_int;
+
+      MPI_Get_address(base_ptr, &base_int);
+
+      for (j = 0; j < SUB_YDIM; j++) {
+        MPI_Get_address(&src_buf[j*XDIM], &idx_loc[j]);
+        idx_loc[j] = idx_loc[j] - base_int;
+        idx_rem[j] = j*XDIM*sizeof(double);
+        blk_len[j] = SUB_XDIM*sizeof(double);
+      }
+
+      MPI_Type_create_hindexed(SUB_YDIM, blk_len, idx_loc, MPI_BYTE, &src_type);
+      MPI_Type_create_indexed_block(SUB_YDIM, SUB_XDIM*sizeof(double), idx_rem, MPI_BYTE, &dst_type);
+
+      MPI_Type_commit(&src_type);
+      MPI_Type_commit(&dst_type);
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Put(base_ptr, 1, src_type, peer, 0, 1, dst_type, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Type_free(&src_type);
+      MPI_Type_free(&dst_type);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Verify that the results are correct */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = (1.0 + ((rank+nranks-1)%nranks));
+        if (actual - expected > 1e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = 1.0 + rank;
+        if (actual - expected > 1e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = 1.0 + rank;
+        if (actual - expected > 1e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(win_buf);
+    MPI_Free_mem(src_buf);
+    MPI_Free_mem(dst_buf);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/put_bottom.c b/teshsuite/smpi/mpich3-test/rma/put_bottom.c
new file mode 100644
index 0000000000..6634ea08b7
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/put_bottom.c
@@ -0,0 +1,138 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Put Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov> 
+ * Date  : March, 2011
+ *
+ * This code performs N strided put operations into a 2d patch of a shared
+ * array.  The array has dimensions [X, Y] and the subarray has dimensions
+ * [SUB_X, SUB_Y] and begins at index [0, 0].  The input and output buffers are
+ * specified using an MPI datatype.
+ *
+ * This test generates a datatype that is relative to MPI_BOTTOM and tests the
+ * RMA implementation's ability to perform the correct transfer.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 1024
+#define YDIM 1024
+#define SUB_XDIM 1024
+#define SUB_YDIM 1024
+#define ITERATIONS 10
+
+int main(int argc, char **argv) {
+    int i, j, rank, nranks, peer, bufsize, errors;
+    double *win_buf, *src_buf, *dst_buf;
+    MPI_Win buf_win;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf  + i) = 1.0 + rank;
+        *(src_buf + i) = 1.0 + rank;
+    }
+
+    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Perform ITERATIONS strided put operations */
+
+    for (i = 0; i < ITERATIONS; i++) {
+      MPI_Aint idx_loc[SUB_YDIM];
+      int idx_rem[SUB_YDIM];
+      int blk_len[SUB_YDIM];
+      MPI_Datatype src_type, dst_type;
+
+      for (j = 0; j < SUB_YDIM; j++) {
+        MPI_Get_address(&src_buf[j*XDIM], &idx_loc[j]);
+        idx_rem[j] = j*XDIM*sizeof(double);
+        blk_len[j] = SUB_XDIM*sizeof(double);
+      }
+
+      MPI_Type_create_hindexed(SUB_YDIM, blk_len, idx_loc, MPI_BYTE, &src_type);
+      MPI_Type_create_indexed_block(SUB_YDIM, SUB_XDIM*sizeof(double), idx_rem, MPI_BYTE, &dst_type);
+
+      MPI_Type_commit(&src_type);
+      MPI_Type_commit(&dst_type);
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Put(MPI_BOTTOM, 1, src_type, peer, 0, 1, dst_type, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Type_free(&src_type);
+      MPI_Type_free(&dst_type);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Verify that the results are correct */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = (1.0 + ((rank+nranks-1)%nranks));
+        if (actual - expected > 1e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = 1.0 + rank;
+        if (actual - expected > 1e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = 1.0 + rank;
+        if (actual - expected > 1e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(win_buf);
+    MPI_Free_mem(src_buf);
+    MPI_Free_mem(dst_buf);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/putfence1.c b/teshsuite/smpi/mpich3-test/rma/putfence1.c
new file mode 100644
index 0000000000..1020063b6a
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/putfence1.c
@@ -0,0 +1,109 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Put with Fence";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0, err;
+    int rank, size, source, dest;
+    int minsize = 2, count; 
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Aint      extent;
+    MTestDatatype sendtype, recvtype;
+
+    MTest_Init( &argc, &argv );
+
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	source = 0;
+	dest   = size - 1;
+	
+	for (count = 1; count < 65000; count = count * 2) {
+	    while (MTestGetDatatypes( &sendtype, &recvtype, count )) {
+
+		MTestPrintfMsg( 1, 
+		       "Putting count = %d of sendtype %s receive type %s\n", 
+				count, MTestGetDatatypeName( &sendtype ),
+				MTestGetDatatypeName( &recvtype ) );
+
+		/* Make sure that everyone has a recv buffer */
+		recvtype.InitBuf( &recvtype );
+
+		MPI_Type_extent( recvtype.datatype, &extent );
+		MPI_Win_create( recvtype.buf, recvtype.count * extent, 
+				extent, MPI_INFO_NULL, comm, &win );
+		MPI_Win_fence( 0, win );
+		if (rank == source) {
+		    /* To improve reporting of problems about operations, we
+		       change the error handler to errors return */
+		    MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+
+		    sendtype.InitBuf( &sendtype );
+		    
+		    err = MPI_Put( sendtype.buf, sendtype.count, 
+				   sendtype.datatype, dest, 0, 
+				   recvtype.count, recvtype.datatype, win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		    err = MPI_Win_fence( 0, win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		}
+		else if (rank == dest) {
+		    MPI_Win_fence( 0, win );
+		    /* This should have the same effect, in terms of
+		       transfering data, as a send/recv pair */
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) {
+			if (errs < 10) {
+			    printf( "Data in target buffer did not match for destination datatype %s (put with source datatype %s)\n", 
+				    MTestGetDatatypeName( &recvtype ),
+				    MTestGetDatatypeName( &sendtype ) );
+			    /* Redo the test, with the errors printed */
+			    recvtype.printErrors = 1;
+			    (void)MTestCheckRecv( 0, &recvtype );
+			}
+			errs += err;
+		    }
+		}
+		else {
+		    MPI_Win_fence( 0, win );
+		}
+		MPI_Win_free( &win );
+		MTestFreeDatatype( &sendtype );
+		MTestFreeDatatype( &recvtype );
+	    }
+	}
+        MTestFreeComm(&comm);
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/putfidx.c b/teshsuite/smpi/mpich3-test/rma/putfidx.c
new file mode 100644
index 0000000000..6a23eb2d7e
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/putfidx.c
@@ -0,0 +1,125 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+/*
+static char MTEST_Descrip[] = "Put with Fence for an indexed datatype";
+*/
+
+int CheckMPIErr( int err );
+
+int main( int argc, char *argv[] )
+{
+    int           errs = 0, err;
+    int           i, rank, size, source, dest;
+    int           blksize, totsize;
+    int           *recvBuf = 0, *srcBuf = 0;
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Aint      extent;
+    MPI_Datatype  originType;
+    int           counts[2];
+    int           displs[2];
+
+    MTest_Init( &argc, &argv );
+
+    /* Select the communicator and datatypes */
+    comm = MPI_COMM_WORLD;
+
+    /* Create the datatype */
+    /* One MPI Implementation fails this test with sufficiently large 
+       values of blksize - it appears to convert this type to an 
+       incorrect contiguous move */
+    blksize = 2048;
+    counts[0] = blksize;
+    counts[1] = blksize;
+    displs[0] = 0;
+    displs[1] = blksize + 1;
+    MPI_Type_indexed( 2, counts, displs, MPI_INT, &originType );
+    MPI_Type_commit( &originType );
+
+    totsize = 2 * blksize;
+
+    /* Determine the sender and receiver */
+    MPI_Comm_rank( comm, &rank );
+    MPI_Comm_size( comm, &size );
+    source = 0;
+    dest   = size - 1;
+	
+    recvBuf = (int *) malloc( totsize * sizeof(int) );
+    srcBuf  = (int *) malloc( (totsize + 1) * sizeof(int) ) ;
+    
+    if (!recvBuf || !srcBuf) {
+	fprintf( stderr, "Could not allocate buffers\n" );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+    
+    /* Initialize the send and recv buffers */
+    for (i=0; i<totsize; i++) {
+	recvBuf[i] = -1;
+    }
+    for (i=0; i<blksize; i++) {
+	srcBuf[i] = i;
+	srcBuf[blksize+1+i] = blksize+i;
+    }
+    srcBuf[blksize] = -1;
+
+    MPI_Type_extent( MPI_INT, &extent );
+    MPI_Win_create( recvBuf, totsize * extent, extent, 
+		    MPI_INFO_NULL, comm, &win );
+    MPI_Win_fence( 0, win );
+    if (rank == source) {
+	/* To improve reporting of problems about operations, we
+	   change the error handler to errors return */
+	MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+
+	err = MPI_Put( srcBuf, 1, originType, dest, 0, 
+		       totsize, MPI_INT, win );
+	errs += CheckMPIErr( err );
+	err = MPI_Win_fence( 0, win );
+	errs += CheckMPIErr( err );
+    }
+    else if (rank == dest) {
+	MPI_Win_fence( 0, win );
+	for (i=0; i<totsize; i++) {
+	    if (recvBuf[i] != i) {
+		errs++;
+		if (errs < 10) {
+		    printf( "recvBuf[%d] = %d should = %d\n", 
+			    i, recvBuf[i], i );
+		}
+	    }
+	}
+    }
+    else {
+	MPI_Win_fence( 0, win );
+    }
+    
+    MPI_Type_free( &originType );
+    MPI_Win_free( &win );
+    free( recvBuf );
+    free( srcBuf );
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
+
+int CheckMPIErr( int err )
+{
+    int rc = 0;
+    if (err != MPI_SUCCESS) { 
+	MTestPrintError( err );
+	rc = 1;
+    }
+    return rc;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/putpscw1.c b/teshsuite/smpi/mpich3-test/rma/putpscw1.c
new file mode 100644
index 0000000000..ff18f4c2a2
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/putpscw1.c
@@ -0,0 +1,109 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Put with Post/Start/Complete/Wait";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0, err;
+    int rank, size, source, dest;
+    int minsize = 2, count; 
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Aint      extent;
+    MPI_Group     wingroup, neighbors;
+    MTestDatatype sendtype, recvtype;
+
+    MTest_Init( &argc, &argv );
+
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	source = 0;
+	dest   = size - 1;
+	
+	for (count = 1; count < 65000; count = count * 2) {
+	    while (MTestGetDatatypes( &sendtype, &recvtype, count )) {
+		/* Make sure that everyone has a recv buffer */
+		recvtype.InitBuf( &recvtype );
+
+		MPI_Type_extent( recvtype.datatype, &extent );
+		MPI_Win_create( recvtype.buf, recvtype.count * extent, 
+				(int)extent, MPI_INFO_NULL, comm, &win );
+		MPI_Win_get_group( win, &wingroup );
+		if (rank == source) {
+		    /* To improve reporting of problems about operations, we
+		       change the error handler to errors return */
+		    MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+		    sendtype.InitBuf( &sendtype );
+		    
+		    /* Neighbor is dest only */
+		    MPI_Group_incl( wingroup, 1, &dest, &neighbors );
+		    err = MPI_Win_start( neighbors, 0, win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		    MPI_Group_free( &neighbors );
+		    err = MPI_Put( sendtype.buf, sendtype.count, 
+				    sendtype.datatype, dest, 0, 
+				   recvtype.count, recvtype.datatype, win );
+		    if (err) {
+			errs++;
+			MTestPrintError( err );
+		    }
+		    err = MPI_Win_complete( win );
+		    if (err) {
+			errs++;
+			if (errs < 10) {
+			    MTestPrintError( err );
+			}
+		    }
+		}
+		else if (rank == dest) {
+		    MPI_Group_incl( wingroup, 1, &source, &neighbors );
+		    MPI_Win_post( neighbors, 0, win );
+		    MPI_Group_free( &neighbors );
+		    MPI_Win_wait( win );
+		    /* This should have the same effect, in terms of
+		       transfering data, as a send/recv pair */
+		    err = MTestCheckRecv( 0, &recvtype );
+		    if (err) {
+			errs += errs;
+		    }
+		}
+		else {
+		    /* Nothing; the other processes need not call any 
+		       MPI routines */
+		    ;
+		}
+		MPI_Win_free( &win );
+		MTestFreeDatatype( &sendtype );
+		MTestFreeDatatype( &recvtype );
+		MPI_Group_free( &wingroup );
+	    }
+	}
+	MTestFreeComm( &comm );
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/req_example.c b/teshsuite/smpi/mpich3-test/rma/req_example.c
new file mode 100644
index 0000000000..571325c8cc
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/req_example.c
@@ -0,0 +1,91 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <mpi.h>
+#include <stdio.h>
+#include <assert.h>
+#include "mpitest.h"
+
+#define NSTEPS 100
+#define N 1000
+#define M 10
+
+/* This is example 11.21 from the MPI 3.0 spec:
+ *
+ * The following example shows how request-based operations can be used to
+ * overlap communication with computation. Each process fetches, processes,
+ * and writes the result for NSTEPS chunks of data. Instead of a single
+ * buffer, M local buffers are used to allow up to M communication operations
+ * to overlap with computation.
+ */
+
+/* Use a global variable to inhibit compiler optimizations in the compute
+ * function. */
+double junk = 0.0;
+
+void compute(int step, double *data) {
+    int i;
+
+    for (i = 0; i < N; i++)
+        junk += data[i] * (double) step;
+}
+
+int main( int argc, char *argv[] )
+{
+    int i, rank, nproc;
+    int errors = 0, all_errors = 0;
+    MPI_Win win;
+    MPI_Request put_req[M] = { MPI_REQUEST_NULL };
+    MPI_Request get_req;
+    double *baseptr;
+    double data[M][N]; /* M buffers of length N */
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    assert(M < NSTEPS);
+
+    MPI_Win_allocate(NSTEPS*N*sizeof(double), sizeof(double), MPI_INFO_NULL,
+                     MPI_COMM_WORLD, &baseptr, &win);
+
+    MPI_Win_lock_all(0, win);
+
+    for (i = 0; i < NSTEPS; i++) {
+        int target = (rank+1) % nproc;
+        int j;
+
+        /* Find a free put request */
+        if (i < M) {
+            j = i;
+        } else {
+            MPI_Waitany(M, put_req, &j, MPI_STATUS_IGNORE);
+        }
+
+        MPI_Rget(data[j], N, MPI_DOUBLE, target, i*N, N, MPI_DOUBLE, win,
+                 &get_req);
+        MPI_Wait(&get_req,MPI_STATUS_IGNORE);
+
+        compute(i, data[j]);
+        MPI_Rput(data[j], N, MPI_DOUBLE, target, i*N, N, MPI_DOUBLE, win,
+                 &put_req[j]);
+
+    }
+
+    MPI_Waitall(M, put_req, MPI_STATUSES_IGNORE);
+    MPI_Win_unlock_all(win);
+
+    MPI_Win_free(&win);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/reqops.c b/teshsuite/smpi/mpich3-test/rma/reqops.c
new file mode 100644
index 0000000000..ef2636fb99
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/reqops.c
@@ -0,0 +1,286 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <mpi.h>
+#include <stdio.h>
+#include <assert.h>
+#include "mpitest.h"
+
+#define ITER 100
+
+int main( int argc, char *argv[] )
+{
+    int rank, nproc, i;
+    int errors = 0, all_errors = 0;
+    int *buf;
+    MPI_Win window;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    if (nproc < 2) {
+        if (rank == 0) printf("Error: must be run with two or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    /** Create using MPI_Win_create() **/
+
+    if (rank == 0) {
+      MPI_Alloc_mem(4*sizeof(int), MPI_INFO_NULL, &buf);
+      *buf = nproc-1;
+    } else
+      buf = NULL;
+
+    MPI_Win_create(buf, 4*sizeof(int)*(rank == 0), 1, MPI_INFO_NULL, MPI_COMM_WORLD, &window);
+
+    /* PROC_NULL Communication */
+    {
+        MPI_Request pn_req[4];
+        int val[4], res;
+
+        MPI_Win_lock_all(0, window);
+
+        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, MPI_PROC_NULL, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
+        MPI_Rget(&val[1], 1, MPI_INT, MPI_PROC_NULL, 1, 1, MPI_INT, window, &pn_req[1]);
+        MPI_Rput(&val[2], 1, MPI_INT, MPI_PROC_NULL, 2, 1, MPI_INT, window, &pn_req[2]);
+        MPI_Raccumulate(&val[3], 1, MPI_INT, MPI_PROC_NULL, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);
+
+        assert(pn_req[0] != MPI_REQUEST_NULL);
+        assert(pn_req[1] != MPI_REQUEST_NULL);
+        assert(pn_req[2] != MPI_REQUEST_NULL);
+        assert(pn_req[3] != MPI_REQUEST_NULL);
+
+        MPI_Win_unlock_all(window);
+
+        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, window);
+
+    /* GET-ACC: Test third-party communication, through rank 0. */
+    for (i = 0; i < ITER; i++) {
+        MPI_Request gacc_req;
+        int val = -1, exp = -1;
+
+        /* Processes form a ring.  Process 0 starts first, then passes a token
+         * to the right.  Each process, in turn, performs third-party
+         * communication via process 0's window. */
+        if (rank > 0) {
+            MPI_Recv(NULL, 0, MPI_BYTE, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+
+        MPI_Rget_accumulate(&rank, 1, MPI_INT, &val, 1, MPI_INT, 0, 0, 1, MPI_INT, MPI_REPLACE, window, &gacc_req);
+        assert(gacc_req != MPI_REQUEST_NULL);
+        MPI_Wait(&gacc_req, MPI_STATUS_IGNORE);
+
+        exp = (rank + nproc-1) % nproc;
+
+        if (val != exp) {
+            printf("%d - Got %d, expected %d\n", rank, val, exp);
+            errors++;
+        }
+
+        if (rank < nproc-1) {
+            MPI_Send(NULL, 0, MPI_BYTE, rank+1, 0, MPI_COMM_WORLD);
+        }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) *buf = nproc-1;
+    MPI_Win_sync(window);
+
+    /* GET+PUT: Test third-party communication, through rank 0. */
+    for (i = 0; i < ITER; i++) {
+        MPI_Request req;
+        int val = -1, exp = -1;
+
+        /* Processes form a ring.  Process 0 starts first, then passes a token
+         * to the right.  Each process, in turn, performs third-party
+         * communication via process 0's window. */
+        if (rank > 0) {
+            MPI_Recv(NULL, 0, MPI_BYTE, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+
+        MPI_Rget(&val, 1, MPI_INT, 0, 0, 1, MPI_INT, window, &req);
+        assert(req != MPI_REQUEST_NULL);
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+        MPI_Rput(&rank, 1, MPI_INT, 0, 0, 1, MPI_INT, window, &req);
+        assert(req != MPI_REQUEST_NULL);
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+        exp = (rank + nproc-1) % nproc;
+
+        if (val != exp) {
+            printf("%d - Got %d, expected %d\n", rank, val, exp);
+            errors++;
+        }
+
+        if (rank < nproc-1) {
+            MPI_Send(NULL, 0, MPI_BYTE, rank+1, 0, MPI_COMM_WORLD);
+        }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) *buf = nproc-1;
+    MPI_Win_sync(window);
+
+    /* GET+ACC: Test third-party communication, through rank 0. */
+    for (i = 0; i < ITER; i++) {
+        MPI_Request req;
+        int val = -1, exp = -1;
+
+        /* Processes form a ring.  Process 0 starts first, then passes a token
+         * to the right.  Each process, in turn, performs third-party
+         * communication via process 0's window. */
+        if (rank > 0) {
+            MPI_Recv(NULL, 0, MPI_BYTE, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+
+        MPI_Rget(&val, 1, MPI_INT, 0, 0, 1, MPI_INT, window, &req);
+        assert(req != MPI_REQUEST_NULL);
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+        MPI_Raccumulate(&rank, 1, MPI_INT, 0, 0, 1, MPI_INT, MPI_REPLACE, window, &req);
+        assert(req != MPI_REQUEST_NULL);
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+        exp = (rank + nproc-1) % nproc;
+
+        if (val != exp) {
+            printf("%d - Got %d, expected %d\n", rank, val, exp);
+            errors++;
+        }
+
+        if (rank < nproc-1) {
+            MPI_Send(NULL, 0, MPI_BYTE, rank+1, 0, MPI_COMM_WORLD);
+        }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    MPI_Win_unlock(0, window);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Wait inside of an epoch */
+    {
+        MPI_Request pn_req[4];
+        int val[4], res;
+        const int target = 0;
+
+        MPI_Win_lock_all(0, window);
+
+        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, target, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
+        MPI_Rget(&val[1], 1, MPI_INT, target, 1, 1, MPI_INT, window, &pn_req[1]);
+        MPI_Rput(&val[2], 1, MPI_INT, target, 2, 1, MPI_INT, window, &pn_req[2]);
+        MPI_Raccumulate(&val[3], 1, MPI_INT, target, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);
+
+        assert(pn_req[0] != MPI_REQUEST_NULL);
+        assert(pn_req[1] != MPI_REQUEST_NULL);
+        assert(pn_req[2] != MPI_REQUEST_NULL);
+        assert(pn_req[3] != MPI_REQUEST_NULL);
+
+        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);
+
+        MPI_Win_unlock_all(window);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Wait outside of an epoch */
+    {
+        MPI_Request pn_req[4];
+        int val[4], res;
+        const int target = 0;
+
+        MPI_Win_lock_all(0, window);
+
+        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, target, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
+        MPI_Rget(&val[1], 1, MPI_INT, target, 1, 1, MPI_INT, window, &pn_req[1]);
+        MPI_Rput(&val[2], 1, MPI_INT, target, 2, 1, MPI_INT, window, &pn_req[2]);
+        MPI_Raccumulate(&val[3], 1, MPI_INT, target, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);
+
+        assert(pn_req[0] != MPI_REQUEST_NULL);
+        assert(pn_req[1] != MPI_REQUEST_NULL);
+        assert(pn_req[2] != MPI_REQUEST_NULL);
+        assert(pn_req[3] != MPI_REQUEST_NULL);
+
+        MPI_Win_unlock_all(window);
+
+        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);
+    }
+
+    /* Wait in a different epoch */
+    {
+        MPI_Request pn_req[4];
+        int val[4], res;
+        const int target = 0;
+
+        MPI_Win_lock_all(0, window);
+
+        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, target, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
+        MPI_Rget(&val[1], 1, MPI_INT, target, 1, 1, MPI_INT, window, &pn_req[1]);
+        MPI_Rput(&val[2], 1, MPI_INT, target, 2, 1, MPI_INT, window, &pn_req[2]);
+        MPI_Raccumulate(&val[3], 1, MPI_INT, target, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);
+
+        assert(pn_req[0] != MPI_REQUEST_NULL);
+        assert(pn_req[1] != MPI_REQUEST_NULL);
+        assert(pn_req[2] != MPI_REQUEST_NULL);
+        assert(pn_req[3] != MPI_REQUEST_NULL);
+
+        MPI_Win_unlock_all(window);
+
+        MPI_Win_lock_all(0, window);
+        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);
+        MPI_Win_unlock_all(window);
+    }
+
+    /* Wait in a fence epoch */
+    {
+        MPI_Request pn_req[4];
+        int val[4], res;
+        const int target = 0;
+
+        MPI_Win_lock_all(0, window);
+
+        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, target, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
+        MPI_Rget(&val[1], 1, MPI_INT, target, 1, 1, MPI_INT, window, &pn_req[1]);
+        MPI_Rput(&val[2], 1, MPI_INT, target, 2, 1, MPI_INT, window, &pn_req[2]);
+        MPI_Raccumulate(&val[3], 1, MPI_INT, target, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);
+
+        assert(pn_req[0] != MPI_REQUEST_NULL);
+        assert(pn_req[1] != MPI_REQUEST_NULL);
+        assert(pn_req[2] != MPI_REQUEST_NULL);
+        assert(pn_req[3] != MPI_REQUEST_NULL);
+
+        MPI_Win_unlock_all(window);
+
+        MPI_Win_fence(0, window);
+        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);
+        MPI_Win_fence(0, window);
+    }
+
+    MPI_Win_free(&window);
+    if (buf) MPI_Free_mem(buf);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/rmanull.c b/teshsuite/smpi/mpich3-test/rma/rmanull.c
new file mode 100644
index 0000000000..cb228f3747
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/rmanull.c
@@ -0,0 +1,231 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2010 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+/* Test the given operation within a Fence epoch */
+#define TEST_FENCE_OP(op_name_, fcn_call_)                              \
+    do {                                                                \
+        err = fcn_call_                                                 \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "PROC_NULL to " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+        err = MPI_Win_fence( 0, win );                                  \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Fence after " op_name_, err );     \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+
+/* Test the given operation within a passive target epoch */
+#define TEST_PT_OP(op_name_, fcn_call_)                                 \
+    do {                                                                \
+        err = MPI_Win_lock(MPI_LOCK_EXCLUSIVE, MPI_PROC_NULL, 0, win);  \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Lock before" op_name_, err );      \
+            }                                                           \
+        }                                                               \
+        err = fcn_call_                                                 \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "PROC_NULL to " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+        err = MPI_Win_unlock( MPI_PROC_NULL, win );                     \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Unlock after " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+
+/* Test the given request-based operation within a passive target epoch */
+#define TEST_REQ_OP(op_name_, req_, fcn_call_)                          \
+    do {                                                                \
+        err = MPI_Win_lock(MPI_LOCK_EXCLUSIVE, MPI_PROC_NULL, 0, win);  \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Lock before" op_name_, err );      \
+            }                                                           \
+        }                                                               \
+        err = fcn_call_                                                 \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "PROC_NULL to " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+        err = MPI_Win_unlock( MPI_PROC_NULL, win );                     \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Unlock after " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+        err = MPI_Wait( &req_, MPI_STATUS_IGNORE );                     \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Wait after " op_name_, err );      \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+/*
+static char MTEST_Descrip[] = "Test the MPI_PROC_NULL is a valid target";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int           errs = 0, err;
+    int           rank, size;
+    int           *buf, bufsize;
+    int           *result;
+    int           *rmabuf, rsize, rcount;
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Request   req;
+
+    MTest_Init( &argc, &argv );
+
+    bufsize = 256 * sizeof(int);
+    buf     = (int *)malloc( bufsize );
+    if (!buf) {
+	fprintf( stderr, "Unable to allocated %d bytes\n", bufsize );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+    result  = (int *)malloc( bufsize );
+    if (!result) {
+        fprintf( stderr, "Unable to allocated %d bytes\n", bufsize );
+        MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+    rcount   = 16;
+    rsize    = rcount * sizeof(int);
+    rmabuf   = (int *)malloc( rsize );
+    if (!rmabuf) {
+	fprintf( stderr, "Unable to allocated %d bytes\n", rsize );
+	MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+	
+    /* The following illustrates the use of the routines to 
+       run through a selection of communicators and datatypes.
+       Use subsets of these for tests that do not involve combinations 
+       of communicators, datatypes, and counts of datatypes */
+    while (MTestGetIntracommGeneral( &comm, 1, 1 )) {
+	if (comm == MPI_COMM_NULL) continue;
+	/* Determine the sender and receiver */
+	MPI_Comm_rank( comm, &rank );
+	MPI_Comm_size( comm, &size );
+	
+	MPI_Win_create( buf, bufsize, sizeof(int), MPI_INFO_NULL, comm, &win );
+	/* To improve reporting of problems about operations, we
+	   change the error handler to errors return */
+	MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+
+        /** TEST OPERATIONS USING ACTIVE TARGET (FENCE) SYNCHRONIZATION **/
+        MPI_Win_fence( 0, win );
+
+        TEST_FENCE_OP("Put",
+                      MPI_Put( rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0,
+                               rcount, MPI_INT, win );
+                     );
+
+        TEST_FENCE_OP("Get",
+                      MPI_Get( rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0,
+                               rcount, MPI_INT, win );
+                     );
+        TEST_FENCE_OP("Accumulate",
+                      MPI_Accumulate( rmabuf, rcount, MPI_INT, MPI_PROC_NULL,
+                                      0, rcount, MPI_INT, MPI_SUM, win );
+                     );
+        TEST_FENCE_OP("Get accumulate",
+                      MPI_Get_accumulate( rmabuf, rcount, MPI_INT, result,
+                                          rcount, MPI_INT, MPI_PROC_NULL, 0,
+                                          rcount, MPI_INT, MPI_SUM, win );
+                     );
+        TEST_FENCE_OP("Fetch and op",
+                      MPI_Fetch_and_op( rmabuf, result, MPI_INT, MPI_PROC_NULL,
+                                        0, MPI_SUM, win );
+                     );
+        TEST_FENCE_OP("Compare and swap",
+                      MPI_Compare_and_swap( rmabuf, &rank, result, MPI_INT,
+                                            MPI_PROC_NULL, 0, win );
+                     );
+
+        /** TEST OPERATIONS USING PASSIVE TARGET SYNCHRONIZATION **/
+
+        TEST_PT_OP("Put",
+                   MPI_Put( rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0, rcount,
+                            MPI_INT, win );
+                   );
+        TEST_PT_OP("Get",
+                   MPI_Get( rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0, rcount,
+                            MPI_INT, win );
+                   );
+        TEST_PT_OP("Accumulate",
+                   MPI_Accumulate( rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0,
+                                   rcount, MPI_INT, MPI_SUM, win );
+                   );
+        TEST_PT_OP("Get accumulate",
+                   MPI_Get_accumulate( rmabuf, rcount, MPI_INT, result, rcount,
+                                       MPI_INT, MPI_PROC_NULL, 0, rcount,
+                                       MPI_INT, MPI_SUM, win );
+                   );
+        TEST_PT_OP("Fetch and op",
+                   MPI_Fetch_and_op( rmabuf, result, MPI_INT, MPI_PROC_NULL, 0,
+                                     MPI_SUM, win );
+                   );
+        TEST_PT_OP("Compare and swap",
+                   MPI_Compare_and_swap( rmabuf, &rank, result, MPI_INT,
+                                         MPI_PROC_NULL, 0, win );
+                   );
+
+        /** TEST REQUEST-BASED OPERATIONS (PASSIVE TARGET ONLY) **/
+
+        TEST_REQ_OP("Rput", req,
+                    MPI_Rput( rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0, rcount,
+                              MPI_INT, win, &req );
+                   );
+        TEST_REQ_OP("Rget", req,
+                    MPI_Rget( rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0, rcount,
+                              MPI_INT, win, &req );
+                   );
+        TEST_REQ_OP("Raccumulate", req,
+                    MPI_Raccumulate( rmabuf, rcount, MPI_INT, MPI_PROC_NULL, 0,
+                                     rcount, MPI_INT, MPI_SUM, win, &req );
+                   );
+        TEST_REQ_OP("Rget_accumulate", req,
+                    MPI_Rget_accumulate( rmabuf, rcount, MPI_INT, result,
+                                         rcount, MPI_INT, MPI_PROC_NULL, 0,
+                                         rcount, MPI_INT, MPI_SUM, win, &req );
+                   );
+
+	MPI_Win_free( &win );
+        MTestFreeComm(&comm);
+    }
+
+    free( result );
+    free( buf );
+    free( rmabuf );
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/rmazero.c b/teshsuite/smpi/mpich3-test/rma/rmazero.c
new file mode 100644
index 0000000000..0ea28d7cb0
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/rmazero.c
@@ -0,0 +1,220 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2013 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+#define TARGET 0
+
+/* Test the given operation within a Fence epoch */
+#define TEST_FENCE_OP(op_name_, fcn_call_)                              \
+    do {                                                                \
+        err = fcn_call_                                                 \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Zero-byte op " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+        err = MPI_Win_fence( 0, win );                                  \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Fence after " op_name_, err );     \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+
+/* Test the given operation within a passive target epoch */
+#define TEST_PT_OP(op_name_, fcn_call_)                                 \
+    do {                                                                \
+        err = MPI_Win_lock(MPI_LOCK_EXCLUSIVE, TARGET, 0, win);         \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Lock before" op_name_, err );      \
+            }                                                           \
+        }                                                               \
+        err = fcn_call_                                                 \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Zero-byte op " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+        err = MPI_Win_unlock( TARGET, win );                            \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Unlock after " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+
+/* Test the given request-based operation within a passive target epoch */
+#define TEST_REQ_OP(op_name_, req_, fcn_call_)                          \
+    do {                                                                \
+        err = MPI_Win_lock(MPI_LOCK_EXCLUSIVE, TARGET, 0, win);         \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Lock before" op_name_, err );      \
+            }                                                           \
+        }                                                               \
+        err = fcn_call_                                                 \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Zero-byte op " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+        err = MPI_Win_unlock( TARGET, win );                            \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Unlock after " op_name_, err );    \
+            }                                                           \
+        }                                                               \
+        err = MPI_Wait( &req_, MPI_STATUS_IGNORE );                     \
+        if (err) {                                                      \
+            errs++;                                                     \
+            if (errs < 10) {                                            \
+                MTestPrintErrorMsg( "Wait after " op_name_, err );      \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+/*
+static char MTEST_Descrip[] = "Test handling of zero-byte transfers";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int           errs = 0, err;
+    int           rank, size;
+    int           *buf, bufsize;
+    int           *result;
+    int           *rmabuf, rsize, rcount;
+    MPI_Comm      comm;
+    MPI_Win       win;
+    MPI_Request   req;
+
+    MTest_Init( &argc, &argv );
+
+    bufsize = 256 * sizeof(int);
+    buf     = (int *)malloc( bufsize );
+    if (!buf) {
+        fprintf( stderr, "Unable to allocated %d bytes\n", bufsize );
+        MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+    result  = (int *)malloc( bufsize );
+    if (!result) {
+        fprintf( stderr, "Unable to allocated %d bytes\n", bufsize );
+        MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+    rcount   = 16;
+    rsize    = rcount * sizeof(int);
+    rmabuf   = (int *)malloc( rsize );
+    if (!rmabuf) {
+        fprintf( stderr, "Unable to allocated %d bytes\n", rsize );
+        MPI_Abort( MPI_COMM_WORLD, 1 );
+    }
+
+    /* The following loop is used to run through a series of communicators
+     * that are subsets of MPI_COMM_WORLD, of size 1 or greater. */
+    while (MTestGetIntracommGeneral( &comm, 1, 1 )) {
+        int count = 0;
+
+        if (comm == MPI_COMM_NULL) continue;
+        /* Determine the sender and receiver */
+        MPI_Comm_rank( comm, &rank );
+        MPI_Comm_size( comm, &size );
+
+        MPI_Win_create( buf, bufsize, sizeof(int), MPI_INFO_NULL, comm, &win );
+        /* To improve reporting of problems about operations, we
+           change the error handler to errors return */
+        MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN );
+
+        /** TEST OPERATIONS USING ACTIVE TARGET (FENCE) SYNCHRONIZATION **/
+        MPI_Win_fence( 0, win );
+
+        TEST_FENCE_OP("Put",
+                      MPI_Put( rmabuf, count, MPI_INT, TARGET, 0,
+                               count, MPI_INT, win );
+                     );
+
+        TEST_FENCE_OP("Get",
+                      MPI_Get( rmabuf, count, MPI_INT, TARGET, 0,
+                               count, MPI_INT, win );
+                     );
+        TEST_FENCE_OP("Accumulate",
+                      MPI_Accumulate( rmabuf, count, MPI_INT, TARGET,
+                                      0, count, MPI_INT, MPI_SUM, win );
+                     );
+        TEST_FENCE_OP("Get accumulate",
+                      MPI_Get_accumulate( rmabuf, count, MPI_INT, result,
+                                          count, MPI_INT, TARGET, 0,
+                                          count, MPI_INT, MPI_SUM, win );
+                     );
+        /* Note: It's not possible to generate a zero-byte FOP or CAS */
+
+        /** TEST OPERATIONS USING PASSIVE TARGET SYNCHRONIZATION **/
+
+        TEST_PT_OP("Put",
+                   MPI_Put( rmabuf, count, MPI_INT, TARGET, 0, count,
+                            MPI_INT, win );
+                   );
+        TEST_PT_OP("Get",
+                   MPI_Get( rmabuf, count, MPI_INT, TARGET, 0, count,
+                            MPI_INT, win );
+                   );
+        TEST_PT_OP("Accumulate",
+                   MPI_Accumulate( rmabuf, count, MPI_INT, TARGET, 0,
+                                   count, MPI_INT, MPI_SUM, win );
+                   );
+        TEST_PT_OP("Get accumulate",
+                   MPI_Get_accumulate( rmabuf, count, MPI_INT, result, count,
+                                       MPI_INT, TARGET, 0, count,
+                                       MPI_INT, MPI_SUM, win );
+                   );
+
+        /* Note: It's not possible to generate a zero-byte FOP or CAS */
+
+        /** TEST REQUEST-BASED OPERATIONS (PASSIVE TARGET ONLY) **/
+
+        TEST_REQ_OP("Rput", req,
+                    MPI_Rput( rmabuf, count, MPI_INT, TARGET, 0, count,
+                              MPI_INT, win, &req );
+                   );
+        TEST_REQ_OP("Rget", req,
+                    MPI_Rget( rmabuf, count, MPI_INT, TARGET, 0, count,
+                              MPI_INT, win, &req );
+                   );
+        TEST_REQ_OP("Raccumulate", req,
+                    MPI_Raccumulate( rmabuf, count, MPI_INT, TARGET, 0,
+                                     count, MPI_INT, MPI_SUM, win, &req );
+                   );
+        TEST_REQ_OP("Rget_accumulate", req,
+                    MPI_Rget_accumulate( rmabuf, count, MPI_INT, result,
+                                         count, MPI_INT, TARGET, 0,
+                                         count, MPI_INT, MPI_SUM, win, &req );
+                   );
+
+        MPI_Win_free( &win );
+        MTestFreeComm(&comm);
+    }
+
+    free( result );
+    free( buf );
+    free( rmabuf );
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/selfrma.c b/teshsuite/smpi/mpich3-test/rma/selfrma.c
new file mode 100644
index 0000000000..ca8ae4ba84
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/selfrma.c
@@ -0,0 +1,113 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "RMA to self";
+*/
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0;
+    int rank, size, i, j;
+    MPI_Comm      comm;
+    MPI_Win       win;
+    int           *winbuf, count;
+    int           *sbuf, scount, vcount;
+    MPI_Datatype  vectype;
+
+    MTest_Init( &argc, &argv );
+
+    comm = MPI_COMM_WORLD;
+
+    MPI_Comm_rank( comm, &rank );
+    MPI_Comm_size( comm, &size );
+
+    /* Allocate and initialize sbuf */
+    scount = 1000;
+    count  = 1000;
+    sbuf   = (int *)malloc( scount * sizeof(int) );
+    if (!sbuf) {
+	fprintf( stderr, "Could not allocate send buffer f size %d\n", 
+		 scount );
+	MPI_Abort( MPI_COMM_WORLD, 0 );
+    }
+    for (i=0; i<scount; i++) sbuf[i] = i;
+
+    MPI_Alloc_mem( count*sizeof(int), MPI_INFO_NULL, &winbuf );
+
+    /* This is a simple vector type */
+    vcount = count / 4;
+    MPI_Type_vector( vcount, 1, 2, MPI_INT, &vectype );
+    MPI_Type_commit( &vectype );
+    MPI_Win_create( winbuf, count * sizeof(int), sizeof(int), MPI_INFO_NULL, 
+		    comm, &win );
+
+    /* Check with different combination of types, including non-contig on 
+       both sides */
+    
+    /* Clear winbuf */
+    memset( winbuf, 0, count*sizeof(int) );
+    MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, win );
+    MPI_Put( sbuf, 1, vectype, rank, 0, 1, vectype, win );
+    MPI_Win_unlock( rank, win );
+    /* Check results */
+    j = 0;
+    for (i=0; i<vcount; i++) {
+	if (winbuf[j] != sbuf[j]) {
+	    errs ++;
+	    fprintf( stderr, "VecPut: winbuf[%d] = %d, should = %d\n", 
+		     winbuf[j], j, sbuf[j] );
+	}
+	j += 2;
+    }
+
+    memset( winbuf, 0, count*sizeof(int) );
+    MPI_Win_lock( MPI_LOCK_SHARED, rank, 0, win );
+    MPI_Accumulate( sbuf, 1, vectype, rank, 0, 1, vectype, MPI_SUM, win );
+    MPI_Win_unlock( rank, win );
+    /* Check results */
+    j = 0;
+    for (i=0; i<vcount; i++) {
+	if (winbuf[j] != sbuf[j]) {
+	    errs ++;
+	    fprintf( stderr, "VecAcc: winbuf[%d] = %d, should = %d\n", 
+		     winbuf[j], j, sbuf[j] );
+	}
+	j += 2;
+    }
+
+    /* Now, use get to fetch back the results that we just wrote */
+    memset( sbuf, 0, count*sizeof(int) );
+    MPI_Win_lock( MPI_LOCK_SHARED, rank, 0, win );
+    MPI_Get( sbuf, 1, vectype, rank, 0, 1, vectype, win );
+    MPI_Win_unlock( rank, win );
+    /* Check results */
+    j = 0;
+    for (i=0; i<vcount; i++) {
+	if (winbuf[j] != sbuf[j]) {
+	    errs ++;
+	    fprintf( stderr, "VecAcc: winbuf[%d] = %d, should = %d\n", 
+		     winbuf[j], j, sbuf[j] );
+	}
+	j += 2;
+    }
+
+    MPI_Win_free( &win );
+    MPI_Free_mem( winbuf );
+    free( sbuf );
+    MPI_Type_free( &vectype );
+
+    MTest_Finalize( errs );
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/squelch.h b/teshsuite/smpi/mpich3-test/rma/squelch.h
new file mode 100644
index 0000000000..2e469d3853
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/squelch.h
@@ -0,0 +1,16 @@
+#ifndef SQUELCH_H_INCLUDED
+#define SQUELCH_H_INCLUDED
+
+static const int SQ_LIMIT   = 10;
+static       int SQ_COUNT   = 0;
+static       int SQ_VERBOSE = 0;
+
+#define SQUELCH(X)                              \
+  do {                                          \
+    if (SQ_COUNT < SQ_LIMIT || SQ_VERBOSE) {    \
+      SQ_COUNT++;                               \
+      X                                         \
+    }                                           \
+  } while (0)
+
+#endif
diff --git a/teshsuite/smpi/mpich3-test/rma/strided_acc_indexed.c b/teshsuite/smpi/mpich3-test/rma/strided_acc_indexed.c
new file mode 100644
index 0000000000..ac54f52e47
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/strided_acc_indexed.c
@@ -0,0 +1,143 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Accumulate Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov> 
+ * Date  : December, 2010
+ *
+ * This code performs N accumulates into a 2d patch of a shared array.  The
+ * array has dimensions [X, Y] and the subarray has dimensions [SUB_X, SUB_Y]
+ * and begins at index [0, 0].  The input and output buffers are specified
+ * using an MPI indexed type.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 16
+#define YDIM 16
+#define SUB_XDIM 8
+#define SUB_YDIM 8
+#define ITERATIONS 1
+
+int main(int argc, char **argv) {
+    int itr, i, j, rank, nranks, peer, bufsize, errors;
+    double *win_buf, *src_buf;
+    MPI_Win buf_win;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf + i) = -1.0;
+        *(src_buf + i) = 1.0 + rank;
+    }
+
+    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Perform ITERATIONS strided accumulate operations */
+
+    for (itr = 0; itr < ITERATIONS; itr++) {
+      MPI_Aint idx_loc[SUB_YDIM];
+      int idx_rem[SUB_YDIM];
+      int blk_len[SUB_YDIM];
+      MPI_Datatype src_type, dst_type;
+
+      for (i = 0; i < SUB_YDIM; i++) {
+        MPI_Get_address(&src_buf[i*XDIM], &idx_loc[i]);
+        idx_rem[i] = i*XDIM;
+        blk_len[i] = SUB_XDIM;
+      }
+
+#ifdef ABSOLUTE
+      MPI_Type_hindexed(SUB_YDIM, blk_len, idx_loc, MPI_DOUBLE, &src_type);
+#else
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type);
+#endif
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type);
+
+      MPI_Type_commit(&src_type);
+      MPI_Type_commit(&dst_type);
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+
+#ifdef ABSOLUTE
+      MPI_Accumulate(MPI_BOTTOM, 1, src_type, peer, 0, 1, dst_type, MPI_SUM, buf_win);
+#else
+      MPI_Accumulate(src_buf, 1, src_type, peer, 0, 1, dst_type, MPI_SUM, buf_win);
+#endif
+
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Type_free(&src_type);
+      MPI_Type_free(&dst_type);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Verify that the results are correct */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0 + (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(win_buf);
+    MPI_Free_mem(src_buf);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/strided_acc_onelock.c b/teshsuite/smpi/mpich3-test/rma/strided_acc_onelock.c
new file mode 100644
index 0000000000..55ecde262c
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/strided_acc_onelock.c
@@ -0,0 +1,85 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Accumulate Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov> 
+ * Date  : December, 2010
+ *
+ * This code performs one-sided accumulate into a 2d patch of a shared array.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 1024 
+#define YDIM 1024
+#define ITERATIONS 10
+
+int main(int argc, char **argv) {
+    int i, j, rank, nranks, peer, bufsize, errors;
+    double *buffer, *src_buf;
+    MPI_Win buf_win;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &buffer);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(buffer  + i) = 1.0 + rank;
+        *(src_buf + i) = 1.0 + rank;
+    }
+
+    MPI_Win_create(buffer, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
+
+    peer = (rank+1) % nranks;
+
+    for (i = 0; i < ITERATIONS; i++) {
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+
+      for (j = 0; j < YDIM; j++) {
+        MPI_Accumulate(src_buf + j*XDIM, XDIM, MPI_DOUBLE, peer,
+                       j*XDIM*sizeof(double), XDIM, MPI_DOUBLE, MPI_SUM, buf_win);
+      }
+
+      MPI_Win_unlock(peer, buf_win);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    for (i = errors = 0; i < XDIM; i++) {
+      for (j = 0; j < YDIM; j++) {
+        const double actual   = *(buffer + i + j*XDIM);
+        const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(buffer);
+    MPI_Free_mem(src_buf);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/strided_acc_subarray.c b/teshsuite/smpi/mpich3-test/rma/strided_acc_subarray.c
new file mode 100644
index 0000000000..c8f850c6a7
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/strided_acc_subarray.c
@@ -0,0 +1,136 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Accumulate Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov> 
+ * Date  : December, 2010
+ *
+ * This code performs N accumulates into a 2d patch of a shared array.  The
+ * array has dimensions [X, Y] and the subarray has dimensions [SUB_X, SUB_Y]
+ * and begins at index [0, 0].  The input and output buffers are specified
+ * using an MPI subarray type.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 1024 
+#define YDIM 1024
+#define SUB_XDIM 512
+#define SUB_YDIM 512
+#define ITERATIONS 10
+
+int main(int argc, char **argv) {
+    int i, j, rank, nranks, peer, bufsize, errors;
+    double *win_buf, *src_buf;
+    MPI_Win buf_win;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf + i) = -1.0;
+        *(src_buf + i) = 1.0 + rank;
+    }
+
+    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Perform ITERATIONS strided accumulate operations */
+
+    for (i = 0; i < ITERATIONS; i++) {
+      int ndims               = 2;
+      int src_arr_sizes[2]    = { XDIM, YDIM };
+      int src_arr_subsizes[2] = { SUB_XDIM, SUB_YDIM };
+      int src_arr_starts[2]   = {    0,    0 };
+      int dst_arr_sizes[2]    = { XDIM, YDIM };
+      int dst_arr_subsizes[2] = { SUB_XDIM, SUB_YDIM };
+      int dst_arr_starts[2]   = {    0,    0 };
+      MPI_Datatype src_type, dst_type;
+
+      MPI_Type_create_subarray(ndims, src_arr_sizes, src_arr_subsizes, src_arr_starts,
+          MPI_ORDER_C, MPI_DOUBLE, &src_type);
+
+      MPI_Type_create_subarray(ndims, dst_arr_sizes, dst_arr_subsizes, dst_arr_starts,
+          MPI_ORDER_C, MPI_DOUBLE, &dst_type);
+
+      MPI_Type_commit(&src_type);
+      MPI_Type_commit(&dst_type);
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+
+      MPI_Accumulate(src_buf, 1, src_type, peer, 0, 1, dst_type, MPI_SUM, buf_win);
+
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Type_free(&src_type);
+      MPI_Type_free(&dst_type);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Verify that the results are correct */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0 + (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(win_buf);
+    MPI_Free_mem(src_buf);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/strided_get_indexed.c b/teshsuite/smpi/mpich3-test/rma/strided_get_indexed.c
new file mode 100644
index 0000000000..3a98d29879
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/strided_get_indexed.c
@@ -0,0 +1,133 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Get Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov> 
+ * Date  : December, 2010
+ *
+ * This code performs N strided get operations from a 2d patch of a shared
+ * array.  The array has dimensions [X, Y] and the subarray has dimensions
+ * [SUB_X, SUB_Y] and begins at index [0, 0].  The input and output buffers are
+ * specified using an MPI indexed type.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 8
+#define YDIM 1024
+#define SUB_XDIM 8
+#define SUB_YDIM 256
+
+int main(int argc, char **argv) {
+    int i, j, rank, nranks, peer, bufsize, errors;
+    double *win_buf, *loc_buf;
+    MPI_Win buf_win;
+
+    int idx_rem[SUB_YDIM];
+    int blk_len[SUB_YDIM];
+    MPI_Datatype loc_type, rem_type;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &loc_buf);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf + i) =  1.0 + rank;
+        *(loc_buf + i) = -1.0;
+    }
+
+    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Build the datatype */
+
+    for (i = 0; i < SUB_YDIM; i++) {
+      idx_rem[i] = i*XDIM;
+      blk_len[i] = SUB_XDIM;
+    }
+
+    MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &loc_type);
+    MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &rem_type);
+
+    MPI_Type_commit(&loc_type);
+    MPI_Type_commit(&rem_type);
+
+    /* Perform get operation */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+
+    MPI_Get(loc_buf, 1, loc_type, peer, 0, 1, rem_type, buf_win);
+
+    /* Use the datatype only on the remote side (must have SUB_XDIM == XDIM) */
+    /* MPI_Get(loc_buf, SUB_XDIM*SUB_YDIM, MPI_DOUBLE, peer, 0, 1, rem_type, buf_win); */
+
+    MPI_Win_unlock(peer, buf_win);
+
+    MPI_Type_free(&loc_type);
+    MPI_Type_free(&rem_type);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Verify that the results are correct */
+
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(loc_buf + i + j*XDIM);
+        const double expected = (1.0 + peer);
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(loc_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(loc_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(win_buf);
+    MPI_Free_mem(loc_buf);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/strided_getacc_indexed.c b/teshsuite/smpi/mpich3-test/rma/strided_getacc_indexed.c
new file mode 100644
index 0000000000..e3293a1eff
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/strided_getacc_indexed.c
@@ -0,0 +1,141 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Accumulate Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov> 
+ * Date  : December, 2010
+ *
+ * This code performs N strided put operations followed by get operations into
+ * a 2d patch of a shared array.  The array has dimensions [X, Y] and the
+ * subarray has dimensions [SUB_X, SUB_Y] and begins at index [0, 0].  The
+ * input and output buffers are specified using an MPI indexed type.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 8
+#define YDIM 1024
+#define SUB_XDIM 1
+#define SUB_YDIM 2
+#define ITERATIONS 10
+
+int main(int argc, char **argv) {
+    int i, j, rank, nranks, peer, bufsize, errors;
+    double *win_buf, *src_buf, *dst_buf;
+    MPI_Win buf_win;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf + i) = -1.0;
+        *(src_buf + i) =  1.0 + rank;
+    }
+
+    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Perform ITERATIONS strided accumulate operations */
+
+    for (i = 0; i < ITERATIONS; i++) {
+      int idx_rem[SUB_YDIM];
+      int blk_len[SUB_YDIM];
+      MPI_Datatype src_type, dst_type;
+
+      for (j = 0; j < SUB_YDIM; j++) {
+        idx_rem[j] = j*XDIM;
+        blk_len[j] = SUB_XDIM;
+      }
+
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type);
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type);
+
+      MPI_Type_commit(&src_type);
+      MPI_Type_commit(&dst_type);
+
+      /* PUT */
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Get_accumulate(src_buf, 1, src_type, dst_buf, 1, src_type, peer, 0,
+                          1, dst_type, MPI_REPLACE, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      /* GET */
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Get_accumulate(src_buf, 1, src_type, dst_buf, 1, src_type, peer, 0,
+                          1, dst_type, MPI_NO_OP, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Type_free(&src_type);
+      MPI_Type_free(&dst_type);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Verify that the results are correct */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = (1.0 + ((rank+nranks-1)%nranks));
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(win_buf);
+    MPI_Free_mem(src_buf);
+    MPI_Free_mem(dst_buf);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/strided_getacc_indexed_shared.c b/teshsuite/smpi/mpich3-test/rma/strided_getacc_indexed_shared.c
new file mode 100644
index 0000000000..6ff4f76fdc
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/strided_getacc_indexed_shared.c
@@ -0,0 +1,151 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Accumulate Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov>
+ * Date  : November, 2012
+ *
+ * This code performs N strided put operations followed by get operations into
+ * a 2d patch of a shared array.  The array has dimensions [X, Y] and the
+ * subarray has dimensions [SUB_X, SUB_Y] and begins at index [0, 0].  The
+ * input and output buffers are specified using an MPI indexed type.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 8
+#define YDIM 1024
+#define SUB_XDIM 1
+#define SUB_YDIM 2
+#define ITERATIONS 10
+
+int main(int argc, char **argv) {
+    int rank, nranks, rank_world, nranks_world;
+    int i, j, peer, bufsize, errors;
+    double *win_buf, *src_buf, *dst_buf;
+    MPI_Win buf_win;
+    MPI_Comm shr_comm;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks_world);
+
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shr_comm);
+
+    MPI_Comm_rank(shr_comm, &rank);
+    MPI_Comm_size(shr_comm, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf);
+
+    MPI_Win_allocate_shared(bufsize, 1, MPI_INFO_NULL, shr_comm, &win_buf, &buf_win);
+
+    MPI_Win_fence(0, buf_win);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf + i) = -1.0;
+        *(src_buf + i) =  1.0 + rank;
+    }
+
+    MPI_Win_fence(0, buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Perform ITERATIONS strided accumulate operations */
+
+    for (i = 0; i < ITERATIONS; i++) {
+      int idx_rem[SUB_YDIM];
+      int blk_len[SUB_YDIM];
+      MPI_Datatype src_type, dst_type;
+
+      for (j = 0; j < SUB_YDIM; j++) {
+        idx_rem[j] = j*XDIM;
+        blk_len[j] = SUB_XDIM;
+      }
+
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type);
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type);
+
+      MPI_Type_commit(&src_type);
+      MPI_Type_commit(&dst_type);
+
+      /* PUT */
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Get_accumulate(src_buf, 1, src_type, dst_buf, 1, src_type, peer, 0,
+                          1, dst_type, MPI_REPLACE, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      /* GET */
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Get_accumulate(src_buf, 1, src_type, dst_buf, 1, src_type, peer, 0,
+                          1, dst_type, MPI_NO_OP, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Type_free(&src_type);
+      MPI_Type_free(&dst_type);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Verify that the results are correct */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = (1.0 + ((rank+nranks-1)%nranks));
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(src_buf);
+    MPI_Free_mem(dst_buf);
+    MPI_Comm_free(&shr_comm);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/strided_putget_indexed.c b/teshsuite/smpi/mpich3-test/rma/strided_putget_indexed.c
new file mode 100644
index 0000000000..09f17ae1f6
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/strided_putget_indexed.c
@@ -0,0 +1,137 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Accumulate Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov> 
+ * Date  : December, 2010
+ *
+ * This code performs N strided put operations followed by get operations into
+ * a 2d patch of a shared array.  The array has dimensions [X, Y] and the
+ * subarray has dimensions [SUB_X, SUB_Y] and begins at index [0, 0].  The
+ * input and output buffers are specified using an MPI indexed type.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 8
+#define YDIM 1024
+#define SUB_XDIM 8
+#define SUB_YDIM 255
+#define ITERATIONS 10
+
+int main(int argc, char **argv) {
+    int i, j, rank, nranks, peer, bufsize, errors;
+    double *win_buf, *src_buf, *dst_buf;
+    MPI_Win buf_win;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf + i) = -1.0;
+        *(src_buf + i) =  1.0 + rank;
+    }
+
+    MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Perform ITERATIONS strided accumulate operations */
+
+    for (i = 0; i < ITERATIONS; i++) {
+      int idx_rem[SUB_YDIM];
+      int blk_len[SUB_YDIM];
+      MPI_Datatype src_type, dst_type;
+
+      for (j = 0; j < SUB_YDIM; j++) {
+        idx_rem[j] = j*XDIM;
+        blk_len[j] = SUB_XDIM;
+      }
+
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type);
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type);
+
+      MPI_Type_commit(&src_type);
+      MPI_Type_commit(&dst_type);
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Put(src_buf, 1, src_type, peer, 0, 1, dst_type, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Get(dst_buf, 1, src_type, peer, 0, 1, dst_type, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Type_free(&src_type);
+      MPI_Type_free(&dst_type);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Verify that the results are correct */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = (1.0 + ((rank+nranks-1)%nranks));
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(win_buf);
+    MPI_Free_mem(src_buf);
+    MPI_Free_mem(dst_buf);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/strided_putget_indexed_shared.c b/teshsuite/smpi/mpich3-test/rma/strided_putget_indexed_shared.c
new file mode 100644
index 0000000000..727190b529
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/strided_putget_indexed_shared.c
@@ -0,0 +1,147 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+/* One-Sided MPI 2-D Strided Accumulate Test
+ *
+ * Author: James Dinan <dinan@mcs.anl.gov>
+ * Date  : November, 2012
+ *
+ * This code performs N strided put operations followed by get operations into
+ * a 2d patch of a shared array.  The array has dimensions [X, Y] and the
+ * subarray has dimensions [SUB_X, SUB_Y] and begins at index [0, 0].  The
+ * input and output buffers are specified using an MPI indexed type.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <mpi.h>
+#include "mpitest.h"
+#include "squelch.h"
+
+#define XDIM 8
+#define YDIM 1024
+#define SUB_XDIM 8
+#define SUB_YDIM 255
+#define ITERATIONS 10
+
+int main(int argc, char **argv) {
+    int rank, nranks, rank_world, nranks_world;
+    int i, j, peer, bufsize, errors;
+    double *win_buf, *src_buf, *dst_buf;
+    MPI_Win buf_win;
+    MPI_Comm shr_comm;
+
+    MTest_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks_world);
+
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shr_comm);
+
+    MPI_Comm_rank(shr_comm, &rank);
+    MPI_Comm_size(shr_comm, &nranks);
+
+    bufsize = XDIM * YDIM * sizeof(double);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
+    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf);
+
+    MPI_Win_allocate_shared(bufsize, 1, MPI_INFO_NULL, shr_comm, &win_buf, &buf_win);
+
+    MPI_Win_fence(0, buf_win);
+
+    for (i = 0; i < XDIM*YDIM; i++) {
+        *(win_buf + i) = -1.0;
+        *(src_buf + i) =  1.0 + rank;
+    }
+
+    MPI_Win_fence(0, buf_win);
+
+    peer = (rank+1) % nranks;
+
+    /* Perform ITERATIONS strided accumulate operations */
+
+    for (i = 0; i < ITERATIONS; i++) {
+      int idx_rem[SUB_YDIM];
+      int blk_len[SUB_YDIM];
+      MPI_Datatype src_type, dst_type;
+
+      for (j = 0; j < SUB_YDIM; j++) {
+        idx_rem[j] = j*XDIM;
+        blk_len[j] = SUB_XDIM;
+      }
+
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type);
+      MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type);
+
+      MPI_Type_commit(&src_type);
+      MPI_Type_commit(&dst_type);
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Put(src_buf, 1, src_type, peer, 0, 1, dst_type, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
+      MPI_Get(dst_buf, 1, src_type, peer, 0, 1, dst_type, buf_win);
+      MPI_Win_unlock(peer, buf_win);
+
+      MPI_Type_free(&src_type);
+      MPI_Type_free(&dst_type);
+    }
+
+    MPI_Barrier(shr_comm);
+
+    /* Verify that the results are correct */
+
+    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
+    errors = 0;
+    for (i = 0; i < SUB_XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = (1.0 + ((rank+nranks-1)%nranks));
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = SUB_XDIM; i < XDIM; i++) {
+      for (j = 0; j < SUB_YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    for (i = 0; i < XDIM; i++) {
+      for (j = SUB_YDIM; j < YDIM; j++) {
+        const double actual   = *(win_buf + i + j*XDIM);
+        const double expected = -1.0;
+        if (fabs(actual - expected) > 1.0e-10) {
+          SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
+              rank, j, i, expected, actual); );
+          errors++;
+          fflush(stdout);
+        }
+      }
+    }
+    MPI_Win_unlock(rank, buf_win);
+
+    MPI_Win_free(&buf_win);
+    MPI_Free_mem(src_buf);
+    MPI_Free_mem(dst_buf);
+    MPI_Comm_free(&shr_comm);
+
+    MTest_Finalize( errors );
+    MPI_Finalize();
+    return MTestReturnValue( errors );
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/test1.c b/teshsuite/smpi/mpich3-test/rma/test1.c
new file mode 100644
index 0000000000..b11995b8a0
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test1.c
@@ -0,0 +1,81 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests a series of puts, gets, and accumulate on 2 processes using fence */
+
+#define SIZE 100
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, A[SIZE], B[SIZE], i;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    int errs = 0;
+ 
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        if (rank == 0) {
+            for (i=0; i<SIZE; i++)
+                A[i] = B[i] = i;
+        }
+        else {
+            for (i=0; i<SIZE; i++) {
+                A[i] = (-3)*i;
+                B[i] = (-4)*i;
+            }
+        }
+ 
+        MPI_Win_create(B, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win); 
+ 
+        MPI_Win_fence(0, win); 
+ 
+        if (rank == 0) {
+            for (i=0; i<SIZE-1; i++)
+                MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
+        }        
+        else {
+            for (i=0; i<SIZE-1; i++)
+                MPI_Get(A+i, 1, MPI_INT, 0, i, 1, MPI_INT, win);
+ 
+            MPI_Accumulate(A+i, 1, MPI_INT, 0, i, 1, MPI_INT, MPI_SUM, win);
+        }
+        MPI_Win_fence(0, win); 
+ 
+        if (rank == 1) {
+            for (i=0; i<SIZE-1; i++) {
+                if (A[i] != B[i]) {
+                    SQUELCH( printf("Put/Get Error: A[i]=%d, B[i]=%d\n", A[i], B[i]); );
+                    errs++;
+ 	            }
+            }
+        }
+        else {
+            if (B[SIZE-1] != SIZE - 1 - 3*(SIZE-1)) {
+                SQUELCH( printf("Accumulate Error: B[SIZE-1] is %d, should be %d\n", B[SIZE-1], SIZE - 1 - 3*(SIZE-1)); );
+                errs++;
+            }
+ 	   }
+       MPI_Win_free(&win); 
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test1_am.c b/teshsuite/smpi/mpich3-test/rma/test1_am.c
new file mode 100644
index 0000000000..9ceedfd762
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test1_am.c
@@ -0,0 +1,100 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests a series of puts, gets, and accumulate on 2 processes using fence */
+
+/* same as test1.c but uses alloc_mem */
+
+#define SIZE 100
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, i;
+    MPI_Comm CommDeuce;
+    int *A, *B;
+
+    MPI_Win win;
+    int errs = 0;
+ 
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        i = MPI_Alloc_mem(SIZE * sizeof(int), MPI_INFO_NULL, &A);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        i = MPI_Alloc_mem(SIZE * sizeof(int), MPI_INFO_NULL, &B);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        
+        if (rank == 0) {
+            for (i=0; i<SIZE; i++)
+                A[i] = B[i] = i;
+        }
+        else {
+            for (i=0; i<SIZE; i++) {
+                A[i] = (-3)*i;
+                B[i] = (-4)*i;
+            }
+        }
+
+        MPI_Win_create(B, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+
+        MPI_Win_fence(0, win); 
+ 
+        if (rank == 0) {
+            for (i=0; i<SIZE-1; i++)
+                MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
+        }        
+        else {
+            for (i=0; i<SIZE-1; i++)
+                MPI_Get(A+i, 1, MPI_INT, 0, i, 1, MPI_INT, win);
+ 
+            MPI_Accumulate(A+i, 1, MPI_INT, 0, i, 1, MPI_INT, MPI_SUM, win);
+        }
+        MPI_Win_fence(0, win); 
+ 
+        if (rank == 1) {
+            for (i=0; i<SIZE-1; i++) {
+                if (A[i] != B[i]) {
+                    SQUELCH( printf("Put/Get Error: A[i]=%d, B[i]=%d\n", A[i], B[i]); );
+                    errs++;
+ 	       }
+            }
+        }
+        else {
+            if (B[SIZE-1] != SIZE - 1 - 3*(SIZE-1)) {
+                SQUELCH( printf("Accumulate Error: B[SIZE-1] is %d, should be %d\n", B[SIZE-1], SIZE - 1 - 3*(SIZE-1)); );
+                errs++;
+            }
+ 	    }
+        MPI_Win_free(&win); 
+
+        MPI_Free_mem(A);
+        MPI_Free_mem(B);
+    }
+    MPI_Comm_free(&CommDeuce);
+
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test1_dt.c b/teshsuite/smpi/mpich3-test/rma/test1_dt.c
new file mode 100644
index 0000000000..072c184b25
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test1_dt.c
@@ -0,0 +1,89 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests a series of puts, gets, and accumulate on 2 processes using fence */
+/* Same as test1.c but uses derived datatypes to receive data */
+
+#define SIZE 100
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, A[SIZE], B[SIZE], i;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    MPI_Datatype contig_2ints;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+
+        if (rank == 0) {
+            for (i=0; i<SIZE; i++)
+                A[i] = B[i] = i;
+        }
+        else {
+            for (i=0; i<SIZE; i++) {
+                A[i] = (-3)*i;
+                B[i] = (-4)*i;
+            }
+        }
+
+        MPI_Type_contiguous(2, MPI_INT, &contig_2ints);
+        MPI_Type_commit(&contig_2ints);
+
+        MPI_Win_create(B, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+
+        MPI_Win_fence(0, win);
+
+        if (rank == 0) {
+            for (i=0; i<SIZE-2; i+=2)
+                MPI_Put(A+i, 2, MPI_INT, 1, i, 1, contig_2ints, win);
+        }
+        else {
+            for (i=0; i<SIZE-2; i+=2)
+                MPI_Get(A+i, 2, MPI_INT, 0, i, 1, contig_2ints, win);
+
+            MPI_Accumulate(A+SIZE-2, 2, MPI_INT, 0, SIZE-2, 1, contig_2ints, MPI_SUM, win);
+        }
+        MPI_Win_fence(0, win);
+
+        if (rank == 1) {
+            for (i=0; i<SIZE-2; i++) {
+                if (A[i] != B[i]) {
+                    SQUELCH( printf("Put/Get Error: A[i]=%d, B[i]=%d\n", A[i], B[i]); );
+                    errs++;
+                }
+            }
+        }
+        else {
+            if (B[SIZE-1] != SIZE - 1 - 3*(SIZE-1)) {
+                SQUELCH( printf("Accumulate Error: B[SIZE-1] is %d, should be %d\n", B[SIZE-1], SIZE - 1 - 3*(SIZE-1)); );
+                errs++;
+            }
+        }
+
+        MPI_Win_free(&win);
+        MPI_Type_free(&contig_2ints);
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test2.c b/teshsuite/smpi/mpich3-test/rma/test2.c
new file mode 100644
index 0000000000..f4399eaeed
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test2.c
@@ -0,0 +1,82 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests put and get with post/start/complete/wait on 2 processes */
+
+#define SIZE1 100
+#define SIZE2 200
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, destrank, nprocs, A[SIZE2], B[SIZE2], i;
+    MPI_Comm CommDeuce;
+    MPI_Group comm_group, group;
+    MPI_Win win;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2)
+    {
+        MPI_Comm_group(CommDeuce, &comm_group);
+
+        if (rank == 0) {
+            for (i=0; i<SIZE2; i++) A[i] = B[i] = i;
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 1;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_start(group, 0, win);
+            for (i=0; i<SIZE1; i++)
+                MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
+            for (i=0; i<SIZE1; i++)
+                MPI_Get(B+i, 1, MPI_INT, 1, SIZE1+i, 1, MPI_INT, win);
+
+            MPI_Win_complete(win);
+
+            for (i=0; i<SIZE1; i++)
+                if (B[i] != (-4)*(i+SIZE1)) {
+                    SQUELCH( printf("Get Error: B[i] is %d, should be %d\n", B[i], (-4)*(i+SIZE1)); );
+                    errs++;
+                }
+        }
+        else if (rank == 1) {
+            for (i=0; i<SIZE2; i++) B[i] = (-4)*i;
+            MPI_Win_create(B, SIZE2*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 0;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_post(group, 0, win);
+            MPI_Win_wait(win);
+
+            for (i=0; i<SIZE1; i++) {
+                if (B[i] != i) {
+                    SQUELCH( printf("Put Error: B[i] is %d, should be %d\n", B[i], i); );
+                    errs++;
+                }
+            }
+        }
+
+        MPI_Group_free(&group);
+        MPI_Group_free(&comm_group);
+        MPI_Win_free(&win);
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test2_am.c b/teshsuite/smpi/mpich3-test/rma/test2_am.c
new file mode 100644
index 0000000000..53780d5a44
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test2_am.c
@@ -0,0 +1,99 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests put and get with post/start/complete/wait on 2 processes */
+
+/* same as test1.c but uses alloc_mem */
+
+#define SIZE1 100
+#define SIZE2 200
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, destrank, nprocs, *A, *B, i;
+    MPI_Comm CommDeuce;
+    MPI_Group comm_group, group;
+    MPI_Win win;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2)
+    {
+
+        i = MPI_Alloc_mem(SIZE2 * sizeof(int), MPI_INFO_NULL, &A);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        i = MPI_Alloc_mem(SIZE2 * sizeof(int), MPI_INFO_NULL, &B);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        MPI_Comm_group(CommDeuce, &comm_group);
+
+        if (rank == 0) {
+            for (i=0; i<SIZE2; i++) A[i] = B[i] = i;
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 1;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_start(group, 0, win);
+            for (i=0; i<SIZE1; i++)
+                MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
+            for (i=0; i<SIZE1; i++)
+                MPI_Get(B+i, 1, MPI_INT, 1, SIZE1+i, 1, MPI_INT, win);
+
+            MPI_Win_complete(win);
+
+            for (i=0; i<SIZE1; i++)
+                if (B[i] != (-4)*(i+SIZE1)) {
+                    SQUELCH( printf("Get Error: B[i] is %d, should be %d\n", B[i], (-4)*(i+SIZE1)); );
+                    errs++;
+                }
+        }
+        else if (rank == 1) {
+            for (i=0; i<SIZE2; i++) B[i] = (-4)*i;
+            MPI_Win_create(B, SIZE2*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 0;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_post(group, 0, win);
+            MPI_Win_wait(win);
+
+            for (i=0; i<SIZE1; i++) {
+                if (B[i] != i) {
+                    SQUELCH( printf("Put Error: B[i] is %d, should be %d\n", B[i], i); );
+                    errs++;
+                }
+            }
+        }
+
+        MPI_Group_free(&group);
+        MPI_Group_free(&comm_group);
+        MPI_Win_free(&win);
+        MPI_Free_mem(A);
+        MPI_Free_mem(B);
+    }
+
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test3.c b/teshsuite/smpi/mpich3-test/rma/test3.c
new file mode 100644
index 0000000000..06dd53b8a9
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test3.c
@@ -0,0 +1,100 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* Tests the example in Fig 6.8, pg 142, MPI-2 standard. Process 1 has
+   a blocking MPI_Recv between the Post and Wait. Therefore, this
+   example will not run if the one-sided operations are simply
+   implemented on top of MPI_Isends and Irecvs. They either need to be
+   implemented inside the progress engine or using threads with Isends
+   and Irecvs. In MPICH-2, they are implemented in the progress engine. */
+
+#define SIZE 1048576
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, destrank, nprocs, *A, *B, i;
+    MPI_Comm CommDeuce;
+    MPI_Group comm_group, group;
+    MPI_Win win;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2)
+    {
+        A = (int *) malloc(SIZE * sizeof(int));
+        if (!A) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        B = (int *) malloc(SIZE * sizeof(int));
+        if (!B) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        MPI_Comm_group(CommDeuce, &comm_group);
+
+        if (rank == 0) {
+            for (i=0; i<SIZE; i++) {
+                A[i] = i;
+                B[i] = SIZE + i;
+            }
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 1;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_start(group, 0, win);
+            MPI_Put(A, SIZE, MPI_INT, 1, 0, SIZE, MPI_INT, win);
+            MPI_Win_complete(win);
+            MPI_Send(B, SIZE, MPI_INT, 1, 100, MPI_COMM_WORLD);
+        }
+
+        else if (rank == 1) {
+            for (i=0; i<SIZE; i++) A[i] = B[i] = (-4)*i;
+            MPI_Win_create(B, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 0;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_post(group, 0, win);
+            MPI_Recv(A, SIZE, MPI_INT, 0, 100, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            MPI_Win_wait(win);
+
+            for (i=0; i<SIZE; i++) {
+                if (B[i] != i) {
+                    SQUELCH( printf("Rank 1: Put Error: B[i] is %d, should be %d\n", B[i], i); );
+                    errs++;
+                }
+                if (A[i] != SIZE + i) {
+                    SQUELCH( printf("Rank 1: Send/Recv Error: A[i] is %d, should be %d\n", A[i], SIZE+i); );
+                    errs++;
+                }
+            }
+        }
+
+        MPI_Group_free(&group);
+        MPI_Group_free(&comm_group);
+        MPI_Win_free(&win);
+        free(A);
+        free(B);
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test3_am.c b/teshsuite/smpi/mpich3-test/rma/test3_am.c
new file mode 100644
index 0000000000..dc10c3196e
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test3_am.c
@@ -0,0 +1,100 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* Tests the example in Fig 6.8, pg 142, MPI-2 standard. Process 1 has
+   a blocking MPI_Recv between the Post and Wait. Therefore, this
+   example will not run if the one-sided operations are simply
+   implemented on top of MPI_Isends and Irecvs. They either need to be
+   implemented inside the progress engine or using threads with Isends
+   and Irecvs. In MPICH-2, they are implemented in the progress engine. */
+
+/* same as test3.c but uses alloc_mem */
+
+#define SIZE 1048576
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, destrank, nprocs, *A, *B, i;
+    MPI_Comm CommDeuce;
+    MPI_Group comm_group, group;
+    MPI_Win win;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        i = MPI_Alloc_mem(SIZE * sizeof(int), MPI_INFO_NULL, &A);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        i = MPI_Alloc_mem(SIZE * sizeof(int), MPI_INFO_NULL, &B);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        MPI_Comm_group(CommDeuce, &comm_group);
+
+        if (rank == 0) {
+            for (i=0; i<SIZE; i++) {
+                A[i] = i;
+                B[i] = SIZE + i;
+            }
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 1;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_start(group, 0, win);
+            MPI_Put(A, SIZE, MPI_INT, 1, 0, SIZE, MPI_INT, win);
+            MPI_Win_complete(win);
+            MPI_Send(B, SIZE, MPI_INT, 1, 100, MPI_COMM_WORLD);
+        }
+        else {  /* rank=1 */
+            for (i=0; i<SIZE; i++) A[i] = B[i] = (-4)*i;
+            MPI_Win_create(B, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 0;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_post(group, 0, win);
+            MPI_Recv(A, SIZE, MPI_INT, 0, 100, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            MPI_Win_wait(win);
+
+            for (i=0; i<SIZE; i++) {
+                if (B[i] != i) {
+                    SQUELCH( printf("Rank 1: Put Error: B[i] is %d, should be %d\n", B[i], i); );
+                    errs++;
+                }
+                if (A[i] != SIZE + i) {
+                    SQUELCH( printf("Rank 1: Send/Recv Error: A[i] is %d, should be %d\n", A[i], SIZE+i); );
+                    errs++;
+                }
+            }
+        }
+
+        MPI_Group_free(&group);
+        MPI_Group_free(&comm_group);
+        MPI_Win_free(&win);
+        MPI_Free_mem(A);
+        MPI_Free_mem(B);
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test4.c b/teshsuite/smpi/mpich3-test/rma/test4.c
new file mode 100644
index 0000000000..11ee9dd608
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test4.c
@@ -0,0 +1,81 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests passive target RMA on 2 processes. tests the lock-single_op-unlock 
+   optimization. */
+
+#define SIZE1 100
+#define SIZE2 200
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, A[SIZE2], B[SIZE2], i, j;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        if (rank == 0) {
+            for (i=0; i<SIZE2; i++) A[i] = B[i] = i;
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+
+            for (j = 0; j < 2; j++) {
+                for (i=0; i<SIZE1; i++) {
+                    MPI_Win_lock(MPI_LOCK_SHARED, 1, j == 0 ? 0 : MPI_MODE_NOCHECK, win);
+                    MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
+                    MPI_Win_unlock(1, win);
+                }
+
+                for (i=0; i<SIZE1; i++) {
+                    MPI_Win_lock(MPI_LOCK_SHARED, 1, j == 0 ? 0 : MPI_MODE_NOCHECK, win);
+                    MPI_Get(B+i, 1, MPI_INT, 1, SIZE1+i, 1, MPI_INT, win);
+                    MPI_Win_unlock(1, win);
+                }
+            }
+
+            MPI_Win_free(&win);
+
+            for (i=0; i<SIZE1; i++)
+                if (B[i] != (-4)*(i+SIZE1)) {
+                    SQUELCH( printf("Get Error: B[%d] is %d, should be %d\n", i, B[i], (-4)*(i+SIZE1)); );
+                    errs++;
+                }
+        }
+        else {  /* rank=1 */
+            for (i=0; i<SIZE2; i++) B[i] = (-4)*i;
+            MPI_Win_create(B, SIZE2*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+
+            MPI_Win_free(&win);
+
+            for (i=0; i<SIZE1; i++) {
+                if (B[i] != i) {
+                    SQUELCH( printf("Put Error: B[%d] is %d, should be %d\n", i, B[i], i); );
+                    errs++;
+                }
+            }
+        }
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test4_am.c b/teshsuite/smpi/mpich3-test/rma/test4_am.c
new file mode 100644
index 0000000000..83cb3efc01
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test4_am.c
@@ -0,0 +1,95 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "stdlib.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests passive target RMA on 2 processes. tests the lock-single_op-unlock 
+   optimization. */
+
+/* same as test4.c but uses alloc_mem */
+
+#define SIZE1 100
+#define SIZE2 200
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, *A, *B, i; 
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        i = MPI_Alloc_mem(SIZE2 * sizeof(int), MPI_INFO_NULL, &A);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        i = MPI_Alloc_mem(SIZE2 * sizeof(int), MPI_INFO_NULL, &B);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        if (rank == 0) {
+            for (i=0; i<SIZE2; i++) A[i] = B[i] = i;
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+
+            for (i=0; i<SIZE1; i++) {
+                MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win);
+                MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win);
+                MPI_Win_unlock(1, win);
+            }
+
+            for (i=0; i<SIZE1; i++) {
+                MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win);
+                MPI_Get(B+i, 1, MPI_INT, 1, SIZE1+i, 1, MPI_INT, win);
+                MPI_Win_unlock(1, win);
+            }
+
+            MPI_Win_free(&win);
+
+            for (i=0; i<SIZE1; i++)
+                if (B[i] != (-4)*(i+SIZE1)) {
+                    SQUELCH( printf("Get Error: B[%d] is %d, should be %d\n", i, B[i], (-4)*(i+SIZE1)); );
+                    errs++;
+                }
+        }
+        else {  /* rank=1 */
+            for (i=0; i<SIZE2; i++) B[i] = (-4)*i;
+            MPI_Win_create(B, SIZE2*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+
+            MPI_Win_free(&win);
+
+            for (i=0; i<SIZE1; i++) {
+                if (B[i] != i) {
+                    SQUELCH( printf("Put Error: B[%d] is %d, should be %d\n", i, B[i], i); );
+                    errs++;
+                }
+            }
+        }
+
+        MPI_Free_mem(A);
+        MPI_Free_mem(B);
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test5.c b/teshsuite/smpi/mpich3-test/rma/test5.c
new file mode 100644
index 0000000000..4cc02cf5e7
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test5.c
@@ -0,0 +1,74 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests a series of Gets. Run on 2 processes. */
+
+#define SIZE 2000
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, i, A[SIZE], B[SIZE];
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        if (rank == 0) {
+            for (i=0; i<SIZE; i++)
+                B[i] = 500 + i;
+            MPI_Win_create(B, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            MPI_Win_fence(0, win);
+            for (i=0; i<SIZE; i++) {
+                A[i] = i+100;
+                MPI_Get(&A[i], 1, MPI_INT, 1, i, 1, MPI_INT, win);
+            }
+            MPI_Win_fence(0, win);
+            for (i=0; i<SIZE; i++)
+                if (A[i] != 1000 + i) {
+                    SQUELCH( printf("Rank 0: A[%d] is %d, should be %d\n", i, A[i], 1000+i); );
+                    errs++;
+                }
+        }
+        if (rank == 1) {
+            for (i=0; i<SIZE; i++)
+                A[i] = 1000 + i;
+            MPI_Win_create(A, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            MPI_Win_fence(0, win);
+            for (i=0; i<SIZE; i++) {
+                B[i] = i+200;
+                MPI_Get(&B[i], 1, MPI_INT, 0, i, 1, MPI_INT, win);
+            }
+            MPI_Win_fence(0, win);
+            for (i=0; i<SIZE; i++)
+                if (B[i] != 500 + i) {
+                    SQUELCH( printf("Rank 1: B[%d] is %d, should be %d\n", i, B[i], 500+i); );
+                    errs++;
+                }
+        }
+
+        MPI_Win_free(&win);
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/test5_am.c b/teshsuite/smpi/mpich3-test/rma/test5_am.c
new file mode 100644
index 0000000000..f0482d357b
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/test5_am.c
@@ -0,0 +1,92 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* tests a series of Gets. Run on 2 processes. */
+
+/* same as test5.c but uses alloc_mem */
+
+#define SIZE 2000
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, i, *A, *B;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        i = MPI_Alloc_mem(SIZE * sizeof(int), MPI_INFO_NULL, &A);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        i = MPI_Alloc_mem(SIZE * sizeof(int), MPI_INFO_NULL, &B);
+        if (i) {
+            printf("Can't allocate memory in test program\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+
+        if (rank == 0) {
+            for (i=0; i<SIZE; i++)
+                B[i] = 500 + i;
+            MPI_Win_create(B, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            MPI_Win_fence(0, win);
+            for (i=0; i<SIZE; i++) {
+                A[i] = i+100;
+                MPI_Get(&A[i], 1, MPI_INT, 1, i, 1, MPI_INT, win);
+            }
+            MPI_Win_fence(0, win);
+            for (i=0; i<SIZE; i++)
+                if (A[i] != 1000 + i) {
+                    SQUELCH( printf("Rank 0: A[%d] is %d, should be %d\n", i, A[i], 1000+i); );
+                    errs++;
+                }
+        }
+        if (rank == 1) {
+            for (i=0; i<SIZE; i++)
+                A[i] = 1000 + i;
+            MPI_Win_create(A, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            MPI_Win_fence(0, win);
+            for (i=0; i<SIZE; i++) {
+                B[i] = i+200;
+                MPI_Get(&B[i], 1, MPI_INT, 0, i, 1, MPI_INT, win);
+            }
+            MPI_Win_fence(0, win);
+            for (i=0; i<SIZE; i++)
+                if (B[i] != 500 + i) {
+                    SQUELCH( printf("Rank 1: B[%d] is %d, should be %d\n", i, B[i], 500+i); );
+                    errs++;
+                }
+        }
+
+        MPI_Win_free(&win);
+
+        MPI_Free_mem(A);
+        MPI_Free_mem(B);
+
+    }
+
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/testlist b/teshsuite/smpi/mpich3-test/rma/testlist
new file mode 100644
index 0000000000..58b96b9275
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/testlist
@@ -0,0 +1,125 @@
+#Needs Win set name, win_get_name needed
+#winname 2
+allocmem 2
+putfence1 4
+putfidx 4
+getfence1 4
+#Accumulate + datatypes=Bug
+#accfence1 4
+#Needs lock, unlock
+#adlb_mimic1 3
+accfence2 4
+#Needs post
+#putpscw1 4
+#accpscw1 4
+#Needs get_group
+#getgroup 4
+transpose1 2
+transpose2 2
+#Needs post/start
+#transpose3 2
+#Needs lock/unlock
+#transpose5 2
+#accum on complex datatypes
+#transpose6 1
+transpose7 2
+test1 2
+#Needs post/start
+#test2 2
+#test3 2
+#Needs lock, unlock
+#test4 2
+test5 2
+#Needs lock, unlock
+#lockcontention 3
+#lockcontention2 4
+#lockcontention2 8
+#lockcontention3 8
+#lockopts 2
+#transpose4 2
+#fetchandadd 7
+#fetchandadd_tree 7
+#Needs start, complete
+#wintest 2
+#Needs lock, unlock
+#contig_displ 1
+test1_am 2
+#test2_am 2
+#test3_am 2
+#test4_am 2
+test5_am 2
+#fetchandadd_am 7
+#fetchandadd_tree_am 7
+accfence2_am 4
+#Accumulate + datatypes=Bug
+#test1_dt 2 timeLimit=30
+#Needs post/start
+#nullpscw 7
+#Needs win_attr
+#attrorderwin 1
+#Needs MPI_Win_call_errhandler
+#wincall 2
+#Needs win_attr
+#baseattrwin 1
+#Needs MPI_Win_create_keyval
+#fkeyvalwin 1
+#Needs lock, unlock
+#selfrma 1
+#mixedsync 4
+epochtest 4
+#Needs lock, unlock
+#locknull 2
+#Needs MPI_Rput, rget, racumulate,  MPI_Fetch_and_op, MPI_Compare_and_swap
+#rmanull 2
+#rmazero 2
+#Needs lock, unlock
+#strided_acc_indexed 2
+#strided_acc_onelock 2
+#strided_acc_subarray 2
+#strided_get_indexed 2
+#strided_putget_indexed 4
+#strided_putget_indexed_shared 4 mpiversion=3.0
+#strided_getacc_indexed 4 mpiversion=3.0
+#strided_getacc_indexed_shared 4 mpiversion=3.0
+window_creation 2
+#Needs lock, unlock
+#contention_put 4
+#contention_putget 4
+#put_base 2
+#put_bottom 2
+#win_flavors 4 mpiversion=3.0
+#manyrma2 2 timeLimit=500
+#win_shared 4 mpiversion=3.0
+#win_shared_noncontig 4 mpiversion=3.0
+#win_shared_noncontig_put 4 mpiversion=3.0
+#win_dynamic_acc 4 mpiversion=3.0
+#get_acc_local 1 mpiversion=3.0
+#linked_list 4 mpiversion=3.0
+#linked_list_fop 4 mpiversion=3.0
+#compare_and_swap 4 mpiversion=3.0
+#fetch_and_op_char 4 mpiversion=3.0
+#fetch_and_op_short 4 mpiversion=3.0
+#fetch_and_op_int 4 mpiversion=3.0
+#fetch_and_op_long 4 mpiversion=3.0
+#fetch_and_op_double 4 mpiversion=3.0
+#fetch_and_op_long_double 4 mpiversion=3.0
+#get_accumulate_double 4 mpiversion=3.0
+#get_accumulate_double_derived 4 mpiversion=3.0
+#get_accumulate_int 4 mpiversion=3.0
+#get_accumulate_int_derived 4 mpiversion=3.0
+#get_accumulate_long 4 mpiversion=3.0
+#get_accumulate_long_derived 4 mpiversion=3.0
+#get_accumulate_short 4 mpiversion=3.0
+#get_accumulate_short_derived 4 mpiversion=3.0
+#flush 4 mpiversion=3.0
+#reqops 4 mpiversion=3.0
+#req_example 4 mpiversion=3.0
+#win_info 4 mpiversion=3.0
+#linked_list_lockall 4 mpiversion=3.0
+#pscw_ordering 4 mpiversion=3.0
+#linked_list_bench_lock_all 4 mpiversion=3.0
+#linked_list_bench_lock_excl 4 mpiversion=3.0
+#linked_list_bench_lock_shr 4 mpiversion=3.0
+#linked_list_bench_lock_shr_nocheck 4 mpiversion=3.0
+#mutex_bench 4 mpiversion=3.0
+#mutex_bench_shared 4 mpiversion=3.0
diff --git a/teshsuite/smpi/mpich3-test/rma/transpose1.c b/teshsuite/smpi/mpich3-test/rma/transpose1.c
new file mode 100644
index 0000000000..908ecc0820
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/transpose1.c
@@ -0,0 +1,109 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include <stdlib.h>
+#include "mpitest.h"
+
+/* transposes a matrix using put, fence, and derived datatypes. Uses
+   vector and hvector (Example 3.32 from MPI 1.1 Standard). Run on
+   2 processes */
+
+#define NROWS 1000
+#define NCOLS 1000
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, **A, *A_data, i, j;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    MPI_Datatype column, xpose;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        A_data = (int *) malloc(NROWS * NCOLS * sizeof(int));
+        A = (int **) malloc(NROWS * sizeof(int *));
+
+        A[0] = A_data;
+        for (i=1; i<NROWS; i++)
+            A[i] = A[i-1] + NCOLS;
+
+        if (rank == 0)
+        {
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = i*NCOLS + j;
+
+            /* create datatype for one column */
+            MPI_Type_vector(NROWS, 1, NCOLS, MPI_INT, &column);
+            /* create datatype for matrix in column-major order */
+            MPI_Type_hvector(NCOLS, 1, sizeof(int), column, &xpose);
+            MPI_Type_commit(&xpose);
+
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+
+            MPI_Win_fence(0, win);
+
+            MPI_Put(&A[0][0], NROWS*NCOLS, MPI_INT, 1, 0, 1, xpose, win);
+
+            MPI_Type_free(&column);
+            MPI_Type_free(&xpose);
+
+            MPI_Win_fence(0, win);
+        }
+        else if (rank == 1)
+        {
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = -1;
+            MPI_Win_create(&A[0][0], NROWS*NCOLS*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            MPI_Win_fence(0, win);
+
+            MPI_Win_fence(0, win);
+
+            for (j=0; j<NCOLS; j++)
+            {
+                for (i=0; i<NROWS; i++)
+                {
+                    if (A[j][i] != i*NCOLS + j)
+                    {
+                        if (errs < 50)
+                        {
+                            printf("Error: A[%d][%d]=%d should be %d\n", j, i,
+                                   A[j][i], i*NCOLS + j);
+                        }
+                        errs++;
+                    }
+                }
+            }
+            if (errs >= 50)
+            {
+                printf("Total number of errors: %d\n", errs);
+            }
+        }
+
+        MPI_Win_free(&win);
+
+        free(A_data);
+        free(A);
+
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/transpose2.c b/teshsuite/smpi/mpich3-test/rma/transpose2.c
new file mode 100644
index 0000000000..bfb30c4f80
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/transpose2.c
@@ -0,0 +1,107 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+
+/* transposes a matrix using put, fence, and derived
+   datatypes. Uses vector and struct (Example 3.33 from MPI 1.1
+   Standard). We could use vector and type_create_resized instead. Run
+   on 2 processes */ 
+
+#define NROWS 100
+#define NCOLS 100
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, A[NROWS][NCOLS], i, j, blocklen[2];
+    MPI_Comm CommDeuce;
+    MPI_Aint disp[2];
+    MPI_Win win;
+    MPI_Datatype column, column1, type[2];
+    int errs=0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        if (rank == 0)
+        {
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = i*NCOLS + j;
+
+            /* create datatype for one column */
+            MPI_Type_vector(NROWS, 1, NCOLS, MPI_INT, &column);
+
+            /* create datatype for one column, with the extent of one
+           integer. we could use type_create_resized instead. */
+            disp[0] = 0;
+            disp[1] = sizeof(int);
+            type[0]  = column;
+            type[1]  = MPI_UB;
+            blocklen[0]  = 1;
+            blocklen[1]  = 1;
+            MPI_Type_struct(2, blocklen, disp, type, &column1);
+            MPI_Type_commit(&column1);
+
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+
+            MPI_Win_fence(0, win);
+
+            MPI_Put(A, NROWS*NCOLS, MPI_INT, 1, 0, NCOLS, column1, win);
+
+            MPI_Type_free(&column);
+            MPI_Type_free(&column1);
+
+            MPI_Win_fence(0, win);
+        }
+        else
+        { /* rank=1 */
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = -1;
+            MPI_Win_create(A, NROWS*NCOLS*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            MPI_Win_fence(0, win);
+
+            MPI_Win_fence(0, win);
+
+            for (j=0; j<NCOLS; j++)
+            {
+                for (i=0; i<NROWS; i++)
+                {
+                    if (A[j][i] != i*NCOLS + j)
+                    {
+                        if (errs < 50)
+                        {
+                            printf("Error: A[%d][%d]=%d should be %d\n", j, i,
+                                   A[j][i], i*NCOLS + j);
+                        }
+                        errs++;
+                    }
+                }
+            }
+            if (errs >= 50)
+            {
+                printf("Total number of errors: %d\n", errs);
+            }
+        }
+        MPI_Win_free(&win);
+    }
+
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/transpose3.c b/teshsuite/smpi/mpich3-test/rma/transpose3.c
new file mode 100644
index 0000000000..86ef3d5ff2
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/transpose3.c
@@ -0,0 +1,107 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+#include "squelch.h"
+
+/* transposes a matrix using post/start/complete/wait and derived
+   datatypes. Uses  vector and hvector (Example 3.32 from MPI 1.1
+   Standard). Run on 2 processes */
+
+#define NROWS 100
+#define NCOLS 100
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, A[NROWS][NCOLS], i, j, destrank;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    MPI_Datatype column, xpose;
+    MPI_Group comm_group, group;
+    int errs=0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2)
+    {
+        MPI_Comm_group(CommDeuce, &comm_group);
+
+        if (rank == 0)
+        {
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = i*NCOLS + j;
+
+            /* create datatype for one column */
+            MPI_Type_vector(NROWS, 1, NCOLS, MPI_INT, &column);
+            /* create datatype for matrix in column-major order */
+            MPI_Type_hvector(NCOLS, 1, sizeof(int), column, &xpose);
+            MPI_Type_commit(&xpose);
+
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+
+            destrank = 1;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_start(group, 0, win);
+
+            MPI_Put(A, NROWS*NCOLS, MPI_INT, 1, 0, 1, xpose, win);
+
+            MPI_Type_free(&column);
+            MPI_Type_free(&xpose);
+
+            MPI_Win_complete(win);
+        }
+        else
+        { /* rank=1 */
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = -1;
+            MPI_Win_create(A, NROWS*NCOLS*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 0;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_post(group, 0, win);
+            MPI_Win_wait(win);
+
+            for (j=0; j<NCOLS; j++)
+            {
+                for (i=0; i<NROWS; i++)
+                {
+                    if (A[j][i] != i*NCOLS + j)
+                    {
+                        if (errs < 50)
+                        {
+                            SQUELCH( printf("Error: A[%d][%d]=%d should be %d\n", j, i,
+                                            A[j][i], i*NCOLS + j); );
+                        }
+                        errs++;
+                    }
+                }
+            }
+            if (errs >= 50)
+            {
+                printf("Total number of errors: %d\n", errs);
+            }
+        }
+
+        MPI_Group_free(&group);
+        MPI_Group_free(&comm_group);
+        MPI_Win_free(&win);
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/transpose4.c b/teshsuite/smpi/mpich3-test/rma/transpose4.c
new file mode 100644
index 0000000000..6e81c43de7
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/transpose4.c
@@ -0,0 +1,84 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+
+/* transposes a matrix using passive target RMA and derived
+   datatypes. Uses  vector and hvector (Example 3.32 from MPI 1.1
+   Standard). Run on 2 processes. */
+
+#define NROWS 100
+#define NCOLS 100
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, A[NROWS][NCOLS], i, j;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    MPI_Datatype column, xpose;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2)
+    {
+        if (rank == 0) {
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = i*NCOLS + j;
+
+            /* create datatype for one column */
+            MPI_Type_vector(NROWS, 1, NCOLS, MPI_INT, &column);
+            /* create datatype for matrix in column-major order */
+            MPI_Type_hvector(NCOLS, 1, sizeof(int), column, &xpose);
+            MPI_Type_commit(&xpose);
+
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+
+            MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win);
+
+            MPI_Put(A, NROWS*NCOLS, MPI_INT, 1, 0, 1, xpose, win);
+
+            MPI_Type_free(&column);
+            MPI_Type_free(&xpose);
+
+            MPI_Win_unlock(1, win);
+            MPI_Win_free(&win);
+        }
+        else
+        { /* rank=1 */
+            for (i=0; i<NROWS; i++) 
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = -1;
+            MPI_Win_create(A, NROWS*NCOLS*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+
+            MPI_Win_free(&win);
+
+            for (j=0; j<NCOLS; j++)
+                for (i=0; i<NROWS; i++)
+                    if (A[j][i] != i*NCOLS + j) {
+                        printf("Error: A[%d][%d]=%d should be %d\n", j, i,
+                               A[j][i], i*NCOLS + j);
+                        errs++;
+                    }
+        }
+    }
+
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/transpose5.c b/teshsuite/smpi/mpich3-test/rma/transpose5.c
new file mode 100644
index 0000000000..2ae63bdc0f
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/transpose5.c
@@ -0,0 +1,111 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include <stdlib.h>
+#include "mpitest.h"
+
+/* This does a transpose-cum-accumulate operation. Uses  vector and
+   hvector datatypes (Example 3.32 from MPI 1.1 Standard). Run on 2
+   processes */ 
+
+#define NROWS 1000
+#define NCOLS 1000
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, **A, *A_data, i, j;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    MPI_Datatype column, xpose;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2)
+    {
+        A_data = (int *) malloc(NROWS * NCOLS * sizeof(int));
+        A = (int **) malloc(NROWS * sizeof(int *));
+
+        A[0] = A_data;
+        for (i=1; i<NROWS; i++)
+            A[i] = A[i-1] + NCOLS;
+
+        if (rank == 0)
+        {
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = i*NCOLS + j;
+
+            /* create datatype for one column */
+            MPI_Type_vector(NROWS, 1, NCOLS, MPI_INT, &column);
+            /* create datatype for matrix in column-major order */
+            MPI_Type_hvector(NCOLS, 1, sizeof(int), column, &xpose);
+            MPI_Type_commit(&xpose);
+
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+
+            MPI_Win_fence(0, win);
+
+            MPI_Accumulate(&A[0][0], NROWS*NCOLS, MPI_INT, 1, 0, 1, xpose, MPI_SUM, win);
+
+            MPI_Type_free(&column);
+            MPI_Type_free(&xpose);
+
+            MPI_Win_fence(0, win);
+        }
+        else
+        { /* rank=1 */
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = i*NCOLS + j;
+            MPI_Win_create(&A[0][0], NROWS*NCOLS*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            MPI_Win_fence(0, win);
+
+            MPI_Win_fence(0, win);
+
+            for (j=0; j<NCOLS; j++)
+            {
+                for (i=0; i<NROWS; i++)
+                {
+                    if (A[j][i] != i*NCOLS + j + j*NCOLS + i)
+                    {
+                        if (errs < 50)
+                        {
+                            printf("Error: A[%d][%d]=%d should be %d\n", j, i,
+                                   A[j][i], i*NCOLS + j + j*NCOLS + i);
+                        }
+                        errs++;
+                    }
+                }
+            }
+            if (errs >= 50)
+            {
+                printf("Total number of errors: %d\n", errs);
+            }
+        }
+
+        MPI_Win_free(&win);
+
+        free(A_data);
+        free(A);
+
+    }
+
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/transpose6.c b/teshsuite/smpi/mpich3-test/rma/transpose6.c
new file mode 100644
index 0000000000..09471dae9c
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/transpose6.c
@@ -0,0 +1,76 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+
+/* This does a local transpose-cum-accumulate operation. Uses 
+   vector and hvector datatypes (Example 3.32 from MPI 1.1
+   Standard). Run on 1 process. */
+
+#define NROWS 100
+#define NCOLS 100
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, A[NROWS][NCOLS], B[NROWS][NCOLS], i, j;
+    MPI_Win win;
+    MPI_Datatype column, xpose;
+    int errs = 0;
+ 
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+
+    if (rank==0)
+    {
+        for (i=0; i<NROWS; i++)
+            for (j=0; j<NCOLS; j++)
+                A[i][j] = B[i][j] = i*NCOLS + j;
+        
+        /* create datatype for one column */
+        MPI_Type_vector(NROWS, 1, NCOLS, MPI_INT, &column);
+        /* create datatype for matrix in column-major order */
+        MPI_Type_hvector(NCOLS, 1, sizeof(int), column, &xpose);
+        MPI_Type_commit(&xpose);
+        
+        MPI_Win_create(B, NROWS*NCOLS*sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_SELF, &win); 
+        
+        MPI_Win_fence(0, win); 
+        
+        MPI_Accumulate(A, NROWS*NCOLS, MPI_INT, 0, 0, 1, xpose, MPI_SUM, win);
+        
+        MPI_Type_free(&column);
+        MPI_Type_free(&xpose);
+        
+        MPI_Win_fence(0, win); 
+        
+        for (j=0; j<NCOLS; j++)
+        {
+            for (i=0; i<NROWS; i++)
+ 	   {
+                if (B[j][i] != i*NCOLS + j + j*NCOLS + i)
+ 	       {
+ 	   	if (errs < 20)
+ 	   	{
+ 	   	    printf("Error: B[%d][%d]=%d should be %d\n", j, i,
+ 	   		B[j][i], i*NCOLS + j + j*NCOLS + i);
+ 	   	}
+                    errs++;
+                }
+ 	   }
+        }
+        if (errs >= 20)
+        {
+ 	   printf("Total number of errors: %d\n", errs);
+        }
+ 
+        MPI_Win_free(&win); 
+    }
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/transpose7.c b/teshsuite/smpi/mpich3-test/rma/transpose7.c
new file mode 100644
index 0000000000..d78b2dd0f8
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/transpose7.c
@@ -0,0 +1,105 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+/* This does a transpose with a get operation, fence, and derived
+   datatypes. Uses vector and hvector (Example 3.32 from MPI 1.1
+   Standard). Run on 2 processes */
+
+#define NROWS 1000
+#define NCOLS 1000
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, nprocs, **A, *A_data, i, j;
+    MPI_Comm CommDeuce;
+    MPI_Win win;
+    MPI_Datatype column, xpose;
+    int errs = 0;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2)
+    {
+        A_data = (int *) malloc(NROWS * NCOLS * sizeof(int));
+        A = (int **) malloc(NROWS * sizeof(int *));
+
+        A[0] = A_data;
+        for (i=1; i<NROWS; i++)
+            A[i] = A[i-1] + NCOLS;
+
+        if (rank == 0)
+        {
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = -1;
+
+            /* create datatype for one column */
+            MPI_Type_vector(NROWS, 1, NCOLS, MPI_INT, &column);
+            /* create datatype for matrix in column-major order */
+            MPI_Type_hvector(NCOLS, 1, sizeof(int), column, &xpose);
+            MPI_Type_commit(&xpose);
+
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+
+            MPI_Win_fence(0, win);
+
+            MPI_Get(&A[0][0], NROWS*NCOLS, MPI_INT, 1, 0, 1, xpose, win);
+
+            MPI_Type_free(&column);
+            MPI_Type_free(&xpose);
+
+            MPI_Win_fence(0, win);
+
+            for (j=0; j<NCOLS; j++)
+            {
+                for (i=0; i<NROWS; i++)
+                {
+                    if (A[j][i] != i*NCOLS + j)
+                    {
+                        if (errs < 50)
+                        {
+                            printf("Error: A[%d][%d]=%d should be %d\n", j, i,
+                                   A[j][i], i*NCOLS + j);
+                        }
+                        errs++;
+                    }
+                }
+            }
+            if (errs >= 50)
+            {
+                printf("Total number of errors: %d\n", errs);
+            }
+        }
+        else
+        {
+            for (i=0; i<NROWS; i++)
+                for (j=0; j<NCOLS; j++)
+                    A[i][j] = i*NCOLS + j;
+
+            MPI_Win_create(&A[0][0], NROWS*NCOLS*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            MPI_Win_fence(0, win);
+            MPI_Win_fence(0, win);
+        }
+        MPI_Win_free(&win);
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize(); 
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/rma/win_dynamic_acc.c b/teshsuite/smpi/mpich3-test/rma/win_dynamic_acc.c
new file mode 100644
index 0000000000..ebb0a35fc1
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/win_dynamic_acc.c
@@ -0,0 +1,65 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <mpi.h>
+#include "mpitest.h"
+
+#define ITER 100
+
+const int verbose = 0;
+
+int main(int argc, char **argv) {
+    int       i, j, rank, nproc;
+    int       errors = 0, all_errors = 0;
+    int       val = 0, one = 1;
+    MPI_Aint *val_ptrs;
+    MPI_Win   dyn_win;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    val_ptrs = malloc(nproc * sizeof(MPI_Aint));
+    MPI_Get_address(&val, &val_ptrs[rank]);
+
+    MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, val_ptrs, 1, MPI_AINT,
+                  MPI_COMM_WORLD);
+
+    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &dyn_win);
+    MPI_Win_attach(dyn_win, &one, sizeof(int));
+
+    for (i = 0; i < ITER; i++) {
+            MPI_Win_fence(MPI_MODE_NOPRECEDE, dyn_win);
+            MPI_Accumulate(&one, 1, MPI_INT, i%nproc, val_ptrs[i%nproc], 1, MPI_INT, MPI_SUM, dyn_win);
+            MPI_Win_fence(MPI_MODE_NOSUCCEED, dyn_win);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* Read and verify my data */
+    if ( val != ITER ) {
+        errors++;
+        printf("%d -- Got %d, expected %d\n", rank, val, ITER);
+    }
+
+    MPI_Win_detach(dyn_win, &one);
+    MPI_Win_free(&dyn_win);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    free(val_ptrs);
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/win_flavors.c b/teshsuite/smpi/mpich3-test/rma/win_flavors.c
new file mode 100644
index 0000000000..c5179c4f28
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/win_flavors.c
@@ -0,0 +1,122 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include <mpi.h>
+#include <stdio.h>
+#include "mpitest.h"
+
+#define ELEM_SIZE 8
+
+int main( int argc, char *argv[] )
+{
+    int     rank;
+    int     errors = 0, all_errors = 0;
+    int    *flavor, *model, flag;
+    void   *buf;
+    MPI_Win window;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    /** Create using MPI_Win_create() **/
+
+    if (rank > 0)
+      MPI_Alloc_mem(rank*ELEM_SIZE, MPI_INFO_NULL, &buf);
+    else
+      buf = NULL;
+
+    MPI_Win_create(buf, rank*ELEM_SIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &window);
+    MPI_Win_get_attr(window, MPI_WIN_CREATE_FLAVOR, &flavor, &flag);
+
+    if (!flag) {
+      printf("%d: MPI_Win_create - Error, no flavor\n", rank);
+      errors++;
+    } else if (*flavor != MPI_WIN_FLAVOR_CREATE) {
+      printf("%d: MPI_Win_create - Error, bad flavor (%d)\n", rank, *flavor);
+      errors++;
+    }
+
+    MPI_Win_get_attr(window, MPI_WIN_MODEL, &model, &flag);
+
+    if (!flag) {
+      printf("%d: MPI_Win_create - Error, no model\n", rank);
+      errors++;
+    } else if ( ! (*model == MPI_WIN_SEPARATE || *model == MPI_WIN_UNIFIED) ) {
+      printf("%d: MPI_Win_create - Error, bad model (%d)\n", rank, *model);
+      errors++;
+    }
+
+    MPI_Win_free(&window);
+
+    if (buf)
+      MPI_Free_mem(buf);
+
+    /** Create using MPI_Win_allocate() **/
+
+    MPI_Win_allocate(rank*ELEM_SIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf, &window);
+
+    if (rank > 0 && buf == NULL) {
+      printf("%d: MPI_Win_allocate - Error, bad base pointer\n", rank);
+      errors++;
+    }
+
+    MPI_Win_get_attr(window, MPI_WIN_CREATE_FLAVOR, &flavor, &flag);
+
+    if (!flag) {
+      printf("%d: MPI_Win_allocate - Error, no flavor\n", rank);
+      errors++;
+    } else if (*flavor != MPI_WIN_FLAVOR_ALLOCATE) {
+      printf("%d: MPI_Win_allocate - Error, bad flavor (%d)\n", rank, *flavor);
+      errors++;
+    }
+
+    MPI_Win_get_attr(window, MPI_WIN_MODEL, &model, &flag);
+
+    if (!flag) {
+      printf("%d: MPI_Win_allocate - Error, no model\n", rank);
+      errors++;
+    } else if (*model != MPI_WIN_SEPARATE && *model != MPI_WIN_UNIFIED) {
+      printf("%d: MPI_Win_allocate - Error, bad model (%d)\n", rank, *model);
+      errors++;
+    }
+
+    MPI_Win_free(&window);
+
+    /** Create using MPI_Win_create_dynamic() **/
+
+    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &window);
+
+    MPI_Win_get_attr(window, MPI_WIN_CREATE_FLAVOR, &flavor, &flag);
+
+    if (!flag) {
+      printf("%d: MPI_Win_create_dynamic - Error, no flavor\n", rank);
+      errors++;
+    } else if (*flavor != MPI_WIN_FLAVOR_DYNAMIC) {
+      printf("%d: MPI_Win_create_dynamic - Error, bad flavor (%d)\n", rank, *flavor);
+      errors++;
+    }
+
+    MPI_Win_get_attr(window, MPI_WIN_MODEL, &model, &flag);
+
+    if (!flag) {
+      printf("%d: MPI_Win_create_dynamic - Error, no model\n", rank);
+      errors++;
+    } else if (*model != MPI_WIN_SEPARATE && *model != MPI_WIN_UNIFIED) {
+      printf("%d: MPI_Win_create_dynamic - Error, bad model (%d)\n", rank, *model);
+      errors++;
+    }
+
+    MPI_Win_free(&window);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/win_info.c b/teshsuite/smpi/mpich3-test/rma/win_info.c
new file mode 100644
index 0000000000..44286a9182
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/win_info.c
@@ -0,0 +1,72 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <mpi.h>
+#include "mpitest.h"
+
+#define VERBOSE 0
+
+int main(int argc, char **argv) {
+    int      i, j, rank, nproc;
+    MPI_Info info_in, info_out;
+    int      errors = 0, all_errors = 0;
+    MPI_Win  win;
+    void    *base;
+    char     invalid_key[] = "invalid_test_key";
+    char     buf[MPI_MAX_INFO_VAL];
+    int      flag;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Info_create(&info_in);
+    MPI_Info_set(info_in, invalid_key, "true");
+
+    MPI_Win_allocate(sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &base, &win);
+
+    MPI_Win_set_info(win, info_in);
+    MPI_Win_get_info(win, &info_out);
+
+    MPI_Info_get(info_out, invalid_key, MPI_MAX_INFO_VAL, buf, &flag);
+#ifndef USE_STRICT_MPI
+    /* Check if our invalid key was ignored.  Note, this check's MPICH's
+     * behavior, but this behavior may not be required for a standard
+     * conforming MPI implementation. */
+    if (flag) {
+        printf("%d: %s was not ignored\n", rank, invalid_key);
+        errors++;
+    }
+#endif
+
+    MPI_Info_get(info_out, "no_locks", MPI_MAX_INFO_VAL, buf, &flag);
+    if (flag && VERBOSE) printf("%d: no_locks = %s\n", rank, buf);
+
+    MPI_Info_get(info_out, "accumulate_ordering", MPI_MAX_INFO_VAL, buf, &flag);
+    if (flag && VERBOSE) printf("%d: accumulate_ordering = %s\n", rank, buf);
+
+    MPI_Info_get(info_out, "accumulate_ops", MPI_MAX_INFO_VAL, buf, &flag);
+    if (flag && VERBOSE) printf("%d: accumulate_ops = %s\n", rank, buf);
+
+    MPI_Info_get(info_out, "same_size", MPI_MAX_INFO_VAL, buf, &flag);
+    if (flag && VERBOSE) printf("%d: same_size = %s\n", rank, buf);
+
+    MPI_Info_free(&info_in);
+    MPI_Info_free(&info_out);
+    MPI_Win_free(&win);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/win_shared.c b/teshsuite/smpi/mpich3-test/rma/win_shared.c
new file mode 100644
index 0000000000..b4e1f6ced7
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/win_shared.c
@@ -0,0 +1,88 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <mpi.h>
+#include "mpitest.h"
+
+#define ELEM_PER_PROC 10000
+
+const int verbose = 0;
+
+int main(int argc, char **argv) {
+    int      i, j, rank, nproc;
+    int      shm_rank, shm_nproc;
+    MPI_Aint size;
+    int      errors = 0, all_errors = 0;
+    int     *base, *my_base;
+    int      disp_unit;
+    MPI_Win  shm_win;
+    MPI_Comm shm_comm;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm);
+
+    MPI_Comm_rank(shm_comm, &shm_rank);
+    MPI_Comm_size(shm_comm, &shm_nproc);
+
+    /* Allocate ELEM_PER_PROC integers for each process */
+    MPI_Win_allocate_shared(sizeof(int)*ELEM_PER_PROC, sizeof(int), MPI_INFO_NULL, 
+                             shm_comm, &my_base, &shm_win);
+
+    /* Locate absolute base */
+    MPI_Win_shared_query(shm_win, MPI_PROC_NULL, &size, &disp_unit, &base); 
+
+    if (verbose) printf("%d -- size = %d baseptr = %p my_baseptr = %p\n", shm_rank, 
+                        (int) size, (void*) base, (void*) my_base);
+
+    assert(size == ELEM_PER_PROC * sizeof(int));
+    if (shm_rank == 0)
+        assert(base == my_base);
+    else
+        assert(base != my_base);
+
+    MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win);
+
+    /* Write to all my data */
+    for (i = 0; i < ELEM_PER_PROC; i++) {
+        my_base[i] = i;
+    }
+
+    MPI_Win_sync(shm_win);
+    MPI_Barrier(shm_comm);
+    MPI_Win_sync(shm_win);
+
+    /* Read and verify everyone's data */
+    for (i = 0; i < shm_nproc; i++) {
+        for (j = 0; j < ELEM_PER_PROC; j++) {
+            if ( base[i*ELEM_PER_PROC + j] != j ) {
+                errors++;
+                printf("%d -- Got %d at rank %d index %d, expected %d\n", shm_rank,
+                       base[i*ELEM_PER_PROC + j], i, j, j);
+            }
+        }
+    }
+
+    MPI_Win_unlock_all(shm_win);
+    MPI_Win_free(&shm_win);
+    MPI_Comm_free(&shm_comm);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/win_shared_noncontig.c b/teshsuite/smpi/mpich3-test/rma/win_shared_noncontig.c
new file mode 100644
index 0000000000..a6ab73ba1f
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/win_shared_noncontig.c
@@ -0,0 +1,87 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <mpi.h>
+#include "mpitest.h"
+
+#define ELEM_PER_PROC 10000
+
+const int verbose = 0;
+
+int main(int argc, char **argv) {
+    int      i, j, rank, nproc;
+    int      shm_rank, shm_nproc;
+    MPI_Info alloc_shared_info;
+    int      errors = 0, all_errors = 0;
+    int      disp_unit;
+    int     *my_base;
+    MPI_Win  shm_win;
+    MPI_Comm shm_comm;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Info_create(&alloc_shared_info);
+    MPI_Info_set(alloc_shared_info, "alloc_shared_noncontig", "true");
+
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm);
+
+    MPI_Comm_rank(shm_comm, &shm_rank);
+    MPI_Comm_size(shm_comm, &shm_nproc);
+
+    /* Allocate ELEM_PER_PROC integers for each process */
+    MPI_Win_allocate_shared(sizeof(int)*ELEM_PER_PROC, sizeof(int), alloc_shared_info, 
+                             shm_comm, &my_base, &shm_win);
+
+    MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win);
+
+    /* Write to all my data */
+    for (i = 0; i < ELEM_PER_PROC; i++) {
+        my_base[i] = i;
+    }
+
+    MPI_Win_sync(shm_win);
+    MPI_Barrier(shm_comm);
+    MPI_Win_sync(shm_win);
+
+    /* Read and verify everyone's data */
+    for (i = 0; i < shm_nproc; i++) {
+        int      *base;
+        MPI_Aint  size;
+
+        MPI_Win_shared_query(shm_win, i, &size, &disp_unit, &base);
+        assert(size == ELEM_PER_PROC * sizeof(int));
+
+        for (j = 0; j < ELEM_PER_PROC; j++) {
+            if ( base[j] != j ) {
+                errors++;
+                printf("%d -- Got %d at rank %d index %d, expected %d\n", shm_rank, 
+                       base[j], i, j, j);
+            }
+        }
+    }
+
+    MPI_Win_unlock_all(shm_win);
+    MPI_Win_free(&shm_win);
+    MPI_Comm_free(&shm_comm);
+
+    MPI_Info_free(&alloc_shared_info);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/win_shared_noncontig_put.c b/teshsuite/smpi/mpich3-test/rma/win_shared_noncontig_put.c
new file mode 100644
index 0000000000..60409a8850
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/win_shared_noncontig_put.c
@@ -0,0 +1,94 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2012 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <mpi.h>
+#include "mpitest.h"
+
+#define ELEM_PER_PROC 10000
+
+const int verbose = 0;
+
+int main(int argc, char **argv) {
+    int      i, j, rank, nproc;
+    int      shm_rank, shm_nproc;
+    MPI_Info alloc_shared_info;
+    int      errors = 0, all_errors = 0;
+    int      disp_unit;
+    int     *my_base, my_size;
+    MPI_Win  shm_win;
+    MPI_Comm shm_comm;
+
+    MPI_Init(&argc, &argv);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+    MPI_Info_create(&alloc_shared_info);
+    MPI_Info_set(alloc_shared_info, "alloc_shared_noncontig", "true");
+
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm);
+
+    MPI_Comm_rank(shm_comm, &shm_rank);
+    MPI_Comm_size(shm_comm, &shm_nproc);
+
+    /* Allocate ELEM_PER_PROC integers on each even rank process */
+    my_size = (shm_rank % 2 == 0) ? sizeof(int)*ELEM_PER_PROC : 0;
+    MPI_Win_allocate_shared(my_size, sizeof(int), alloc_shared_info,
+                             shm_comm, &my_base, &shm_win);
+
+    for (i = 0; i < ELEM_PER_PROC; i++) {
+            MPI_Win_fence(MPI_MODE_NOPRECEDE, shm_win);
+            if (shm_rank % 2 == 0) {
+                MPI_Put(&i, 1, MPI_INT, 
+                        (shm_rank + 2 > shm_nproc) ? 0 : (shm_rank+2) % shm_nproc,
+                        i, 1, MPI_INT, shm_win);
+            }
+            MPI_Win_fence(MPI_MODE_NOSUCCEED, shm_win);
+    }
+
+    MPI_Barrier(shm_comm);
+
+    /* Read and verify everyone's data */
+    for (i = 0; i < shm_nproc; i++) {
+        int      *base;
+        MPI_Aint  size;
+
+        MPI_Win_shared_query(shm_win, i, &size, &disp_unit, &base);
+
+        if (i % 2 == 0) {
+            assert(size == ELEM_PER_PROC * sizeof(int));
+
+            for (j = 0; j < ELEM_PER_PROC; j++) {
+                if ( base[j] != j ) {
+                    errors++;
+                    printf("%d -- Got %d at rank %d index %d, expected %d\n", shm_rank,
+                           base[j], i, j, j);
+                }
+            }
+        } else {
+            assert(size == 0);
+            assert(base == NULL);
+        }
+    }
+
+    MPI_Win_free(&shm_win);
+    MPI_Comm_free(&shm_comm);
+
+    MPI_Info_free(&alloc_shared_info);
+
+    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (rank == 0 && all_errors == 0)
+        printf(" No Errors\n");
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/wincall.c b/teshsuite/smpi/mpich3-test/rma/wincall.c
new file mode 100644
index 0000000000..c29b7966fd
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/wincall.c
@@ -0,0 +1,65 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2003 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+/*
+static char MTEST_Descrip[] = "Test win_call_errhandler";
+*/
+
+static int calls = 0;
+static int errs = 0;
+static MPI_Win mywin;
+void eh( MPI_Win *win, int *err, ... );
+void eh( MPI_Win *win, int *err, ... )
+{
+    if (*err != MPI_ERR_OTHER) {
+	errs++;
+	printf( "Unexpected error code\n" );
+    }
+    if (*win != mywin) {
+	errs++;
+	printf( "Unexpected window\n" );
+    }
+    calls++;
+    return;
+}
+int main( int argc, char *argv[] )
+{
+    int buf[2];
+    MPI_Win        win;
+    MPI_Errhandler newerr;
+    int            i;
+
+    MTest_Init( &argc, &argv );
+
+    /* Run this test multiple times to expose storage leaks (we found a leak
+       of error handlers with this test) */
+    for (i=0;i<1000; i++)  {
+	calls = 0;
+	
+	MPI_Win_create( buf, 2*sizeof(int), sizeof(int), 
+			MPI_INFO_NULL, MPI_COMM_WORLD, &win );
+	mywin = win;
+	
+	MPI_Win_create_errhandler( eh, &newerr );
+	
+	MPI_Win_set_errhandler( win, newerr );
+	MPI_Win_call_errhandler( win, MPI_ERR_OTHER );
+	MPI_Errhandler_free( &newerr );
+	if (calls != 1) {
+	    errs++;
+	    printf( "Error handler not called\n" );
+	}
+	MPI_Win_free( &win );
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/window_creation.c b/teshsuite/smpi/mpich3-test/rma/window_creation.c
new file mode 100644
index 0000000000..a805350eb1
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/window_creation.c
@@ -0,0 +1,53 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <mpi.h>
+
+#define DATA_NELTS  1000
+#define NUM_WIN     1000
+#define DATA_SZ     (DATA_NELTS*sizeof(int))
+
+static int verbose = 0;
+
+int main(int argc, char ** argv) {
+  int      rank, nproc, i;
+  void    *base_ptrs[NUM_WIN];
+  MPI_Win  windows[NUM_WIN];
+
+  MPI_Init(&argc, &argv);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+  if (rank == 0) if (verbose) printf("Starting MPI window creation test with %d processes\n", nproc);
+
+  /* Perform a pile of window creations */
+  for (i = 0; i < NUM_WIN; i++) {
+    if (rank == 0) if (verbose) printf(" + Creating window %d\n", i);
+
+    MPI_Alloc_mem(DATA_SZ, MPI_INFO_NULL, &base_ptrs[i]);
+    MPI_Win_create(base_ptrs[i], DATA_SZ, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &windows[i]);
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  /* Free all the windows */
+  for (i = 0; i < NUM_WIN; i++) {
+    if (rank == 0) if (verbose) printf(" + Freeing window %d\n", i);
+
+    MPI_Win_free(&windows[i]);
+    MPI_Free_mem(base_ptrs[i]);
+  }
+
+  if (rank == 0) printf(" No Errors\n");
+
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/winname.c b/teshsuite/smpi/mpich3-test/rma/winname.c
new file mode 100644
index 0000000000..290f26e260
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/winname.c
@@ -0,0 +1,47 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitestconf.h"
+#include "mpitest.h"
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
+int main( int argc, char *argv[] )
+{
+    int errs = 0;
+    MPI_Win win;
+    int cnt, namelen;
+    char name[MPI_MAX_OBJECT_NAME], nameout[MPI_MAX_OBJECT_NAME];
+
+    MTest_Init( &argc, &argv );
+
+    cnt = 0;
+    while (MTestGetWin( &win, 1 )) {
+	if (win == MPI_WIN_NULL) continue;
+    
+	sprintf( name, "win-%d", cnt );
+	cnt++;
+	MPI_Win_set_name( win, name );
+	nameout[0] = 0;
+	MPI_Win_get_name( win, nameout, &namelen );
+	if (strcmp( name, nameout )) {
+	    errs++;
+	    printf( "Unexpected name, was %s but should be %s\n",
+		    nameout, name );
+	}
+
+	MTestFreeWin( &win );
+    }
+
+    MTest_Finalize( errs );
+    MPI_Finalize();
+    return 0;
+}
diff --git a/teshsuite/smpi/mpich3-test/rma/wintest.c b/teshsuite/smpi/mpich3-test/rma/wintest.c
new file mode 100644
index 0000000000..a8a784c3ad
--- /dev/null
+++ b/teshsuite/smpi/mpich3-test/rma/wintest.c
@@ -0,0 +1,83 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h" 
+#include "stdio.h"
+#include "mpitest.h"
+
+/* tests put and get with post/start/complete/test on 2 processes */
+/* Same as test2.c, but uses win_test instead of win_wait */
+
+#define SIZE1 10
+#define SIZE2 20
+
+int main(int argc, char *argv[]) 
+{ 
+    int rank, destrank, nprocs, A[SIZE2], B[SIZE2], i;
+    MPI_Comm CommDeuce;
+    MPI_Group comm_group, group;
+    MPI_Win win;
+    int errs = 0, flag;
+
+    MTest_Init(&argc,&argv); 
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank); 
+
+    if (nprocs < 2) {
+        printf("Run this program with 2 or more processes\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce);
+
+    if (rank < 2) {
+        MPI_Comm_group(CommDeuce, &comm_group);
+
+        if (rank == 0) {
+            for (i=0; i<SIZE2; i++) A[i] = B[i] = i;
+            MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 1;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_start(group, 0, win);
+            for (i=0; i<SIZE1; i++)
+                MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win); 
+            for (i=0; i<SIZE1; i++)
+                MPI_Get(B+i, 1, MPI_INT, 1, SIZE1+i, 1, MPI_INT, win);
+
+            MPI_Win_complete(win);
+
+            for (i=0; i<SIZE1; i++) 
+                if (B[i] != (-4)*(i+SIZE1)) {
+                    printf("Get Error: B[i] is %d, should be %d\n", B[i], (-4)*(i+SIZE1));
+                    errs++;
+                }
+        }
+        else {  /* rank=1 */
+            for (i=0; i<SIZE2; i++) B[i] = (-4)*i;
+            MPI_Win_create(B, SIZE2*sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win);
+            destrank = 0;
+            MPI_Group_incl(comm_group, 1, &destrank, &group);
+            MPI_Win_post(group, 0, win);
+            flag = 0;
+            while (!flag)
+                MPI_Win_test(win, &flag);
+
+            for (i=0; i<SIZE1; i++) {
+                if (B[i] != i) {
+                    printf("Put Error: B[i] is %d, should be %d\n", B[i], i);
+                    errs++;
+                }
+            }
+        }
+
+        MPI_Group_free(&group);
+        MPI_Group_free(&comm_group);
+        MPI_Win_free(&win); 
+    }
+    MPI_Comm_free(&CommDeuce);
+    MTest_Finalize(errs);
+    MPI_Finalize();
+    return 0; 
+} 
diff --git a/teshsuite/smpi/mpich3-test/testlist b/teshsuite/smpi/mpich3-test/testlist
index 900c9194ab..aef3b2c9c2 100644
--- a/teshsuite/smpi/mpich3-test/testlist
+++ b/teshsuite/smpi/mpich3-test/testlist
@@ -6,6 +6,7 @@ coll
 comm
 datatype
 #errhan
+rma
 group
 #info
 init
-- 
2.20.1