From: Frederic Suter <frederic.suter@cc.in2p3.fr>
Date: Thu, 10 Mar 2016 14:12:40 +0000 (+0100)
Subject: Merge branch 'master' of git+ssh://scm.gforge.inria.fr//gitroot/simgrid/simgrid
X-Git-Tag: v3_13~453
X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/bc48db087894fd960073b3120cebf90e6b2f8c02?hp=d73e15ba74e08212c750b27197e964efd759974c

Merge branch 'master' of git+ssh://scm.gforge.inria.fr//gitroot/simgrid/simgrid
---

diff --git a/.gitignore b/.gitignore
index 6a6cfb79c7..ce66e3d2c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -247,6 +247,9 @@ examples/smpi/mc/bugged2
 examples/smpi/mc/mutual_exclusion
 examples/smpi/mc/non_deterministic
 examples/smpi/mc/send_deterministic
+examples/smpi/NAS/dt
+examples/smpi/NAS/ep
+examples/smpi/NAS/is
 examples/smpi/mvmul
 examples/smpi/replay_multiple/replay_multiple
 examples/smpi/replay/one_trace
diff --git a/examples/smpi/NAS/CMakeLists.txt b/examples/smpi/NAS/CMakeLists.txt
new file mode 100644
index 0000000000..b4de3b0f4c
--- /dev/null
+++ b/examples/smpi/NAS/CMakeLists.txt
@@ -0,0 +1,25 @@
+if(enable_smpi)
+  if(WIN32)
+    set(CMAKE_C_FLAGS "-include ${CMAKE_HOME_DIRECTORY}/include/smpi/smpi_main.h")
+  else()
+    set(CMAKE_C_COMPILER "${CMAKE_BINARY_DIR}/smpi_script/bin/smpicc")
+  endif()
+
+  include_directories(BEFORE "${CMAKE_HOME_DIRECTORY}/include/smpi")
+  add_executable       (is is.c nas_common.c)
+  target_link_libraries(is simgrid m)
+  add_executable       (ep ep.c nas_common.c)
+  target_link_libraries(ep simgrid m)
+  add_executable       (dt dt.c nas_common.c DGraph.c)
+  target_link_libraries(dt simgrid m)
+endif()
+
+set(examples_src  ${examples_src}  ${CMAKE_CURRENT_SOURCE_DIR}/nas_common.h
+                                   ${CMAKE_CURRENT_SOURCE_DIR}/nas_common.c
+                                   ${CMAKE_CURRENT_SOURCE_DIR}/is.c
+                                   ${CMAKE_CURRENT_SOURCE_DIR}/dt.c
+                                   ${CMAKE_CURRENT_SOURCE_DIR}/ep.c
+                                   ${CMAKE_CURRENT_SOURCE_DIR}/DGraph.c
+                                   ${CMAKE_CURRENT_SOURCE_DIR}/DGraph.h
+                                                                              PARENT_SCOPE)
+set(txt_files     ${txt_files}    ${CMAKE_CURRENT_SOURCE_DIR}/README.install  PARENT_SCOPE)
\ No newline at end of file
diff --git a/examples/smpi/NAS/DT/DGraph.c b/examples/smpi/NAS/DGraph.c
similarity index 82%
rename from examples/smpi/NAS/DT/DGraph.c
rename to examples/smpi/NAS/DGraph.c
index f573786db8..6dd0ed353c 100644
--- a/examples/smpi/NAS/DT/DGraph.c
+++ b/examples/smpi/NAS/DGraph.c
@@ -31,8 +31,7 @@ DGNode *newNode(char *nm){
   return nd;
 }
 void nodeShow(DGNode* nd){
-  fprintf( stderr,"%3d.%s: (%d,%d)\n",
-             nd->id,nd->name,nd->inDegree,nd->outDegree);
+  fprintf( stderr,"%3d.%s: (%d,%d)\n", nd->id,nd->name,nd->inDegree,nd->outDegree);
 /*
   if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
   else if(nd->verified==0)  fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
@@ -51,6 +50,7 @@ DGraph* newDGraph(char* nm){
   dg->name=strdup(nm);
   return dg;
 }
+
 int AttachNode(DGraph* dg, DGNode* nd) {
   int i=0,j,len=0;
   DGNode **nds =NULL, *tmpnd=NULL;
@@ -58,13 +58,13 @@ int AttachNode(DGraph* dg, DGNode* nd) {
 
   if (dg->numNodes == dg->maxNodes-1 ) {
     dg->maxNodes += BLOCK_SIZE;
-          nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
+    nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
     memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
     free(dg->node);
     dg->node=nds;
   }
 
-        len = strlen( nd->name);
+  len = strlen( nd->name);
   for (i = 0; i < dg->numNodes; i++) {
     tmpnd =dg->node[ i];
     ar=NULL;
@@ -72,7 +72,7 @@ int AttachNode(DGraph* dg, DGNode* nd) {
     if ( strncmp( nd->name, tmpnd->name, len) ) continue;
     if ( nd->inDegree > 0 ) {
       tmpnd->maxInDegree += nd->maxInDegree;
-            ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
+      ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
       memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
       free(tmpnd->inArc);
       tmpnd->inArc=ar;
@@ -84,60 +84,58 @@ int AttachNode(DGraph* dg, DGNode* nd) {
     }   
     if ( nd->outDegree > 0 ) {
       tmpnd->maxOutDegree += nd->maxOutDegree;
-            ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
+      ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
       memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
       free(tmpnd->outArc);
       tmpnd->outArc=ar;
       for (j = 0; j < nd->outDegree; j++ ) {
         nd->outArc[ j]->tail = tmpnd;
-      }      
+      }
       memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
       tmpnd->outDegree += nd->outDegree;
-    } 
+    }
     free(nd); 
     return i;
   }
   nd->id = dg->numNodes;
   dg->node[dg->numNodes] = nd;
   dg->numNodes++;
-return nd->id;
+  return nd->id;
 }
+
 int AttachArc(DGraph *dg,DGArc* nar){
-int  arcId = -1;
-int i=0,newNumber=0;
-DGNode  *head = nar->head,
-  *tail = nar->tail; 
-DGArc **ars=NULL,*probe=NULL;
-/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
+  int  arcId = -1;
+  int i=0,newNumber=0;
+  DGNode  *head = nar->head,
+          *tail = nar->tail;
+  DGArc **ars=NULL,*probe=NULL;
+  /*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
   if ( !tail || !head ) return arcId;
   if ( dg->numArcs == dg->maxArcs-1 ) {
     dg->maxArcs += BLOCK_SIZE;
-          ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
+    ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
     memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
     free(dg->arc);
     dg->arc=ars;
   }
   for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
     probe = tail->outArc[ i];
-    if(probe->head == head
-       &&
-       probe->length == nar->length
-            ){
-            free(nar);
-      return probe->id;   
+    if(probe->head == head && probe->length == nar->length){
+      free(nar);
+      return probe->id;
     }
   }
-  
+
   nar->id = dg->numArcs;
   arcId=dg->numArcs;
   dg->arc[dg->numArcs] = nar;
   dg->numArcs++;
-  
+
   head->inArc[ head->inDegree] = nar;
   head->inDegree++;
   if ( head->inDegree >= head->maxInDegree ) {
     newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
-          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+    ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
     memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
     free(head->inArc);
     head->inArc=ars;
@@ -147,15 +145,16 @@ DGArc **ars=NULL,*probe=NULL;
   tail->outDegree++;
   if(tail->outDegree >= tail->maxOutDegree ) {
     newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
-          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+    ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
     memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
     free(tail->outArc);
     tail->outArc=ars;
     tail->maxOutDegree = newNumber;
   }
 /*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
-return arcId;
+  return arcId;
 }
+
 void graphShow(DGraph *dg,int DetailsLevel){
   int i=0,j=0;
   fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
@@ -164,8 +163,8 @@ void graphShow(DGraph *dg,int DetailsLevel){
     DGNode *focusNode = dg->node[ i];
     if(DetailsLevel >= 2) {
       for (j = 0; j < focusNode->inDegree; j++ ) {
-  fprintf(stderr,"\t ");
-  nodeShow(focusNode->inArc[ j]->tail);
+        fprintf(stderr,"\t ");
+        nodeShow(focusNode->inArc[ j]->tail);
       }
     }
     nodeShow(focusNode);
@@ -173,12 +172,9 @@ void graphShow(DGraph *dg,int DetailsLevel){
     for (j = 0; j < focusNode->outDegree; j++ ) {
       fprintf(stderr, "\t ");
       nodeShow(focusNode->outArc[ j]->head);
-    }  
+    }
     fprintf(stderr, "---\n");
   }
   fprintf(stderr,"----------------------------------------\n");
   if ( DetailsLevel < 3) return;
 }
-
-
-
diff --git a/examples/smpi/NAS/DT/DGraph.h b/examples/smpi/NAS/DGraph.h
similarity index 100%
rename from examples/smpi/NAS/DT/DGraph.h
rename to examples/smpi/NAS/DGraph.h
diff --git a/examples/smpi/NAS/DT/Makefile b/examples/smpi/NAS/DT/Makefile
deleted file mode 100644
index cfcfea51f8..0000000000
--- a/examples/smpi/NAS/DT/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-BENCHMARK=dt
-include ../config/make.def
-include ../sys/make.common
-
-OBJS = DGraph.o ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/randdp.o
-
-${PROGRAM}: config dt.o dt-folding.o ${OBJS}
-	${CLINK} ${CLINKFLAGS} -o $(BINDIR)/dt.${CLASS} dt.o ${OBJS} ${CMPI_LIB}
-	${CLINK} ${CLINKFLAGS} -o ${BINDIR}/dt-folding.${CLASS} dt-folding.o ${OBJS} ${CMPI_LIB}
-
-.c.o:
-	${CCOMPILE} $<
-
-dt.o:             dt.c npbparams.h
-dt-folding.o:     dt-folding.c npbparams.h
-DGraph.o:         DGraph.c DGraph.h
-
-clean:
-	- rm -f *.o *~ npbparams.h
diff --git a/examples/smpi/NAS/DT/README b/examples/smpi/NAS/DT/README
deleted file mode 100644
index 873e3ae6f2..0000000000
--- a/examples/smpi/NAS/DT/README
+++ /dev/null
@@ -1,22 +0,0 @@
-Data Traffic benchmark DT is new in the NPB suite 
-(released as part of NPB3.x-MPI package).
-----------------------------------------------------
-
-DT is written in C and same executable can run on any number of processors,
-provided this number is not less than the number of nodes in the communication
-graph.  DT benchmark takes one argument: BH, WH, or SH. This argument 
-specifies the communication graph Black Hole, White Hole, or SHuffle 
-respectively. The current release contains verification numbers for 
-CLASSES S, W, A, and B only.  Classes C and D are defined, but verification 
-numbers are not provided in this release.
-
-The following table summarizes the number of nodes in the communication
-graph based on CLASS and graph TYPE.
-
-CLASS  N_Source N_Nodes(BH,WH) N_Nodes(SH)
- S      4        5              12
- W      8        11             32
- A      16       21             80
- B      32       43             192
- C      64       85             448
- D      128      171            1024
diff --git a/examples/smpi/NAS/DT/dt-folding.c b/examples/smpi/NAS/DT/dt-folding.c
deleted file mode 100644
index b088fbd5c8..0000000000
--- a/examples/smpi/NAS/DT/dt-folding.c
+++ /dev/null
@@ -1,759 +0,0 @@
-/*************************************************************************
- *                                                                       * 
- *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
- *                                                                       * 
- *                                  D T           * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
- *                                                                       * 
- *   Permission to use, copy, distribute and modify this software        * 
- *   for any purpose with or without fee is hereby granted.  We          * 
- *   request, however, that all derived work reference the NAS           * 
- *   Parallel Benchmarks 3.3. This software is provided "as is"          *
- *   without express or implied warranty.                                * 
- *                                                                       * 
- *   Information on NPB 3.3, including the technical report, the         *
- *   original specifications, source code, results and information       * 
- *   on how to submit new results, is available at:                      * 
- *                                                                       * 
- *          http:  www.nas.nasa.gov/Software/NPB                         * 
- *                                                                       * 
- *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
- *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
- *                                                                       * 
- *         NAS Parallel Benchmarks Group                                 * 
- *         NASA Ames Research Center                                     * 
- *         Mail Stop: T27A-1                                             * 
- *         Moffett Field, CA   94035-1000                                * 
- *                                                                       * 
- *         E-mail:  npb@nas.nasa.gov                                     * 
- *         Fax:     (650) 604-3957                                       * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   Author: M. Frumkin               *             * 
- *                                                                       * 
- *************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS            1                 
-#endif
-
-//int      passed_verification;
-extern double randlc( double *X, double *A );
-extern
-void c_print_results( char   *name,
-                      char   class,
-                      int    n1, 
-                      int    n2,
-                      int    n3,
-                      int    niter,
-                      int    nprocs_compiled,
-                      int    nprocs_total,
-                      double t,
-                      double mops,
-          char   *optype,
-                      int    passed_verification,
-                      char   *npbversion,
-                      char   *compiletime,
-                      char   *mpicc,
-                      char   *clink,
-                      char   *cmpi_lib,
-                      char   *cmpi_inc,
-                      char   *cflags,
-                      char   *clinkflags );
-          
-void    timer_clear( int n );
-void    timer_start( int n );
-void    timer_stop( int n );
-double  timer_read( int n );
-int timer_on=0,timers_tot=64;
-
-int verify(char *bmname,double rnm2){
-    double verify_value=0.0;
-    double epsilon=1.0E-8;
-    char cls=CLASS;
-    int verified=-1;
-    if (cls != 'U') {
-       if(cls=='S') {
-         if(strstr(bmname,"BH")){
-           verify_value=30892725.0;
-         }else if(strstr(bmname,"WH")){
-           verify_value=67349758.0;
-         }else if(strstr(bmname,"SH")){
-           verify_value=58875767.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-         }
-         verified = 0;
-       }else if(cls=='W') {
-         if(strstr(bmname,"BH")){
-       verify_value = 4102461.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 204280762.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 186944764.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-         }
-         verified = 0;
-       }else if(cls=='A') {
-         if(strstr(bmname,"BH")){
-       verify_value = 17809491.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 1289925229.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 610856482.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-         }
-     verified = 0;
-       }else if(cls=='B') {
-         if(strstr(bmname,"BH")){
-       verify_value = 4317114.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 7877279917.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 1836863082.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-       verified = 0;
-         }
-       }else if(cls=='C') {
-         if(strstr(bmname,"BH")){
-       verify_value = 0.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 0.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 0.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-       verified = -1;
-         }
-       }else if(cls=='D') {
-         if(strstr(bmname,"BH")){
-       verify_value = 0.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 0.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 0.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-         }
-         verified = -1;
-       }else{
-         fprintf(stderr,"No such class as %c.\n",cls);
-       }
-       fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
-       if(verified==-1){
-     fprintf(stderr," No verification was performed.\n");
-       }else if( rnm2 - verify_value < epsilon &&
-                 rnm2 - verify_value > -epsilon) {  /* abs here does not work on ALTIX */
-      verified = 1;
-      fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
-       }else{
-     verified = 0;
-     fprintf(stderr," The correct verification value = %f\n",verify_value);
-     fprintf(stderr," Got value = %f\n",rnm2);
-       }
-    }else{
-       verified = -1;
-    }
-    return  verified;  
-  }
-
-int ipowMod(int a,long long int n,int md){ 
-  int seed=1,q=a,r=1;
-  if(n<0){
-    fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
-    n=-n; /* temp fix */
-/*    return 1; */
-  }
-  if(md<=0){
-    fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
-    return 1;
-  }
-  if(n==0) return 1;
-  while(n>1){
-    int n2 = n/2;
-    if (n2*2==n){
-       seed = (q*q)%md;
-       q=seed;
-       n = n2;
-    }else{
-       seed = (r*q)%md;
-       r=seed;
-       n = n-1;
-    }
-  }
-  seed = (r*q)%md;
-  return seed;
-}
-
-#include "DGraph.h"
-DGraph *buildSH(char cls){
-/*
-  Nodes of the graph must be topologically sorted
-  to avoid MPI deadlock.
-*/
-  DGraph *dg;
-  int numSources=NUM_SOURCES; /* must be power of 2 */
-  int numOfLayers=0,tmpS=numSources>>1;
-  int firstLayerNode=0;
-  DGArc *ar=NULL;
-  DGNode *nd=NULL;
-  int mask=0x0,ndid=0,ndoff=0;
-  int i=0,j=0;
-  char nm[BLOCK_SIZE];
-  
-  sprintf(nm,"DT_SH.%c",cls);
-  dg=newDGraph(nm);
-
-  while(tmpS>1){
-    numOfLayers++;
-    tmpS>>=1;
-  }
-  for(i=0;i<numSources;i++){
-    sprintf(nm,"Source.%d",i);
-    nd=newNode(nm);
-    AttachNode(dg,nd);
-  }
-  for(j=0;j<numOfLayers;j++){
-    mask=0x00000001<<j;
-    for(i=0;i<numSources;i++){
-      sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
-      nd=newNode(nm);
-      AttachNode(dg,nd);
-      ndoff=i&(~mask);
-      ndid=firstLayerNode+ndoff;
-      ar=newArc(dg->node[ndid],nd);     
-      AttachArc(dg,ar);
-      ndoff+=mask;
-      ndid=firstLayerNode+ndoff;
-      ar=newArc(dg->node[ndid],nd);     
-      AttachArc(dg,ar);
-    }
-    firstLayerNode+=numSources;
-  }
-  mask=0x00000001<<numOfLayers;
-  for(i=0;i<numSources;i++){
-    sprintf(nm,"Sink.%d",i);
-    nd=newNode(nm);
-    AttachNode(dg,nd);
-    ndoff=i&(~mask);
-    ndid=firstLayerNode+ndoff;
-    ar=newArc(dg->node[ndid],nd);     
-    AttachArc(dg,ar);
-    ndoff+=mask;
-    ndid=firstLayerNode+ndoff;
-    ar=newArc(dg->node[ndid],nd);     
-    AttachArc(dg,ar);
-  }
-return dg;
-}
-DGraph *buildWH(char cls){
-/*
-  Nodes of the graph must be topologically sorted
-  to avoid MPI deadlock.
-*/
-  int i=0,j=0;
-  int numSources=NUM_SOURCES,maxInDeg=4;
-  int numLayerNodes=numSources,firstLayerNode=0;
-  int totComparators=0;
-  int numPrevLayerNodes=numLayerNodes;
-  int id=0,sid=0;
-  DGraph *dg;
-  DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
-  DGArc *ar=NULL;
-  char nm[BLOCK_SIZE];
-
-  sprintf(nm,"DT_WH.%c",cls);
-  dg=newDGraph(nm);
-
-  for(i=0;i<numSources;i++){
-    sprintf(nm,"Sink.%d",i);
-    nd=newNode(nm);
-    AttachNode(dg,nd);
-  }
-  totComparators=0;
-  numPrevLayerNodes=numLayerNodes;
-  while(numLayerNodes>maxInDeg){
-    numLayerNodes=numLayerNodes/maxInDeg;
-    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
-    for(i=0;i<numLayerNodes;i++){
-      sprintf(nm,"Comparator.%d",totComparators);
-      totComparators++;
-      nd=newNode(nm);
-      id=AttachNode(dg,nd);
-      for(j=0;j<maxInDeg;j++){
-        sid=i*maxInDeg+j;
-  if(sid>=numPrevLayerNodes) break;
-        snd=dg->node[firstLayerNode+sid];
-        ar=newArc(dg->node[id],snd);
-        AttachArc(dg,ar);
-      }
-    }
-    firstLayerNode+=numPrevLayerNodes;
-    numPrevLayerNodes=numLayerNodes;
-  }
-  source=newNode("Source");
-  AttachNode(dg,source);   
-  for(i=0;i<numPrevLayerNodes;i++){
-    nd=dg->node[firstLayerNode+i];
-    ar=newArc(source,nd);
-    AttachArc(dg,ar);
-  }
-
-  for(i=0;i<dg->numNodes/2;i++){  /* Topological sorting */
-    tmp=dg->node[i];
-    dg->node[i]=dg->node[dg->numNodes-1-i];
-    dg->node[i]->id=i;
-    dg->node[dg->numNodes-1-i]=tmp;
-    dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
-  }
-return dg;
-}
-DGraph *buildBH(char cls){
-/*
-  Nodes of the graph must be topologically sorted
-  to avoid MPI deadlock.
-*/
-  int i=0,j=0;
-  int numSources=NUM_SOURCES,maxInDeg=4;
-  int numLayerNodes=numSources,firstLayerNode=0;
-  DGraph *dg;
-  DGNode *nd=NULL, *snd=NULL, *sink=NULL;
-  DGArc *ar=NULL;
-  int totComparators=0;
-  int numPrevLayerNodes=numLayerNodes;
-  int id=0, sid=0;
-  char nm[BLOCK_SIZE];
-
-  sprintf(nm,"DT_BH.%c",cls);
-  dg=newDGraph(nm);
-
-  for(i=0;i<numSources;i++){
-    sprintf(nm,"Source.%d",i);
-    nd=newNode(nm);
-    AttachNode(dg,nd);
-  }
-  while(numLayerNodes>maxInDeg){
-    numLayerNodes=numLayerNodes/maxInDeg;
-    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
-    for(i=0;i<numLayerNodes;i++){
-      sprintf(nm,"Comparator.%d",totComparators);
-      totComparators++;
-      nd=newNode(nm);
-      id=AttachNode(dg,nd);
-      for(j=0;j<maxInDeg;j++){
-        sid=i*maxInDeg+j;
-  if(sid>=numPrevLayerNodes) break;
-        snd=dg->node[firstLayerNode+sid];
-        ar=newArc(snd,dg->node[id]);
-        AttachArc(dg,ar);
-      }
-    }
-    firstLayerNode+=numPrevLayerNodes;
-    numPrevLayerNodes=numLayerNodes;
-  }
-  sink=newNode("Sink");
-  AttachNode(dg,sink);   
-  for(i=0;i<numPrevLayerNodes;i++){
-    nd=dg->node[firstLayerNode+i];
-    ar=newArc(nd,sink);
-    AttachArc(dg,ar);
-  }
-return dg;
-}
-
-typedef struct{
-  int len;
-  double* val;
-} Arr;
-Arr *newArr(int len){
-  Arr *arr=(Arr *)SMPI_SHARED_MALLOC(sizeof(Arr));
-  arr->len=len;
-  arr->val=(double *)SMPI_SHARED_MALLOC(len*sizeof(double));
-  return arr;
-}
-void arrShow(Arr* a){
-  if(!a) fprintf(stderr,"-- NULL array\n");
-  else{
-    fprintf(stderr,"-- length=%d\n",a->len);
-  }
-}
-double CheckVal(Arr *feat){
-  double csum=0.0;
-  int i=0;
-  for(i=0;i<feat->len;i++){
-    csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since 
-                                                  result will be 0 for large len  */
-  }
-   return csum;
-}
-int GetFNumDPar(int* mean, int* stdev){
-  *mean=NUM_SAMPLES;
-  *stdev=STD_DEVIATION;
-  return 0;
-}
-int GetFeatureNum(char *mbname,int id){
-  double tran=314159265.0;
-  double A=2*id+1;
-  double denom=randlc(&tran,&A);
-  char cval='S';
-  int mean=NUM_SAMPLES,stdev=128;
-  int rtfs=0,len=0;
-  GetFNumDPar(&mean,&stdev);
-  rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
-  if(rtfs<0) rtfs=-rtfs;
-  len=mean-stdev+rtfs;
-  return len;
-}
-Arr* RandomFeatures(char *bmname,int fdim,int id){
-  int len=GetFeatureNum(bmname,id)*fdim;
-  Arr* feat=newArr(len);
-  int nxg=2,nyg=2,nzg=2,nfg=5;
-  int nx=421,ny=419,nz=1427,nf=3527;
-  long long int expon=(len*(id+1))%3141592;
-  int seedx=ipowMod(nxg,expon,nx),
-      seedy=ipowMod(nyg,expon,ny),
-      seedz=ipowMod(nzg,expon,nz),
-      seedf=ipowMod(nfg,expon,nf);
-  int i=0;
-  if(timer_on){
-    timer_clear(id+1);
-    timer_start(id+1);
-  }
-  for(i=0;i<len;i+=fdim){
-    seedx=(seedx*nxg)%nx;
-    seedy=(seedy*nyg)%ny;
-    seedz=(seedz*nzg)%nz;
-    seedf=(seedf*nfg)%nf;
-    feat->val[i]=seedx;
-    feat->val[i+1]=seedy;
-    feat->val[i+2]=seedz;
-    feat->val[i+3]=seedf;
-  }
-  if(timer_on){
-    timer_stop(id+1);
-    fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
-  }
-  return feat;   
-}
-void Resample(Arr *a,int blen){
-    long long int i=0,j=0,jlo=0,jhi=0;
-    double avval=0.0;
-    double *nval=(double *)SMPI_SHARED_MALLOC(blen*sizeof(double));
-    Arr *tmp=newArr(10);
-    for(i=0;i<blen;i++) nval[i]=0.0;
-    for(i=1;i<a->len-1;i++){
-      jlo=(int)(0.5*(2*i-1)*(blen/a->len)); 
-      jhi=(int)(0.5*(2*i+1)*(blen/a->len));
-
-      avval=a->val[i]/(jhi-jlo+1);    
-      for(j=jlo;j<=jhi;j++){
-        nval[j]+=avval;
-      }
-    }
-    nval[0]=a->val[0];
-    nval[blen-1]=a->val[a->len-1];
-    SMPI_SHARED_FREE(a->val);
-    a->val=nval;
-    a->len=blen;
-}
-#define fielddim 4
-Arr* WindowFilter(Arr *a, Arr* b,int w){
-  int i=0,j=0,k=0;
-  double rms0=0.0,rms1=0.0,rmsm1=0.0;
-  double weight=((double) (w+1))/(w+2);
- 
-  w+=1;
-  if(timer_on){
-    timer_clear(w);
-    timer_start(w);
-  }
-  if(a->len<b->len) Resample(a,b->len);
-  if(a->len>b->len) Resample(b,a->len);
-  for(i=fielddim;i<a->len-fielddim;i+=fielddim){
-    rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
-  +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
-  +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
-  +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
-    j=i+fielddim;
-    rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
-      +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
-      +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
-      +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
-    j=i-fielddim;
-    rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
-   +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
-   +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
-   +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
-    k=0;
-    if(rms1<rms0){
-      k=1;
-      rms0=rms1;
-    }
-    if(rmsm1<rms0) k=-1;
-    if(k==0){
-      j=i+fielddim;
-      a->val[i]=weight*b->val[i];
-      a->val[i+1]=weight*b->val[i+1];
-      a->val[i+2]=weight*b->val[i+2];
-      a->val[i+3]=weight*b->val[i+3];  
-    }else if(k==1){
-      j=i+fielddim;
-      a->val[i]=weight*b->val[j];
-      a->val[i+1]=weight*b->val[j+1];
-      a->val[i+2]=weight*b->val[j+2];
-      a->val[i+3]=weight*b->val[j+3];  
-    }else { /*if(k==-1)*/
-      j=i-fielddim;
-      a->val[i]=weight*b->val[j];
-      a->val[i+1]=weight*b->val[j+1];
-      a->val[i+2]=weight*b->val[j+2];
-      a->val[i+3]=weight*b->val[j+3];  
-    }     
-  }
-  if(timer_on){
-    timer_stop(w);
-    fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
-  }
-  return a;
-}
-
-int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
-  int i=0,tag=0;
-  DGArc *ar=NULL;
-  DGNode *head=NULL;
-  if(!feat) return 0;
-  for(i=0;i<nd->outDegree;i++){
-    ar=nd->outArc[i];
-    if(ar->tail!=nd) continue;
-    head=ar->head;
-    tag=ar->id;
-    if(head->address!=nd->address){
-      MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
-      MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
-    }
-  }
-  return 1;
-}
-Arr* CombineStreams(DGraph *dg,DGNode *nd){
-  Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
-  int i=0,len=0,tag=0;
-  DGArc *ar=NULL;
-  DGNode *tail=NULL;
-  MPI_Status status;
-  Arr *feat=NULL,*featp=NULL;
-
-  if(nd->inDegree==0) return NULL;
-  for(i=0;i<nd->inDegree;i++){
-    ar=nd->inArc[i];
-    if(ar->head!=nd) continue;
-    tail=ar->tail;
-    if(tail->address!=nd->address){
-      len=0;
-      tag=ar->id;
-      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
-      feat=newArr(len);
-      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
-      resfeat=WindowFilter(resfeat,feat,nd->id);
-      SMPI_SHARED_FREE(feat);
-    }else{
-      featp=(Arr *)tail->feat;
-      feat=newArr(featp->len);
-      memcpy(feat->val,featp->val,featp->len*sizeof(double));
-      resfeat=WindowFilter(resfeat,feat,nd->id);  
-      SMPI_SHARED_FREE(feat);
-    }
-  }
-  for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
-  nd->feat=resfeat;
-  return nd->feat;
-}
-double Reduce(Arr *a,int w){
-  double retv=0.0;
-  if(timer_on){
-    timer_clear(w);
-    timer_start(w);
-  }
-  retv=(int)(w*CheckVal(a));/* The casting needed for node  
-                               and array dependent verifcation */
-  if(timer_on){
-    timer_stop(w);
-    fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
-  }
-  return retv;
-}
-
-double ReduceStreams(DGraph *dg,DGNode *nd){
-  double csum=0.0;
-  int i=0,len=0,tag=0;
-  DGArc *ar=NULL;
-  DGNode *tail=NULL;
-  Arr *feat=NULL;
-  double retv=0.0;
-
-  for(i=0;i<nd->inDegree;i++){
-    ar=nd->inArc[i];
-    if(ar->head!=nd) continue;
-    tail=ar->tail;
-    if(tail->address!=nd->address){
-      MPI_Status status;
-      len=0;
-      tag=ar->id;
-      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
-      feat=newArr(len);
-      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
-      csum+=Reduce(feat,(nd->id+1));  
-      SMPI_SHARED_FREE(feat);
-    }else{
-      csum+=Reduce(tail->feat,(nd->id+1));  
-    }
-  }
-  if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
-  retv=(nd->id+1)*csum;
-  return retv;
-}
-
-int ProcessNodes(DGraph *dg,int me){
-  double chksum=0.0;
-  Arr *feat=NULL;
-  int i=0,verified=0,tag;
-  DGNode *nd=NULL;
-  double rchksum=0.0;
-  MPI_Status status;
-
-  for(i=0;i<dg->numNodes;i++){
-    nd=dg->node[i];
-    if(nd->address!=me) continue;
-    if(strstr(nd->name,"Source")){
-      nd->feat=RandomFeatures(dg->name,fielddim,nd->id); 
-      SendResults(dg,nd,nd->feat);
-    }else if(strstr(nd->name,"Sink")){
-      chksum=ReduceStreams(dg,nd);
-      tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
-      MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
-    }else{
-      feat=CombineStreams(dg,nd);
-      SendResults(dg,nd,feat);
-    }
-  }
-  if(me==0){ /* Report node */
-    rchksum=0.0;
-    chksum=0.0;
-    for(i=0;i<dg->numNodes;i++){
-      nd=dg->node[i];
-      if(!strstr(nd->name,"Sink")) continue;
-       tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
-      MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
-      chksum+=rchksum;
-    }
-    verified=verify(dg->name,chksum);
-  }
-return verified;
-}
-
-int main(int argc,char **argv ){
-  int my_rank,comm_size;
-  int i;
-  DGraph *dg=NULL;
-  int verified=0, featnum=0;
-  double bytes_sent=2.0,tot_time=0.0;
-
-    MPI_Init( &argc, &argv );
-    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
-    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
-
-     if(argc!=2||
-                (  strncmp(argv[1],"BH",2)!=0
-                 &&strncmp(argv[1],"WH",2)!=0
-                 &&strncmp(argv[1],"SH",2)!=0
-                )
-      ){
-      if(my_rank==0){
-        fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
-        fprintf(stderr,"** Where \n   - N is integer number of MPI processes\n");
-        fprintf(stderr,"   - S is the class S, W, or A \n");
-        fprintf(stderr,"   - GraphName is the communication graph name BH, WH, or SH.\n");
-        fprintf(stderr,"   - the number of MPI processes N should not be be less than \n");
-        fprintf(stderr,"     the number of nodes in the graph\n");
-      }
-      MPI_Finalize();
-      exit(0);
-    } 
-   if(strncmp(argv[1],"BH",2)==0){
-      dg=buildBH(CLASS);
-    }else if(strncmp(argv[1],"WH",2)==0){
-      dg=buildWH(CLASS);
-    }else if(strncmp(argv[1],"SH",2)==0){
-      dg=buildSH(CLASS);
-    }
-
-    if(timer_on&&dg->numNodes+1>timers_tot){
-      timer_on=0;
-      if(my_rank==0)
-        fprintf(stderr,"Not enough timers. Node timeing is off. \n");
-    }
-    if(dg->numNodes>comm_size){
-      if(my_rank==0){
-        fprintf(stderr,"**  The number of MPI processes should not be less than \n");
-        fprintf(stderr,"**  the number of nodes in the graph\n");
-        fprintf(stderr,"**  Number of MPI processes = %d\n",comm_size);
-        fprintf(stderr,"**  Number nodes in the graph = %d\n",dg->numNodes);
-      }
-      MPI_Finalize();
-      exit(0);
-    }
-    for(i=0;i<dg->numNodes;i++){ 
-      dg->node[i]->address=i;
-    }
-    if( my_rank == 0 ){
-      printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
-      graphShow(dg,0);
-      timer_clear(0);
-      timer_start(0);
-    }
-    verified=ProcessNodes(dg,my_rank);
-    
-    featnum=NUM_SAMPLES*fielddim;
-    bytes_sent=featnum*dg->numArcs;
-    bytes_sent/=1048576;
-    if(my_rank==0){
-      timer_stop(0);
-      tot_time=timer_read(0);
-      c_print_results( dg->name,
-                 CLASS,
-                 featnum,
-                 0,
-                 0,
-                 dg->numNodes,
-                 0,
-                 comm_size,
-                 tot_time,
-                 bytes_sent/tot_time,
-                 "bytes transmitted", 
-                 verified,
-                 NPBVERSION,
-                 COMPILETIME,
-                 MPICC,
-                 CLINK,
-                 CMPI_LIB,
-                 CMPI_INC,
-                 CFLAGS,
-                 CLINKFLAGS );
-    }          
-    MPI_Finalize();
-  return 1;
-}
diff --git a/examples/smpi/NAS/EP/Makefile b/examples/smpi/NAS/EP/Makefile
deleted file mode 100644
index 3d38c277cd..0000000000
--- a/examples/smpi/NAS/EP/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-BENCHMARK=ep
-include ../config/make.def
-include ../sys/make.common
-
-${PROGRAM}: config ep.o ep-sampling.o ../common/randdp.o
-	${CLINK} ${CLINKFLAGS} -o ${BINDIR}/ep.${CLASS}.${NPROCS} ep.o ../common/randdp.o ${CMPI_LIB} -lm
-	${CLINK} ${CLINKFLAGS} -o ${BINDIR}/ep-sampling.${CLASS}.${NPROCS} ep-sampling.o ../common/randdp.o ${CMPI_LIB} -lm
-
-ep.o:	ep.c npbparams.h
-	${CCOMPILE} ep.c
-ep-sampling.o:	ep-sampling.c npbparams.h
-	${CCOMPILE} ep-sampling.c
-
-clean:
-	- rm -f *.o *~ npbparams.h
diff --git a/examples/smpi/NAS/EP/ep-sampling.c b/examples/smpi/NAS/EP/ep-sampling.c
deleted file mode 100644
index c5956b6d2e..0000000000
--- a/examples/smpi/NAS/EP/ep-sampling.c
+++ /dev/null
@@ -1,438 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS            1                 
-#endif
-#define true 1
-#define false 0
-
-
-//---NOTE : all the timers function have been modified to
-//          avoid global timers (privatize these). 
-      // ----------------------- timers ---------------------
-      void timer_clear(double *onetimer) {
-            //elapsed[n] = 0.0;
-            *onetimer = 0.0;
-      }
-
-      void timer_start(double *onetimer) {
-            *onetimer = MPI_Wtime();
-      }
-
-      void timer_stop(int n,double *elapsed,double *start) {
-            double t, now;
-
-            now = MPI_Wtime();
-            t = now - start[n];
-            elapsed[n] += t;
-      }
-
-      double timer_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
-            return(elapsed[n]);
-      }
-      /********************************************************************
-       *****************            V R A N L C          ******************
-       *****************                                 *****************/           
-      double vranlc(int n, double x, double a, double *y)
-      {
-        int i;
-        long  i246m1=0x00003FFFFFFFFFFF;
-    long  LLx, Lx, La;
-        double d2m46;
-
-// This doesn't work, because the compiler does the calculation in 32
-// bits and overflows. No standard way (without f90 stuff) to specify
-// that the rhs should be done in 64 bit arithmetic.
-//     parameter(i246m1=2**46-1)
-
-      d2m46=pow(0.5,46);
-
-// c Note that the v6 compiler on an R8000 does something stupid with
-// c the above. Using the following instead (or various other things)
-// c makes the calculation run almost 10 times as fast.
-//
-// c     save d2m46
-// c      data d2m46/0.0d0/
-// c      if (d2m46 .eq. 0.0d0) then
-// c         d2m46 = 0.5d0**46
-// c      endif
-
-      Lx = (long)x;
-      La = (long)a;
-      //fprintf(stdout,("================== Vranlc ================");
-      //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
-  LLx = Lx;
-  for (i=0; i< n; i++) {
-      Lx   = Lx*La & i246m1 ;
-      LLx = Lx;
-      y[i] = d2m46 * (double)LLx;
-      /*
-         if(i == 0) {
-         fprintf(stdout,("After loop 0:");
-         fprintf(stdout,("Lx = " + Lx + ", La = " + La);
-         fprintf(stdout,("d2m46 = " + d2m46);
-         fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
-         fprintf(stdout,("Y[0]" + y[0]);
-         }
-       */
-  }
-
-      x = (double)LLx;
-      /*
-      fprintf(stdout,("Change: Lx = " + Lx);
-      fprintf(stdout,("=============End   Vranlc ================");
-      */
-      return x;
-    }
-
-
-
-//-------------- the core (unique function) -----------
-      void doTest(int argc, char **argv) {
-      double dum[3] = {1.,1.,1.};
-      double x1, x2, sx, sy, tm, an, tt, gc;
-      double Mops;
-      double epsilon=1.0E-8, a = 1220703125., s=271828183.;
-      double t1, t2, t3, t4; 
-      double sx_verify_value, sy_verify_value, sx_err, sy_err;
-
-#include "npbparams.h"
-      int    mk=16, 
-         // --> set by make : in npbparams.h
-         //m=28, // for CLASS=A
-         //m=30, // for CLASS=B
-         //npm=2, // NPROCS
-         mm = m-mk, 
-         nn = (int)(pow(2,mm)), 
-         nk = (int)(pow(2,mk)), 
-         nq=10, 
-         np, 
-         node, 
-         no_nodes, 
-         i, 
-         ik, 
-         kk, 
-         l, 
-         k, nit, no_large_nodes,
-         np_add, k_offset, j;
-      int    me, nprocs, root=0, dp_type;
-      int verified, 
-          timers_enabled=true;
-      char  size[500]; // mind the size of the string to represent a big number
-
-      //Use in randlc..
-      int KS = 0;
-      double R23, R46, T23, T46;
-
-      double *qq = (double *) malloc (10000*sizeof(double));
-      double *start = (double *) malloc (64*sizeof(double));
-      double *elapsed = (double *) malloc (64*sizeof(double));
-
-      double *x = (double *) malloc (2*nk*sizeof(double));
-      double *q = (double *) malloc (nq*sizeof(double));
-
-      MPI_Init( &argc, &argv );
-      MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
-      MPI_Comm_rank( MPI_COMM_WORLD, &node);
-
-#ifdef USE_MPE
-    MPE_Init_log();
-#endif
-      root = 0;
-      if (node == root ) {
-
-          /*   Because the size of the problem is too large to store in a 32-bit
-           *   integer for some classes, we put it into a string (for printing).
-           *   Have to strip off the decimal point put in there by the floating
-           *   point print statement (internal file)
-           */
-          fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
-          sprintf(size,"%d",(int) pow(2,m+1));
-          //size = size.replace('.', ' ');
-          fprintf(stdout," Number of random numbers generated: %s\n",size);
-          fprintf(stdout," Number of active processes: %d\n",no_nodes);
-
-      }
-      verified = false;
-
-      /* c   Compute the number of "batches" of random number pairs generated 
-         c   per processor. Adjust if the number of processors does not evenly 
-         c   divide the total number
-*/
-
-       np = nn / no_nodes;
-       no_large_nodes = nn % no_nodes;
-       if (node < no_large_nodes) np_add = 1;
-       else np_add = 0;
-       np = np + np_add;
-
-       if (np == 0) {
-             fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
-             MPI_Abort(MPI_COMM_WORLD,1);
-             exit(0); 
-       } 
-
-/* c   Call the random number generator functions and initialize
-   c   the x-array to reduce the effects of paging on the timings.
-   c   Also, call all mathematical functions that are used. Make
-   c   sure these initializations cannot be eliminated as dead code.
-*/
-
-   //call vranlc(0, dum[1], dum[2], dum[3]);
-   // Array indexes start at 1 in Fortran, 0 in Java
-   vranlc(0, dum[0], dum[1], &(dum[2])); 
-
-   dum[0] = randlc(&(dum[1]),&(dum[2]));
-   /////////////////////////////////
-   for (i=0;i<2*nk;i++) {
-       x[i] = -1e99;
-   }
-   Mops = log(sqrt(abs(1))); 
-
-   /*
-      c---------------------------------------------------------------------
-      c    Synchronize before placing time stamp
-      c---------------------------------------------------------------------
-    */
-        MPI_Barrier( MPI_COMM_WORLD );
-
-        timer_clear(&(elapsed[1]));
-        timer_clear(&(elapsed[2]));
-        timer_clear(&(elapsed[3]));
-        timer_start(&(start[1]));
-        
-        t1 = a;
-  //fprintf(stdout,("(ep.f:160) t1 = " + t1);
-        t1 = vranlc(0, t1, a, x);
-  //fprintf(stdout,("(ep.f:161) t1 = " + t1);
-  
-        
-/* c   Compute AN = A ^ (2 * NK) (mod 2^46). */
-        
-        t1 = a;
-  //fprintf(stdout,("(ep.f:165) t1 = " + t1);
-        for (i=1; i <= mk+1; i++) {
-               t2 = randlc(&t1, &t1);
-         //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
-        } 
-        an = t1;
-  //fprintf(stdout,("(ep.f:172) s = " + s);
-        tt = s;
-        gc = 0.;
-        sx = 0.;
-        sy = 0.;
-        for (i=0; i < nq ; i++) {
-               q[i] = 0.;
-        }
-
-/*
-    Each instance of this loop may be performed independently. We compute
-    the k offsets separately to take into account the fact that some nodes
-    have more numbers to generate than others
-*/
-
-      if (np_add == 1)
-         k_offset = node * np -1;
-      else
-         k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
-     
-      int stop = false;
-      for(k = 1; k <= np; k++) SMPI_SAMPLE_LOCAL(0.25 * np, 0.03) {
-         stop = false;
-         kk = k_offset + k ;
-         t1 = s;
-         //fprintf(stdout,("(ep.f:193) t1 = " + t1);
-         t2 = an;
-
-//       Find starting seed t1 for this kk.
-
-         for (i=1;i<=100 && !stop;i++) {
-            ik = kk / 2;
-      //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
-            if (2 * ik != kk)  {
-                t3 = randlc(&t1, &t2);
-                //fprintf(stdout,("(ep.f:200) t1= " +t1 );
-            }
-            if (ik==0)
-                stop = true;
-            else {
-               t3 = randlc(&t2, &t2);
-               kk = ik;
-           }
-         }
-//       Compute uniform pseudorandom numbers.
-
-         //if (timers_enabled)  timer_start(3);
-   timer_start(&(start[3]));
-         //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
-
-  //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
-  //fprintf(stdout,"2*nk = " + (2*nk));
-  //fprintf(stdout,"t1 = " + t1);
-  //fprintf(stdout,"a  = " + a);
-  //fprintf(stdout,"x[0] = " + x[0]);
-  //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-        
-  t1 = vranlc(2 * nk, t1, a, x);
-
-  //fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
-  //fprintf(stdout,("2*nk = " + (2*nk));
-  //fprintf(stdout,("t1 = " + t1);
-  //fprintf(stdout,("a  = " + a);
-  //fprintf(stdout,("x[0] = " + x[0]);
-  //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-        
-         //if (timers_enabled)  timer_stop(3);
-   timer_stop(3,elapsed,start);
-
-/*       Compute Gaussian deviates by acceptance-rejection method and 
- *       tally counts in concentric square annuli.  This loop is not 
- *       vectorizable. 
- */
-         //if (timers_enabled) timer_start(2);
-    timer_start(&(start[2]));
-         for(i=1; i<=nk;i++) {
-            x1 = 2. * x[2*i-2] -1.0;
-            x2 = 2. * x[2*i-1] - 1.0;
-            t1 = x1*x1 + x2*x2;
-            if (t1 <= 1.) {
-               t2   = sqrt(-2. * log(t1) / t1);
-               t3   = (x1 * t2);
-               t4   = (x2 * t2);
-               l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
-               q[l] = q[l] + 1.;
-               sx   = sx + t3;
-               sy   = sy + t4;
-             }
-    /*
-       if(i == 1) {
-                fprintf(stdout,"x1 = " + x1);
-                fprintf(stdout,"x2 = " + x2);
-                fprintf(stdout,"t1 = " + t1);
-                fprintf(stdout,"t2 = " + t2);
-                fprintf(stdout,"t3 = " + t3);
-                fprintf(stdout,"t4 = " + t4);
-                fprintf(stdout,"l = " + l);
-                fprintf(stdout,"q[l] = " + q[l]);
-                fprintf(stdout,"sx = " + sx);
-                fprintf(stdout,"sy = " + sy);
-       }
-    */
-           }
-         //if (timers_enabled)  timer_stop(2);
-    timer_stop(2,elapsed,start);
-      }
-
-      //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)   
-  MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
-      MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-      sy = x[0];
-      MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-      for(i = 0; i < nq; i++) {
-    q[i] = x[i];
-  }
-  for(i = 0; i < nq; i++) {
-    gc += q[i];
-  }
-
-  timer_stop(1,elapsed,start);
-      tm = timer_read(1,elapsed);
-  MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  tm = x[0];
-
-  if(node == root) {
-    nit = 0;
-    verified = true;
-
-    if(m == 24) {
-             sx_verify_value = -3.247834652034740E3;
-                  sy_verify_value = -6.958407078382297E3;
-    } else if(m == 25) {
-                sx_verify_value = -2.863319731645753E3;
-      sy_verify_value = -6.320053679109499E3;
-    } else if(m == 28) {
-            sx_verify_value = -4.295875165629892E3;
-      sy_verify_value = -1.580732573678431E4;
-    } else if(m == 30) {
-            sx_verify_value =  4.033815542441498E4;
-                  sy_verify_value = -2.660669192809235E4;
-    } else if(m == 32) {
-                  sx_verify_value =  4.764367927995374E4;
-                     sy_verify_value = -8.084072988043731E4;
-    } else if(m == 36) {
-            sx_verify_value =  1.982481200946593E5;
-            sy_verify_value = -1.020596636361769E5;
-    } else {
-      verified = false;
-    }
-
-    /*
-    fprintf(stdout,("sx        = " + sx);
-    fprintf(stdout,("sx_verify = " + sx_verify_value);
-    fprintf(stdout,("sy        = " + sy);
-    fprintf(stdout,("sy_verify = " + sy_verify_value);
-    */
-    if(verified) {
-      sx_err = abs((sx - sx_verify_value)/sx_verify_value);
-      sy_err = abs((sy - sy_verify_value)/sy_verify_value);
-      /*
-      fprintf(stdout,("sx_err = " + sx_err);
-      fprintf(stdout,("sy_err = " + sx_err);
-      fprintf(stdout,("epsilon= " + epsilon);
-      */
-      verified = ((sx_err < epsilon) && (sy_err < epsilon));
-    }
-
-    Mops = (pow(2.0, m+1))/tm/1000;
-
-    fprintf(stdout,"EP Benchmark Results:\n");
-    fprintf(stdout,"CPU Time=%d\n",(int) tm);
-    fprintf(stdout,"N = 2^%d\n",m);
-    fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
-    fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
-    fprintf(stdout,"Count:");
-    for(i = 0; i < nq; i++) {
-      fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
-    }
-
-    /*
-    print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
-        "Random numbers generated", verified, npbversion,
-        compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
-    fprintf(stdout,"\nEP Benchmark Completed\n");
-            fprintf(stdout,"Class           = %s\n", _class);
-    fprintf(stdout,"Size            = %s\n", size);
-    fprintf(stdout,"Iteration       = %d\n", nit);
-    fprintf(stdout,"Time in seconds = %f\n",(tm/1000));
-    fprintf(stdout,"Total processes = %d\n",no_nodes);
-    fprintf(stdout,"Mops/s total    = %f\n",Mops);
-    fprintf(stdout,"Mops/s/process  = %f\n", Mops/no_nodes);
-    fprintf(stdout,"Operation type  = Random number generated\n");
-    if(verified) {
-      fprintf(stdout,"Verification    = SUCCESSFUL\n");
-    } else {
-      fprintf(stdout,"Verification    = UNSUCCESSFUL\n");
-    }
-             fprintf(stdout,"Total time:     %f\n",(timer_read(1,elapsed)/1000));
-             fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2,elapsed)/1000));
-             fprintf(stdout,"Random numbers: %f\n",(timer_read(3,elapsed)/1000));
-         }
-#ifdef USE_MPE
-    MPE_Finish_log(argv[0]);
-#endif
- 
-       MPI_Finalize();
-      }
-
-    int main(int argc, char **argv) {
-       doTest(argc,argv);
-    }
diff --git a/examples/smpi/NAS/EP/ep.c b/examples/smpi/NAS/EP/ep.c
deleted file mode 100644
index 569c2f21df..0000000000
--- a/examples/smpi/NAS/EP/ep.c
+++ /dev/null
@@ -1,445 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#include "simgrid/instr.h" //TRACE_
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS            1                 
-#endif
-#define true 1
-#define false 0
-
-//---NOTE : all the timers function have been modified to
-//          avoid global timers (privatize these). 
-      // ----------------------- timers ---------------------
-      void timer_clear(double *onetimer) {
-            //elapsed[n] = 0.0;
-            *onetimer = 0.0;
-      }
-
-      void timer_start(double *onetimer) {
-            *onetimer = MPI_Wtime();
-      }
-
-      void timer_stop(int n,double *elapsed,double *start) {
-            double t, now;
-
-            now = MPI_Wtime();
-            t = now - start[n];
-            elapsed[n] += t;
-      }
-
-      double timer_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
-            return(elapsed[n]);
-      }
-      /********************************************************************
-       *****************            V R A N L C          ******************
-       *****************                                 *****************/           
-      double vranlc(int n, double x, double a, double *y)
-      {
-        int i;
-        long  i246m1=0x00003FFFFFFFFFFF;
-    long  LLx, Lx, La;
-        double d2m46;
-
-// This doesn't work, because the compiler does the calculation in 32
-// bits and overflows. No standard way (without f90 stuff) to specify
-// that the rhs should be done in 64 bit arithmetic.
-//     parameter(i246m1=2**46-1)
-
-      d2m46=pow(0.5,46);
-
-// c Note that the v6 compiler on an R8000 does something stupid with
-// c the above. Using the following instead (or various other things)
-// c makes the calculation run almost 10 times as fast.
-//
-// c     save d2m46
-// c      data d2m46/0.0d0/
-// c      if (d2m46 .eq. 0.0d0) then
-// c         d2m46 = 0.5d0**46
-// c      endif
-
-      Lx = (long)x;
-      La = (long)a;
-      //fprintf(stdout,("================== Vranlc ================");
-      //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
-  LLx = Lx;
-  for (i=0; i< n; i++) {
-      Lx   = Lx*La & i246m1 ;
-      LLx = Lx;
-      y[i] = d2m46 * (double)LLx;
-      /*
-         if(i == 0) {
-         fprintf(stdout,("After loop 0:");
-         fprintf(stdout,("Lx = " + Lx + ", La = " + La);
-         fprintf(stdout,("d2m46 = " + d2m46);
-         fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
-         fprintf(stdout,("Y[0]" + y[0]);
-         }
-       */
-  }
-
-      x = (double)LLx;
-      /*
-      fprintf(stdout,("Change: Lx = " + Lx);
-      fprintf(stdout,("=============End   Vranlc ================");
-      */
-      return x;
-    }
-
-
-
-//-------------- the core (unique function) -----------
-      void doTest(int argc, char **argv) {
-      double dum[3] = {1.,1.,1.};
-      double x1, x2, sx, sy, tm, an, tt, gc;
-      double Mops;
-      double epsilon=1.0E-8, a = 1220703125., s=271828183.;
-      double t1, t2, t3, t4; 
-      double sx_verify_value, sy_verify_value, sx_err, sy_err;
-
-#include "npbparams.h"
-      int    mk=16, 
-         // --> set by make : in npbparams.h
-         //m=28, // for CLASS=A
-         //m=30, // for CLASS=B
-         //npm=2, // NPROCS
-         mm = m-mk, 
-         nn = (int)(pow(2,mm)), 
-         nk = (int)(pow(2,mk)), 
-         nq=10, 
-         np, 
-         node, 
-         no_nodes, 
-         i, 
-         ik, 
-         kk, 
-         l, 
-         k, nit, no_large_nodes,
-         np_add, k_offset, j;
-      int    me, nprocs, root=0, dp_type;
-      int verified, 
-          timers_enabled=true;
-      char  size[500]; // mind the size of the string to represent a big number
-
-      //Use in randlc..
-      int KS = 0;
-      double R23, R46, T23, T46;
-
-      double *qq = (double *) malloc (10000*sizeof(double));
-      double *start = (double *) malloc (64*sizeof(double));
-      double *elapsed = (double *) malloc (64*sizeof(double));
-
-      double *x = (double *) malloc (2*nk*sizeof(double));
-      double *q = (double *) malloc (nq*sizeof(double));
-
-      TRACE_smpi_set_category ("start");
-
-      MPI_Init( &argc, &argv );
-      MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
-      MPI_Comm_rank( MPI_COMM_WORLD, &node);
-
-#ifdef USE_MPE
-    MPE_Init_log();
-#endif
-      root = 0;
-      if (node == root ) {
-
-          /*   Because the size of the problem is too large to store in a 32-bit
-           *   integer for some classes, we put it into a string (for printing).
-           *   Have to strip off the decimal point put in there by the floating
-           *   point print statement (internal file)
-           */
-          fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
-          sprintf(size,"%d",(int)pow(2,m+1));
-          //size = size.replace('.', ' ');
-          fprintf(stdout," Number of random numbers generated: %s\n",size);
-          fprintf(stdout," Number of active processes: %d\n",no_nodes);
-
-      }
-      verified = false;
-
-      /* c   Compute the number of "batches" of random number pairs generated 
-         c   per processor. Adjust if the number of processors does not evenly 
-         c   divide the total number
-*/
-
-       np = nn / no_nodes;
-       no_large_nodes = nn % no_nodes;
-       if (node < no_large_nodes) np_add = 1;
-       else np_add = 0;
-       np = np + np_add;
-
-       if (np == 0) {
-             fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
-             MPI_Abort(MPI_COMM_WORLD,1);
-             exit(0); 
-       } 
-
-/* c   Call the random number generator functions and initialize
-   c   the x-array to reduce the effects of paging on the timings.
-   c   Also, call all mathematical functions that are used. Make
-   c   sure these initializations cannot be eliminated as dead code.
-*/
-
-   //call vranlc(0, dum[1], dum[2], dum[3]);
-   // Array indexes start at 1 in Fortran, 0 in Java
-   vranlc(0, dum[0], dum[1], &(dum[2])); 
-
-   dum[0] = randlc(&(dum[1]),&(dum[2]));
-   /////////////////////////////////
-   for (i=0;i<2*nk;i++) {
-       x[i] = -1e99;
-   }
-   Mops = log(sqrt(abs(1))); 
-
-   /*
-      c---------------------------------------------------------------------
-      c    Synchronize before placing time stamp
-      c---------------------------------------------------------------------
-    */
-        MPI_Barrier( MPI_COMM_WORLD );
-
-        TRACE_smpi_set_category ("ep");
-
-        timer_clear(&(elapsed[1]));
-        timer_clear(&(elapsed[2]));
-        timer_clear(&(elapsed[3]));
-        timer_start(&(start[1]));
-        
-        t1 = a;
-  //fprintf(stdout,("(ep.f:160) t1 = " + t1);
-        t1 = vranlc(0, t1, a, x);
-  //fprintf(stdout,("(ep.f:161) t1 = " + t1);
-  
-        
-/* c   Compute AN = A ^ (2 * NK) (mod 2^46). */
-        
-        t1 = a;
-  //fprintf(stdout,("(ep.f:165) t1 = " + t1);
-        for (i=1; i <= mk+1; i++) {
-               t2 = randlc(&t1, &t1);
-         //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
-        } 
-        an = t1;
-  //fprintf(stdout,("(ep.f:172) s = " + s);
-        tt = s;
-        gc = 0.;
-        sx = 0.;
-        sy = 0.;
-        for (i=0; i < nq ; i++) {
-               q[i] = 0.;
-        }
-
-/*
-    Each instance of this loop may be performed independently. We compute
-    the k offsets separately to take into account the fact that some nodes
-    have more numbers to generate than others
-*/
-
-      if (np_add == 1)
-         k_offset = node * np -1;
-      else
-         k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
-     
-      int stop = false;
-      for(k = 1; k <= np; k++) {
-         stop = false;
-         kk = k_offset + k ;
-         t1 = s;
-         //fprintf(stdout,("(ep.f:193) t1 = " + t1);
-         t2 = an;
-
-//       Find starting seed t1 for this kk.
-
-         for (i=1;i<=100 && !stop;i++) {
-            ik = kk / 2;
-      //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
-            if (2 * ik != kk)  {
-                t3 = randlc(&t1, &t2);
-                //fprintf(stdout,("(ep.f:200) t1= " +t1 );
-            }
-            if (ik==0)
-                stop = true;
-            else {
-               t3 = randlc(&t2, &t2);
-               kk = ik;
-           }
-         }
-//       Compute uniform pseudorandom numbers.
-
-         //if (timers_enabled)  timer_start(3);
-   timer_start(&(start[3]));
-         //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
-
-  //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
-  //fprintf(stdout,"2*nk = " + (2*nk));
-  //fprintf(stdout,"t1 = " + t1);
-  //fprintf(stdout,"a  = " + a);
-  //fprintf(stdout,"x[0] = " + x[0]);
-  //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-        
-  t1 = vranlc(2 * nk, t1, a, x);
-
-  //fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
-  //fprintf(stdout,("2*nk = " + (2*nk));
-  //fprintf(stdout,("t1 = " + t1);
-  //fprintf(stdout,("a  = " + a);
-  //fprintf(stdout,("x[0] = " + x[0]);
-  //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-        
-         //if (timers_enabled)  timer_stop(3);
-   timer_stop(3,elapsed,start);
-
-/*       Compute Gaussian deviates by acceptance-rejection method and 
- *       tally counts in concentric square annuli.  This loop is not 
- *       vectorizable. 
- */
-         //if (timers_enabled) timer_start(2);
-    timer_start(&(start[2]));
-         for(i=1; i<=nk;i++) {
-            x1 = 2. * x[2*i-2] -1.0;
-            x2 = 2. * x[2*i-1] - 1.0;
-            t1 = x1*x1 + x2*x2;
-            if (t1 <= 1.) {
-               t2   = sqrt(-2. * log(t1) / t1);
-               t3   = (x1 * t2);
-               t4   = (x2 * t2);
-               l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
-               q[l] = q[l] + 1.;
-               sx   = sx + t3;
-               sy   = sy + t4;
-             }
-    /*
-       if(i == 1) {
-                fprintf(stdout,"x1 = " + x1);
-                fprintf(stdout,"x2 = " + x2);
-                fprintf(stdout,"t1 = " + t1);
-                fprintf(stdout,"t2 = " + t2);
-                fprintf(stdout,"t3 = " + t3);
-                fprintf(stdout,"t4 = " + t4);
-                fprintf(stdout,"l = " + l);
-                fprintf(stdout,"q[l] = " + q[l]);
-                fprintf(stdout,"sx = " + sx);
-                fprintf(stdout,"sy = " + sy);
-       }
-    */
-           }
-         //if (timers_enabled)  timer_stop(2);
-    timer_stop(2,elapsed,start);
-      }
-
-    TRACE_smpi_set_category ("finalize");
-
-      //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)   
-  MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
-      MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-      sy = x[0];
-      MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-      for(i = 0; i < nq; i++) {
-    q[i] = x[i];
-  }
-  for(i = 0; i < nq; i++) {
-    gc += q[i];
-  }
-
-  timer_stop(1,elapsed,start);
-      tm = timer_read(1,elapsed);
-  MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  tm = x[0];
-
-  if(node == root) {
-    nit = 0;
-    verified = true;
-
-    if(m == 24) {
-             sx_verify_value = -3.247834652034740E3;
-                  sy_verify_value = -6.958407078382297E3;
-    } else if(m == 25) {
-                sx_verify_value = -2.863319731645753E3;
-      sy_verify_value = -6.320053679109499E3;
-    } else if(m == 28) {
-            sx_verify_value = -4.295875165629892E3;
-      sy_verify_value = -1.580732573678431E4;
-    } else if(m == 30) {
-            sx_verify_value =  4.033815542441498E4;
-                  sy_verify_value = -2.660669192809235E4;
-    } else if(m == 32) {
-                  sx_verify_value =  4.764367927995374E4;
-                     sy_verify_value = -8.084072988043731E4;
-    } else if(m == 36) {
-            sx_verify_value =  1.982481200946593E5;
-            sy_verify_value = -1.020596636361769E5;
-    } else {
-      verified = false;
-    }
-
-    /*
-    fprintf(stdout,("sx        = " + sx);
-    fprintf(stdout,("sx_verify = " + sx_verify_value);
-    fprintf(stdout,("sy        = " + sy);
-    fprintf(stdout,("sy_verify = " + sy_verify_value);
-    */
-    if(verified) {
-      sx_err = abs((sx - sx_verify_value)/sx_verify_value);
-      sy_err = abs((sy - sy_verify_value)/sy_verify_value);
-      /*
-      fprintf(stdout,("sx_err = " + sx_err);
-      fprintf(stdout,("sy_err = " + sx_err);
-      fprintf(stdout,("epsilon= " + epsilon);
-      */
-      verified = ((sx_err < epsilon) && (sy_err < epsilon));
-    }
-
-    Mops = (pow(2.0, m+1))/tm/1000;
-
-    fprintf(stdout,"EP Benchmark Results:\n");
-    fprintf(stdout,"CPU Time=%d\n",(int) tm);
-    fprintf(stdout,"N = 2^%d\n",m);
-    fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
-    fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
-    fprintf(stdout,"Count:");
-    for(i = 0; i < nq; i++) {
-      fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
-    }
-
-    /*
-    print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
-        "Random numbers generated", verified, npbversion,
-        compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
-    fprintf(stdout,"\nEP Benchmark Completed\n");
-            fprintf(stdout,"Class           = %s\n", _class);
-    fprintf(stdout,"Size            = %s\n", size);
-    fprintf(stdout,"Iteration       = %d\n", nit);
-    fprintf(stdout,"Time in seconds = %f\n",(tm/1000));
-    fprintf(stdout,"Total processes = %d\n",no_nodes);
-    fprintf(stdout,"Mops/s total    = %f\n",Mops);
-    fprintf(stdout,"Mops/s/process  = %f\n", Mops/no_nodes);
-    fprintf(stdout,"Operation type  = Random number generated\n");
-    if(verified) {
-      fprintf(stdout,"Verification    = SUCCESSFUL\n");
-    } else {
-      fprintf(stdout,"Verification    = UNSUCCESSFUL\n");
-    }
-             fprintf(stdout,"Total time:     %f\n",(timer_read(1,elapsed)/1000));
-             fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2,elapsed)/1000));
-             fprintf(stdout,"Random numbers: %f\n",(timer_read(3,elapsed)/1000));
-         }
-#ifdef USE_MPE
-    MPE_Finish_log(argv[0]);
-#endif
- 
-       MPI_Finalize();
-      }
-
-    int main(int argc, char **argv) {
-       doTest(argc,argv);
-    }
diff --git a/examples/smpi/NAS/IS/Makefile b/examples/smpi/NAS/IS/Makefile
deleted file mode 100644
index bfdc3edbb8..0000000000
--- a/examples/smpi/NAS/IS/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-BENCHMARK=is
-
-include ../config/make.def
-include ../sys/make.common
-
-OBJS = is.o ${COMMON}/c_print_results.o
-
-${PROGRAM}: config ${OBJS} 
-	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
-
-.c.o: 
-	${CCOMPILE} $<
-
-is.o: is.c npbparams.h
-
-clean:
-	- rm -f *.o *~ is npbparams.h
diff --git a/examples/smpi/NAS/IS/is.c b/examples/smpi/NAS/IS/is.c
deleted file mode 100644
index 57b370bb39..0000000000
--- a/examples/smpi/NAS/IS/is.c
+++ /dev/null
@@ -1,1154 +0,0 @@
-/*************************************************************************
- *                                                                       * 
- *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
- *                                                                       * 
- *                                  I S                                  * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
- *   It is described in NAS Technical Report 95-020.                     * 
- *                                                                       * 
- *   Permission to use, copy, distribute and modify this software        * 
- *   for any purpose with or without fee is hereby granted.  We          * 
- *   request, however, that all derived work reference the NAS           * 
- *   Parallel Benchmarks 3.3. This software is provided "as is"          *
- *   without express or implied warranty.                                * 
- *                                                                       * 
- *   Information on NPB 3.3, including the technical report, the         *
- *   original specifications, source code, results and information       * 
- *   on how to submit new results, is available at:                      * 
- *                                                                       * 
- *          http://www.nas.nasa.gov/Software/NPB                         * 
- *                                                                       * 
- *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
- *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
- *                                                                       * 
- *         NAS Parallel Benchmarks Group                                 * 
- *         NASA Ames Research Center                                     * 
- *         Mail Stop: T27A-1                                             * 
- *         Moffett Field, CA   94035-1000                                * 
- *                                                                       * 
- *         E-mail:  npb@nas.nasa.gov                                     * 
- *         Fax:     (650) 604-3957                                       * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   Author: M. Yarrow                                                   * 
- *           H. Jin                                                      * 
- *                                                                       * 
- *************************************************************************/
-
-#include "mpi.h"
-#include "npbparams.h"
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "simgrid/instr.h" //TRACE_
-
-/******************/
-/* default values */
-/******************/
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS            1                 
-#endif
-#define MIN_PROCS            1
-
-
-/*************/
-/*  CLASS S  */
-/*************/
-#if CLASS == 'S'
-#define  TOTAL_KEYS_LOG_2    16
-#define  MAX_KEY_LOG_2       11
-#define  NUM_BUCKETS_LOG_2   9
-#endif
-
-
-/*************/
-/*  CLASS W  */
-/*************/
-#if CLASS == 'W'
-#define  TOTAL_KEYS_LOG_2    20
-#define  MAX_KEY_LOG_2       16
-#define  NUM_BUCKETS_LOG_2   10
-#endif
-
-/*************/
-/*  CLASS A  */
-/*************/
-#if CLASS == 'A'
-#define  TOTAL_KEYS_LOG_2    23
-#define  MAX_KEY_LOG_2       19
-#define  NUM_BUCKETS_LOG_2   10
-#endif
-
-
-/*************/
-/*  CLASS B  */
-/*************/
-#if CLASS == 'B'
-#define  TOTAL_KEYS_LOG_2    25
-#define  MAX_KEY_LOG_2       21
-#define  NUM_BUCKETS_LOG_2   10
-#endif
-
-
-/*************/
-/*  CLASS C  */
-/*************/
-#if CLASS == 'C'
-#define  TOTAL_KEYS_LOG_2    27
-#define  MAX_KEY_LOG_2       23
-#define  NUM_BUCKETS_LOG_2   10
-#endif
-
-
-/*************/
-/*  CLASS D  */
-/*************/
-#if CLASS == 'D'
-#define  TOTAL_KEYS_LOG_2    29
-#define  MAX_KEY_LOG_2       27
-#define  NUM_BUCKETS_LOG_2   10
-#undef   MIN_PROCS
-#define  MIN_PROCS           4
-#endif
-
-
-#define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
-#define  MAX_KEY             (1 << MAX_KEY_LOG_2)
-#define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
-#define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
-
-/*****************************************************************/
-/* On larger number of processors, since the keys are (roughly)  */ 
-/* gaussian distributed, the first and last processor sort keys  */ 
-/* in a large interval, requiring array sizes to be larger. Note */
-/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
-/* The required array size also depends on the bucket size used. */
-/* The following values are validated for the 1024-bucket setup. */
-/*****************************************************************/
-#if   NUM_PROCS < 256
-#define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
-#elif NUM_PROCS < 512
-#define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
-#elif NUM_PROCS < 1024
-#define  SIZE_OF_BUFFERS     4*NUM_KEYS
-#else
-#define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
-#endif
-
-/*****************************************************************/
-/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
-/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
-/* MAX_PROCS AT YOUR PERIL                                       */
-/*****************************************************************/
-#if CLASS == 'S'
-#define  MAX_PROCS           128
-#else
-#define  MAX_PROCS           1024
-#endif
-
-#define  MAX_ITERATIONS      10
-#define  TEST_ARRAY_SIZE     5
-
-
-/***********************************/
-/* Enable separate communication,  */
-/* computation timing and printout */
-/***********************************/
-/* #define  TIMING_ENABLED         */
-
-
-/*************************************/
-/* Typedef: if necessary, change the */
-/* size of int here by changing the  */
-/* int type to, say, long            */
-/*************************************/
-typedef  int  INT_TYPE;
-typedef  long INT_TYPE2;
-#define MP_KEY_TYPE MPI_INT
-
-
-typedef struct {
-
-/********************/
-/* MPI properties:  */
-/********************/
-int      my_rank,
-         comm_size;
-
-
-/********************/
-/* Some global info */
-/********************/
-INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
-         total_local_keys,             /* copies of rank info        */
-         total_lesser_keys;
-
-
-int      passed_verification;
-                                 
-
-
-/************************************/
-/* These are the three main arrays. */
-/* See SIZE_OF_BUFFERS def above    */
-/************************************/
-INT_TYPE key_array[SIZE_OF_BUFFERS],    
-         key_buff1[SIZE_OF_BUFFERS],    
-         key_buff2[SIZE_OF_BUFFERS],
-         bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
-         bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
-         bucket_ptrs[NUM_BUCKETS],
-         process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],   
-         process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];   
-int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
-         send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
-
-
-/**********************/
-/* Partial verif info */
-/**********************/
-INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
-         test_rank_array[TEST_ARRAY_SIZE];
-
-/**********/
-/* Timers */
-/**********/
-double start[64], elapsed[64];
-
-} global_data;
-
-
-const INT_TYPE2
-         S_test_index_array[TEST_ARRAY_SIZE] = 
-                             {48427,17148,23627,62548,4431},
-         S_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {0,18,346,64917,65463},
-
-         W_test_index_array[TEST_ARRAY_SIZE] = 
-                             {357773,934767,875723,898999,404505},
-         W_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {1249,11698,1039987,1043896,1048018},
-
-         A_test_index_array[TEST_ARRAY_SIZE] = 
-                             {2112377,662041,5336171,3642833,4250760},
-         A_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {104,17523,123928,8288932,8388264},
-
-         B_test_index_array[TEST_ARRAY_SIZE] = 
-                             {41869,812306,5102857,18232239,26860214},
-         B_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {33422937,10244,59149,33135281,99}, 
-
-         C_test_index_array[TEST_ARRAY_SIZE] = 
-                             {44172927,72999161,74326391,129606274,21736814},
-         C_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {61147,882988,266290,133997595,133525895},
-
-         D_test_index_array[TEST_ARRAY_SIZE] = 
-                             {1317351170,995930646,1157283250,1503301535,1453734525},
-         D_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {1,36538729,1978098519,2145192618,2147425337};
-
-
-
-/***********************/
-/* function prototypes */
-/***********************/
-double  randlc( double *X, double *A );
-
-void full_verify( global_data* gd );
-
-void c_print_results( char   *name,
-                      char   class,
-                      int    n1, 
-                      int    n2,
-                      int    n3,
-                      int    niter,
-                      int    nprocs_compiled,
-                      int    nprocs_total,
-                      double t,
-                      double mops,
-          char   *optype,
-                      int    passed_verification,
-                      char   *npbversion,
-                      char   *compiletime,
-                      char   *mpicc,
-                      char   *clink,
-                      char   *cmpi_lib,
-                      char   *cmpi_inc,
-                      char   *cflags,
-                      char   *clinkflags );
-
-void    timer_clear(global_data* gd, int n );
-void    timer_start(global_data* gd, int n );
-void    timer_stop(global_data* gd, int n );
-double  timer_read(global_data* gd, int n );
-
-void    timer_clear(global_data* gd, int n ) {
-   gd->elapsed[n] = 0.0;
-}
-
-void    timer_start(global_data* gd, int n ) {
-   gd->start[n] = MPI_Wtime();
-}
-
-void    timer_stop(global_data* gd, int n ) {
-   gd->elapsed[n] += MPI_Wtime() - gd->start[n];
-}
-
-double  timer_read(global_data* gd, int n ) {
-   return gd->elapsed[n];
-}
-
-
-/*
- *    FUNCTION RANDLC (X, A)
- *
- *  This routine returns a uniform pseudorandom double precision number in the
- *  range (0, 1) by using the linear congruential generator
- *
- *  x_{k+1} = a x_k  (mod 2^46)
- *
- *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
- *  before repeating.  The argument A is the same as 'a' in the above formula,
- *  and X is the same as x_0.  A and X must be odd double precision integers
- *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
- *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
- *  the new seed x_1, so that subsequent calls to RANDLC using the same
- *  arguments will generate a continuous sequence.
- *
- *  This routine should produce the same results on any computer with at least
- *  48 mantissa bits in double precision floating point data.  On Cray systems,
- *  double precision should be disabled.
- *
- *  David H. Bailey     October 26, 1990
- *
- *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- *     SAVE KS, R23, R46, T23, T46
- *     DATA KS/0/
- *
- *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
- *  by merely using the ** operator, in order to insure that the results are
- *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
- */
-
-
-/*****************************************************************/
-/*************           R  A  N  D  L  C             ************/
-/*************                                        ************/
-/*************    portable random number generator    ************/
-/*****************************************************************/
-
-double  randlc( double *X, double *A )
-{
-      static int        KS=0;
-      static double  R23, R46, T23, T46;
-      double    T1, T2, T3, T4;
-      double    A1;
-      double    A2;
-      double    X1;
-      double    X2;
-      double    Z;
-      int         i, j;
-
-      if (KS == 0) 
-      {
-        R23 = 1.0;
-        R46 = 1.0;
-        T23 = 1.0;
-        T46 = 1.0;
-    
-        for (i=1; i<=23; i++)
-        {
-          R23 = 0.50 * R23;
-          T23 = 2.0 * T23;
-        }
-        for (i=1; i<=46; i++)
-        {
-          R46 = 0.50 * R46;
-          T46 = 2.0 * T46;
-        }
-        KS = 1;
-      }
-
-/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
-
-      T1 = R23 * *A;
-      j  = T1;
-      A1 = j;
-      A2 = *A - T23 * A1;
-
-/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
-    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
-    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
-
-      T1 = R23 * *X;
-      j  = T1;
-      X1 = j;
-      X2 = *X - T23 * X1;
-      T1 = A1 * X2 + A2 * X1;
-      
-      j  = R23 * T1;
-      T2 = j;
-      Z = T1 - T23 * T2;
-      T3 = T23 * Z + A2 * X2;
-      j  = R46 * T3;
-      T4 = j;
-      *X = T3 - T46 * T4;
-      return(R46 * *X);
-} 
-
-
-
-/*****************************************************************/
-/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
-/************                                         ************/
-/************ returns parallel random number seq seed ************/
-/*****************************************************************/
-
-/*
- * Create a random number sequence of total length nn residing
- * on np number of processors.  Each processor will therefore have a 
- * subsequence of length nn/np.  This routine returns that random 
- * number which is the first random number for the subsequence belonging
- * to processor rank kn, and which is used as seed for proc kn ran # gen.
- */
-
-double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
-                       int  np,       /* np = num procs                      */
-                       long nn,       /* total num of ran numbers, all procs */
-                       double s,      /* Ran num seed, for ex.: 314159265.00 */
-                       double a )     /* Ran num gen mult, try 1220703125.00 */
-{
-
-  long   i;
-
-  double t1,t2,t3,an;
-  long   mq,nq,kk,ik;
-
-
-
-      nq = nn / np;
-
-      for( mq=0; nq>1; mq++,nq/=2 )
-          ;
-
-      t1 = a;
-
-      for( i=1; i<=mq; i++ )
-        t2 = randlc( &t1, &t1 );
-
-      an = t1;
-
-      kk = kn;
-      t1 = s;
-      t2 = an;
-
-      for( i=1; i<=100; i++ )
-      {
-        ik = kk / 2;
-        if( 2 * ik !=  kk ) 
-            t3 = randlc( &t1, &t2 );
-        if( ik == 0 ) 
-            break;
-        t3 = randlc( &t2, &t2 );
-        kk = ik;
-      }
-
-      return( t1 );
-
-}
-
-
-
-
-/*****************************************************************/
-/*************      C  R  E  A  T  E  _  S  E  Q      ************/
-/*****************************************************************/
-
-void  create_seq( global_data* gd, double seed, double a )
-{
-  double x;
-  int    i, k;
-
-        k = MAX_KEY/4;
-
-  for (i=0; i<NUM_KEYS; i++)
-  {
-      x = randlc(&seed, &a);
-      x += randlc(&seed, &a);
-          x += randlc(&seed, &a);
-      x += randlc(&seed, &a);  
-
-            gd->key_array[i] = k*x;
-  }
-}
-
-
-
-
-/*****************************************************************/
-/*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
-/*****************************************************************/
-
-
-void full_verify( global_data* gd )
-{
-    MPI_Status  status;
-    MPI_Request request;
-    
-    INT_TYPE    i, j;
-    INT_TYPE    k, last_local_key;
-
-    
-/*  Now, finally, sort the keys:  */
-    for( i=0; i<gd->total_local_keys; i++ )
-        gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
-                                 gd->total_lesser_keys] = gd->key_buff2[i];
-    last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
-
-/*  Send largest key value to next processor  */
-    if( gd->my_rank > 0 )
-        MPI_Irecv( &k,
-                   1,
-                   MP_KEY_TYPE,
-                   gd->my_rank-1,
-                   1000,
-                   MPI_COMM_WORLD,
-                   &request );                   
-    if( gd->my_rank < gd->comm_size-1 )
-        MPI_Send( &gd->key_array[last_local_key],
-                  1,
-                  MP_KEY_TYPE,
-                  gd->my_rank+1,
-                  1000,
-                  MPI_COMM_WORLD );
-    if( gd->my_rank > 0 )
-        MPI_Wait( &request, &status );
-
-/*  Confirm that neighbor's greatest key value 
-    is not greater than my least key value       */              
-    j = 0;
-    if( gd->my_rank > 0 && gd->total_local_keys > 0 )
-        if( k > gd->key_array[0] )
-            j++;
-
-
-/*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
-    for( i=1; i<gd->total_local_keys; i++ )
-        if( gd->key_array[i-1] > gd->key_array[i] )
-            j++;
-
-
-    if( j != 0 )
-    {
-        printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
-                gd->my_rank, j );
-    }
-    else
-        gd->passed_verification++;
-           
-
-}
-
-
-
-
-/*****************************************************************/
-/*************             R  A  N  K             ****************/
-/*****************************************************************/
-
-
-void rank( global_data* gd, int iteration )
-{
-
-    INT_TYPE    i, k;
-
-    INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
-    INT_TYPE    key;
-    INT_TYPE2   bucket_sum_accumulator, j, m;
-    INT_TYPE    local_bucket_sum_accumulator;
-    INT_TYPE    min_key_val, max_key_val;
-    INT_TYPE    *key_buff_ptr;
-
-
-
-
-/*  Iteration alteration of keys */  
-    if(gd->my_rank == 0 )                    
-    {
-      gd->key_array[iteration] = iteration;
-      gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
-    }
-
-
-/*  Initialize */
-    for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )  
-    {
-        gd->bucket_size[i] = 0;
-        gd->bucket_size_totals[i] = 0;
-        gd->process_bucket_distrib_ptr1[i] = 0;
-        gd->process_bucket_distrib_ptr2[i] = 0;
-    }
-
-
-/*  Determine where the partial verify test keys are, load into  */
-/*  top of array bucket_size                                     */
-    for( i=0; i<TEST_ARRAY_SIZE; i++ )
-        if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
-            gd->bucket_size[NUM_BUCKETS+i] = 
-                          gd->key_array[gd->test_index_array[i] % NUM_KEYS];
-
-
-/*  Determine the number of keys in each bucket */
-    for( i=0; i<NUM_KEYS; i++ )
-        gd->bucket_size[gd->key_array[i] >> shift]++;
-
-
-/*  Accumulative bucket sizes are the bucket pointers */
-    gd->bucket_ptrs[0] = 0;
-    for( i=1; i< NUM_BUCKETS; i++ )  
-        gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
-
-
-/*  Sort into appropriate bucket */
-    for( i=0; i<NUM_KEYS; i++ )  
-    {
-        key = gd->key_array[i];
-        gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
-    }
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 2 );
-    timer_start(gd, 3 );
-#endif
-
-/*  Get the bucket size totals for the entire problem. These 
-    will be used to determine the redistribution of keys      */
-    MPI_Allreduce( gd->bucket_size, 
-                   gd->bucket_size_totals, 
-                   NUM_BUCKETS+TEST_ARRAY_SIZE, 
-                   MP_KEY_TYPE,
-                   MPI_SUM,
-                   MPI_COMM_WORLD );
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 3 );
-    timer_start(gd, 2 );
-#endif
-
-/*  Determine Redistibution of keys: accumulate the bucket size totals 
-    till this number surpasses NUM_KEYS (which the average number of keys
-    per processor).  Then all keys in these buckets go to processor 0.
-    Continue accumulating again until supassing 2*NUM_KEYS. All keys
-    in these buckets go to processor 1, etc.  This algorithm guarantees
-    that all processors have work ranking; no processors are left idle.
-    The optimum number of buckets, however, does not result in as high
-    a degree of load balancing (as even a distribution of keys as is
-    possible) as is obtained from increasing the number of buckets, but
-    more buckets results in more computation per processor so that the
-    optimum number of buckets turns out to be 1024 for machines tested.
-    Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
-    number of first and last bucket which each processor will have after   
-    the redistribution is done.                                          */
-
-    bucket_sum_accumulator = 0;
-    local_bucket_sum_accumulator = 0;
-    gd->send_displ[0] = 0;
-    gd->process_bucket_distrib_ptr1[0] = 0;
-    for( i=0, j=0; i<NUM_BUCKETS; i++ )  
-    {
-        bucket_sum_accumulator       += gd->bucket_size_totals[i];
-        local_bucket_sum_accumulator += gd->bucket_size[i];
-        if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )  
-        {
-            gd->send_count[j] = local_bucket_sum_accumulator;
-            if( j != 0 )
-            {
-                gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
-                gd->process_bucket_distrib_ptr1[j] = 
-                                        gd->process_bucket_distrib_ptr2[j-1]+1;
-            }
-            gd->process_bucket_distrib_ptr2[j++] = i;
-            local_bucket_sum_accumulator = 0;
-        }
-    }
-
-/*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
-    that the last few processors don't get any buckets.  So, we
-    need to set counts properly in this case to avoid any fallouts.    */
-    while( j < gd->comm_size )
-    {
-        gd->send_count[j] = 0;
-        gd->process_bucket_distrib_ptr1[j] = 1;
-        j++;
-    }
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 2 );
-    timer_start(gd, 3 ); 
-#endif
-
-/*  This is the redistribution section:  first find out how many keys
-    each processor will send to every other processor:                 */
-    MPI_Alltoall( gd->send_count,
-                  1,
-                  MPI_INT,
-                  gd->recv_count,
-                  1,
-                  MPI_INT,
-                  MPI_COMM_WORLD );
-
-/*  Determine the receive array displacements for the buckets */    
-    gd->recv_displ[0] = 0;
-    for( i=1; i<gd->comm_size; i++ )
-        gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
-
-
-/*  Now send the keys to respective processors  */    
-    MPI_Alltoallv( gd->key_buff1,
-                   gd->send_count,
-                   gd->send_displ,
-                   MP_KEY_TYPE,
-                   gd->key_buff2,
-                   gd->recv_count,
-                   gd->recv_displ,
-                   MP_KEY_TYPE,
-                   MPI_COMM_WORLD );
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 3 ); 
-    timer_start(gd, 2 );
-#endif
-
-/*  The starting and ending bucket numbers on each processor are
-    multiplied by the interval size of the buckets to obtain the 
-    smallest possible min and greatest possible max value of any 
-    key on each processor                                          */
-    min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
-    max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
-
-/*  Clear the work array */
-    for( i=0; i<max_key_val-min_key_val+1; i++ )
-        gd->key_buff1[i] = 0;
-
-/*  Determine the total number of keys on all other 
-    processors holding keys of lesser value         */
-    m = 0;
-    for( k=0; k<gd->my_rank; k++ )
-        for( i= gd->process_bucket_distrib_ptr1[k];
-             i<=gd->process_bucket_distrib_ptr2[k];
-             i++ )  
-            m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
-
-/*  Determine total number of keys on this processor */
-    j = 0;                                 
-    for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
-         i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
-         i++ )  
-        j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
-
-
-/*  Ranking of all keys occurs in this section:                 */
-/*  shift it backwards so no subtractions are necessary in loop */
-    key_buff_ptr = gd->key_buff1 - min_key_val;
-
-/*  In this section, the keys themselves are used as their 
-    own indexes to determine how many of each there are: their
-    individual population                                       */
-    for( i=0; i<j; i++ )
-        key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
-                                       /* population                     */
-
-/*  To obtain ranks of each key, successively add the individual key
-    population, not forgetting the total of lesser keys, m.
-    NOTE: Since the total of lesser keys would be subtracted later 
-    in verification, it is no longer added to the first key population 
-    here, but still needed during the partial verify test.  This is to 
-    ensure that 32-bit key_buff can still be used for class D.           */
-/*    key_buff_ptr[min_key_val] += m;    */
-    for( i=min_key_val; i<max_key_val; i++ )   
-        key_buff_ptr[i+1] += key_buff_ptr[i];  
-
-
-/* This is the partial verify test section */
-/* Observe that test_rank_array vals are   */
-/* shifted differently for different cases */
-    for( i=0; i<TEST_ARRAY_SIZE; i++ )
-    {                                             
-        k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
-        if( min_key_val <= k  &&  k <= max_key_val )
-        {
-            /* Add the total of lesser keys, m, here */
-            INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
-            int failed = 0;
-
-            switch( CLASS )
-            {
-                case 'S':
-                    if( i <= 2 )
-                    {
-                        if( key_rank != gd->test_rank_array[i]+iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'W':
-                    if( i < 2 )
-                    {
-                        if( key_rank != gd->test_rank_array[i]+(iteration-2) )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'A':
-                    if( i <= 2 )
-              {
-                        if( key_rank != gd->test_rank_array[i]+(iteration-1) )
-                            failed = 1;
-                        else
-                          gd->passed_verification++;
-              }
-                    else
-                    {
-                        if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'B':
-                    if( i == 1 || i == 2 || i == 4 )
-              {
-                        if( key_rank != gd->test_rank_array[i]+iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-              }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'C':
-                    if( i <= 2 )
-              {
-                        if( key_rank != gd->test_rank_array[i]+iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-              }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'D':
-                    if( i < 2 )
-              {
-                        if( key_rank != gd->test_rank_array[i]+iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-              }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-            }
-            if( failed == 1 )
-                printf( "Failed partial verification: "
-                        "iteration %d, processor %d, test key %d\n", 
-                         iteration, gd->my_rank, (int)i );
-        }
-    }
-
-
-
-
-/*  Make copies of rank info for use by full_verify: these variables
-    in rank are local; making them global slows down the code, probably
-    since they cannot be made register by compiler                        */
-
-    if( iteration == MAX_ITERATIONS ) 
-    {
-        gd->key_buff_ptr_global = key_buff_ptr;
-        gd->total_local_keys    = j;
-        gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
-    }
-
-}      
-
-
-/*****************************************************************/
-/*************             M  A  I  N             ****************/
-/*****************************************************************/
-
-int main( int argc, char **argv )
-{
-
-    int             i, iteration, itemp;
-
-    double          timecounter, maxtime;
-
-    global_data* gd = malloc(sizeof(global_data));
-/*  Initialize MPI */
-    MPI_Init( &argc, &argv );
-    MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
-    MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
-
-/*  Initialize the verification arrays if a valid class */
-    for( i=0; i<TEST_ARRAY_SIZE; i++ )
-        switch( CLASS )
-        {
-            case 'S':
-                gd->test_index_array[i] = S_test_index_array[i];
-                gd->test_rank_array[i]  = S_test_rank_array[i];
-                break;
-            case 'A':
-                gd->test_index_array[i] = A_test_index_array[i];
-                gd->test_rank_array[i]  = A_test_rank_array[i];
-                break;
-            case 'W':
-                gd->test_index_array[i] = W_test_index_array[i];
-                gd->test_rank_array[i]  = W_test_rank_array[i];
-                break;
-            case 'B':
-                gd->test_index_array[i] = B_test_index_array[i];
-                gd->test_rank_array[i]  = B_test_rank_array[i];
-                break;
-            case 'C':
-                gd->test_index_array[i] = C_test_index_array[i];
-                gd->test_rank_array[i]  = C_test_rank_array[i];
-                break;
-            case 'D':
-                gd->test_index_array[i] = D_test_index_array[i];
-                gd->test_rank_array[i]  = D_test_rank_array[i];
-                break;
-        };
-
-        
-
-/*  Printout initial NPB info */
-    if( gd->my_rank == 0 )
-    {
-        printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
-        printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
-        printf( " Iterations:   %d\n", MAX_ITERATIONS );
-        printf( " Number of processes:     %d\n",gd->comm_size );
-    }
-
-/*  Check that actual and compiled number of processors agree */
-    if( gd->comm_size != NUM_PROCS )
-    {
-        if( gd->my_rank == 0 )
-            printf( "\n ERROR: compiled for %d processes\n"
-                    " Number of active processes: %d\n"
-                    " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
-        MPI_Finalize();
-        exit( 1 );
-    }
-
-/*  Check to see whether total number of processes is within bounds.
-    This could in principle be checked in setparams.c, but it is more
-    convenient to do it here                                               */
-    if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
-    {
-       if( gd->my_rank == 0 )
-           printf( "\n ERROR: number of processes %d not within range %d-%d"
-                   "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
-       MPI_Finalize();
-       exit( 1 );
-    }
-
-
-/*  Generate random number sequence and subsequent keys on all procs */
-    create_seq(gd,  find_my_seed( gd->my_rank, 
-                              gd->comm_size, 
-                              4*(long)TOTAL_KEYS*MIN_PROCS,
-                              314159265.00,      /* Random number gen seed */
-                              1220703125.00 ),   /* Random number gen mult */
-                1220703125.00 );                 /* Random number gen mult */
-
-/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
-    all data and code pages and respective tables */
-    rank(gd, 1 );  
-
-/*  Start verification counter */
-    gd->passed_verification = 0;
-
-    if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
-
-/*  Initialize timer  */             
-    timer_clear(gd, 0 );
-
-/*  Initialize separate communication, computation timing */
-#ifdef  TIMING_ENABLED 
-    for( i=1; i<=3; i++ ) timer_clear(gd, i );
-#endif
-
-/*  Start timer  */             
-    timer_start(gd, 0 );
-
-#ifdef  TIMING_ENABLED
-    timer_start(gd, 1 );
-    timer_start(gd, 2 );
-#endif
-
-    char smpi_category[100];
-    snprintf (smpi_category, 100, "%d", gd->my_rank);
-    TRACE_smpi_set_category (smpi_category);
-
-/*  This is the main iteration */
-    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
-    {
-        if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
-        rank(gd,  iteration );
-    }
-    TRACE_smpi_set_category (NULL);
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 2 );
-    timer_stop(gd, 1 );
-#endif
-
-/*  Stop timer, obtain time for processors */
-    timer_stop(gd, 0 );
-
-    timecounter = timer_read(gd, 0 );
-
-/*  End of timing, obtain maximum time of all processors */
-    MPI_Reduce( &timecounter,
-                &maxtime,
-                1,
-                MPI_DOUBLE,
-                MPI_MAX,
-                0,
-                MPI_COMM_WORLD );
-
-#ifdef  TIMING_ENABLED
-    {
-        double    tmin, tsum, tmax;
-    
-        if( my_rank == 0 )
-        {
-            printf( "\ntimer 1/2/3 = total/computation/communication time\n");
-            printf( "              min                avg                max\n" );
-        }
-        for( i=1; i<=3; i++ )
-        {
-            timecounter = timer_read(gd, i );
-            MPI_Reduce( &timecounter,
-                        &tmin,
-                        1,
-                        MPI_DOUBLE,
-                        MPI_MIN,
-                        0,
-                        MPI_COMM_WORLD );
-            MPI_Reduce( &timecounter,
-                        &tsum,
-                        1,
-                        MPI_DOUBLE,
-                        MPI_SUM,
-                        0,
-                        MPI_COMM_WORLD );
-            MPI_Reduce( &timecounter,
-                        &tmax,
-                        1,
-                        MPI_DOUBLE,
-                        MPI_MAX,
-                        0,
-                        MPI_COMM_WORLD );
-            if( my_rank == 0 )
-                printf( "timer %d:    %f           %f            %f\n",
-                        i, tmin, tsum/((double) comm_size), tmax );
-        }
-        if( my_rank == 0 )
-            printf( "\n" );
-    }
-#endif
-
-/*  This tests that keys are in sequence: sorting of last ranked key seq
-    occurs here, but is an untimed operation                             */
-    full_verify(gd);
-
-
-/*  Obtain verification counter sum */
-    itemp =gd->passed_verification;
-    MPI_Reduce( &itemp,
-                &gd->passed_verification,
-                1,
-                MPI_INT,
-                MPI_SUM,
-                0,
-                MPI_COMM_WORLD );
-
-
-
-/*  The final printout  */
-    if( gd->my_rank == 0 )
-    {
-        if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
-            gd->passed_verification = 0;
-        c_print_results( "IS",
-                         CLASS,
-                         (int)(TOTAL_KEYS),
-                         MIN_PROCS,
-                         0,
-                         MAX_ITERATIONS,
-                         NUM_PROCS,
-                         gd->comm_size,
-                         maxtime,
-                         ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
-                                                      /maxtime/1000000.,
-                         "keys ranked", 
-                         gd->passed_verification,
-                         NPBVERSION,
-                         COMPILETIME,
-                         MPICC,
-                         CLINK,
-                         CMPI_LIB,
-                         CMPI_INC,
-                         CFLAGS,
-                         CLINKFLAGS );
-    }
-                    
-    MPI_Finalize();
-    free(gd);
-
-    return 0;
-         /**************************/
-}        /*  E N D  P R O G R A M  */
-         /**************************/
diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile
deleted file mode 100644
index a15725403d..0000000000
--- a/examples/smpi/NAS/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-SHELL=/bin/sh
-CLASS=S
-NPROCS=1
-default:
-	@ sys/print_instructions
-
-IS: is
-is: 
-	cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
-EP: ep
-ep:
-	cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
-DT: dt
-dt: 
-	cd DT; $(MAKE) CLASS=$(CLASS)
-
-clean:
-	- rm -f *~  */*~ */*.o */npbparams.h 
-	- rm -f sys/setparams sys/setparams.h
-
-veryclean: clean
-	- rm -f bin/*
\ No newline at end of file
diff --git a/examples/smpi/NAS/common/c_print_results.c b/examples/smpi/NAS/common/c_print_results.c
deleted file mode 100644
index 942c37075a..0000000000
--- a/examples/smpi/NAS/common/c_print_results.c
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-void c_print_results(char *name, char class, int n1, int n2, int n3, int niter, int nprocs_compiled, int nprocs_total,
-                     double t, double mops, char *optype, int passed_verification, char *npbversion, char *compiletime,
-                     char *mpicc, char *clink, char *cmpi_lib, char *cmpi_inc, char *cflags, char *clinkflags)
-{
-    printf( "\n\n %s Benchmark Completed\n", name ); 
-    printf( " Class           =                        %c\n", class );
-
-    if( n3 == 0 ) {
-        long nn = n1;
-        if ( n2 != 0 ) nn *= n2;
-        printf( " Size            =             %12ld\n", nn );   /* as in IS */
-    }
-    else
-        printf( " Size            =              %3dx %3dx %3d\n", n1,n2,n3 );
-
-    printf( " Iterations      =             %12d\n", niter );
-    printf( " Time in seconds =             %12.2f\n", t );
-    printf( " Total processes =             %12d\n", nprocs_total );
-
-    if ( nprocs_compiled != 0 )
-        printf( " Compiled procs  =             %12d\n", nprocs_compiled );
-
-    printf( " Mop/s total     =             %12.2f\n", mops );
-    printf( " Mop/s/process   =             %12.2f\n", mops/((float) nprocs_total) );
-    printf( " Operation type  = %24s\n", optype);
-
-    if( passed_verification )
-        printf( " Verification    =               SUCCESSFUL\n" );
-    else
-        printf( " Verification    =             UNSUCCESSFUL\n" );
-
-    printf( " Version         =             %12s\n", npbversion );
-    printf( " Compile date    =             %12s\n", compiletime );
-    printf( "\n Compile options:\n" );
-    printf( "    MPICC        = %s\n", mpicc );
-    printf( "    CLINK        = %s\n", clink );
-    printf( "    CMPI_LIB     = %s\n", cmpi_lib );
-    printf( "    CMPI_INC     = %s\n", cmpi_inc );
-    printf( "    CFLAGS       = %s\n", cflags );
-    printf( "    CLINKFLAGS   = %s\n", clinkflags );
-    printf( "\n\n" );
-    printf( " Please send the results of this run to:\n\n" );
-    printf( " NPB Development Team\n" );
-    printf( " Internet: npb@nas.nasa.gov\n \n" );
-    printf( " If email is not available, send this to:\n\n" );
-    printf( " MS T27A-1\n" );
-    printf( " NASA Ames Research Center\n" );
-    printf( " Moffett Field, CA  94035-1000\n\n" );
-    printf( " Fax: 650-604-3957\n\n" );
-}
diff --git a/examples/smpi/NAS/common/c_timers.c b/examples/smpi/NAS/common/c_timers.c
deleted file mode 100644
index a3af153d29..0000000000
--- a/examples/smpi/NAS/common/c_timers.c
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "mpi.h"
-
-double start[64], elapsed[64];
-
-void timer_clear( int n )
-{
-    elapsed[n] = 0.0;
-}
-
-void timer_start( int n )
-{
-    start[n] = MPI_Wtime();
-}
-
-void timer_stop( int n )
-{
-    double t, now;
-    now = MPI_Wtime();
-    t = now - start[n];
-    elapsed[n] += t;
-}
-
-double timer_read( int n )
-{
-    return( elapsed[n] );
-}
-
diff --git a/examples/smpi/NAS/common/randdp.c b/examples/smpi/NAS/common/randdp.c
deleted file mode 100644
index 554d6b68de..0000000000
--- a/examples/smpi/NAS/common/randdp.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- *    FUNCTION RANDLC (X, A)
- *
- *  This routine returns a uniform pseudorandom double precision number in the
- *  range (0, 1) by using the linear congruential generator
- *
- *  x_{k+1} = a x_k  (mod 2^46)
- *
- *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
- *  before repeating.  The argument A is the same as 'a' in the above formula,
- *  and X is the same as x_0.  A and X must be odd double precision integers
- *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
- *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
- *  the new seed x_1, so that subsequent calls to RANDLC using the same
- *  arguments will generate a continuous sequence.
- *
- *  This routine should produce the same results on any computer with at least
- *  48 mantissa bits in double precision floating point data.  On Cray systems,
- *  double precision should be disabled.
- *
- *  David H. Bailey     October 26, 1990
- *
- *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- *     SAVE KS, R23, R46, T23, T46
- *     DATA KS/0/
- *
- *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
- *  by merely using the ** operator, in order to insure that the results are
- *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
- */
-double  randlc(double *X, double*A)
-{
-  static int        KS=0;
-  static double  R23, R46, T23, T46;
-  double    T1, T2, T3, T4;
-  double    A1, A2;
-  double    X1, X2;
-  double    Z;
-  int       i, j;
-
-  if (KS == 0) {
-    R23 = 1.0;
-    R46 = 1.0;
-    T23 = 1.0;
-    T46 = 1.0;
-
-    for (i=1; i<=23; i++) {
-      R23 = 0.50 * R23;
-      T23 = 2.0 * T23;
-    }
-    for (i=1; i<=46; i++) {
-      R46 = 0.50 * R46;
-      T46 = 2.0 * T46;
-    }
-    KS = 1;
-  }
-
-/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
-  T1 = R23 * *A;
-  j  = T1;
-  A1 = j;
-  A2 = *A - T23 * A1;
-
-/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
-    Z = A1 * X2 + A2 * X1  (mod 2^23), and then X = 2^23 * Z + A2 * X2  (mod 2^46). */
-  T1 = R23 * *X;
-  j  = T1;
-  X1 = j;
-  X2 = *X - T23 * X1;
-  T1 = A1 * X2 + A2 * X1;
-
-  j  = R23 * T1;
-  T2 = j;
-  Z = T1 - T23 * T2;
-  T3 = T23 * Z + A2 * X2;
-  j  = R46 * T3;
-  T4 = j;
-  *X = T3 - T46 * T4;
-  return(R46 * *X);
-}
diff --git a/examples/smpi/NAS/config/make.def b/examples/smpi/NAS/config/make.def
deleted file mode 100644
index ad8f454a91..0000000000
--- a/examples/smpi/NAS/config/make.def
+++ /dev/null
@@ -1,73 +0,0 @@
-#---------------------------------------------------------------------------
-#
-#                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
-#
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# Items in this file will need to be changed for each platform.
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# Parallel C:
-#
-# MPICC      - C compiler 
-# CFLAGS     - C compilation arguments
-# CMPI_INC   - any -I arguments required for compiling MPI/C 
-# CLINK      - C linker
-# CLINKFLAGS - C linker flags
-# CMPI_LIB   - any -L and -l arguments required for linking MPI/C 
-#
-# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
-#                            $(MPICC) $(CFLAGS)
-# linking is done with       $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# This is the C compiler used for MPI programs
-#---------------------------------------------------------------------------
-MPICC = smpicc
-# This links MPI C programs; usually the same as ${MPICC}
-CLINK = $(MPICC)
-
-#---------------------------------------------------------------------------
-# These macros are passed to the linker to help link with MPI correctly
-#---------------------------------------------------------------------------
-CMPI_LIB  =
-
-#---------------------------------------------------------------------------
-# These macros are passed to the compiler to help find 'mpi.h'
-#---------------------------------------------------------------------------
-CMPI_INC =
-
-#---------------------------------------------------------------------------
-# Global *compile time* flags for C programs
-#---------------------------------------------------------------------------
-CFLAGS = -O2
-
-#---------------------------------------------------------------------------
-# Global *link time* flags. Flags for increasing maximum executable 
-# size usually go here. 
-#---------------------------------------------------------------------------
-CLINKFLAGS = -O2
-
-#---------------------------------------------------------------------------
-# Utilities C:
-#
-# This is the C compiler used to compile C utilities.  Flags required by 
-# this compiler go here also; typically there are few flags required; hence 
-# there are no separate macros provided for such flags.
-#---------------------------------------------------------------------------
-CC	= gcc -g
-
-#---------------------------------------------------------------------------
-# Destination of executables, relative to subdirs of the main directory. . 
-#---------------------------------------------------------------------------
-BINDIR	= ../bin
-
-#---------------------------------------------------------------------------
-# The variable RAND controls which random number generator 
-# is used. It is described in detail in README.install. 
-# Use "randi8" unless there is a reason to use another one. 
-#---------------------------------------------------------------------------
-RAND   = randi8
diff --git a/examples/smpi/NAS/DT/dt.c b/examples/smpi/NAS/dt.c
similarity index 82%
rename from examples/smpi/NAS/DT/dt.c
rename to examples/smpi/NAS/dt.c
index 1a4931704a..ca3a18a942 100644
--- a/examples/smpi/NAS/DT/dt.c
+++ b/examples/smpi/NAS/dt.c
@@ -41,53 +41,26 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "mpi.h"
-#include "npbparams.h"
-
+#include "smpi/mpi.h"
+#include "nas_common.h"
 #include "simgrid/instr.h" //TRACE_
 
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS            1                 
-#endif
-
-//int      passed_verification;
-extern double randlc( double *X, double *A );
-extern
-void c_print_results( char   *name,
-                      char   class,
-                      int    n1, 
-                      int    n2,
-                      int    n3,
-                      int    niter,
-                      int    nprocs_compiled,
-                      int    nprocs_total,
-                      double t,
-                      double mops,
-          char   *optype,
-                      int    passed_verification,
-                      char   *npbversion,
-                      char   *compiletime,
-                      char   *mpicc,
-                      char   *clink,
-                      char   *cmpi_lib,
-                      char   *cmpi_inc,
-                      char   *cflags,
-                      char   *clinkflags );
-          
-void    timer_clear( int n );
-void    timer_start( int n );
-void    timer_stop( int n );
-double  timer_read( int n );
+
 int timer_on=0,timers_tot=64;
+double start[64], elapsed[64];
 
-int verify(char *bmname,double rnm2){
+char class;
+int nprocs;
+int num_samples;
+int deviation;
+int num_sources;
+
+static int verify(char *bmname,double rnm2){
     double verify_value=0.0;
     double epsilon=1.0E-8;
-    char cls=CLASS;
     int verified=-1;
-    if (cls != 'U') {
-       if(cls=='S') {
+    if (class != 'U') {
+       if(class=='S') {
          if(strstr(bmname,"BH")){
            verify_value=30892725.0;
          }else if(strstr(bmname,"WH")){
@@ -98,18 +71,18 @@ int verify(char *bmname,double rnm2){
            fprintf(stderr,"No such benchmark as %s.\n",bmname);
          }
          verified = 0;
-       }else if(cls=='W') {
+       }else if(class=='W') {
          if(strstr(bmname,"BH")){
        verify_value = 4102461.0;
          }else if(strstr(bmname,"WH")){
-       verify_value = 204280762.0;
+        	 verify_value = 204280762.0;
          }else if(strstr(bmname,"SH")){
        verify_value = 186944764.0;
          }else{
            fprintf(stderr,"No such benchmark as %s.\n",bmname);
          }
          verified = 0;
-       }else if(cls=='A') {
+       }else if(class=='A') {
          if(strstr(bmname,"BH")){
        verify_value = 17809491.0;
          }else if(strstr(bmname,"WH")){
@@ -120,7 +93,7 @@ int verify(char *bmname,double rnm2){
            fprintf(stderr,"No such benchmark as %s.\n",bmname);
          }
      verified = 0;
-       }else if(cls=='B') {
+       }else if(class=='B') {
          if(strstr(bmname,"BH")){
        verify_value = 4317114.0;
          }else if(strstr(bmname,"WH")){
@@ -131,7 +104,7 @@ int verify(char *bmname,double rnm2){
            fprintf(stderr,"No such benchmark as %s.\n",bmname);
        verified = 0;
          }
-       }else if(cls=='C') {
+       }else if(class=='C') {
          if(strstr(bmname,"BH")){
        verify_value = 0.0;
          }else if(strstr(bmname,"WH")){
@@ -142,7 +115,7 @@ int verify(char *bmname,double rnm2){
            fprintf(stderr,"No such benchmark as %s.\n",bmname);
        verified = -1;
          }
-       }else if(cls=='D') {
+       }else if(class=='D') {
          if(strstr(bmname,"BH")){
        verify_value = 0.0;
          }else if(strstr(bmname,"WH")){
@@ -154,7 +127,7 @@ int verify(char *bmname,double rnm2){
          }
          verified = -1;
        }else{
-         fprintf(stderr,"No such class as %c.\n",cls);
+         fprintf(stderr,"No such class as %c.\n",class);
        }
        fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
        if(verified==-1){
@@ -174,7 +147,7 @@ int verify(char *bmname,double rnm2){
     return  verified;  
   }
 
-int ipowMod(int a,long long int n,int md){ 
+static int ipowMod(int a,long long int n,int md){
   int seed=1,q=a,r=1;
   if(n<0){
     fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
@@ -203,13 +176,13 @@ int ipowMod(int a,long long int n,int md){
 }
 
 #include "DGraph.h"
-DGraph *buildSH(char cls){
+static DGraph *buildSH(const char cls){
 /*
   Nodes of the graph must be topologically sorted
   to avoid MPI deadlock.
 */
   DGraph *dg;
-  int numSources=NUM_SOURCES; /* must be power of 2 */
+  int numSources=num_sources; /* must be power of 2 */
   int numOfLayers=0,tmpS=numSources>>1;
   int firstLayerNode=0;
   DGArc *ar=NULL;
@@ -263,13 +236,10 @@ DGraph *buildSH(char cls){
   }
 return dg;
 }
-DGraph *buildWH(char cls){
-/*
-  Nodes of the graph must be topologically sorted
-  to avoid MPI deadlock.
-*/
+static DGraph *buildWH(const char cls){
+/*  Nodes of the graph must be topologically sorted to avoid MPI deadlock. */
   int i=0,j=0;
-  int numSources=NUM_SOURCES,maxInDeg=4;
+  int numSources=num_sources,maxInDeg=4;
   int numLayerNodes=numSources,firstLayerNode=0;
   int totComparators=0;
   int numPrevLayerNodes=numLayerNodes;
@@ -308,7 +278,7 @@ DGraph *buildWH(char cls){
     firstLayerNode+=numPrevLayerNodes;
     numPrevLayerNodes=numLayerNodes;
   }
-  source=newNode("Source");
+  source=newNode((char*)"Source");
   AttachNode(dg,source);   
   for(i=0;i<numPrevLayerNodes;i++){
     nd=dg->node[firstLayerNode+i];
@@ -325,13 +295,10 @@ DGraph *buildWH(char cls){
   }
 return dg;
 }
-DGraph *buildBH(char cls){
-/*
-  Nodes of the graph must be topologically sorted
-  to avoid MPI deadlock.
-*/
+static DGraph *buildBH(const char cls){
+/* Nodes of the graph must be topologically sorted to avoid MPI deadlock.*/
   int i=0,j=0;
-  int numSources=NUM_SOURCES,maxInDeg=4;
+  int numSources=num_sources,maxInDeg=4;
   int numLayerNodes=numSources,firstLayerNode=0;
   DGraph *dg;
   DGNode *nd=NULL, *snd=NULL, *sink=NULL;
@@ -368,7 +335,7 @@ DGraph *buildBH(char cls){
     firstLayerNode+=numPrevLayerNodes;
     numPrevLayerNodes=numLayerNodes;
   }
-  sink=newNode("Sink");
+  sink=newNode((char*)"Sink");
   AttachNode(dg,sink);   
   for(i=0;i<numPrevLayerNodes;i++){
     nd=dg->node[firstLayerNode+i];
@@ -382,38 +349,42 @@ typedef struct{
   int len;
   double* val;
 } Arr;
-Arr *newArr(int len){
-  Arr *arr=(Arr *)malloc(sizeof(Arr));
+
+static Arr *newArr(int len){
+  Arr *arr=(Arr *)malloc(sizeof(Arr)); //Arr *arr=(Arr *)SMPI_SHARED_MALLOC(sizeof(Arr));
   arr->len=len;
-  arr->val=(double *)malloc(len*sizeof(double));
+  arr->val=(double *)malloc(len*sizeof(double)); //arr->val=(double *)SMPI_SHARED_MALLOC(len*sizeof(double));
   return arr;
 }
-void arrShow(Arr* a){
+
+static void arrShow(Arr* a){
   if(!a) fprintf(stderr,"-- NULL array\n");
   else{
     fprintf(stderr,"-- length=%d\n",a->len);
   }
 }
-double CheckVal(Arr *feat){
+
+static double CheckVal(Arr *feat){
   double csum=0.0;
   int i=0;
   for(i=0;i<feat->len;i++){
-    csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since 
-                                                  result will be 0 for large len  */
+    csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since result will be 0 for large len  */
   }
-   return csum;
+  return csum;
 }
-int GetFNumDPar(int* mean, int* stdev){
-  *mean=NUM_SAMPLES;
-  *stdev=STD_DEVIATION;
+
+static int GetFNumDPar(int* mean, int* stdev){
+  *mean=num_samples;
+  *stdev=deviation;
   return 0;
 }
-int GetFeatureNum(char *mbname,int id){
+
+static int GetFeatureNum(char *mbname,int id){
   double tran=314159265.0;
   double A=2*id+1;
   double denom=randlc(&tran,&A);
   char cval='S';
-  int mean=NUM_SAMPLES,stdev=128;
+  int mean=num_samples,stdev=128;
   int rtfs=0,len=0;
   GetFNumDPar(&mean,&stdev);
   rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
@@ -421,7 +392,8 @@ int GetFeatureNum(char *mbname,int id){
   len=mean-stdev+rtfs;
   return len;
 }
-Arr* RandomFeatures(char *bmname,int fdim,int id){
+
+static Arr* RandomFeatures(char *bmname,int fdim,int id){
   int len=GetFeatureNum(bmname,id)*fdim;
   Arr* feat=newArr(len);
   int nxg=2,nyg=2,nzg=2,nfg=5;
@@ -450,31 +422,33 @@ Arr* RandomFeatures(char *bmname,int fdim,int id){
     timer_stop(id+1);
     fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
   }
-  return feat;   
+  return feat;
 }
-void Resample(Arr *a,int blen){
+
+static void Resample(Arr *a,int blen){
     long long int i=0,j=0,jlo=0,jhi=0;
     double avval=0.0;
     double *nval=(double *)malloc(blen*sizeof(double));
-    Arr *tmp=newArr(10);
+    //double *nval=(double *)SMPI_SHARED_MALLOC(blen*sizeof(double));
     for(i=0;i<blen;i++) nval[i]=0.0;
     for(i=1;i<a->len-1;i++){
       jlo=(int)(0.5*(2*i-1)*(blen/a->len)); 
       jhi=(int)(0.5*(2*i+1)*(blen/a->len));
 
-      avval=a->val[i]/(jhi-jlo+1);    
+      avval=a->val[i]/(jhi-jlo+1);
       for(j=jlo;j<=jhi;j++){
         nval[j]+=avval;
       }
     }
     nval[0]=a->val[0];
     nval[blen-1]=a->val[a->len-1];
-    free(a->val);
+    free(a->val); //SMPI_SHARED_FREE(a->val);
     a->val=nval;
     a->len=blen;
 }
+
 #define fielddim 4
-Arr* WindowFilter(Arr *a, Arr* b,int w){
+static Arr* WindowFilter(Arr *a, Arr* b,int w){
   int i=0,j=0,k=0;
   double rms0=0.0,rms1=0.0,rmsm1=0.0;
   double weight=((double) (w+1))/(w+2);
@@ -534,7 +508,7 @@ Arr* WindowFilter(Arr *a, Arr* b,int w){
   return a;
 }
 
-int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
+static int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
   int i=0,tag=0;
   DGArc *ar=NULL;
   DGNode *head=NULL;
@@ -553,8 +527,8 @@ int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
   TRACE_smpi_set_category (NULL);
   return 1;
 }
-Arr* CombineStreams(DGraph *dg,DGNode *nd){
-  Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
+static Arr* CombineStreams(DGraph *dg,DGNode *nd){
+  Arr *resfeat=newArr(num_samples*fielddim);
   int i=0,len=0,tag=0;
   DGArc *ar=NULL;
   DGNode *tail=NULL;
@@ -573,27 +547,27 @@ Arr* CombineStreams(DGraph *dg,DGNode *nd){
       feat=newArr(len);
       MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
       resfeat=WindowFilter(resfeat,feat,nd->id);
-      free(feat);
+      free(feat);//SMPI_SHARED_FREE(feat);
     }else{
       featp=(Arr *)tail->feat;
       feat=newArr(featp->len);
       memcpy(feat->val,featp->val,featp->len*sizeof(double));
       resfeat=WindowFilter(resfeat,feat,nd->id);  
-      free(feat);
+      free(feat);//SMPI_SHARED_FREE(feat);
     }
   }
   for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
   nd->feat=resfeat;
   return nd->feat;
 }
-double Reduce(Arr *a,int w){
+
+static double Reduce(Arr *a,int w){
   double retv=0.0;
   if(timer_on){
     timer_clear(w);
     timer_start(w);
   }
-  retv=(int)(w*CheckVal(a));/* The casting needed for node  
-                               and array dependent verifcation */
+  retv=(int)(w*CheckVal(a));/* The casting needed for node and array dependent verifcation */
   if(timer_on){
     timer_stop(w);
     fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
@@ -601,7 +575,7 @@ double Reduce(Arr *a,int w){
   return retv;
 }
 
-double ReduceStreams(DGraph *dg,DGNode *nd){
+static double ReduceStreams(DGraph *dg,DGNode *nd){
   double csum=0.0;
   int i=0,len=0,tag=0;
   DGArc *ar=NULL;
@@ -623,7 +597,7 @@ double ReduceStreams(DGraph *dg,DGNode *nd){
       feat=newArr(len);
       MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
       csum+=Reduce(feat,(nd->id+1));  
-      free(feat);
+      free(feat);//SMPI_SHARED_FREE(feat);
     }else{
       csum+=Reduce(tail->feat,(nd->id+1));  
     }
@@ -633,7 +607,7 @@ double ReduceStreams(DGraph *dg,DGNode *nd){
   return retv;
 }
 
-int ProcessNodes(DGraph *dg,int me){
+static int ProcessNodes(DGraph *dg,int me){
   double chksum=0.0;
   Arr *feat=NULL;
   int i=0,verified=0,tag;
@@ -683,17 +657,27 @@ int main(int argc,char **argv ){
   int verified=0, featnum=0;
   double bytes_sent=2.0,tot_time=0.0;
 
-    MPI_Init( &argc, &argv );
-    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
-    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
-    TRACE_smpi_set_category ("begin");
-
-     if(argc!=2||
-                (  strncmp(argv[1],"BH",2)!=0
-                 &&strncmp(argv[1],"WH",2)!=0
-                 &&strncmp(argv[1],"SH",2)!=0
-                )
-      ){
+  MPI_Init( &argc, &argv );
+  MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+  MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+  TRACE_smpi_set_category ("begin");
+  get_info(argc, argv, &nprocs, &class);
+  check_info(DT, nprocs, class);
+
+  if      (class == 'S') { num_samples=1728; deviation=128; num_sources=4; }
+  else if (class == 'W') { num_samples=1728*8; deviation=128*2; num_sources=4*2; }
+  else if (class == 'A') { num_samples=1728*64; deviation=128*4; num_sources=4*4; }
+  else if (class == 'B') { num_samples=1728*512; deviation=128*8; num_sources=4*8; }
+  else if (class == 'C') { num_samples=1728*4096; deviation=128*16; num_sources=4*16; }
+  else if (class == 'D') { num_samples=1728*4096*8; deviation=128*32; num_sources=4*32; }
+  else {
+    printf("setparams: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+
+
+     if(argc!=2|| (  strncmp(argv[1],"BH",2)!=0 && strncmp(argv[1],"WH",2)!=0 &&strncmp(argv[1],"SH",2)!=0)){
       if(my_rank==0){
         fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
         fprintf(stderr,"** Where \n   - N is integer number of MPI processes\n");
@@ -706,11 +690,11 @@ int main(int argc,char **argv ){
       exit(0);
     } 
    if(strncmp(argv[1],"BH",2)==0){
-      dg=buildBH(CLASS);
+      dg=buildBH(class);
     }else if(strncmp(argv[1],"WH",2)==0){
-      dg=buildWH(CLASS);
+      dg=buildWH(class);
     }else if(strncmp(argv[1],"SH",2)==0){
-      dg=buildSH(CLASS);
+      dg=buildSH(class);
     }
 
     if(timer_on&&dg->numNodes+1>timers_tot){
@@ -740,32 +724,14 @@ int main(int argc,char **argv ){
     verified=ProcessNodes(dg,my_rank);
     TRACE_smpi_set_category ("end");
 
-    featnum=NUM_SAMPLES*fielddim;
+    featnum=num_samples*fielddim;
     bytes_sent=featnum*dg->numArcs;
     bytes_sent/=1048576;
     if(my_rank==0){
       timer_stop(0);
       tot_time=timer_read(0);
-      c_print_results( dg->name,
-                 CLASS,
-                 featnum,
-                 0,
-                 0,
-                 dg->numNodes,
-                 0,
-                 comm_size,
-                 tot_time,
-                 bytes_sent/tot_time,
-                 "bytes transmitted", 
-                 verified,
-                 NPBVERSION,
-                 COMPILETIME,
-                 MPICC,
-                 CLINK,
-                 CMPI_LIB,
-                 CMPI_INC,
-                 CFLAGS,
-                 CLINKFLAGS );
+      c_print_results( dg->name, class, featnum, 0, 0, dg->numNodes, 0, comm_size, tot_time, bytes_sent/tot_time,
+                 "bytes transmitted", verified);
     }          
     MPI_Finalize();
   return 1;
diff --git a/examples/smpi/NAS/ep.c b/examples/smpi/NAS/ep.c
new file mode 100644
index 0000000000..c56cf2043f
--- /dev/null
+++ b/examples/smpi/NAS/ep.c
@@ -0,0 +1,318 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "smpi/mpi.h"
+#include "nas_common.h"
+#include "simgrid/instr.h" //TRACE_
+
+char class;
+int nprocs;
+
+#define true 1
+#define false 0
+
+int main(int argc, char **argv) {
+  double dum[3] = {1.,1.,1.};
+  double x1, x2, sx, sy, tm, an, tt, gc;
+  double Mops;
+  double epsilon=1.0E-8, a = 1220703125., s=271828183.;
+  double t1, t2, t3, t4;
+  double sx_verify_value, sy_verify_value, sx_err, sy_err;
+
+  int    m, mk=16,
+         mm, nn,
+         nk = (int)(pow(2,mk)), 
+         nq=10, 
+         np, 
+         node, 
+         no_nodes, 
+         i, 
+         ik, 
+         kk, 
+         l, 
+         k, nit, no_large_nodes,
+         np_add, k_offset;
+  int    root=0;
+  int verified;
+  char  size[500]; // mind the size of the string to represent a big number
+
+  double *start = (double *) malloc (64*sizeof(double));
+  double *elapsed = (double *) malloc (64*sizeof(double));
+
+  double *x = (double *) malloc (2*nk*sizeof(double));
+  double *q = (double *) malloc (nq*sizeof(double));
+
+  MPI_Init( &argc, &argv );
+  MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
+  MPI_Comm_rank( MPI_COMM_WORLD, &node);
+
+  TRACE_smpi_set_category ("start");
+
+  get_info(argc, argv, &nprocs, &class);
+  check_info(EP, nprocs, class);
+
+  if      (class == 'S') { m = 24; }
+  else if (class == 'W') { m = 25; }
+  else if (class == 'A') { m = 28; }
+  else if (class == 'B') { m = 30; }
+  else if (class == 'C') { m = 32; }
+  else if (class == 'D') { m = 36; }
+  else if (class == 'E') { m = 40; }
+  else {
+    printf("EP: Internal error: invalid class type %c\n", class);
+    exit(1);
+  }
+  mm = m -mk;
+  nn = (int)(pow(2,mm)),
+
+  root = 0;
+  if (node == root ) {
+    /* Because the size of the problem is too large to store in a 32-bit integer for some classes, we put it into a
+     * string (for printing). Have to strip off the decimal point put in there by the floating point print statement
+     * (internal file)
+     */
+    fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
+    sprintf(size,"%zu",(unsigned long)pow(2,m+1));
+    //size = size.replace('.', ' ');
+    fprintf(stdout," Number of random numbers generated: %s\n",size);
+    fprintf(stdout," Number of active processes: %d\n",no_nodes);
+  }
+  verified = false;
+
+  /* Compute the number of "batches" of random number pairs generated per processor. Adjust if the number of processors
+   * does not evenly divide the total number
+   */
+
+  np = nn / no_nodes;
+  no_large_nodes = nn % no_nodes;
+  if (node < no_large_nodes) np_add = 1;
+  else np_add = 0;
+  np = np + np_add;
+
+  if (np == 0) {
+    fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
+    MPI_Abort(MPI_COMM_WORLD,1);
+    exit(0);
+  }
+
+  /* Call the random number generator functions and initialize the x-array to reduce the effects of paging the timings.
+   Also, call all mathematical functions that are used. Make sure initializations cannot be eliminated as dead code. */
+
+  //call vranlc(0, dum[1], dum[2], dum[3]);
+  // Array indexes start at 1 in Fortran, 0 in Java
+  vranlc(0, dum[0], dum[1], &(dum[2]));
+
+  dum[0] = randlc(&(dum[1]),&(dum[2]));
+  for (i=0;i<2*nk;i++) {
+    x[i] = -1e99;
+  }
+  Mops = log(sqrt(abs(1)));
+
+   /* Synchronize before placing time stamp */
+  MPI_Barrier( MPI_COMM_WORLD );
+
+  TRACE_smpi_set_category ("ep");
+
+  time_clear(&(elapsed[1]));
+  time_clear(&(elapsed[2]));
+  time_clear(&(elapsed[3]));
+  time_start(&(start[1]));
+
+  t1 = a;
+  //fprintf(stdout,("(ep.f:160) t1 = " + t1);
+  t1 = vranlc(0, t1, a, x);
+  //fprintf(stdout,("(ep.f:161) t1 = " + t1);
+
+  /* Compute AN = A ^ (2 * NK) (mod 2^46). */
+  t1 = a;
+  //fprintf(stdout,("(ep.f:165) t1 = " + t1);
+  for (i=1; i <= mk+1; i++) {
+    t2 = randlc(&t1, &t1);
+    //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
+  }
+  an = t1;
+  //fprintf(stdout,("(ep.f:172) s = " + s);
+  tt = s;
+  gc = tt = 0.;
+  sx = 0.;
+  sy = 0.;
+  for (i=0; i < nq ; i++) {
+    q[i] = 0.;
+  }
+
+/* Each instance of this loop may be performed independently. We compute the k offsets separately to take into account
+ * the fact that some nodes have more numbers to generate than others */
+
+  if (np_add == 1)
+    k_offset = node * np -1;
+  else
+    k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
+
+  int stop = false;
+  for(k = 1; k <= np; k++) {// SMPI_SAMPLE_LOCAL(0.25 * np, 0.03) {
+    stop = false;
+    kk = k_offset + k ;
+    t1 = s;
+    //fprintf(stdout,("(ep.f:193) t1 = " + t1);
+    t2 = an;
+
+    //       Find starting seed t1 for this kk.
+    for (i=1;i<=100 && !stop;i++) {
+      ik = kk / 2;
+      //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
+      if (2 * ik != kk)  {
+        t3 = randlc(&t1, &t2);
+        //fprintf(stdout,("(ep.f:200) t1= " +t1 );
+      }
+      if (ik==0)
+        stop = true;
+      else {
+        t3 = randlc(&t2, &t2);
+        kk = ik;
+      }
+    }
+    //       Compute uniform pseudorandom numbers.
+
+    //if (timers_enabled)  timer_start(3);
+    time_start(&(start[3]));
+    //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
+
+    //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
+    //fprintf(stdout,"2*nk = " + (2*nk));
+    //fprintf(stdout,"t1 = " + t1);
+    //fprintf(stdout,"a  = " + a);
+    //fprintf(stdout,"x[0] = " + x[0]);
+    //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+    t1 = vranlc(2 * nk, t1, a, x);
+
+    //fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
+    //fprintf(stdout,("2*nk = " + (2*nk));
+    //fprintf(stdout,("t1 = " + t1);
+    //fprintf(stdout,("a  = " + a);
+    //fprintf(stdout,("x[0] = " + x[0]);
+    //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+
+    //if (timers_enabled)  timer_stop(3);
+    time_stop(3,elapsed,start);
+
+    /* Compute Gaussian deviates by acceptance-rejection method and tally counts in concentric square annuli.
+     * This loop is not vectorizable. */
+    //if (timers_enabled) timer_start(2);
+    time_start(&(start[2]));
+    for(i=1; i<=nk;i++) {
+      x1 = 2. * x[2*i-2] -1.0;
+      x2 = 2. * x[2*i-1] - 1.0;
+      t1 = x1*x1 + x2*x2;
+      if (t1 <= 1.) {
+        t2   = sqrt(-2. * log(t1) / t1);
+        t3   = (x1 * t2);
+        t4   = (x2 * t2);
+        l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
+        q[l] = q[l] + 1.;
+        sx   = sx + t3;
+        sy   = sy + t4;
+      }
+      /*
+       if(i == 1) {
+                fprintf(stdout,"x1 = " + x1);
+                fprintf(stdout,"x2 = " + x2);
+                fprintf(stdout,"t1 = " + t1);
+                fprintf(stdout,"t2 = " + t2);
+                fprintf(stdout,"t3 = " + t3);
+                fprintf(stdout,"t4 = " + t4);
+                fprintf(stdout,"l = " + l);
+                fprintf(stdout,"q[l] = " + q[l]);
+                fprintf(stdout,"sx = " + sx);
+                fprintf(stdout,"sy = " + sy);
+       }
+       */
+    }
+    //if (timers_enabled)  timer_stop(2);
+    time_stop(2,elapsed,start);
+  }
+
+  TRACE_smpi_set_category ("finalize");
+
+  MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
+  MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  sy = x[0];
+  MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  for(i = 0; i < nq; i++) {
+    q[i] = x[i];
+  }
+  for(i = 0; i < nq; i++) {
+    gc += q[i];
+  }
+
+  time_stop(1,elapsed,start);
+  tm = time_read(1,elapsed);
+  MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  tm = x[0];
+
+  if(node == root) {
+    nit = 0;
+    verified = true;
+
+    if(m == 24) {
+      sx_verify_value = -3.247834652034740E3;
+      sy_verify_value = -6.958407078382297E3;
+    } else if(m == 25) {
+      sx_verify_value = -2.863319731645753E3;
+      sy_verify_value = -6.320053679109499E3;
+    } else if(m == 28) {
+      sx_verify_value = -4.295875165629892E3;
+      sy_verify_value = -1.580732573678431E4;
+    } else if(m == 30) {
+      sx_verify_value =  4.033815542441498E4;
+      sy_verify_value = -2.660669192809235E4;
+    } else if(m == 32) {
+      sx_verify_value =  4.764367927995374E4;
+      sy_verify_value = -8.084072988043731E4;
+    } else if(m == 36) {
+      sx_verify_value =  1.982481200946593E5;
+      sy_verify_value = -1.020596636361769E5;
+    } else {
+      verified = false;
+    }
+
+    /*
+    fprintf(stdout,("sx        = " + sx);
+    fprintf(stdout,("sx_verify = " + sx_verify_value);
+    fprintf(stdout,("sy        = " + sy);
+    fprintf(stdout,("sy_verify = " + sy_verify_value);
+    */
+    if(verified) {
+      sx_err = abs((sx - sx_verify_value)/sx_verify_value);
+      sy_err = abs((sy - sy_verify_value)/sy_verify_value);
+      /*
+      fprintf(stdout,("sx_err = " + sx_err);
+      fprintf(stdout,("sy_err = " + sx_err);
+      fprintf(stdout,("epsilon= " + epsilon);
+      */
+      verified = ((sx_err < epsilon) && (sy_err < epsilon));
+    }
+
+    Mops = (pow(2.0, m+1))/tm/1000;
+
+    fprintf(stdout,"EP Benchmark Results:\n");
+    fprintf(stdout,"CPU Time=%d\n",(int) tm);
+    fprintf(stdout,"N = 2^%d\n",m);
+    fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
+    fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
+    fprintf(stdout,"Count:");
+    for(i = 0; i < nq; i++) {
+      fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
+    }
+    c_print_results("EP", class, m+1, 0, 0, nit, nprocs, no_nodes, tm, Mops, "Random number generated",verified);
+
+    fprintf(stdout,"Total time:     %f\n",(time_read(1,elapsed)/1000));
+    fprintf(stdout,"Gaussian pairs: %f\n",(time_read(2,elapsed)/1000));
+    fprintf(stdout,"Random numbers: %f\n",(time_read(3,elapsed)/1000));
+  }
+
+  MPI_Finalize();
+  return 0;
+}
diff --git a/examples/smpi/NAS/is.c b/examples/smpi/NAS/is.c
new file mode 100644
index 0000000000..68cbb77487
--- /dev/null
+++ b/examples/smpi/NAS/is.c
@@ -0,0 +1,637 @@
+/*************************************************************************
+ *                                                                       * 
+ *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
+ *                                                                       * 
+ *                                  I S                                  * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
+ *   It is described in NAS Technical Report 95-020.                     * 
+ *                                                                       * 
+ *   Permission to use, copy, distribute and modify this software        * 
+ *   for any purpose with or without fee is hereby granted.  We          * 
+ *   request, however, that all derived work reference the NAS           * 
+ *   Parallel Benchmarks 3.3. This software is provided "as is"          *
+ *   without express or implied warranty.                                * 
+ *                                                                       * 
+ *   Information on NPB 3.3, including the technical report, the         *
+ *   original specifications, source code, results and information       * 
+ *   on how to submit new results, is available at:                      * 
+ *                                                                       * 
+ *          http://www.nas.nasa.gov/Software/NPB                         * 
+ *                                                                       * 
+ *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
+ *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
+ *                                                                       * 
+ *         NAS Parallel Benchmarks Group                                 * 
+ *         NASA Ames Research Center                                     * 
+ *         Mail Stop: T27A-1                                             * 
+ *         Moffett Field, CA   94035-1000                                * 
+ *                                                                       * 
+ *         E-mail:  npb@nas.nasa.gov                                     * 
+ *         Fax:     (650) 604-3957                                       * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   Author: M. Yarrow                                                   * 
+ *           H. Jin                                                      * 
+ *                                                                       * 
+ *************************************************************************/
+
+#include "smpi/mpi.h"
+#include "nas_common.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "simgrid/instr.h" //TRACE_
+
+char class;
+int nprocs;
+int total_keys_log2;
+int max_key_log_2;
+int num_bucket_log_2;
+int min_procs=1;
+/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024.
+ * INCREASE max_procs AT YOUR PERIL
+ */
+int max_procs=1024;
+
+int total_keys;
+int max_key;
+int num_buckets;
+int num_keys;
+long size_of_buffers;
+
+#define  MAX_ITERATIONS      10
+#define  TEST_ARRAY_SIZE     5
+
+/* Typedef: if necessary, change the size of int here by changing the  int type to, say, long */
+typedef  int  INT_TYPE;
+typedef  long INT_TYPE2;
+#define MP_KEY_TYPE MPI_INT
+
+typedef struct {
+/* MPI properties:  */
+int      my_rank, comm_size;
+/* Some global info */
+INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
+         total_local_keys,             /* copies of rank info        */
+         total_lesser_keys;
+
+int      passed_verification;
+/* These are the three main arrays. See SIZE_OF_BUFFERS def above    */
+INT_TYPE *key_array, *key_buff1, *key_buff2,
+         *bucket_size,     /* Top 5 elements for */
+         *bucket_size_totals, /* part. ver. vals */
+         *bucket_ptrs, *process_bucket_distrib_ptr1, *process_bucket_distrib_ptr2;
+int      send_count[1024], recv_count[1024], send_displ[1024], recv_displ[1024];
+
+/* Partial verif info */
+INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
+         test_rank_array[TEST_ARRAY_SIZE];
+} global_data;
+
+const INT_TYPE2
+         S_test_index_array[TEST_ARRAY_SIZE] = {48427,17148,23627,62548,4431},
+         S_test_rank_array[TEST_ARRAY_SIZE] =  {0,18,346,64917,65463},
+         W_test_index_array[TEST_ARRAY_SIZE] = {357773,934767,875723,898999,404505},
+         W_test_rank_array[TEST_ARRAY_SIZE] =  {1249,11698,1039987,1043896,1048018},
+
+         A_test_index_array[TEST_ARRAY_SIZE] = {2112377,662041,5336171,3642833,4250760},
+         A_test_rank_array[TEST_ARRAY_SIZE] =  {104,17523,123928,8288932,8388264},
+
+         B_test_index_array[TEST_ARRAY_SIZE] = {41869,812306,5102857,18232239,26860214},
+         B_test_rank_array[TEST_ARRAY_SIZE] =  {33422937,10244,59149,33135281,99},
+
+         C_test_index_array[TEST_ARRAY_SIZE] = {44172927,72999161,74326391,129606274,21736814},
+         C_test_rank_array[TEST_ARRAY_SIZE] =  {61147,882988,266290,133997595,133525895},
+
+         D_test_index_array[TEST_ARRAY_SIZE] = {1317351170,995930646,1157283250,1503301535,1453734525},
+         D_test_rank_array[TEST_ARRAY_SIZE] =  {1,36538729,1978098519,2145192618,2147425337};
+
+void full_verify( global_data* gd );
+
+/************ returns parallel random number seq seed ************/
+/*
+ * Create a random number sequence of total length nn residing on np number of processors.  Each processor will
+ * therefore have a subsequence of length nn/np.  This routine returns that random number which is the first random
+ * number for the subsequence belonging to processor rank kn, and which is used as seed for proc kn ran # gen.
+ */
+static double  find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
+                             int  np,       /* np = num procs                      */
+                             long nn,       /* total num of ran numbers, all procs */
+                             double s,      /* Ran num seed, for ex.: 314159265.00 */
+                             double a )     /* Ran num gen mult, try 1220703125.00 */
+{
+  long   i;
+  double t1,t2,t3,an;
+  long   mq,nq,kk,ik;
+
+  nq = nn / np;
+
+  for( mq=0; nq>1; mq++,nq/=2);
+
+  t1 = a;
+
+  for( i=1; i<=mq; i++ )
+    t2 = randlc( &t1, &t1 );
+
+  an = t1;
+
+  kk = kn;
+  t1 = s;
+  t2 = an;
+
+  for( i=1; i<=100; i++ ){
+    ik = kk / 2;
+    if( 2 * ik !=  kk )
+      t3 = randlc( &t1, &t2 );
+    if( ik == 0 )
+      break;
+    t3 = randlc( &t2, &t2 );
+    kk = ik;
+  }
+  an=t3;//added to silence paranoid compilers
+
+  return t1;
+}
+
+static void create_seq( global_data* gd, double seed, double a )
+{
+  double x;
+  int    i, k;
+
+  k = max_key/4;
+
+  for (i=0; i<num_keys; i++){
+    x = randlc(&seed, &a);
+    x += randlc(&seed, &a);
+    x += randlc(&seed, &a);
+    x += randlc(&seed, &a);
+
+    gd->key_array[i] = k*x;
+  }
+}
+
+void full_verify( global_data* gd )
+{
+  MPI_Status  status;
+  MPI_Request request;
+
+  INT_TYPE    i, j;
+  INT_TYPE    k, last_local_key;
+
+/*  Now, finally, sort the keys:  */
+  for( i=0; i<gd->total_local_keys; i++ )
+    gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- gd->total_lesser_keys] = gd->key_buff2[i];
+  last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
+
+/*  Send largest key value to next processor  */
+  if( gd->my_rank > 0 )
+    MPI_Irecv( &k, 1, MP_KEY_TYPE, gd->my_rank-1, 1000, MPI_COMM_WORLD, &request );
+  if( gd->my_rank < gd->comm_size-1 )
+    MPI_Send( &gd->key_array[last_local_key], 1, MP_KEY_TYPE, gd->my_rank+1, 1000, MPI_COMM_WORLD );
+  if( gd->my_rank > 0 )
+    MPI_Wait( &request, &status );
+
+/*  Confirm that neighbor's greatest key value is not greater than my least key value */
+  j = 0;
+  if( gd->my_rank > 0 && gd->total_local_keys > 0 )
+    if( k > gd->key_array[0] )
+      j++;
+
+/*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
+  for( i=1; i<gd->total_local_keys; i++ )
+    if( gd->key_array[i-1] > gd->key_array[i] )
+      j++;
+
+  if( j != 0 ) {
+    printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n", gd->my_rank, j );
+  } else
+    gd->passed_verification++;
+}
+
+static void rank( global_data* gd, int iteration )
+{
+  INT_TYPE    i, k;
+  INT_TYPE    shift = max_key_log_2 - num_bucket_log_2;
+  INT_TYPE    key;
+  INT_TYPE2   bucket_sum_accumulator, j, m;
+  INT_TYPE    local_bucket_sum_accumulator;
+  INT_TYPE    min_key_val, max_key_val;
+  INT_TYPE    *key_buff_ptr;
+
+/*  Iteration alteration of keys */  
+  if(gd->my_rank == 0){
+    gd->key_array[iteration] = iteration;
+    gd->key_array[iteration+MAX_ITERATIONS] = max_key - iteration;
+  }
+
+/*  Initialize */
+  for( i=0; i<num_buckets+TEST_ARRAY_SIZE; i++ ){
+    gd->bucket_size[i] = 0;
+    gd->bucket_size_totals[i] = 0;
+    gd->process_bucket_distrib_ptr1[i] = 0;
+    gd->process_bucket_distrib_ptr2[i] = 0;
+  }
+
+/*  Determine where the partial verify test keys are, load into top of array bucket_size */
+  for( i=0; i<TEST_ARRAY_SIZE; i++ )
+    if( (gd->test_index_array[i]/num_keys) == gd->my_rank )
+      gd->bucket_size[num_buckets+i] = gd->key_array[gd->test_index_array[i] % num_keys];
+
+/*  Determine the number of keys in each bucket */
+  for( i=0; i<num_keys; i++ )
+    gd->bucket_size[gd->key_array[i] >> shift]++;
+
+/*  Accumulative bucket sizes are the bucket pointers */
+  gd->bucket_ptrs[0] = 0;
+  for( i=1; i< num_buckets; i++ )
+    gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
+
+/*  Sort into appropriate bucket */
+  for( i=0; i<num_keys; i++ ) {
+    key = gd->key_array[i];
+    gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
+  }
+
+/*  Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */
+  MPI_Allreduce(gd->bucket_size, gd->bucket_size_totals, num_buckets+TEST_ARRAY_SIZE, MP_KEY_TYPE, MPI_SUM,
+                MPI_COMM_WORLD);
+
+/* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses num_keys (which the
+ * average number of keys per processor).  Then all keys in these buckets go to processor 0.
+   Continue accumulating again until supassing 2*num_keys. All keys in these buckets go to processor 1, etc.  This
+   algorithm guarantees that all processors have work ranking; no processors are left idle.
+   The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution
+   of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more
+   computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested.
+   Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each
+   processor will have after the redistribution is done.
+*/
+
+  bucket_sum_accumulator = 0;
+  local_bucket_sum_accumulator = 0;
+  gd->send_displ[0] = 0;
+  gd->process_bucket_distrib_ptr1[0] = 0;
+  for( i=0, j=0; i<num_buckets; i++ ) {
+    bucket_sum_accumulator       += gd->bucket_size_totals[i];
+    local_bucket_sum_accumulator += gd->bucket_size[i];
+    if( bucket_sum_accumulator >= (j+1)*num_keys ) {
+      gd->send_count[j] = local_bucket_sum_accumulator;
+      if( j != 0 ){
+         gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
+         gd->process_bucket_distrib_ptr1[j] = gd->process_bucket_distrib_ptr2[j-1]+1;
+      }
+      gd->process_bucket_distrib_ptr2[j++] = i;
+      local_bucket_sum_accumulator = 0;
+    }
+  }
+
+/*  When nprocs approaching num_buckets, it is highly possible that the last few processors don't get any buckets.
+ *  So, we need to set counts properly in this case to avoid any fallouts.    */
+  while( j < gd->comm_size ) {
+    gd->send_count[j] = 0;
+    gd->process_bucket_distrib_ptr1[j] = 1;
+    j++;
+  }
+
+/*  This is the redistribution section:  first find out how many keys
+    each processor will send to every other processor:                 */
+  MPI_Alltoall( gd->send_count, 1, MPI_INT, gd->recv_count, 1, MPI_INT, MPI_COMM_WORLD );
+
+/*  Determine the receive array displacements for the buckets */
+  gd->recv_displ[0] = 0;
+  for( i=1; i<gd->comm_size; i++ )
+    gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
+
+  /*  Now send the keys to respective processors  */
+  MPI_Alltoallv(gd->key_buff1, gd->send_count, gd->send_displ, MP_KEY_TYPE, gd->key_buff2, gd->recv_count,
+                gd->recv_displ, MP_KEY_TYPE, MPI_COMM_WORLD );
+
+/* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to
+ * obtain the smallest possible min and greatest possible max value of any key on each processor
+ */
+  min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
+  max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
+
+/*  Clear the work array */
+  for( i=0; i<max_key_val-min_key_val+1; i++ )
+    gd->key_buff1[i] = 0;
+
+/*  Determine the total number of keys on all other processors holding keys of lesser value         */
+  m = 0;
+  for( k=0; k<gd->my_rank; k++ )
+    for( i= gd->process_bucket_distrib_ptr1[k]; i<=gd->process_bucket_distrib_ptr2[k]; i++ )
+      m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
+
+/*  Determine total number of keys on this processor */
+  j = 0;
+  for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; i++ )
+    j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
+
+/*  Ranking of all keys occurs in this section:                 */
+/*  shift it backwards so no subtractions are necessary in loop */
+  key_buff_ptr = gd->key_buff1 - min_key_val;
+
+/*  In this section, the keys themselves are used as their own indexes to determine how many of each there are: their
+    individual population                                       */
+  for( i=0; i<j; i++ )
+    key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key  population                     */
+
+/*  To obtain ranks of each key, successively add the individual key population, not forgetting the total of lesser
+ *  keys, m.
+    NOTE: Since the total of lesser keys would be subtracted later in verification, it is no longer added to the first
+    key population here, but still needed during the partial verify test.  This is to ensure that 32-bit key_buff can
+    still be used for class D.           */
+/*    key_buff_ptr[min_key_val] += m;    */
+  for( i=min_key_val; i<max_key_val; i++ )
+    key_buff_ptr[i+1] += key_buff_ptr[i];
+
+/* This is the partial verify test section */
+/* Observe that test_rank_array vals are shifted differently for different cases */
+  for( i=0; i<TEST_ARRAY_SIZE; i++ ){
+    k = gd->bucket_size_totals[i+num_buckets];    /* Keys were hidden here */
+    if( min_key_val <= k  &&  k <= max_key_val ){
+      /* Add the total of lesser keys, m, here */
+      INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
+      int failed = 0;
+
+      switch( class ){
+        case 'S':
+          if( i <= 2 ) {
+            if( key_rank != gd->test_rank_array[i]+iteration )
+              failed = 1;
+            else
+              gd->passed_verification++;
+          } else {
+            if( key_rank != gd->test_rank_array[i]-iteration )
+              failed = 1;
+            else
+              gd->passed_verification++;
+          }
+          break;
+        case 'W':
+          if( i < 2 ){
+            if( key_rank != gd->test_rank_array[i]+(iteration-2) )
+              failed = 1;
+            else
+              gd->passed_verification++;
+          } else {
+              if( key_rank != gd->test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                gd->passed_verification++;
+          }
+          break;
+        case 'A':
+          if( i <= 2 ){
+            if( key_rank != gd->test_rank_array[i]+(iteration-1) )
+              failed = 1;
+            else
+              gd->passed_verification++;
+          } else {
+              if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
+                failed = 1;
+              else
+                gd->passed_verification++;
+          }
+          break;
+        case 'B':
+          if( i == 1 || i == 2 || i == 4 ) {
+            if( key_rank != gd->test_rank_array[i]+iteration )
+              failed = 1;
+            else
+              gd->passed_verification++;
+          } else {
+              if( key_rank != gd->test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                gd->passed_verification++;
+          }
+          break;
+        case 'C':
+          if( i <= 2 ){
+            if( key_rank != gd->test_rank_array[i]+iteration )
+              failed = 1;
+            else
+              gd->passed_verification++;
+          } else {
+              if( key_rank != gd->test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                gd->passed_verification++;
+          }
+          break;
+        case 'D':
+          if( i < 2 ) {
+            if( key_rank != gd->test_rank_array[i]+iteration )
+              failed = 1;
+            else
+              gd->passed_verification++;
+           } else {
+              if( key_rank != gd->test_rank_array[i]-iteration )
+                failed = 1;
+              else
+                gd->passed_verification++;
+           }
+         break;
+      }
+      if( failed == 1 )
+        printf( "Failed partial verification: iteration %d, processor %d, test key %d\n",
+               iteration, gd->my_rank, (int)i );
+    }
+  }
+
+/*  Make copies of rank info for use by full_verify: these variables in rank are local; making them global slows down
+ *  the code, probably since they cannot be made register by compiler                        */
+
+  if( iteration == MAX_ITERATIONS ) {
+    gd->key_buff_ptr_global = key_buff_ptr;
+    gd->total_local_keys    = j;
+    gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
+  }
+}
+
+int main( int argc, char **argv )
+{
+  int             i, iteration, itemp;
+  double          timecounter, maxtime;
+
+  global_data* gd = malloc(sizeof(global_data));
+/*  Initialize MPI */
+  MPI_Init( &argc, &argv );
+  MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
+  MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
+
+  get_info(argc, argv, &nprocs, &class);
+  check_info(IS, nprocs, class);
+/*  Initialize the verification arrays if a valid class */
+  for( i=0; i<TEST_ARRAY_SIZE; i++ )
+
+    switch( class ) {
+      case 'S':
+         total_keys_log2 = 16;
+         max_key_log_2 = 11;
+         num_bucket_log_2 = 9;
+         max_procs = 128;
+         gd->test_index_array[i] = S_test_index_array[i];
+         gd->test_rank_array[i]  = S_test_rank_array[i];
+         break;
+      case 'A':
+         total_keys_log2 = 23;
+         max_key_log_2 = 19;
+         num_bucket_log_2 = 10;
+         gd->test_index_array[i] = A_test_index_array[i];
+         gd->test_rank_array[i]  = A_test_rank_array[i];
+         break;
+      case 'W':
+          total_keys_log2 = 20;
+          max_key_log_2 = 16;
+          num_bucket_log_2 = 10;
+         gd->test_index_array[i] = W_test_index_array[i];
+         gd->test_rank_array[i]  = W_test_rank_array[i];
+         break;
+      case 'B':
+          total_keys_log2 = 25;
+          max_key_log_2 = 21;
+          num_bucket_log_2 = 10;
+         gd->test_index_array[i] = B_test_index_array[i];
+         gd->test_rank_array[i]  = B_test_rank_array[i];
+         break;
+      case 'C':
+          total_keys_log2 = 27;
+          max_key_log_2 = 23;
+          num_bucket_log_2 = 10;
+         gd->test_index_array[i] = C_test_index_array[i];
+         gd->test_rank_array[i]  = C_test_rank_array[i];
+         break;
+      case 'D':
+          total_keys_log2 = 29;
+          max_key_log_2 = 27;
+          num_bucket_log_2 = 10;
+         min_procs = 4;
+         gd->test_index_array[i] = D_test_index_array[i];
+         gd->test_rank_array[i]  = D_test_rank_array[i];
+         break;
+    };
+
+  total_keys  = (1 << total_keys_log2);
+  max_key     = (1 << max_key_log_2);
+  num_buckets = (1 << num_bucket_log_2);
+  num_keys    = (total_keys/nprocs*min_procs);
+
+  /* On larger number of processors, since the keys are (roughly)  gaussian distributed, the first and last processor
+   * sort keys in a large interval, requiring array sizes to be larger. Note that for large NUM_PROCS, num_keys is,
+   * however, a small number The required array size also depends on the bucket size used. The following values are
+   * validated for the 1024-bucket setup. */
+  if (nprocs < 256)
+    size_of_buffers = 3*num_keys/2;
+  else if (nprocs < 512)
+    size_of_buffers = 5*num_keys/2;
+  else if (nprocs < 1024)
+    size_of_buffers = 4*num_keys/2;
+  else
+    size_of_buffers = 13*num_keys/2;
+
+  gd->key_array = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
+  gd->key_buff1 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
+  gd->key_buff2 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
+  gd->bucket_size = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));     /* Top 5 elements for */
+  gd->bucket_size_totals = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* part. ver. vals */
+  gd->bucket_ptrs = (INT_TYPE*)malloc(num_buckets*sizeof(INT_TYPE));
+  gd->process_bucket_distrib_ptr1 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
+  gd->process_bucket_distrib_ptr2 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
+//  int      send_count[max_procs], recv_count[max_procs],
+//           send_displ[max_procs], recv_displ[max_procs];
+
+/*  Printout initial NPB info */
+  if( gd->my_rank == 0 ){
+     printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
+     printf( " Size:  %ld  (class %c)\n", (long)total_keys*min_procs, class);
+     printf( " Iterations:   %d\n", MAX_ITERATIONS );
+     printf( " Number of processes:     %d\n",gd->comm_size );
+  }
+
+/*  Check that actual and compiled number of processors agree */
+  if( gd->comm_size != nprocs) {
+    if( gd->my_rank == 0 )
+       printf( "\n ERROR: compiled for %d processes\n"
+               " Number of active processes: %d\n"
+               " Exiting program!\n\n", nprocs, gd->comm_size );
+    MPI_Finalize();
+    exit( 1 );
+  }
+
+/*  Check to see whether total number of processes is within bounds.
+    This could in principle be checked in setparams.c, but it is more convenient to do it here */
+  if( gd->comm_size < min_procs || gd->comm_size > max_procs){
+    if( gd->my_rank == 0 )
+      printf( "\n ERROR: number of processes %d not within range %d-%d"
+              "\n Exiting program!\n\n", gd->comm_size, min_procs, max_procs);
+    MPI_Finalize();
+    exit( 1 );
+  }
+
+/*  Generate random number sequence and subsequent keys on all procs */
+  create_seq(gd,  find_my_seed( gd->my_rank, gd->comm_size, 4*(long)total_keys*min_procs,
+             314159265.00,      /* Random number gen seed */
+             1220703125.00 ),   /* Random number gen mult */
+             1220703125.00 );   /* Random number gen mult */
+
+/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
+    all data and code pages and respective tables */
+  rank(gd, 1 );
+
+/*  Start verification counter */
+  gd->passed_verification = 0;
+
+  if( gd->my_rank == 0 && class != 'S' ) printf( "\n   iteration\n" );
+
+/*  Initialize timer  */
+  timer_clear(0);
+
+/*  Start timer */
+  timer_start(0);
+
+  char smpi_category[100];
+  snprintf (smpi_category, 100, "%d", gd->my_rank);
+  TRACE_smpi_set_category (smpi_category);
+
+/*  This is the main iteration */
+  for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) {
+    if( gd->my_rank == 0 && class != 'S' ) printf( "        %d\n", iteration );
+    rank(gd,  iteration );
+  }
+  TRACE_smpi_set_category (NULL);
+
+/*  Stop timer, obtain time for processors */
+  timer_stop(0);
+
+  timecounter = timer_read(0);
+
+/*  End of timing, obtain maximum time of all processors */
+  MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
+
+/*  This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */
+  full_verify(gd);
+
+/*  Obtain verification counter sum */
+  itemp =gd->passed_verification;
+  MPI_Reduce( &itemp, &gd->passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );
+
+/*  The final printout  */
+  if( gd->my_rank == 0 ) {
+    if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
+      gd->passed_verification = 0;
+    c_print_results("IS", class, (int)(total_keys), min_procs, 0, MAX_ITERATIONS, nprocs, gd->comm_size, maxtime,
+                    ((double) (MAX_ITERATIONS)*total_keys*min_procs)/maxtime/1000000., "keys ranked",
+                    gd->passed_verification);
+  }
+
+  MPI_Finalize();
+  free(gd);
+
+  return 0;
+}
diff --git a/examples/smpi/NAS/nas_common.c b/examples/smpi/NAS/nas_common.c
new file mode 100644
index 0000000000..51ac28f479
--- /dev/null
+++ b/examples/smpi/NAS/nas_common.c
@@ -0,0 +1,268 @@
+/* Copyright (c) 2016. The SimGrid Team.
+ * All rights reserved.                                                     */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+#include "nas_common.h"
+
+static double start[64], elapsed[64];
+
+/* integer log base two. Return error is argument isn't a power of two or is less than or equal to zero */
+int ilog2(int i)
+{
+  int log2;
+  int exp2 = 1;
+  if (i <= 0) return(-1);
+
+  for (log2 = 0; log2 < 20; log2++) {
+    if (exp2 == i) return(log2);
+    exp2 *= 2;
+  }
+  return(-1);
+}
+
+/*  get_info(): Get parameters from command line */
+void get_info(int argc, char *argv[], int *nprocsp, char *classp)
+{
+  if (argc < 3) {
+    printf("Usage: %s (%d) nprocs class\n", argv[0], argc);
+    exit(1);
+  }
+
+  *nprocsp = atoi(argv[1]);
+  *classp = *argv[2];
+}
+
+/*  check_info(): Make sure command line data is ok for this benchmark */
+void check_info(int type, int nprocs, char class)
+{
+  int logprocs;
+
+  /* check number of processors */
+  if (nprocs <= 0) {
+    printf("setparams: Number of processors must be greater than zero\n");
+    exit(1);
+  }
+  switch(type) {
+  case IS:
+    logprocs = ilog2(nprocs);
+    if (logprocs < 0) {
+      printf("setparams: Number of processors must be a power of two (1,2,4,...) for this benchmark\n");
+      exit(1);
+    }
+    break;
+  case EP:
+  case DT:
+    break;
+  default:
+    /* never should have gotten this far with a bad name */
+    printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type);
+    exit(1);
+  }
+
+  /* check class */
+  if (class != 'S' && class != 'W' && class != 'A' && class != 'B' && class != 'C' && class != 'D' && class != 'E') {
+    printf("setparams: Unknown benchmark class %c\n", class);
+    printf("setparams: Allowed classes are \"S\", \"W\", and \"A\" through \"E\"\n");
+    exit(1);
+  }
+
+  if (class == 'E' && (type == IS || type == DT)) {
+    printf("setparams: Benchmark class %c not defined for IS or DT\n", class);
+    exit(1);
+  }
+
+  if (class == 'D' && type == IS && nprocs < 4) {
+    printf("setparams: IS class D size cannot be run on less than 4 processors\n");
+    exit(1);
+  }
+}
+
+void time_clear(double *onetimer) {
+ *onetimer = 0.0;
+}
+
+void time_start(double *onetimer) {
+ *onetimer = MPI_Wtime();
+}
+
+void time_stop(int n,double *elapsed,double *start) {
+  elapsed[n] = MPI_Wtime()- start[n];
+}
+
+double time_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
+  return(elapsed[n]);
+}
+
+void timer_clear(int n)
+{
+  elapsed[n] = 0.0;
+}
+
+void timer_start(int n)
+{
+  start[n] = MPI_Wtime();
+}
+
+void timer_stop(int n)
+{
+  elapsed[n] += MPI_Wtime() - start[n];
+}
+
+double timer_read(int n)
+{
+  return elapsed[n];
+}
+
+double vranlc(int n, double x, double a, double *y)
+{
+  int i;
+  long  i246m1=0x00003FFFFFFFFFFF;
+  long  LLx, Lx, La;
+  double d2m46;
+
+// This doesn't work, because the compiler does the calculation in 32 bits and overflows. No standard way (without
+// f90 stuff) to specifythat the rhs should be done in 64 bit arithmetic.
+// parameter(i246m1=2**46-1)
+
+  d2m46=pow(0.5,46);
+
+  Lx = (long)x;
+  La = (long)a;
+  //fprintf(stdout,("================== Vranlc ================");
+  //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
+  LLx = Lx;
+  for (i=0; i< n; i++) {
+    Lx   = Lx*La & i246m1 ;
+    LLx = Lx;
+    y[i] = d2m46 * (double)LLx;
+    /*
+     if(i == 0) {
+       fprintf(stdout,("After loop 0:");
+       fprintf(stdout,("Lx = " + Lx + ", La = " + La);
+       fprintf(stdout,("d2m46 = " + d2m46);
+       fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
+       fprintf(stdout,("Y[0]" + y[0]);
+     }
+     */
+  }
+
+  x = (double)LLx;
+  /*
+  fprintf(stdout,("Change: Lx = " + Lx);
+  fprintf(stdout,("=============End   Vranlc ================");
+   */
+  return x;
+}
+
+/*
+ *    FUNCTION RANDLC (X, A)
+ *
+ *  This routine returns a uniform pseudorandom double precision number in the
+ *  range (0, 1) by using the linear congruential generator
+ *
+ *  x_{k+1} = a x_k  (mod 2^46)
+ *
+ *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+ *  before repeating.  The argument A is the same as 'a' in the above formula,
+ *  and X is the same as x_0.  A and X must be odd double precision integers
+ *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
+ *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+ *  the new seed x_1, so that subsequent calls to RANDLC using the same
+ *  arguments will generate a continuous sequence.
+ *
+ *  This routine should produce the same results on any computer with at least
+ *  48 mantissa bits in double precision floating point data.  On Cray systems,
+ *  double precision should be disabled.
+ *
+ *  David H. Bailey     October 26, 1990
+ *
+ *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ *     SAVE KS, R23, R46, T23, T46
+ *     DATA KS/0/
+ *
+ *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
+ *  by merely using the ** operator, in order to insure that the results are
+ *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
+ */
+double randlc(double *X, double*A)
+{
+  static int        KS=0;
+  static double  R23, R46, T23, T46;
+  double    T1, T2, T3, T4;
+  double    A1, A2;
+  double    X1, X2;
+  double    Z;
+  int       i, j;
+
+  if (KS == 0) {
+    R23 = 1.0;
+    R46 = 1.0;
+    T23 = 1.0;
+    T46 = 1.0;
+
+    for (i=1; i<=23; i++) {
+      R23 = 0.50 * R23;
+      T23 = 2.0 * T23;
+    }
+    for (i=1; i<=46; i++) {
+      R46 = 0.50 * R46;
+      T46 = 2.0 * T46;
+    }
+    KS = 1;
+  }
+
+/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
+  T1 = R23 * *A;
+  j  = T1;
+  A1 = j;
+  A2 = *A - T23 * A1;
+
+/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
+    Z = A1 * X2 + A2 * X1  (mod 2^23), and then X = 2^23 * Z + A2 * X2  (mod 2^46). */
+  T1 = R23 * *X;
+  j  = T1;
+  X1 = j;
+  X2 = *X - T23 * X1;
+  T1 = A1 * X2 + A2 * X1;
+
+  j  = R23 * T1;
+  T2 = j;
+  Z = T1 - T23 * T2;
+  T3 = T23 * Z + A2 * X2;
+  j  = R46 * T3;
+  T4 = j;
+  *X = T3 - T46 * T4;
+  return(R46 * *X);
+}
+
+void c_print_results(const char *name, char class, int n1, int n2, int n3, int niter, int nprocs_compiled,
+                     int nprocs_total, double t, double mops, const char *optype, int passed_verification)
+{
+  printf( "\n\n %s Benchmark Completed\n", name );
+  printf( " Class           =                        %c\n", class );
+
+  if( n3 == 0 ) {
+    long nn = n1;
+    if ( n2 != 0 ) nn *= n2;
+    printf( " Size            =             %12ld\n", nn );   /* as in IS */
+  } else
+    printf( " Size            =              %3dx %3dx %3d\n", n1,n2,n3 );
+
+  printf( " Iterations      =             %12d\n", niter );
+  printf( " Time in seconds =             %12.2f\n", t );
+  printf( " Total processes =             %12d\n", nprocs_total );
+
+  if ( nprocs_compiled != 0 )
+    printf( " Compiled procs  =             %12d\n", nprocs_compiled );
+
+  printf( " Mop/s total     =             %12.2f\n", mops );
+  printf( " Mop/s/process   =             %12.2f\n", mops/((float) nprocs_total) );
+  printf( " Operation type  = %24s\n", optype);
+
+  if( passed_verification )
+    printf( " Verification    =               SUCCESSFUL\n" );
+  else
+    printf( " Verification    =             UNSUCCESSFUL\n" );
+}
diff --git a/examples/smpi/NAS/nas_common.h b/examples/smpi/NAS/nas_common.h
new file mode 100644
index 0000000000..1990c0d1cc
--- /dev/null
+++ b/examples/smpi/NAS/nas_common.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016. The SimGrid Team.
+ * All rights reserved.                                                     */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+#ifndef NAS_COMMON_H
+#define NAS_COMMON_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "mpi.h"
+
+enum benchmark_types {IS, DT, EP};
+
+int ilog2(int i);
+void timer_clear(int n);
+void timer_start(int n);
+void timer_stop(int n);
+double timer_read(int n);
+
+void time_clear(double *onetimer);
+void time_start(double *onetimer);
+void time_stop(int n,double *elapsed,double *start);
+double time_read(int n, double *elapsed);
+
+double vranlc(int n, double x, double a, double *y);
+double randlc(double *X, double*A);
+
+void c_print_results(const char *name, char class, int n1, int n2, int n3, int niter, int nprocs_compiled,
+                     int nprocs_total, double t, double mops, const char *optype, int passed_verification);
+
+void get_info(int argc, char *argv[], int *nprocsp, char *classp);
+void check_info(int type, int nprocs, char class);
+
+#endif
diff --git a/examples/smpi/NAS/sys/Makefile b/examples/smpi/NAS/sys/Makefile
deleted file mode 100644
index 35b4166abf..0000000000
--- a/examples/smpi/NAS/sys/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-include ../config/make.def
-all: setparams 
-
-# setparams creates an npbparam.h file for each benchmark configuration. 
-# npbparams.h also contains info about how a benchmark was compiled and linked
-
-setparams: setparams.c ../config/make.def
-	$(CC) ${CONVERTFLAG} -o setparams setparams.c
-
-clean: 
-	-rm -f setparams setparams.h npbparams.h *~ *.o
-
diff --git a/examples/smpi/NAS/sys/README b/examples/smpi/NAS/sys/README
deleted file mode 100644
index 0a62dd15af..0000000000
--- a/examples/smpi/NAS/sys/README
+++ /dev/null
@@ -1,38 +0,0 @@
-This directory contains utilities and files used by the 
-build process. You should not need to change anything
-in this directory. 
-
-Original Files
---------------
-setparams.c:
-        Source for the setparams program. This program is used internally
-        in the build process to create the file "npbparams.h" for each 
-        benchmark. npbparams.h contains Fortran or C parameters to build a 
-        benchmark for a specific class and number of nodes. The setparams 
-        program is never run directly by a user. Its invocation syntax is 
-        "setparams benchmark-name nprocs class". 
-        It examines the file "npbparams.h" in the current directory. If 
-        the specified parameters are the same as those in the npbparams.h 
-        file, nothing it changed. If the file does not exist or corresponds 
-        to a different class/number of nodes, it is (re)built. 
-	One of the more complicated things in npbparams.h is that it 
-        contains, in a Fortran string, the compiler flags used to build a 
-        benchmark, so that a benchmark can print out how it was compiled. 
-
-make.common
-        A makefile segment that is included in each individual benchmark
-        program makefile. It sets up some standard macros (COMPILE, etc) 
-        and makes sure everything is configured correctly (npbparams.h)
-
-Makefile
-        Builds  setparams
-
-README
-        This file. 
-
-Created files
--------------
-
-setparams
-	See descriptions above
-
diff --git a/examples/smpi/NAS/sys/make.common b/examples/smpi/NAS/sys/make.common
deleted file mode 100644
index 228036707a..0000000000
--- a/examples/smpi/NAS/sys/make.common
+++ /dev/null
@@ -1,45 +0,0 @@
-PROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS).$(NPROCS)
-CCOMPILE = $(MPICC)  -c $(CMPI_INC) $(CFLAGS)
-
-# Class "U" is used internally by the setparams program to mean
-# "unknown". This means that if you don't specify CLASS=
-# on the command line, you'll get an error. It would be nice
-# to be able to avoid this, but we'd have to get information
-# from the setparams back to the make program, which isn't easy. 
-CLASS=U
-NPROCS=1
-
-default:: ${PROGRAM}
-
-# This makes sure the configuration utility setparams 
-# is up to date. 
-# Note that this must be run every time, which is why the
-# target does not exist and is not created. 
-# If you create a file called "config" you will break things. 
-config:
-	@cd ../sys; ${MAKE} all
-	../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE}
-
-COMMON=../common
-$${COMMON}/c_randdp.o: ${COMMON}/randdp.c
-	cd ${COMMON}; ${CCOMPILE} -o c_randdp.o randdp.c
-
-${COMMON}/c_print_results.o: ${COMMON}/c_print_results.c
-	cd ${COMMON}; ${CCOMPILE} c_print_results.c
-
-${COMMON}/c_timers.o: ${COMMON}/c_timers.c
-	cd ${COMMON}; ${CCOMPILE} c_timers.c
-
-# Normally setparams updates npbparams.h only if the settings (CLASS/NPROCS)
-# have changed. However, we also want to update if the compile options
-# may have changed (set in ../config/make.def). 
-npbparams.h: ../config/make.def
-	@ echo make.def modified. Rebuilding npbparams.h just in case
-	rm -f npbparams.h
-	../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE}
-
-# So that "make benchmark-name" works
-${BENCHMARK}:  default
-${BENCHMARKU}: default
-
-
diff --git a/examples/smpi/NAS/sys/print_instructions b/examples/smpi/NAS/sys/print_instructions
deleted file mode 100755
index 6a09f2a123..0000000000
--- a/examples/smpi/NAS/sys/print_instructions
+++ /dev/null
@@ -1,10 +0,0 @@
-echo ''
-echo '   To make a NAS benchmark type '
-echo ''
-echo '         make <benchmark-name> NPROCS=<number> CLASS=<class>'
-echo ''
-echo '   where <benchmark-name>  is "ep", "dt", or "is"
-echo '         <number>          is the number of processors'
-echo '         <class>           is "S", "W", "A", "B", "C", or "D"'
-echo ''
-
diff --git a/examples/smpi/NAS/sys/setparams.c b/examples/smpi/NAS/sys/setparams.c
deleted file mode 100644
index a01dab58c7..0000000000
--- a/examples/smpi/NAS/sys/setparams.c
+++ /dev/null
@@ -1,597 +0,0 @@
-/* 
- * This utility configures a NPB to be built for a specific number
- * of nodes and a specific class. It creates a file "npbparams.h" 
- * in the source directory. This file keeps state information about 
- * which size of benchmark is currently being built (so that nothing
- * if unnecessarily rebuilt) and defines (through PARAMETER statements)
- * the number of nodes and class for which a benchmark is being built. 
-
- * The utility takes 3 arguments: 
- *    setparams benchmark-name nprocs class
- *      benchmark-name is "ep", "dt", or "is"
- *      nprocs is the number of processors to run on
- *      class is the size of the benchmark
- * These parameters are checked for the current benchmark. If they
- * are invalid, this program prints a message and aborts. 
- * If the parameters are ok, the current npbsize.h (actually just
- * the first line) is read in. If the new parameters are the same as 
- * the old, nothing is done, but an exit code is returned to force the
- * user to specify (otherwise the make procedure succeeds but builds a
- * binary of the wrong name).  Otherwise the file is rewritten. 
- * Errors write a message (to stdout) and abort. 
- * 
- * This program makes use of two extra benchmark "classes"
- * class "X" means an invalid specification. It is returned if
- * there is an error parsing the config file. 
- * class "U" is an external specification meaning "unknown class"
- * 
- * Unfortunately everything has to be case sensitive. This is
- * because we can always convert lower to upper or v.v. but
- * can't feed this information back to the makefile, so typing
- * make CLASS=a and make CLASS=A will produce different binaries.
- *
- * 
- */
-
-#include <sys/types.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <string.h>
-#include <time.h>
-
-/*
- * This is the master version number for this set of NPB benchmarks. It is in an obscure place so people
- * won't accidentally change it. 
- */
-
-#define VERSION "3.3"
-
-/* controls verbose output from setparams */
-/* #define VERBOSE */
-
-#define FILENAME "npbparams.h"
-#define DESC_LINE "c NPROCS = %d CLASS = %c\n"
-#define DEF_CLASS_LINE     "#define CLASS '%c'\n"
-#define DEF_NUM_PROCS_LINE "#define NUM_PROCS %d\n"
-#define FINDENT  "        "
-#define CONTINUE "     > "
-
-void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp, int* subtypep);
-void check_info(int type, int nprocs, char class);
-void read_info(int type, int *nprocsp, char *classp, int *subtypep);
-void write_info(int type, int nprocs, char class, int subtype);
-void write_ep_info_C(FILE *fp, int nprocs, char class);  /* after C translation */
-void write_is_info(FILE *fp, int nprocs, char class);
-void write_dt_info(FILE *fp, int nprocs, char class);
-void write_compiler_info(int type, FILE *fp);
-void check_line(char *line, char *label, char *val);
-int  check_include_line(char *line, char *filename);
-void put_string(FILE *fp, char *name, char *val);
-void put_def_string(FILE *fp, char *name, char *val);
-void put_def_variable(FILE *fp, char *name, char *val);
-int isqrt(int i);
-int ilog2(int i);
-int ipow2(int i);
-
-enum benchmark_types {IS, DT, EP};
-
-int main(int argc, char *argv[])
-{
-  int nprocs, nprocs_old, type;
-  char class, class_old;
-  int subtype = -1, old_subtype = -1;
-
-  /* Get command line arguments. Make sure they're ok. */
-  get_info(argc, argv, &type, &nprocs, &class, &subtype);
-  if (class != 'U') {
-#ifdef VERBOSE
-    printf("setparams: For benchmark %s: number of processors = %d class = %c\n", 
-     argv[1], nprocs, class); 
-#endif
-    check_info(type, nprocs, class);
-  }
-
-  /* Get old information. */
-  read_info(type, &nprocs_old, &class_old, &old_subtype);
-  if (class != 'U') {
-    if (class_old != 'X') {
-#ifdef VERBOSE
-      printf("setparams:     old settings: number of processors = %d class = %c\n", 
-       nprocs_old, class_old); 
-#endif
-    }
-  } else {
-    printf("setparams:\n\
-  *********************************************************************\n\
-  * You must specify NPROCS and CLASS to build this benchmark         *\n\
-  * For example, to build a class A benchmark for 4 processors, type  *\n\
-  *       make {benchmark-name} NPROCS=4 CLASS=A                      *\n\
-  *********************************************************************\n\n"); 
-
-    if (class_old != 'X') {
-#ifdef VERBOSE
-      printf("setparams: Previous settings were CLASS=%c NPROCS=%d\n", 
-       class_old, nprocs_old); 
-#endif
-    }
-    exit(1); /* exit on class==U */
-  }
-
-  /* Write out new information if it's different. */
-  if (nprocs != nprocs_old || class != class_old || subtype != old_subtype) {
-#ifdef VERBOSE
-    printf("setparams: Writing %s\n", FILENAME); 
-#endif
-    write_info(type, nprocs, class, subtype);
-  } else {
-#ifdef VERBOSE
-    printf("setparams: Settings unchanged. %s unmodified\n", FILENAME); 
-#endif
-  }
-
-  return 0;
-}
-
-/*  get_info(): Get parameters from command line */
-void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp, int *subtypep)
-{
-  if (argc < 4) {
-    printf("Usage: %s (%d) benchmark-name nprocs class\n", argv[0], argc);
-    exit(1);
-  }
-
-  *nprocsp = atoi(argv[2]);
-  *classp = *argv[3];
-
-  if      (!strcmp(argv[1], "is") || !strcmp(argv[1], "IS")) *typep = IS;
-  else if (!strcmp(argv[1], "dt") || !strcmp(argv[1], "DT")) *typep = DT;
-  else if (!strcmp(argv[1], "ep") || !strcmp(argv[1], "EP")) *typep = EP;
-  else {
-    printf("setparams: Error: unknown benchmark type %s\n", argv[1]);
-    exit(1);
-  }
-}
-
-/*
- *  check_info(): Make sure command line data is ok for this benchmark 
- */
-
-void check_info(int type, int nprocs, char class) 
-{
-  int rootprocs, logprocs; 
-
-  /* check number of processors */
-  if (nprocs <= 0) {
-    printf("setparams: Number of processors must be greater than zero\n");
-    exit(1);
-  }
-  switch(type) {
-  case IS:
-    logprocs = ilog2(nprocs);
-    if (logprocs < 0) {
-      printf("setparams: Number of processors must be a power of two (1,2,4,...) for this benchmark\n");
-      exit(1);
-    }
-    break;
-
-  case EP:
-  case DT:
-    break;
-
-  default:
-    /* never should have gotten this far with a bad name */
-    printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type); 
-    exit(1);
-  }
-
-  /* check class */
-  if (class != 'S' && 
-      class != 'W' && 
-      class != 'A' && 
-      class != 'B' && 
-      class != 'C' && 
-      class != 'D' && 
-      class != 'E') {
-    printf("setparams: Unknown benchmark class %c\n", class); 
-    printf("setparams: Allowed classes are \"S\", \"W\", and \"A\" through \"E\"\n");
-    exit(1);
-  }
-
-  if (class == 'E' && (type == IS || type == DT)) {
-    printf("setparams: Benchmark class %c not defined for IS or DT\n", class);
-    exit(1);
-  }
-
-  if (class == 'D' && type == IS && nprocs < 4) {
-    printf("setparams: IS class D size cannot be run on less than 4 processors\n");
-    exit(1);
-  }
-}
-
-/* 
- * read_info(): Read previous information from file. 
- *              Not an error if file doesn't exist, because this may be the first time we're running.
- *              Assumes the first two lines of the file is in a special format that we understand (since we wrote it).
- */
-
-void read_info(int type, int *nprocsp, char *classp, int *subtypep)
-{
-  int nread = 0;
-  FILE *fp;
-  fp = fopen(FILENAME, "r");
-  if (fp == NULL) {
-#ifdef VERBOSE
-    printf("setparams: INFO: configuration file %s does not exist (yet)\n", FILENAME); 
-#endif
-    goto abort;
-  }
-
-  /* first two lines of file contains info */
-  nread = fscanf(fp, DEF_CLASS_LINE, classp);
-  nread += fscanf(fp, DEF_NUM_PROCS_LINE, nprocsp);
-  if (nread != 2) {
-    printf("setparams: Error line %d parsing config file %s. Ignoring previous settings\n", __LINE__,FILENAME);
-    goto abort;
-  }
-
-  fclose(fp);
-  return;
-
- abort:
-  *nprocsp = -1;
-  *classp = 'X';
-  *subtypep = -1;
-  return;
-}
-
-/* 
- * write_info(): Write new information to config file. 
- *               First line is in a special format so we can read
- *               it in again. Then comes a warning. The rest is all
- *               specific to a particular benchmark. 
- */
-
-void write_info(int type, int nprocs, char class, int subtype) 
-{
-  FILE *fp;
-  char *BT_TYPES[] = {"NONE", "FULL", "SIMPLE", "EPIO", "FORTRAN"};
-
-  fp = fopen(FILENAME, "w");
-  if (fp == NULL) {
-    printf("setparams: Can't open file %s for writing\n", FILENAME);
-    exit(1);
-  }
-
-  fprintf(fp, DEF_CLASS_LINE, class);
-  fprintf(fp, DEF_NUM_PROCS_LINE, nprocs);
-  fprintf(fp, "\
-/*\n\
-   This file is generated automatically by the setparams utility.\n\
-   It sets the number of processors and the class of the NPB\n\
-   in this directory. Do not modify it by hand.   */\n\
-   \n");
-
-  /* Now do benchmark-specific stuff */
-  switch(type) {
-  case IS:
-    write_is_info(fp, nprocs, class);  
-    break;
-  case DT:
-    write_dt_info(fp, nprocs, class);  
-    break;
-  case EP:
-    write_ep_info_C(fp, nprocs, class);
-    break;
-  default:
-    printf("setparams: (Internal error): Unknown benchmark type %d\n", type);
-    exit(1);
-  }
-  write_compiler_info(type, fp);
-  fclose(fp);
-  return;
-}
-
-/* write_dt_info(): Write DT specific info to config file */
-
-void write_dt_info(FILE *fp, int nprocs, char class) 
-{
-  int num_samples,deviation,num_sources;
-  if      (class == 'S') { num_samples=1728; deviation=128; num_sources=4; }
-  else if (class == 'W') { num_samples=1728*8; deviation=128*2; num_sources=4*2; }
-  else if (class == 'A') { num_samples=1728*64; deviation=128*4; num_sources=4*4; }
-  else if (class == 'B') { num_samples=1728*512; deviation=128*8; num_sources=4*8; }
-  else if (class == 'C') { num_samples=1728*4096; deviation=128*16; num_sources=4*16; }
-  else if (class == 'D') { num_samples=1728*4096*8; deviation=128*32; num_sources=4*32; }
-  else {
-    printf("setparams: Internal error: invalid class type %c\n", class);
-    exit(1);
-  }
-  fprintf(fp, "#define NUM_SAMPLES %d\n", num_samples);
-  fprintf(fp, "#define STD_DEVIATION %d\n", deviation);
-  fprintf(fp, "#define NUM_SOURCES %d\n", num_sources);
-}
-
-/* write_is_info(): Write IS specific info to config file */
-void write_is_info(FILE *fp, int nprocs, char class)
-{
-  if( class != 'S' && class != 'W' && class != 'A' && class != 'B' && class != 'C' && class != 'D' )
-  {
-    printf("setparams: Internal error: invalid class type %c\n", class);
-    exit(1);
-  }
-}
-
-/* write_ep_info_C(): Write EP specific info to config file */
-void write_ep_info_C(FILE *fp, int nprocs, char class)
-{
-  /* easiest way (given the way the benchmark is written) is to specify log of number of grid points in each
-   * direction m1, m2, m3. nt is the number of iterations
-   */
-  int m;
-  if      (class == 'S') { m = 24; }
-  else if (class == 'W') { m = 25; }
-  else if (class == 'A') { m = 28; }
-  else if (class == 'B') { m = 30; }
-  else if (class == 'C') { m = 32; }
-  else if (class == 'D') { m = 36; }
-  else if (class == 'E') { m = 40; }
-  else {
-    printf("setparams: Internal error: invalid class type %c\n", class);
-    exit(1);
-  }
-
-  /* number of processors given by "npm" */
-  fprintf(fp, "%schar *_class=\"%c\";\n",FINDENT,class);
-  fprintf(fp, "%sint m=%d;\n", FINDENT,m);
-  fprintf(fp, "%sint npm=%d;\n", FINDENT,nprocs);
-}
-
-/* 
- * This is a gross hack to allow the benchmarks to  print out how they were compiled. Various other ways
- * of doing this have been tried and they all fail on some machine - due to a broken "make" program, or
- * F77 limitations, of whatever. Hopefully this will always work because it uses very portable C. Unfortunately
- * it relies on parsing the make.def file - YUK. 
- * If your machine doesn't have <string.h> or <ctype.h>, happy hacking!
- */
-
-#define VERBOSE
-#define LL 400
-#include <stdio.h>
-#define DEFFILE "../config/make.def"
-#define DEFAULT_MESSAGE "(none)"
-FILE *deffile;
-void write_compiler_info(int type, FILE *fp)
-{
-  char line[LL];
-  char compiletime[LL], randfile[LL];
-  char mpicc[LL], cflags[LL], clink[LL], clinkflags[LL],
-       cmpi_lib[LL], cmpi_inc[LL];
-  struct tm *tmp;
-  time_t t;
-  deffile = fopen(DEFFILE, "r");
-  if (deffile == NULL) {
-    printf("\n\
-setparams: File %s doesn't exist. To build the NAS benchmarks\n\
-           you need to create is according to the instructions\n\
-           in the README in the main directory and comments in \n\
-           the file config/make.def.template\n", DEFFILE);
-    exit(1);
-  }
-  strcpy(randfile, DEFAULT_MESSAGE);
-  strcpy(mpicc, DEFAULT_MESSAGE);
-  strcpy(cflags, DEFAULT_MESSAGE);
-  strcpy(clink, DEFAULT_MESSAGE);
-  strcpy(clinkflags, DEFAULT_MESSAGE);
-  strcpy(cmpi_lib, DEFAULT_MESSAGE);
-  strcpy(cmpi_inc, DEFAULT_MESSAGE);
-
-  while (fgets(line, LL, deffile) != NULL) {
-    if (*line == '#') continue;
-    /* yes, this is inefficient. but it's simple! */
-    check_line(line, "RAND", randfile);
-    check_line(line, "MPICC", mpicc);
-    check_line(line, "CFLAGS", cflags);
-    check_line(line, "CLINK", clink);
-    check_line(line, "CLINKFLAGS", clinkflags);
-    check_line(line, "CMPI_LIB", cmpi_lib);
-    check_line(line, "CMPI_INC", cmpi_inc);
-  }
-
-  (void) time(&t);
-  tmp = localtime(&t);
-  (void) strftime(compiletime, (size_t)LL, "%d %b %Y", tmp);
-
-  put_def_string(fp, "COMPILETIME", compiletime);
-  put_def_string(fp, "NPBVERSION", VERSION);
-  put_def_string(fp, "MPICC", mpicc);
-  put_def_string(fp, "CFLAGS", cflags);
-  put_def_string(fp, "CLINK", clink);
-  put_def_string(fp, "CLINKFLAGS", clinkflags);
-  put_def_string(fp, "CMPI_LIB", cmpi_lib);
-  put_def_string(fp, "CMPI_INC", cmpi_inc);
-}
-
-void check_line(char *line, char *label, char *val)
-{
-  char *original_line;
-  int n;
-  original_line = line;
-  /* compare beginning of line and label */
-  while (*label != '\0' && *line == *label) {
-    line++; label++; 
-  }
-  /* if *label is not EOS, we must have had a mismatch */
-  if (*label != '\0') return;
-  /* if *line is not a space, actual label is longer than test label */
-  if (!isspace(*line) && *line != '=') return ; 
-  /* skip over white space */
-  while (isspace(*line)) line++;
-  /* next char should be '=' */
-  if (*line != '=') return;
-  /* skip over white space */
-  while (isspace(*++line));
-  /* if EOS, nothing was specified */
-  if (*line == '\0') return;
-  /* finally we've come to the value */
-  strcpy(val, line);
-  /* chop off the newline at the end */
-  n = strlen(val)-1;
-  if (n >= 0 && val[n] == '\n')
-    val[n--] = '\0';
-  if (n >= 0 && val[n] == '\r')
-    val[n--] = '\0';
-  /* treat continuation */
-  while (val[n] == '\\' && fgets(original_line, LL, deffile)) {
-     line = original_line;
-     while (isspace(*line)) line++;
-     if (isspace(*original_line)) val[n++] = ' ';
-     while (*line && *line != '\n' && *line != '\r' && n < LL-1)
-       val[n++] = *line++;
-     val[n] = '\0';
-     n--;
-  }
-/*  if (val[strlen(val) - 1] == '\\') {
-    printf("\n\
-setparams: Error in file make.def. Because of the way in which\n\
-           command line arguments are incorporated into the\n\
-           executable benchmark, you can't have any continued\n\
-           lines in the file make.def, that is, lines ending\n\
-           with the character \"\\\". Although it may be ugly, \n\
-           you should be able to reformat without continuation\n\
-           lines. The offending line is\n\
-  %s\n", original_line);
-    exit(1);
-  } */
-}
-
-int check_include_line(char *line, char *filename)
-{
-  char *include_string = "include";
-  /* compare beginning of line and "include" */
-  while (*include_string != '\0' && *line == *include_string) {
-    line++; include_string++; 
-  }
-  /* if *include_string is not EOS, we must have had a mismatch */
-  if (*include_string != '\0') return(0);
-  /* if *line is not a space, first word is not "include" */
-  if (!isspace(*line)) return(0); 
-  /* skip over white space */
-  while (isspace(*++line));
-  /* if EOS, nothing was specified */
-  if (*line == '\0') return(0);
-  /* next keyword should be name of include file in *filename */
-  while (*filename != '\0' && *line == *filename) {
-    line++; filename++; 
-  }  
-  if (*filename != '\0' || 
-      (*line != ' ' && *line != '\0' && *line !='\n')) return(0);
-  else return(1);
-}
-
-#define MAXL 46
-void put_string(FILE *fp, char *name, char *val)
-{
-  int len;
-  len = strlen(val);
-  if (len > MAXL) {
-    val[MAXL] = '\0';
-    val[MAXL-1] = '.';
-    val[MAXL-2] = '.';
-    val[MAXL-3] = '.';
-    len = MAXL;
-  }
-  fprintf(fp, "%scharacter*%d %s\n", FINDENT, len, name);
-  fprintf(fp, "%sparameter (%s=\'%s\')\n", FINDENT, name, val);
-}
-
-/* NOTE: is the ... stuff necessary in C? */
-void put_def_string(FILE *fp, char *name, char *val)
-{
-  int len;
-  len = strlen(val);
-  if (len > MAXL) {
-    val[MAXL] = '\0';
-    val[MAXL-1] = '.';
-    val[MAXL-2] = '.';
-    val[MAXL-3] = '.';
-    len = MAXL;
-  }
-  fprintf(fp, "#define %s \"%s\"\n", name, val);
-}
-
-void put_def_variable(FILE *fp, char *name, char *val)
-{
-  int len;
-  len = strlen(val);
-  if (len > MAXL) {
-    val[MAXL] = '\0';
-    val[MAXL-1] = '.';
-    val[MAXL-2] = '.';
-    val[MAXL-3] = '.';
-    len = MAXL;
-  }
-  fprintf(fp, "#define %s %s\n", name, val);
-}
-
-#if 0
-/* this version allows arbitrarily long lines but some compilers don't like that and they're rarely useful */
-
-#define LINELEN 65
-void put_string(FILE *fp, char *name, char *val)
-{
-  int len, nlines, pos, i;
-  char line[100];
-  len = strlen(val);
-  nlines = len/LINELEN;
-  if (nlines*LINELEN < len) nlines++;
-  fprintf(fp, "%scharacter*%d %s\n", FINDENT, nlines*LINELEN, name);
-  fprintf(fp, "%sparameter (%s = \n", FINDENT, name);
-  for (i = 0; i < nlines; i++) {
-    pos = i*LINELEN;
-    if (i == 0) fprintf(fp, "%s\'", CONTINUE);
-    else        fprintf(fp, "%s", CONTINUE);
-    /* number should be same as LINELEN */
-    fprintf(fp, "%.65s", val+pos);
-    if (i == nlines-1) fprintf(fp, "\')\n");
-    else             fprintf(fp, "\n");
-  }
-}
-#endif
-
-
-/* integer square root. Return error if argument isn't a perfect square or is less than or equal to zero */
-int isqrt(int i)
-{
-  int root, square;
-  if (i <= 0) return(-1);
-  square = 0;
-  for (root = 1; square <= i; root++) {
-    square = root*root;
-    if (square == i) return(root);
-  }
-  return(-1);
-}
-
-/* integer log base two. Return error is argument isn't a power of two or is less than or equal to zero */
-int ilog2(int i)
-{
-  int log2;
-  int exp2 = 1;
-  if (i <= 0) return(-1);
-
-  for (log2 = 0; log2 < 20; log2++) {
-    if (exp2 == i) return(log2);
-    exp2 *= 2;
-  }
-  return(-1);
-}
-
-int ipow2(int i)
-{
-  int pow2 = 1;
-  if (i < 0) return(-1);
-  if (i == 0) return(1);
-  while(i--) pow2 *= 2;
-  return(pow2);
-}
diff --git a/tools/cmake/DefinePackages.cmake b/tools/cmake/DefinePackages.cmake
index e7033d9fcb..ef11ae1ad3 100644
--- a/tools/cmake/DefinePackages.cmake
+++ b/tools/cmake/DefinePackages.cmake
@@ -984,6 +984,7 @@ set(CMAKEFILES_TXT
   examples/simdag/scheduling/CMakeLists.txt
   
   examples/smpi/CMakeLists.txt
+  examples/smpi/NAS/CMakeLists.txt
   examples/smpi/smpi_msg_masterslave/CMakeLists.txt
   examples/smpi/replay_multiple/CMakeLists.txt
   examples/smpi/energy/CMakeLists.txt
diff --git a/tools/internal/check_dist_archive.exclude b/tools/internal/check_dist_archive.exclude
index 8cecb9f927..7070e3bc58 100644
--- a/tools/internal/check_dist_archive.exclude
+++ b/tools/internal/check_dist_archive.exclude
@@ -23,8 +23,6 @@
 
 + contrib/.*
 
-+ examples/smpi/NAS/.*
-
 + src/simix/README_attempt_without_stack
 + src/simix/simix_network\.tla