From: Frederic Suter <frederic.suter@cc.in2p3.fr>
Date: Mon, 1 Feb 2016 10:45:00 +0000 (+0100)
Subject: some cleaning in the NAS examples
X-Git-Tag: v3_13~1009
X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/202e327eb4f29590a1847fd31db81471021c2a36

some cleaning in the NAS examples
---

diff --git a/examples/smpi/NAS/DT-trace/DGraph.c b/examples/smpi/NAS/DT-trace/DGraph.c
deleted file mode 100644
index f573786db8..0000000000
--- a/examples/smpi/NAS/DT-trace/DGraph.c
+++ /dev/null
@@ -1,184 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "DGraph.h"
-
-DGArc *newArc(DGNode *tl,DGNode *hd){
-  DGArc *ar=(DGArc *)malloc(sizeof(DGArc));
-  ar->tail=tl;
-  ar->head=hd;
-  return ar;
-}
-void arcShow(DGArc *ar){
-  DGNode *tl=(DGNode *)ar->tail,
-         *hd=(DGNode *)ar->head;
-  fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name);
-}
-
-DGNode *newNode(char *nm){
-  DGNode *nd=(DGNode *)malloc(sizeof(DGNode));
-  nd->attribute=0;
-  nd->color=0;
-  nd->inDegree=0;
-  nd->outDegree=0;
-  nd->maxInDegree=SMALL_BLOCK_SIZE;
-  nd->maxOutDegree=SMALL_BLOCK_SIZE;
-  nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*));
-  nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*));
-  nd->name=strdup(nm);
-  nd->feat=NULL;
-  return nd;
-}
-void nodeShow(DGNode* nd){
-  fprintf( stderr,"%3d.%s: (%d,%d)\n",
-             nd->id,nd->name,nd->inDegree,nd->outDegree);
-/*
-  if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
-  else if(nd->verified==0)  fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
-  else  fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name);   
-*/
-}
-
-DGraph* newDGraph(char* nm){
-  DGraph *dg=(DGraph *)malloc(sizeof(DGraph));
-  dg->numNodes=0;
-  dg->numArcs=0;
-  dg->maxNodes=BLOCK_SIZE;
-  dg->maxArcs=BLOCK_SIZE;
-  dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*));
-  dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*));
-  dg->name=strdup(nm);
-  return dg;
-}
-int AttachNode(DGraph* dg, DGNode* nd) {
-  int i=0,j,len=0;
-  DGNode **nds =NULL, *tmpnd=NULL;
-  DGArc **ar=NULL;
-
-  if (dg->numNodes == dg->maxNodes-1 ) {
-    dg->maxNodes += BLOCK_SIZE;
-          nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
-    memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
-    free(dg->node);
-    dg->node=nds;
-  }
-
-        len = strlen( nd->name);
-  for (i = 0; i < dg->numNodes; i++) {
-    tmpnd =dg->node[ i];
-    ar=NULL;
-    if ( strlen( tmpnd->name) != len ) continue;
-    if ( strncmp( nd->name, tmpnd->name, len) ) continue;
-    if ( nd->inDegree > 0 ) {
-      tmpnd->maxInDegree += nd->maxInDegree;
-            ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
-      memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
-      free(tmpnd->inArc);
-      tmpnd->inArc=ar;
-      for (j = 0; j < nd->inDegree; j++ ) {
-        nd->inArc[ j]->head = tmpnd;
-      }
-      memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *));
-      tmpnd->inDegree += nd->inDegree;
-    }   
-    if ( nd->outDegree > 0 ) {
-      tmpnd->maxOutDegree += nd->maxOutDegree;
-            ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
-      memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
-      free(tmpnd->outArc);
-      tmpnd->outArc=ar;
-      for (j = 0; j < nd->outDegree; j++ ) {
-        nd->outArc[ j]->tail = tmpnd;
-      }      
-      memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
-      tmpnd->outDegree += nd->outDegree;
-    } 
-    free(nd); 
-    return i;
-  }
-  nd->id = dg->numNodes;
-  dg->node[dg->numNodes] = nd;
-  dg->numNodes++;
-return nd->id;
-}
-int AttachArc(DGraph *dg,DGArc* nar){
-int  arcId = -1;
-int i=0,newNumber=0;
-DGNode  *head = nar->head,
-  *tail = nar->tail; 
-DGArc **ars=NULL,*probe=NULL;
-/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
-  if ( !tail || !head ) return arcId;
-  if ( dg->numArcs == dg->maxArcs-1 ) {
-    dg->maxArcs += BLOCK_SIZE;
-          ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
-    memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
-    free(dg->arc);
-    dg->arc=ars;
-  }
-  for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
-    probe = tail->outArc[ i];
-    if(probe->head == head
-       &&
-       probe->length == nar->length
-            ){
-            free(nar);
-      return probe->id;   
-    }
-  }
-  
-  nar->id = dg->numArcs;
-  arcId=dg->numArcs;
-  dg->arc[dg->numArcs] = nar;
-  dg->numArcs++;
-  
-  head->inArc[ head->inDegree] = nar;
-  head->inDegree++;
-  if ( head->inDegree >= head->maxInDegree ) {
-    newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
-          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
-    memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
-    free(head->inArc);
-    head->inArc=ars;
-    head->maxInDegree = newNumber;
-  }
-  tail->outArc[ tail->outDegree] = nar;
-  tail->outDegree++;
-  if(tail->outDegree >= tail->maxOutDegree ) {
-    newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
-          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
-    memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
-    free(tail->outArc);
-    tail->outArc=ars;
-    tail->maxOutDegree = newNumber;
-  }
-/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
-return arcId;
-}
-void graphShow(DGraph *dg,int DetailsLevel){
-  int i=0,j=0;
-  fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
-  if ( DetailsLevel < 1) return;
-  for (i = 0; i < dg->numNodes; i++ ) {
-    DGNode *focusNode = dg->node[ i];
-    if(DetailsLevel >= 2) {
-      for (j = 0; j < focusNode->inDegree; j++ ) {
-  fprintf(stderr,"\t ");
-  nodeShow(focusNode->inArc[ j]->tail);
-      }
-    }
-    nodeShow(focusNode);
-    if ( DetailsLevel < 2) continue;
-    for (j = 0; j < focusNode->outDegree; j++ ) {
-      fprintf(stderr, "\t ");
-      nodeShow(focusNode->outArc[ j]->head);
-    }  
-    fprintf(stderr, "---\n");
-  }
-  fprintf(stderr,"----------------------------------------\n");
-  if ( DetailsLevel < 3) return;
-}
-
-
-
diff --git a/examples/smpi/NAS/DT-trace/DGraph.h b/examples/smpi/NAS/DT-trace/DGraph.h
deleted file mode 100644
index f38f898b24..0000000000
--- a/examples/smpi/NAS/DT-trace/DGraph.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef _DGRAPH
-#define _DGRAPH
-
-#define BLOCK_SIZE  128
-#define SMALL_BLOCK_SIZE 32
-
-typedef struct{
-  int id;
-  void *tail,*head;
-  int length,width,attribute,maxWidth;
-}DGArc;
-
-typedef struct{
-  int maxInDegree,maxOutDegree;
-  int inDegree,outDegree;
-  int id;
-  char *name;
-  DGArc **inArc,**outArc;
-  int depth,height,width;
-  int color,attribute,address,verified;
-  void *feat;
-}DGNode;
-
-typedef struct{
-  int maxNodes,maxArcs;
-  int id;
-  char *name;
-  int numNodes,numArcs;
-  DGNode **node;
-  DGArc **arc;
-} DGraph;
-
-DGArc *newArc(DGNode *tl,DGNode *hd);
-void arcShow(DGArc *ar);
-DGNode *newNode(char *nm);
-void nodeShow(DGNode* nd);
-
-DGraph* newDGraph(char *nm);
-int AttachNode(DGraph *dg,DGNode *nd);
-int AttachArc(DGraph *dg,DGArc* nar);
-void graphShow(DGraph *dg,int DetailsLevel);
-
-#endif
diff --git a/examples/smpi/NAS/DT-trace/Makefile b/examples/smpi/NAS/DT-trace/Makefile
deleted file mode 100644
index c727c75051..0000000000
--- a/examples/smpi/NAS/DT-trace/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=dt
-BENCHMARKU=DT
-
-include ../config/make.def
-
-include ../sys/make.common
-#Override PROGRAM
-DTPROGRAM  = $(BINDIR)/$(BENCHMARK)-trace.$(CLASS)
-
-OBJS = dt.o DGraph.o \
-	${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
-
-
-${PROGRAM}: config ${OBJS}
-	${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
-
-.c.o:
-	${CCOMPILE} $<
-
-dt.o:             dt.c  npbparams.h
-DGraph.o:	DGraph.c DGraph.h
-
-clean:
-	- rm -f *.o *~ mputil*
-	- rm -f dt npbparams.h core
diff --git a/examples/smpi/NAS/DT-trace/README b/examples/smpi/NAS/DT-trace/README
deleted file mode 100644
index 873e3ae6f2..0000000000
--- a/examples/smpi/NAS/DT-trace/README
+++ /dev/null
@@ -1,22 +0,0 @@
-Data Traffic benchmark DT is new in the NPB suite 
-(released as part of NPB3.x-MPI package).
-----------------------------------------------------
-
-DT is written in C and same executable can run on any number of processors,
-provided this number is not less than the number of nodes in the communication
-graph.  DT benchmark takes one argument: BH, WH, or SH. This argument 
-specifies the communication graph Black Hole, White Hole, or SHuffle 
-respectively. The current release contains verification numbers for 
-CLASSES S, W, A, and B only.  Classes C and D are defined, but verification 
-numbers are not provided in this release.
-
-The following table summarizes the number of nodes in the communication
-graph based on CLASS and graph TYPE.
-
-CLASS  N_Source N_Nodes(BH,WH) N_Nodes(SH)
- S      4        5              12
- W      8        11             32
- A      16       21             80
- B      32       43             192
- C      64       85             448
- D      128      171            1024
diff --git a/examples/smpi/NAS/DT-trace/dt.c b/examples/smpi/NAS/DT-trace/dt.c
deleted file mode 100644
index 1af5ccead4..0000000000
--- a/examples/smpi/NAS/DT-trace/dt.c
+++ /dev/null
@@ -1,777 +0,0 @@
-/*************************************************************************
- *                                                                       * 
- *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
- *                                                                       * 
- *                                  D T           * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
- *                                                                       * 
- *   Permission to use, copy, distribute and modify this software        * 
- *   for any purpose with or without fee is hereby granted.  We          * 
- *   request, however, that all derived work reference the NAS           * 
- *   Parallel Benchmarks 3.3. This software is provided "as is"          *
- *   without express or implied warranty.                                * 
- *                                                                       * 
- *   Information on NPB 3.3, including the technical report, the         *
- *   original specifications, source code, results and information       * 
- *   on how to submit new results, is available at:                      * 
- *                                                                       * 
- *          http:  www.nas.nasa.gov/Software/NPB                         * 
- *                                                                       * 
- *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
- *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
- *                                                                       * 
- *         NAS Parallel Benchmarks Group                                 * 
- *         NASA Ames Research Center                                     * 
- *         Mail Stop: T27A-1                                             * 
- *         Moffett Field, CA   94035-1000                                * 
- *                                                                       * 
- *         E-mail:  npb@nas.nasa.gov                                     * 
- *         Fax:     (650) 604-3957                                       * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   Author: M. Frumkin               *             * 
- *                                                                       * 
- *************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#include "simgrid/instr.h" //TRACE_
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS            1                 
-#endif
-
-//int      passed_verification;
-extern double randlc( double *X, double *A );
-extern
-void c_print_results( char   *name,
-                      char   class,
-                      int    n1, 
-                      int    n2,
-                      int    n3,
-                      int    niter,
-                      int    nprocs_compiled,
-                      int    nprocs_total,
-                      double t,
-                      double mops,
-          char   *optype,
-                      int    passed_verification,
-                      char   *npbversion,
-                      char   *compiletime,
-                      char   *mpicc,
-                      char   *clink,
-                      char   *cmpi_lib,
-                      char   *cmpi_inc,
-                      char   *cflags,
-                      char   *clinkflags );
-          
-void    timer_clear( int n );
-void    timer_start( int n );
-void    timer_stop( int n );
-double  timer_read( int n );
-int timer_on=0,timers_tot=64;
-
-int verify(char *bmname,double rnm2){
-    double verify_value=0.0;
-    double epsilon=1.0E-8;
-    char cls=CLASS;
-    int verified=-1;
-    if (cls != 'U') {
-       if(cls=='S') {
-         if(strstr(bmname,"BH")){
-           verify_value=30892725.0;
-         }else if(strstr(bmname,"WH")){
-           verify_value=67349758.0;
-         }else if(strstr(bmname,"SH")){
-           verify_value=58875767.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-         }
-         verified = 0;
-       }else if(cls=='W') {
-         if(strstr(bmname,"BH")){
-       verify_value = 4102461.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 204280762.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 186944764.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-         }
-         verified = 0;
-       }else if(cls=='A') {
-         if(strstr(bmname,"BH")){
-       verify_value = 17809491.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 1289925229.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 610856482.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-         }
-     verified = 0;
-       }else if(cls=='B') {
-         if(strstr(bmname,"BH")){
-       verify_value = 4317114.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 7877279917.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 1836863082.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-       verified = 0;
-         }
-       }else if(cls=='C') {
-         if(strstr(bmname,"BH")){
-       verify_value = 0.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 0.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 0.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-       verified = -1;
-         }
-       }else if(cls=='D') {
-         if(strstr(bmname,"BH")){
-       verify_value = 0.0;
-         }else if(strstr(bmname,"WH")){
-       verify_value = 0.0;
-         }else if(strstr(bmname,"SH")){
-       verify_value = 0.0;
-         }else{
-           fprintf(stderr,"No such benchmark as %s.\n",bmname);
-         }
-         verified = -1;
-       }else{
-         fprintf(stderr,"No such class as %c.\n",cls);
-       }
-       fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
-       if(verified==-1){
-     fprintf(stderr," No verification was performed.\n");
-       }else if( rnm2 - verify_value < epsilon &&
-                 rnm2 - verify_value > -epsilon) {  /* abs here does not work on ALTIX */
-      verified = 1;
-      fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
-       }else{
-     verified = 0;
-     fprintf(stderr," The correct verification value = %f\n",verify_value);
-     fprintf(stderr," Got value = %f\n",rnm2);
-       }
-    }else{
-       verified = -1;
-    }
-    return  verified;  
-  }
-
-int ipowMod(int a,long long int n,int md){ 
-  int seed=1,q=a,r=1;
-  if(n<0){
-    fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
-    n=-n; /* temp fix */
-/*    return 1; */
-  }
-  if(md<=0){
-    fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
-    return 1;
-  }
-  if(n==0) return 1;
-  while(n>1){
-    int n2 = n/2;
-    if (n2*2==n){
-       seed = (q*q)%md;
-       q=seed;
-       n = n2;
-    }else{
-       seed = (r*q)%md;
-       r=seed;
-       n = n-1;
-    }
-  }
-  seed = (r*q)%md;
-  return seed;
-}
-
-#include "DGraph.h"
-DGraph *buildSH(char cls){
-/*
-  Nodes of the graph must be topologically sorted
-  to avoid MPI deadlock.
-*/
-  DGraph *dg;
-  int numSources=NUM_SOURCES; /* must be power of 2 */
-  int numOfLayers=0,tmpS=numSources>>1;
-  int firstLayerNode=0;
-  DGArc *ar=NULL;
-  DGNode *nd=NULL;
-  int mask=0x0,ndid=0,ndoff=0;
-  int i=0,j=0;
-  char nm[BLOCK_SIZE];
-  
-  sprintf(nm,"DT_SH.%c",cls);
-  dg=newDGraph(nm);
-
-  while(tmpS>1){
-    numOfLayers++;
-    tmpS>>=1;
-  }
-  for(i=0;i<numSources;i++){
-    sprintf(nm,"Source.%d",i);
-    nd=newNode(nm);
-    AttachNode(dg,nd);
-  }
-  for(j=0;j<numOfLayers;j++){
-    mask=0x00000001<<j;
-    for(i=0;i<numSources;i++){
-      sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
-      nd=newNode(nm);
-      AttachNode(dg,nd);
-      ndoff=i&(~mask);
-      ndid=firstLayerNode+ndoff;
-      ar=newArc(dg->node[ndid],nd);     
-      AttachArc(dg,ar);
-      ndoff+=mask;
-      ndid=firstLayerNode+ndoff;
-      ar=newArc(dg->node[ndid],nd);     
-      AttachArc(dg,ar);
-    }
-    firstLayerNode+=numSources;
-  }
-  mask=0x00000001<<numOfLayers;
-  for(i=0;i<numSources;i++){
-    sprintf(nm,"Sink.%d",i);
-    nd=newNode(nm);
-    AttachNode(dg,nd);
-    ndoff=i&(~mask);
-    ndid=firstLayerNode+ndoff;
-    ar=newArc(dg->node[ndid],nd);     
-    AttachArc(dg,ar);
-    ndoff+=mask;
-    ndid=firstLayerNode+ndoff;
-    ar=newArc(dg->node[ndid],nd);     
-    AttachArc(dg,ar);
-  }
-return dg;
-}
-DGraph *buildWH(char cls){
-/*
-  Nodes of the graph must be topologically sorted
-  to avoid MPI deadlock.
-*/
-  int i=0,j=0;
-  int numSources=NUM_SOURCES,maxInDeg=4;
-  int numLayerNodes=numSources,firstLayerNode=0;
-  int totComparators=0;
-  int numPrevLayerNodes=numLayerNodes;
-  int id=0,sid=0;
-  DGraph *dg;
-  DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
-  DGArc *ar=NULL;
-  char nm[BLOCK_SIZE];
-
-  sprintf(nm,"DT_WH.%c",cls);
-  dg=newDGraph(nm);
-
-  for(i=0;i<numSources;i++){
-    sprintf(nm,"Sink.%d",i);
-    nd=newNode(nm);
-    AttachNode(dg,nd);
-  }
-  totComparators=0;
-  numPrevLayerNodes=numLayerNodes;
-  while(numLayerNodes>maxInDeg){
-    numLayerNodes=numLayerNodes/maxInDeg;
-    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
-    for(i=0;i<numLayerNodes;i++){
-      sprintf(nm,"Comparator.%d",totComparators);
-      totComparators++;
-      nd=newNode(nm);
-      id=AttachNode(dg,nd);
-      for(j=0;j<maxInDeg;j++){
-        sid=i*maxInDeg+j;
-  if(sid>=numPrevLayerNodes) break;
-        snd=dg->node[firstLayerNode+sid];
-        ar=newArc(dg->node[id],snd);
-        AttachArc(dg,ar);
-      }
-    }
-    firstLayerNode+=numPrevLayerNodes;
-    numPrevLayerNodes=numLayerNodes;
-  }
-  source=newNode("Source");
-  AttachNode(dg,source);   
-  for(i=0;i<numPrevLayerNodes;i++){
-    nd=dg->node[firstLayerNode+i];
-    ar=newArc(source,nd);
-    AttachArc(dg,ar);
-  }
-
-  for(i=0;i<dg->numNodes/2;i++){  /* Topological sorting */
-    tmp=dg->node[i];
-    dg->node[i]=dg->node[dg->numNodes-1-i];
-    dg->node[i]->id=i;
-    dg->node[dg->numNodes-1-i]=tmp;
-    dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
-  }
-return dg;
-}
-DGraph *buildBH(char cls){
-/*
-  Nodes of the graph must be topologically sorted
-  to avoid MPI deadlock.
-*/
-  int i=0,j=0;
-  int numSources=NUM_SOURCES,maxInDeg=4;
-  int numLayerNodes=numSources,firstLayerNode=0;
-  DGraph *dg;
-  DGNode *nd=NULL, *snd=NULL, *sink=NULL;
-  DGArc *ar=NULL;
-  int totComparators=0;
-  int numPrevLayerNodes=numLayerNodes;
-  int id=0, sid=0;
-  char nm[BLOCK_SIZE];
-
-  sprintf(nm,"DT_BH.%c",cls);
-  dg=newDGraph(nm);
-
-  for(i=0;i<numSources;i++){
-    sprintf(nm,"Source.%d",i);
-    nd=newNode(nm);
-    AttachNode(dg,nd);
-  }
-  while(numLayerNodes>maxInDeg){
-    numLayerNodes=numLayerNodes/maxInDeg;
-    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
-    for(i=0;i<numLayerNodes;i++){
-      sprintf(nm,"Comparator.%d",totComparators);
-      totComparators++;
-      nd=newNode(nm);
-      id=AttachNode(dg,nd);
-      for(j=0;j<maxInDeg;j++){
-        sid=i*maxInDeg+j;
-  if(sid>=numPrevLayerNodes) break;
-        snd=dg->node[firstLayerNode+sid];
-        ar=newArc(snd,dg->node[id]);
-        AttachArc(dg,ar);
-      }
-    }
-    firstLayerNode+=numPrevLayerNodes;
-    numPrevLayerNodes=numLayerNodes;
-  }
-  sink=newNode("Sink");
-  AttachNode(dg,sink);   
-  for(i=0;i<numPrevLayerNodes;i++){
-    nd=dg->node[firstLayerNode+i];
-    ar=newArc(nd,sink);
-    AttachArc(dg,ar);
-  }
-return dg;
-}
-
-typedef struct{
-  int len;
-  double* val;
-} Arr;
-Arr *newArr(int len){
-  Arr *arr=(Arr *)SMPI_SHARED_MALLOC(sizeof(Arr));
-  arr->len=len;
-  arr->val=(double *)SMPI_SHARED_MALLOC(len*sizeof(double));
-  return arr;
-}
-void arrShow(Arr* a){
-  if(!a) fprintf(stderr,"-- NULL array\n");
-  else{
-    fprintf(stderr,"-- length=%d\n",a->len);
-  }
-}
-double CheckVal(Arr *feat){
-  double csum=0.0;
-  int i=0;
-  for(i=0;i<feat->len;i++){
-    csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since 
-                                                  result will be 0 for large len  */
-  }
-   return csum;
-}
-int GetFNumDPar(int* mean, int* stdev){
-  *mean=NUM_SAMPLES;
-  *stdev=STD_DEVIATION;
-  return 0;
-}
-int GetFeatureNum(char *mbname,int id){
-  double tran=314159265.0;
-  double A=2*id+1;
-  double denom=randlc(&tran,&A);
-  char cval='S';
-  int mean=NUM_SAMPLES,stdev=128;
-  int rtfs=0,len=0;
-  GetFNumDPar(&mean,&stdev);
-  rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
-  if(rtfs<0) rtfs=-rtfs;
-  len=mean-stdev+rtfs;
-  return len;
-}
-Arr* RandomFeatures(char *bmname,int fdim,int id){
-  int len=GetFeatureNum(bmname,id)*fdim;
-  Arr* feat=newArr(len);
-  int nxg=2,nyg=2,nzg=2,nfg=5;
-  int nx=421,ny=419,nz=1427,nf=3527;
-  long long int expon=(len*(id+1))%3141592;
-  int seedx=ipowMod(nxg,expon,nx),
-      seedy=ipowMod(nyg,expon,ny),
-      seedz=ipowMod(nzg,expon,nz),
-      seedf=ipowMod(nfg,expon,nf);
-  int i=0;
-  if(timer_on){
-    timer_clear(id+1);
-    timer_start(id+1);
-  }
-  for(i=0;i<len;i+=fdim){
-    seedx=(seedx*nxg)%nx;
-    seedy=(seedy*nyg)%ny;
-    seedz=(seedz*nzg)%nz;
-    seedf=(seedf*nfg)%nf;
-    feat->val[i]=seedx;
-    feat->val[i+1]=seedy;
-    feat->val[i+2]=seedz;
-    feat->val[i+3]=seedf;
-  }
-  if(timer_on){
-    timer_stop(id+1);
-    fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
-  }
-  return feat;   
-}
-void Resample(Arr *a,int blen){
-    long long int i=0,j=0,jlo=0,jhi=0;
-    double avval=0.0;
-    double *nval=(double *)SMPI_SHARED_MALLOC(blen*sizeof(double));
-    Arr *tmp=newArr(10);
-    for(i=0;i<blen;i++) nval[i]=0.0;
-    for(i=1;i<a->len-1;i++){
-      jlo=(int)(0.5*(2*i-1)*(blen/a->len)); 
-      jhi=(int)(0.5*(2*i+1)*(blen/a->len));
-
-      avval=a->val[i]/(jhi-jlo+1);    
-      for(j=jlo;j<=jhi;j++){
-        nval[j]+=avval;
-      }
-    }
-    nval[0]=a->val[0];
-    nval[blen-1]=a->val[a->len-1];
-    SMPI_SHARED_FREE(a->val);
-    a->val=nval;
-    a->len=blen;
-}
-#define fielddim 4
-Arr* WindowFilter(Arr *a, Arr* b,int w){
-  int i=0,j=0,k=0;
-  double rms0=0.0,rms1=0.0,rmsm1=0.0;
-  double weight=((double) (w+1))/(w+2);
- 
-  w+=1;
-  if(timer_on){
-    timer_clear(w);
-    timer_start(w);
-  }
-  if(a->len<b->len) Resample(a,b->len);
-  if(a->len>b->len) Resample(b,a->len);
-  for(i=fielddim;i<a->len-fielddim;i+=fielddim){
-    rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
-  +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
-  +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
-  +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
-    j=i+fielddim;
-    rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
-      +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
-      +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
-      +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
-    j=i-fielddim;
-    rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
-   +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
-   +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
-   +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
-    k=0;
-    if(rms1<rms0){
-      k=1;
-      rms0=rms1;
-    }
-    if(rmsm1<rms0) k=-1;
-    if(k==0){
-      j=i+fielddim;
-      a->val[i]=weight*b->val[i];
-      a->val[i+1]=weight*b->val[i+1];
-      a->val[i+2]=weight*b->val[i+2];
-      a->val[i+3]=weight*b->val[i+3];  
-    }else if(k==1){
-      j=i+fielddim;
-      a->val[i]=weight*b->val[j];
-      a->val[i+1]=weight*b->val[j+1];
-      a->val[i+2]=weight*b->val[j+2];
-      a->val[i+3]=weight*b->val[j+3];  
-    }else { /*if(k==-1)*/
-      j=i-fielddim;
-      a->val[i]=weight*b->val[j];
-      a->val[i+1]=weight*b->val[j+1];
-      a->val[i+2]=weight*b->val[j+2];
-      a->val[i+3]=weight*b->val[j+3];  
-    }     
-  }
-  if(timer_on){
-    timer_stop(w);
-    fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
-  }
-  return a;
-}
-
-int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
-  int i=0,tag=0;
-  DGArc *ar=NULL;
-  DGNode *head=NULL;
-  if(!feat) return 0;
-  TRACE_smpi_set_category ("SendResults");
-  for(i=0;i<nd->outDegree;i++){
-    ar=nd->outArc[i];
-    if(ar->tail!=nd) continue;
-    head=ar->head;
-    tag=ar->id;
-    if(head->address!=nd->address){
-      MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
-      MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
-    }
-  }
-  TRACE_smpi_set_category (NULL);
-  return 1;
-}
-Arr* CombineStreams(DGraph *dg,DGNode *nd){
-  Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
-  int i=0,len=0,tag=0;
-  DGArc *ar=NULL;
-  DGNode *tail=NULL;
-  MPI_Status status;
-  Arr *feat=NULL,*featp=NULL;
-
-  if(nd->inDegree==0) return NULL;
-  for(i=0;i<nd->inDegree;i++){
-    ar=nd->inArc[i];
-    if(ar->head!=nd) continue;
-    tail=ar->tail;
-    if(tail->address!=nd->address){
-      len=0;
-      tag=ar->id;
-      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
-      feat=newArr(len);
-      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
-      resfeat=WindowFilter(resfeat,feat,nd->id);
-      SMPI_SHARED_FREE(feat);
-    }else{
-      featp=(Arr *)tail->feat;
-      feat=newArr(featp->len);
-      memcpy(feat->val,featp->val,featp->len*sizeof(double));
-      resfeat=WindowFilter(resfeat,feat,nd->id);  
-      SMPI_SHARED_FREE(feat);
-    }
-  }
-  for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
-  nd->feat=resfeat;
-  return nd->feat;
-}
-double Reduce(Arr *a,int w){
-  double retv=0.0;
-  if(timer_on){
-    timer_clear(w);
-    timer_start(w);
-  }
-  retv=(int)(w*CheckVal(a));/* The casting needed for node  
-                               and array dependent verifcation */
-  if(timer_on){
-    timer_stop(w);
-    fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
-  }
-  return retv;
-}
-
-double ReduceStreams(DGraph *dg,DGNode *nd){
-  double csum=0.0;
-  int i=0,len=0,tag=0;
-  DGArc *ar=NULL;
-  DGNode *tail=NULL;
-  Arr *feat=NULL;
-  double retv=0.0;
-
-  TRACE_smpi_set_category ("ReduceStreams");
-
-  for(i=0;i<nd->inDegree;i++){
-    ar=nd->inArc[i];
-    if(ar->head!=nd) continue;
-    tail=ar->tail;
-    if(tail->address!=nd->address){
-      MPI_Status status;
-      len=0;
-      tag=ar->id;
-      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
-      feat=newArr(len);
-      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
-      csum+=Reduce(feat,(nd->id+1));  
-      SMPI_SHARED_FREE(feat);
-    }else{
-      csum+=Reduce(tail->feat,(nd->id+1));  
-    }
-  }
-  if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
-  retv=(nd->id+1)*csum;
-  return retv;
-}
-
-int ProcessNodes(DGraph *dg,int me){
-  double chksum=0.0;
-  Arr *feat=NULL;
-  int i=0,verified=0,tag;
-  DGNode *nd=NULL;
-  double rchksum=0.0;
-  MPI_Status status;
-
-  TRACE_smpi_set_category ("ProcessNodes");
-
-
-  for(i=0;i<dg->numNodes;i++){
-    nd=dg->node[i];
-    if(nd->address!=me) continue;
-    if(strstr(nd->name,"Source")){
-      nd->feat=RandomFeatures(dg->name,fielddim,nd->id); 
-      SendResults(dg,nd,nd->feat);
-    }else if(strstr(nd->name,"Sink")){
-      chksum=ReduceStreams(dg,nd);
-      tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
-      MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
-    }else{
-      feat=CombineStreams(dg,nd);
-      SendResults(dg,nd,feat);
-    }
-  }
-
-  TRACE_smpi_set_category ("ProcessNodes");
-
-
-  if(me==0){ /* Report node */
-    rchksum=0.0;
-    chksum=0.0;
-    for(i=0;i<dg->numNodes;i++){
-      nd=dg->node[i];
-      if(!strstr(nd->name,"Sink")) continue;
-       tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
-      MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
-      chksum+=rchksum;
-    }
-    verified=verify(dg->name,chksum);
-  }
-return verified;
-}
-
-int main(int argc,char **argv ){
-  int my_rank,comm_size;
-  int i;
-  DGraph *dg=NULL;
-  int verified=0, featnum=0;
-  double bytes_sent=2.0,tot_time=0.0;
-
-
-
-    MPI_Init( &argc, &argv );
-    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
-    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
-    TRACE_smpi_set_category ("begin");
-
-     if(argc!=2||
-                (  strncmp(argv[1],"BH",2)!=0
-                 &&strncmp(argv[1],"WH",2)!=0
-                 &&strncmp(argv[1],"SH",2)!=0
-                )
-      ){
-      if(my_rank==0){
-        fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
-        fprintf(stderr,"** Where \n   - N is integer number of MPI processes\n");
-        fprintf(stderr,"   - S is the class S, W, or A \n");
-        fprintf(stderr,"   - GraphName is the communication graph name BH, WH, or SH.\n");
-        fprintf(stderr,"   - the number of MPI processes N should not be be less than \n");
-        fprintf(stderr,"     the number of nodes in the graph\n");
-      }
-      MPI_Finalize();
-      exit(0);
-    } 
-   if(strncmp(argv[1],"BH",2)==0){
-      dg=buildBH(CLASS);
-    }else if(strncmp(argv[1],"WH",2)==0){
-      dg=buildWH(CLASS);
-    }else if(strncmp(argv[1],"SH",2)==0){
-      dg=buildSH(CLASS);
-    }
-
-    if(timer_on&&dg->numNodes+1>timers_tot){
-      timer_on=0;
-      if(my_rank==0)
-        fprintf(stderr,"Not enough timers. Node timeing is off. \n");
-    }
-    if(dg->numNodes>comm_size){
-      if(my_rank==0){
-        fprintf(stderr,"**  The number of MPI processes should not be less than \n");
-        fprintf(stderr,"**  the number of nodes in the graph\n");
-        fprintf(stderr,"**  Number of MPI processes = %d\n",comm_size);
-        fprintf(stderr,"**  Number nodes in the graph = %d\n",dg->numNodes);
-      }
-      MPI_Finalize();
-      exit(0);
-    }
-    for(i=0;i<dg->numNodes;i++){ 
-      dg->node[i]->address=i;
-    }
-    if( my_rank == 0 ){
-      printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
-      graphShow(dg,0);
-      timer_clear(0);
-      timer_start(0);
-    }
-
-    verified=ProcessNodes(dg,my_rank);
-    TRACE_smpi_set_category ("end");
- 
-    featnum=NUM_SAMPLES*fielddim;
-    bytes_sent=featnum*dg->numArcs;
-    bytes_sent/=1048576;
-    if(my_rank==0){
-      timer_stop(0);
-      tot_time=timer_read(0);
-      c_print_results( dg->name,
-                 CLASS,
-                 featnum,
-                 0,
-                 0,
-                 dg->numNodes,
-                 0,
-                 comm_size,
-                 tot_time,
-                 bytes_sent/tot_time,
-                 "bytes transmitted", 
-                 verified,
-                 NPBVERSION,
-                 COMPILETIME,
-                 MPICC,
-                 CLINK,
-                 CMPI_LIB,
-                 CMPI_INC,
-                 CFLAGS,
-                 CLINKFLAGS );
-    }          
-    MPI_Finalize();
-  return 1;
-}
diff --git a/examples/smpi/NAS/DT/dt.c b/examples/smpi/NAS/DT/dt.c
index d210b16424..1a4931704a 100644
--- a/examples/smpi/NAS/DT/dt.c
+++ b/examples/smpi/NAS/DT/dt.c
@@ -44,6 +44,8 @@
 #include "mpi.h"
 #include "npbparams.h"
 
+#include "simgrid/instr.h" //TRACE_
+
 #ifndef CLASS
 #define CLASS 'S'
 #define NUM_PROCS            1                 
@@ -537,6 +539,7 @@ int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
   DGArc *ar=NULL;
   DGNode *head=NULL;
   if(!feat) return 0;
+  TRACE_smpi_set_category ("SendResults");
   for(i=0;i<nd->outDegree;i++){
     ar=nd->outArc[i];
     if(ar->tail!=nd) continue;
@@ -547,6 +550,7 @@ int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
       MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
     }
   }
+  TRACE_smpi_set_category (NULL);
   return 1;
 }
 Arr* CombineStreams(DGraph *dg,DGNode *nd){
@@ -605,6 +609,8 @@ double ReduceStreams(DGraph *dg,DGNode *nd){
   Arr *feat=NULL;
   double retv=0.0;
 
+  TRACE_smpi_set_category ("ReduceStreams");
+
   for(i=0;i<nd->inDegree;i++){
     ar=nd->inArc[i];
     if(ar->head!=nd) continue;
@@ -635,6 +641,8 @@ int ProcessNodes(DGraph *dg,int me){
   double rchksum=0.0;
   MPI_Status status;
 
+  TRACE_smpi_set_category ("ProcessNodes");
+
   for(i=0;i<dg->numNodes;i++){
     nd=dg->node[i];
     if(nd->address!=me) continue;
@@ -650,6 +658,9 @@ int ProcessNodes(DGraph *dg,int me){
       SendResults(dg,nd,feat);
     }
   }
+
+  TRACE_smpi_set_category ("ProcessNodes");
+
   if(me==0){ /* Report node */
     rchksum=0.0;
     chksum=0.0;
@@ -675,6 +686,7 @@ int main(int argc,char **argv ){
     MPI_Init( &argc, &argv );
     MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
     MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+    TRACE_smpi_set_category ("begin");
 
      if(argc!=2||
                 (  strncmp(argv[1],"BH",2)!=0
@@ -726,7 +738,8 @@ int main(int argc,char **argv ){
       timer_start(0);
     }
     verified=ProcessNodes(dg,my_rank);
-    
+    TRACE_smpi_set_category ("end");
+
     featnum=NUM_SAMPLES*fielddim;
     bytes_sent=featnum*dg->numArcs;
     bytes_sent/=1048576;
diff --git a/examples/smpi/NAS/EP-trace/Makefile b/examples/smpi/NAS/EP-trace/Makefile
deleted file mode 100644
index 6194dcfe97..0000000000
--- a/examples/smpi/NAS/EP-trace/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=ep
-BENCHMARKU=EP
-
-include ../config/make.def
-
-#OBJS = ep-trace.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o
-OBJS = ep-trace.o randlc.o
-
-include ../sys/make.common
-
-${PROGRAM}: config ${OBJS}
-#	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
-
-
-#ep-trace.o:		ep-trace.f  mpinpb.h npbparams.h
-#	${FCOMPILE} ep-trace.f
-
-ep-trace.o:	ep-trace.c randlc.c mpinpb.h npbparams.h
-	${CCOMPILE} ep-trace.c
-
-clean:
-	- rm -f *.o *~ 
-	- rm -f npbparams.h core
-
-
-
diff --git a/examples/smpi/NAS/EP-trace/README b/examples/smpi/NAS/EP-trace/README
deleted file mode 100644
index 6eb36571af..0000000000
--- a/examples/smpi/NAS/EP-trace/README
+++ /dev/null
@@ -1,6 +0,0 @@
-This code implements the random-number generator described in the
-NAS Parallel Benchmark document RNR Technical Report RNR-94-007.
-The code is "embarrassingly" parallel in that no communication is
-required for the generation of the random numbers itself. There is
-no special requirement on the number of processors used for running
-the benchmark.
diff --git a/examples/smpi/NAS/EP-trace/ep-trace.c b/examples/smpi/NAS/EP-trace/ep-trace.c
deleted file mode 100644
index 4f606a0e39..0000000000
--- a/examples/smpi/NAS/EP-trace/ep-trace.c
+++ /dev/null
@@ -1,449 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#include "simgrid/instr.h" //TRACE_
-
-#include "randlc.h"
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS            1                 
-#endif
-#define true 1
-#define false 0
-
-
-//---NOTE : all the timers function have been modified to
-//          avoid global timers (privatize these). 
-      // ----------------------- timers ---------------------
-      void timer_clear(double *onetimer) {
-            //elapsed[n] = 0.0;
-            *onetimer = 0.0;
-      }
-
-      void timer_start(double *onetimer) {
-            *onetimer = MPI_Wtime();
-      }
-
-      void timer_stop(int n,double *elapsed,double *start) {
-            double t, now;
-
-            now = MPI_Wtime();
-            t = now - start[n];
-            elapsed[n] += t;
-      }
-
-      double timer_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
-            return(elapsed[n]);
-      }
-      /********************************************************************
-       *****************            V R A N L C          ******************
-       *****************                                 *****************/           
-      double vranlc(int n, double x, double a, double *y)
-      {
-        int i;
-        long  i246m1=0x00003FFFFFFFFFFF;
-    long  LLx, Lx, La;
-        double d2m46;
-
-// This doesn't work, because the compiler does the calculation in 32
-// bits and overflows. No standard way (without f90 stuff) to specify
-// that the rhs should be done in 64 bit arithmetic.
-//     parameter(i246m1=2**46-1)
-
-      d2m46=pow(0.5,46);
-
-// c Note that the v6 compiler on an R8000 does something stupid with
-// c the above. Using the following instead (or various other things)
-// c makes the calculation run almost 10 times as fast.
-//
-// c     save d2m46
-// c      data d2m46/0.0d0/
-// c      if (d2m46 .eq. 0.0d0) then
-// c         d2m46 = 0.5d0**46
-// c      endif
-
-      Lx = (long)x;
-      La = (long)a;
-      //fprintf(stdout,("================== Vranlc ================");
-      //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
-  LLx = Lx;
-  for (i=0; i< n; i++) {
-      Lx   = Lx*La & i246m1 ;
-      LLx = Lx;
-      y[i] = d2m46 * (double)LLx;
-      /*
-         if(i == 0) {
-         fprintf(stdout,("After loop 0:");
-         fprintf(stdout,("Lx = " + Lx + ", La = " + La);
-         fprintf(stdout,("d2m46 = " + d2m46);
-         fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
-         fprintf(stdout,("Y[0]" + y[0]);
-         }
-       */
-  }
-
-      x = (double)LLx;
-      /*
-      fprintf(stdout,("Change: Lx = " + Lx);
-      fprintf(stdout,("=============End   Vranlc ================");
-      */
-      return x;
-    }
-
-
-
-//-------------- the core (unique function) -----------
-      void doTest(int argc, char **argv) {
-      double dum[3] = {1.,1.,1.};
-      double x1, x2, sx, sy, tm, an, tt, gc;
-      double Mops;
-      double epsilon=1.0E-8, a = 1220703125., s=271828183.;
-      double t1, t2, t3, t4; 
-      double sx_verify_value, sy_verify_value, sx_err, sy_err;
-
-#include "npbparams.h"
-      int    mk=16, 
-         // --> set by make : in npbparams.h
-         //m=28, // for CLASS=A
-         //m=30, // for CLASS=B
-         //npm=2, // NPROCS
-         mm = m-mk, 
-         nn = (int)(pow(2,mm)), 
-         nk = (int)(pow(2,mk)), 
-         nq=10, 
-         np, 
-         node, 
-         no_nodes, 
-         i, 
-         ik, 
-         kk, 
-         l, 
-         k, nit, no_large_nodes,
-         np_add, k_offset, j;
-      int    me, nprocs, root=0, dp_type;
-      int verified, 
-          timers_enabled=true;
-      char  size[500]; // mind the size of the string to represent a big number
-
-      //Use in randlc..
-      int KS = 0;
-      double R23, R46, T23, T46;
-
-      double *qq = (double *) malloc (10000*sizeof(double));
-      double *start = (double *) malloc (64*sizeof(double));
-      double *elapsed = (double *) malloc (64*sizeof(double));
-
-      double *x = (double *) malloc (2*nk*sizeof(double));
-      double *q = (double *) malloc (nq*sizeof(double));
-
-      TRACE_smpi_set_category ("start");
-
-      MPI_Init( &argc, &argv );
-      MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
-      MPI_Comm_rank( MPI_COMM_WORLD, &node);
-
-#ifdef USE_MPE
-    MPE_Init_log();
-#endif
-      root = 0;
-      if (node == root ) {
-
-          /*   Because the size of the problem is too large to store in a 32-bit
-           *   integer for some classes, we put it into a string (for printing).
-           *   Have to strip off the decimal point put in there by the floating
-           *   point print statement (internal file)
-           */
-          fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
-          sprintf(size,"%d",pow(2,m+1));
-          //size = size.replace('.', ' ');
-          fprintf(stdout," Number of random numbers generated: %s\n",size);
-          fprintf(stdout," Number of active processes: %d\n",no_nodes);
-
-      }
-      verified = false;
-
-      /* c   Compute the number of "batches" of random number pairs generated 
-         c   per processor. Adjust if the number of processors does not evenly 
-         c   divide the total number
-*/
-
-       np = nn / no_nodes;
-       no_large_nodes = nn % no_nodes;
-       if (node < no_large_nodes) np_add = 1;
-       else np_add = 0;
-       np = np + np_add;
-
-       if (np == 0) {
-             fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
-             MPI_Abort(MPI_COMM_WORLD,1);
-             exit(0); 
-       } 
-
-/* c   Call the random number generator functions and initialize
-   c   the x-array to reduce the effects of paging on the timings.
-   c   Also, call all mathematical functions that are used. Make
-   c   sure these initializations cannot be eliminated as dead code.
-*/
-
-   //call vranlc(0, dum[1], dum[2], dum[3]);
-   // Array indexes start at 1 in Fortran, 0 in Java
-   vranlc(0, dum[0], dum[1], &(dum[2])); 
-
-   dum[0] = randlc(&(dum[1]),&(dum[2]));
-   /////////////////////////////////
-   for (i=0;i<2*nk;i++) {
-       x[i] = -1e99;
-   }
-   Mops = log(sqrt(abs(1))); 
-
-   /*
-      c---------------------------------------------------------------------
-      c    Synchronize before placing time stamp
-      c---------------------------------------------------------------------
-    */
-        MPI_Barrier( MPI_COMM_WORLD );
-
-
-      TRACE_smpi_set_category ("ep");
-
-        timer_clear(&(elapsed[1]));
-        timer_clear(&(elapsed[2]));
-        timer_clear(&(elapsed[3]));
-        timer_start(&(start[1]));
-        
-        t1 = a;
-  //fprintf(stdout,("(ep.f:160) t1 = " + t1);
-        t1 = vranlc(0, t1, a, x);
-  //fprintf(stdout,("(ep.f:161) t1 = " + t1);
-  
-        
-/* c   Compute AN = A ^ (2 * NK) (mod 2^46). */
-        
-        t1 = a;
-  //fprintf(stdout,("(ep.f:165) t1 = " + t1);
-        for (i=1; i <= mk+1; i++) {
-               t2 = randlc(&t1, &t1);
-         //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
-        } 
-        an = t1;
-  //fprintf(stdout,("(ep.f:172) s = " + s);
-        tt = s;
-        gc = 0.;
-        sx = 0.;
-        sy = 0.;
-        for (i=0; i < nq ; i++) {
-               q[i] = 0.;
-        }
-
-/*
-    Each instance of this loop may be performed independently. We compute
-    the k offsets separately to take into account the fact that some nodes
-    have more numbers to generate than others
-*/
-
-      if (np_add == 1)
-         k_offset = node * np -1;
-      else
-         k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
-     
-      int stop = false;
-      for(k = 1; k <= np; k++) {
-         stop = false;
-         kk = k_offset + k ;
-         t1 = s;
-         //fprintf(stdout,("(ep.f:193) t1 = " + t1);
-         t2 = an;
-
-//       Find starting seed t1 for this kk.
-
-         for (i=1;i<=100 && !stop;i++) {
-            ik = kk / 2;
-      //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
-            if (2 * ik != kk)  {
-                t3 = randlc(&t1, &t2);
-                //fprintf(stdout,("(ep.f:200) t1= " +t1 );
-            }
-            if (ik==0)
-                stop = true;
-            else {
-               t3 = randlc(&t2, &t2);
-               kk = ik;
-           }
-         }
-//       Compute uniform pseudorandom numbers.
-
-         //if (timers_enabled)  timer_start(3);
-   timer_start(&(start[3]));
-         //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
-
-  //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
-  //fprintf(stdout,"2*nk = " + (2*nk));
-  //fprintf(stdout,"t1 = " + t1);
-  //fprintf(stdout,"a  = " + a);
-  //fprintf(stdout,"x[0] = " + x[0]);
-  //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-        
-  t1 = vranlc(2 * nk, t1, a, x);
-
-  //fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
-  //fprintf(stdout,("2*nk = " + (2*nk));
-  //fprintf(stdout,("t1 = " + t1);
-  //fprintf(stdout,("a  = " + a);
-  //fprintf(stdout,("x[0] = " + x[0]);
-  //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-        
-         //if (timers_enabled)  timer_stop(3);
-   timer_stop(3,elapsed,start);
-
-/*       Compute Gaussian deviates by acceptance-rejection method and 
- *       tally counts in concentric square annuli.  This loop is not 
- *       vectorizable. 
- */
-         //if (timers_enabled) timer_start(2);
-    timer_start(&(start[2]));
-         for(i=1; i<=nk;i++) {
-            x1 = 2. * x[2*i-2] -1.0;
-            x2 = 2. * x[2*i-1] - 1.0;
-            t1 = x1*x1 + x2*x2;
-            if (t1 <= 1.) {
-               t2   = sqrt(-2. * log(t1) / t1);
-               t3   = (x1 * t2);
-               t4   = (x2 * t2);
-               l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
-               q[l] = q[l] + 1.;
-               sx   = sx + t3;
-               sy   = sy + t4;
-             }
-    /*
-       if(i == 1) {
-                fprintf(stdout,"x1 = " + x1);
-                fprintf(stdout,"x2 = " + x2);
-                fprintf(stdout,"t1 = " + t1);
-                fprintf(stdout,"t2 = " + t2);
-                fprintf(stdout,"t3 = " + t3);
-                fprintf(stdout,"t4 = " + t4);
-                fprintf(stdout,"l = " + l);
-                fprintf(stdout,"q[l] = " + q[l]);
-                fprintf(stdout,"sx = " + sx);
-                fprintf(stdout,"sy = " + sy);
-       }
-    */
-           }
-         //if (timers_enabled)  timer_stop(2);
-    timer_stop(2,elapsed,start);
-      }
-
-      TRACE_smpi_set_category ("finalize");
-
-      //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)   
-  MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
-      MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-      sy = x[0];
-      MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-      for(i = 0; i < nq; i++) {
-    q[i] = x[i];
-  }
-  for(i = 0; i < nq; i++) {
-    gc += q[i];
-  }
-
-  timer_stop(1,elapsed,start);
-      tm = timer_read(1,elapsed);
-  MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  tm = x[0];
-
-  if(node == root) {
-    nit = 0;
-    verified = true;
-
-    if(m == 24) {
-             sx_verify_value = -3.247834652034740E3;
-                  sy_verify_value = -6.958407078382297E3;
-    } else if(m == 25) {
-                sx_verify_value = -2.863319731645753E3;
-      sy_verify_value = -6.320053679109499E3;
-    } else if(m == 28) {
-            sx_verify_value = -4.295875165629892E3;
-      sy_verify_value = -1.580732573678431E4;
-    } else if(m == 30) {
-            sx_verify_value =  4.033815542441498E4;
-                  sy_verify_value = -2.660669192809235E4;
-    } else if(m == 32) {
-                  sx_verify_value =  4.764367927995374E4;
-                     sy_verify_value = -8.084072988043731E4;
-    } else if(m == 36) {
-            sx_verify_value =  1.982481200946593E5;
-            sy_verify_value = -1.020596636361769E5;
-    } else {
-      verified = false;
-    }
-
-    /*
-    fprintf(stdout,("sx        = " + sx);
-    fprintf(stdout,("sx_verify = " + sx_verify_value);
-    fprintf(stdout,("sy        = " + sy);
-    fprintf(stdout,("sy_verify = " + sy_verify_value);
-    */
-    if(verified) {
-      sx_err = abs((sx - sx_verify_value)/sx_verify_value);
-      sy_err = abs((sy - sy_verify_value)/sy_verify_value);
-      /*
-      fprintf(stdout,("sx_err = " + sx_err);
-      fprintf(stdout,("sy_err = " + sx_err);
-      fprintf(stdout,("epsilon= " + epsilon);
-      */
-      verified = ((sx_err < epsilon) && (sy_err < epsilon));
-    }
-
-    Mops = (pow(2.0, m+1))/tm/1000;
-
-    fprintf(stdout,"EP Benchmark Results:\n");
-    fprintf(stdout,"CPU Time=%d\n",tm);
-    fprintf(stdout,"N = 2^%d\n",m);
-    fprintf(stdout,"No. Gaussain Pairs =%d\n",gc);
-    fprintf(stdout,"Sum = %f %ld\n",sx,sy);
-    fprintf(stdout,"Count:");
-    for(i = 0; i < nq; i++) {
-      fprintf(stdout,"%d\t %ld\n",i,q[i]);
-    }
-
-    /*
-    print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
-        "Random numbers generated", verified, npbversion,
-        compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
-    fprintf(stdout,"\nEP Benchmark Completed\n");
-            fprintf(stdout,"Class           = %s\n", _class);
-    fprintf(stdout,"Size            = %s\n", size);
-    fprintf(stdout,"Iteration       = %d\n", nit);
-    fprintf(stdout,"Time in seconds = %f\n",(tm/1000));
-    fprintf(stdout,"Total processes = %d\n",no_nodes);
-    fprintf(stdout,"Mops/s total    = %f\n",Mops);
-    fprintf(stdout,"Mops/s/process  = %f\n", Mops/no_nodes);
-    fprintf(stdout,"Operation type  = Random number generated\n");
-    if(verified) {
-      fprintf(stdout,"Verification    = SUCCESSFUL\n");
-    } else {
-      fprintf(stdout,"Verification    = UNSUCCESSFUL\n");
-    }
-             fprintf(stdout,"Total time:     %f\n",(timer_read(1,elapsed)/1000));
-             fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2,elapsed)/1000));
-             fprintf(stdout,"Random numbers: %f\n",(timer_read(3,elapsed)/1000));
-         }
-#ifdef USE_MPE
-    MPE_Finish_log(argv[0]);
-#endif
- 
-       MPI_Finalize();
-      }
-
-    int main(int argc, char **argv) {
-       doTest(argc,argv);
-    }
diff --git a/examples/smpi/NAS/EP-trace/mpinpb.h b/examples/smpi/NAS/EP-trace/mpinpb.h
deleted file mode 100644
index 1f13637ae4..0000000000
--- a/examples/smpi/NAS/EP-trace/mpinpb.h
+++ /dev/null
@@ -1,9 +0,0 @@
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-      include 'mpif.h'
-
-      integer           me, nprocs, root, dp_type
-      common /mpistuff/ me, nprocs, root, dp_type
-
diff --git a/examples/smpi/NAS/EP-trace/randlc.c b/examples/smpi/NAS/EP-trace/randlc.c
deleted file mode 100644
index 4de6c93b88..0000000000
--- a/examples/smpi/NAS/EP-trace/randlc.c
+++ /dev/null
@@ -1,107 +0,0 @@
-
-/*
- *    FUNCTION RANDLC (X, A)
- *
- *  This routine returns a uniform pseudorandom double precision number in the
- *  range (0, 1) by using the linear congruential generator
- *
- *  x_{k+1} = a x_k  (mod 2^46)
- *
- *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
- *  before repeating.  The argument A is the same as 'a' in the above formula,
- *  and X is the same as x_0.  A and X must be odd double precision integers
- *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
- *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
- *  the new seed x_1, so that subsequent calls to RANDLC using the same
- *  arguments will generate a continuous sequence.
- *
- *  This routine should produce the same results on any computer with at least
- *  48 mantissa bits in double precision floating point data.  On Cray systems,
- *  double precision should be disabled.
- *
- *  David H. Bailey     October 26, 1990
- *
- *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- *     SAVE KS, R23, R46, T23, T46
- *     DATA KS/0/
- *
- *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
- *  by merely using the ** operator, in order to insure that the results are
- *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
- */
-
-
-/*****************************************************************/
-/*************           R  A  N  D  L  C             ************/
-/*************                                        ************/
-/*************    portable random number generator    ************/
-/*****************************************************************/
-
-double  randlc( double *X, double *A )
-{
-      static int        KS=0;
-      static double  R23, R46, T23, T46;
-      double    T1, T2, T3, T4;
-      double    A1;
-      double    A2;
-      double    X1;
-      double    X2;
-      double    Z;
-      int         i, j;
-
-      if (KS == 0) 
-      {
-        R23 = 1.0;
-        R46 = 1.0;
-        T23 = 1.0;
-        T46 = 1.0;
-    
-        for (i=1; i<=23; i++)
-        {
-          R23 = 0.50 * R23;
-          T23 = 2.0 * T23;
-        }
-        for (i=1; i<=46; i++)
-        {
-          R46 = 0.50 * R46;
-          T46 = 2.0 * T46;
-        }
-        KS = 1;
-      }
-
-/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
-
-      T1 = R23 * *A;
-      j  = T1;
-      A1 = j;
-      A2 = *A - T23 * A1;
-
-/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
-    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
-    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
-
-      T1 = R23 * *X;
-      j  = T1;
-      X1 = j;
-      X2 = *X - T23 * X1;
-      T1 = A1 * X2 + A2 * X1;
-      
-      j  = R23 * T1;
-      T2 = j;
-      Z = T1 - T23 * T2;
-      T3 = T23 * Z + A2 * X2;
-      j  = R46 * T3;
-      T4 = j;
-      *X = T3 - T46 * T4;
-      return(R46 * *X);
-} 
-
-
-
-/*****************************************************************/
-/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
-/************                                         ************/
-/************ returns parallel random number seq seed ************/
-/*****************************************************************/
-
diff --git a/examples/smpi/NAS/EP-trace/randlc.h b/examples/smpi/NAS/EP-trace/randlc.h
deleted file mode 100644
index aff84d341a..0000000000
--- a/examples/smpi/NAS/EP-trace/randlc.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-double      randlc( double *X, double *A );
-
diff --git a/examples/smpi/NAS/EP/ep.c b/examples/smpi/NAS/EP/ep.c
index 961422af26..5cf9d5fb7f 100644
--- a/examples/smpi/NAS/EP/ep.c
+++ b/examples/smpi/NAS/EP/ep.c
@@ -6,6 +6,8 @@
 #include "mpi.h"
 #include "npbparams.h"
 
+#include "simgrid/instr.h" //TRACE_
+
 #include "randlc.h"
 
 #ifndef CLASS
@@ -140,6 +142,8 @@
       double *x = (double *) malloc (2*nk*sizeof(double));
       double *q = (double *) malloc (nq*sizeof(double));
 
+      TRACE_smpi_set_category ("start");
+
       MPI_Init( &argc, &argv );
       MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
       MPI_Comm_rank( MPI_COMM_WORLD, &node);
@@ -205,6 +209,8 @@
     */
         MPI_Barrier( MPI_COMM_WORLD );
 
+        TRACE_smpi_set_category ("ep");
+
         timer_clear(&(elapsed[1]));
         timer_clear(&(elapsed[2]));
         timer_clear(&(elapsed[3]));
@@ -332,6 +338,8 @@
     timer_stop(2,elapsed,start);
       }
 
+    TRACE_smpi_set_category ("finalize");
+
       //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)   
   MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
diff --git a/examples/smpi/NAS/IS-trace/Makefile b/examples/smpi/NAS/IS-trace/Makefile
deleted file mode 100644
index 4911d0ddd5..0000000000
--- a/examples/smpi/NAS/IS-trace/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-SHELL=/bin/sh
-BENCHMARK=is
-BENCHMARKU=IS
-
-include ../config/make.def
-
-include ../sys/make.common
-
-OBJS = is-trace.o ${COMMON}/c_print_results.o
-
-
-${PROGRAM}: config ${OBJS}
-	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
-
-.c.o:
-	${CCOMPILE} $<
-
-is-trace.o: is-trace.c npbparams.h
-
-clean:
-	- rm -f *.o *~ mputil*
-	- rm -f is-trace npbparams.h core
diff --git a/examples/smpi/NAS/IS-trace/is-trace.c b/examples/smpi/NAS/IS-trace/is-trace.c
deleted file mode 100644
index 57b370bb39..0000000000
--- a/examples/smpi/NAS/IS-trace/is-trace.c
+++ /dev/null
@@ -1,1154 +0,0 @@
-/*************************************************************************
- *                                                                       * 
- *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
- *                                                                       * 
- *                                  I S                                  * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
- *   It is described in NAS Technical Report 95-020.                     * 
- *                                                                       * 
- *   Permission to use, copy, distribute and modify this software        * 
- *   for any purpose with or without fee is hereby granted.  We          * 
- *   request, however, that all derived work reference the NAS           * 
- *   Parallel Benchmarks 3.3. This software is provided "as is"          *
- *   without express or implied warranty.                                * 
- *                                                                       * 
- *   Information on NPB 3.3, including the technical report, the         *
- *   original specifications, source code, results and information       * 
- *   on how to submit new results, is available at:                      * 
- *                                                                       * 
- *          http://www.nas.nasa.gov/Software/NPB                         * 
- *                                                                       * 
- *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
- *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
- *                                                                       * 
- *         NAS Parallel Benchmarks Group                                 * 
- *         NASA Ames Research Center                                     * 
- *         Mail Stop: T27A-1                                             * 
- *         Moffett Field, CA   94035-1000                                * 
- *                                                                       * 
- *         E-mail:  npb@nas.nasa.gov                                     * 
- *         Fax:     (650) 604-3957                                       * 
- *                                                                       * 
- ************************************************************************* 
- *                                                                       * 
- *   Author: M. Yarrow                                                   * 
- *           H. Jin                                                      * 
- *                                                                       * 
- *************************************************************************/
-
-#include "mpi.h"
-#include "npbparams.h"
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "simgrid/instr.h" //TRACE_
-
-/******************/
-/* default values */
-/******************/
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS            1                 
-#endif
-#define MIN_PROCS            1
-
-
-/*************/
-/*  CLASS S  */
-/*************/
-#if CLASS == 'S'
-#define  TOTAL_KEYS_LOG_2    16
-#define  MAX_KEY_LOG_2       11
-#define  NUM_BUCKETS_LOG_2   9
-#endif
-
-
-/*************/
-/*  CLASS W  */
-/*************/
-#if CLASS == 'W'
-#define  TOTAL_KEYS_LOG_2    20
-#define  MAX_KEY_LOG_2       16
-#define  NUM_BUCKETS_LOG_2   10
-#endif
-
-/*************/
-/*  CLASS A  */
-/*************/
-#if CLASS == 'A'
-#define  TOTAL_KEYS_LOG_2    23
-#define  MAX_KEY_LOG_2       19
-#define  NUM_BUCKETS_LOG_2   10
-#endif
-
-
-/*************/
-/*  CLASS B  */
-/*************/
-#if CLASS == 'B'
-#define  TOTAL_KEYS_LOG_2    25
-#define  MAX_KEY_LOG_2       21
-#define  NUM_BUCKETS_LOG_2   10
-#endif
-
-
-/*************/
-/*  CLASS C  */
-/*************/
-#if CLASS == 'C'
-#define  TOTAL_KEYS_LOG_2    27
-#define  MAX_KEY_LOG_2       23
-#define  NUM_BUCKETS_LOG_2   10
-#endif
-
-
-/*************/
-/*  CLASS D  */
-/*************/
-#if CLASS == 'D'
-#define  TOTAL_KEYS_LOG_2    29
-#define  MAX_KEY_LOG_2       27
-#define  NUM_BUCKETS_LOG_2   10
-#undef   MIN_PROCS
-#define  MIN_PROCS           4
-#endif
-
-
-#define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
-#define  MAX_KEY             (1 << MAX_KEY_LOG_2)
-#define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
-#define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
-
-/*****************************************************************/
-/* On larger number of processors, since the keys are (roughly)  */ 
-/* gaussian distributed, the first and last processor sort keys  */ 
-/* in a large interval, requiring array sizes to be larger. Note */
-/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
-/* The required array size also depends on the bucket size used. */
-/* The following values are validated for the 1024-bucket setup. */
-/*****************************************************************/
-#if   NUM_PROCS < 256
-#define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
-#elif NUM_PROCS < 512
-#define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
-#elif NUM_PROCS < 1024
-#define  SIZE_OF_BUFFERS     4*NUM_KEYS
-#else
-#define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
-#endif
-
-/*****************************************************************/
-/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
-/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
-/* MAX_PROCS AT YOUR PERIL                                       */
-/*****************************************************************/
-#if CLASS == 'S'
-#define  MAX_PROCS           128
-#else
-#define  MAX_PROCS           1024
-#endif
-
-#define  MAX_ITERATIONS      10
-#define  TEST_ARRAY_SIZE     5
-
-
-/***********************************/
-/* Enable separate communication,  */
-/* computation timing and printout */
-/***********************************/
-/* #define  TIMING_ENABLED         */
-
-
-/*************************************/
-/* Typedef: if necessary, change the */
-/* size of int here by changing the  */
-/* int type to, say, long            */
-/*************************************/
-typedef  int  INT_TYPE;
-typedef  long INT_TYPE2;
-#define MP_KEY_TYPE MPI_INT
-
-
-typedef struct {
-
-/********************/
-/* MPI properties:  */
-/********************/
-int      my_rank,
-         comm_size;
-
-
-/********************/
-/* Some global info */
-/********************/
-INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
-         total_local_keys,             /* copies of rank info        */
-         total_lesser_keys;
-
-
-int      passed_verification;
-                                 
-
-
-/************************************/
-/* These are the three main arrays. */
-/* See SIZE_OF_BUFFERS def above    */
-/************************************/
-INT_TYPE key_array[SIZE_OF_BUFFERS],    
-         key_buff1[SIZE_OF_BUFFERS],    
-         key_buff2[SIZE_OF_BUFFERS],
-         bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
-         bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
-         bucket_ptrs[NUM_BUCKETS],
-         process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],   
-         process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];   
-int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
-         send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
-
-
-/**********************/
-/* Partial verif info */
-/**********************/
-INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
-         test_rank_array[TEST_ARRAY_SIZE];
-
-/**********/
-/* Timers */
-/**********/
-double start[64], elapsed[64];
-
-} global_data;
-
-
-const INT_TYPE2
-         S_test_index_array[TEST_ARRAY_SIZE] = 
-                             {48427,17148,23627,62548,4431},
-         S_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {0,18,346,64917,65463},
-
-         W_test_index_array[TEST_ARRAY_SIZE] = 
-                             {357773,934767,875723,898999,404505},
-         W_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {1249,11698,1039987,1043896,1048018},
-
-         A_test_index_array[TEST_ARRAY_SIZE] = 
-                             {2112377,662041,5336171,3642833,4250760},
-         A_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {104,17523,123928,8288932,8388264},
-
-         B_test_index_array[TEST_ARRAY_SIZE] = 
-                             {41869,812306,5102857,18232239,26860214},
-         B_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {33422937,10244,59149,33135281,99}, 
-
-         C_test_index_array[TEST_ARRAY_SIZE] = 
-                             {44172927,72999161,74326391,129606274,21736814},
-         C_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {61147,882988,266290,133997595,133525895},
-
-         D_test_index_array[TEST_ARRAY_SIZE] = 
-                             {1317351170,995930646,1157283250,1503301535,1453734525},
-         D_test_rank_array[TEST_ARRAY_SIZE] = 
-                             {1,36538729,1978098519,2145192618,2147425337};
-
-
-
-/***********************/
-/* function prototypes */
-/***********************/
-double  randlc( double *X, double *A );
-
-void full_verify( global_data* gd );
-
-void c_print_results( char   *name,
-                      char   class,
-                      int    n1, 
-                      int    n2,
-                      int    n3,
-                      int    niter,
-                      int    nprocs_compiled,
-                      int    nprocs_total,
-                      double t,
-                      double mops,
-          char   *optype,
-                      int    passed_verification,
-                      char   *npbversion,
-                      char   *compiletime,
-                      char   *mpicc,
-                      char   *clink,
-                      char   *cmpi_lib,
-                      char   *cmpi_inc,
-                      char   *cflags,
-                      char   *clinkflags );
-
-void    timer_clear(global_data* gd, int n );
-void    timer_start(global_data* gd, int n );
-void    timer_stop(global_data* gd, int n );
-double  timer_read(global_data* gd, int n );
-
-void    timer_clear(global_data* gd, int n ) {
-   gd->elapsed[n] = 0.0;
-}
-
-void    timer_start(global_data* gd, int n ) {
-   gd->start[n] = MPI_Wtime();
-}
-
-void    timer_stop(global_data* gd, int n ) {
-   gd->elapsed[n] += MPI_Wtime() - gd->start[n];
-}
-
-double  timer_read(global_data* gd, int n ) {
-   return gd->elapsed[n];
-}
-
-
-/*
- *    FUNCTION RANDLC (X, A)
- *
- *  This routine returns a uniform pseudorandom double precision number in the
- *  range (0, 1) by using the linear congruential generator
- *
- *  x_{k+1} = a x_k  (mod 2^46)
- *
- *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
- *  before repeating.  The argument A is the same as 'a' in the above formula,
- *  and X is the same as x_0.  A and X must be odd double precision integers
- *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
- *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
- *  the new seed x_1, so that subsequent calls to RANDLC using the same
- *  arguments will generate a continuous sequence.
- *
- *  This routine should produce the same results on any computer with at least
- *  48 mantissa bits in double precision floating point data.  On Cray systems,
- *  double precision should be disabled.
- *
- *  David H. Bailey     October 26, 1990
- *
- *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- *     SAVE KS, R23, R46, T23, T46
- *     DATA KS/0/
- *
- *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
- *  by merely using the ** operator, in order to insure that the results are
- *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
- */
-
-
-/*****************************************************************/
-/*************           R  A  N  D  L  C             ************/
-/*************                                        ************/
-/*************    portable random number generator    ************/
-/*****************************************************************/
-
-double  randlc( double *X, double *A )
-{
-      static int        KS=0;
-      static double  R23, R46, T23, T46;
-      double    T1, T2, T3, T4;
-      double    A1;
-      double    A2;
-      double    X1;
-      double    X2;
-      double    Z;
-      int         i, j;
-
-      if (KS == 0) 
-      {
-        R23 = 1.0;
-        R46 = 1.0;
-        T23 = 1.0;
-        T46 = 1.0;
-    
-        for (i=1; i<=23; i++)
-        {
-          R23 = 0.50 * R23;
-          T23 = 2.0 * T23;
-        }
-        for (i=1; i<=46; i++)
-        {
-          R46 = 0.50 * R46;
-          T46 = 2.0 * T46;
-        }
-        KS = 1;
-      }
-
-/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
-
-      T1 = R23 * *A;
-      j  = T1;
-      A1 = j;
-      A2 = *A - T23 * A1;
-
-/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
-    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
-    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
-
-      T1 = R23 * *X;
-      j  = T1;
-      X1 = j;
-      X2 = *X - T23 * X1;
-      T1 = A1 * X2 + A2 * X1;
-      
-      j  = R23 * T1;
-      T2 = j;
-      Z = T1 - T23 * T2;
-      T3 = T23 * Z + A2 * X2;
-      j  = R46 * T3;
-      T4 = j;
-      *X = T3 - T46 * T4;
-      return(R46 * *X);
-} 
-
-
-
-/*****************************************************************/
-/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
-/************                                         ************/
-/************ returns parallel random number seq seed ************/
-/*****************************************************************/
-
-/*
- * Create a random number sequence of total length nn residing
- * on np number of processors.  Each processor will therefore have a 
- * subsequence of length nn/np.  This routine returns that random 
- * number which is the first random number for the subsequence belonging
- * to processor rank kn, and which is used as seed for proc kn ran # gen.
- */
-
-double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
-                       int  np,       /* np = num procs                      */
-                       long nn,       /* total num of ran numbers, all procs */
-                       double s,      /* Ran num seed, for ex.: 314159265.00 */
-                       double a )     /* Ran num gen mult, try 1220703125.00 */
-{
-
-  long   i;
-
-  double t1,t2,t3,an;
-  long   mq,nq,kk,ik;
-
-
-
-      nq = nn / np;
-
-      for( mq=0; nq>1; mq++,nq/=2 )
-          ;
-
-      t1 = a;
-
-      for( i=1; i<=mq; i++ )
-        t2 = randlc( &t1, &t1 );
-
-      an = t1;
-
-      kk = kn;
-      t1 = s;
-      t2 = an;
-
-      for( i=1; i<=100; i++ )
-      {
-        ik = kk / 2;
-        if( 2 * ik !=  kk ) 
-            t3 = randlc( &t1, &t2 );
-        if( ik == 0 ) 
-            break;
-        t3 = randlc( &t2, &t2 );
-        kk = ik;
-      }
-
-      return( t1 );
-
-}
-
-
-
-
-/*****************************************************************/
-/*************      C  R  E  A  T  E  _  S  E  Q      ************/
-/*****************************************************************/
-
-void  create_seq( global_data* gd, double seed, double a )
-{
-  double x;
-  int    i, k;
-
-        k = MAX_KEY/4;
-
-  for (i=0; i<NUM_KEYS; i++)
-  {
-      x = randlc(&seed, &a);
-      x += randlc(&seed, &a);
-          x += randlc(&seed, &a);
-      x += randlc(&seed, &a);  
-
-            gd->key_array[i] = k*x;
-  }
-}
-
-
-
-
-/*****************************************************************/
-/*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
-/*****************************************************************/
-
-
-void full_verify( global_data* gd )
-{
-    MPI_Status  status;
-    MPI_Request request;
-    
-    INT_TYPE    i, j;
-    INT_TYPE    k, last_local_key;
-
-    
-/*  Now, finally, sort the keys:  */
-    for( i=0; i<gd->total_local_keys; i++ )
-        gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
-                                 gd->total_lesser_keys] = gd->key_buff2[i];
-    last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
-
-/*  Send largest key value to next processor  */
-    if( gd->my_rank > 0 )
-        MPI_Irecv( &k,
-                   1,
-                   MP_KEY_TYPE,
-                   gd->my_rank-1,
-                   1000,
-                   MPI_COMM_WORLD,
-                   &request );                   
-    if( gd->my_rank < gd->comm_size-1 )
-        MPI_Send( &gd->key_array[last_local_key],
-                  1,
-                  MP_KEY_TYPE,
-                  gd->my_rank+1,
-                  1000,
-                  MPI_COMM_WORLD );
-    if( gd->my_rank > 0 )
-        MPI_Wait( &request, &status );
-
-/*  Confirm that neighbor's greatest key value 
-    is not greater than my least key value       */              
-    j = 0;
-    if( gd->my_rank > 0 && gd->total_local_keys > 0 )
-        if( k > gd->key_array[0] )
-            j++;
-
-
-/*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
-    for( i=1; i<gd->total_local_keys; i++ )
-        if( gd->key_array[i-1] > gd->key_array[i] )
-            j++;
-
-
-    if( j != 0 )
-    {
-        printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
-                gd->my_rank, j );
-    }
-    else
-        gd->passed_verification++;
-           
-
-}
-
-
-
-
-/*****************************************************************/
-/*************             R  A  N  K             ****************/
-/*****************************************************************/
-
-
-void rank( global_data* gd, int iteration )
-{
-
-    INT_TYPE    i, k;
-
-    INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
-    INT_TYPE    key;
-    INT_TYPE2   bucket_sum_accumulator, j, m;
-    INT_TYPE    local_bucket_sum_accumulator;
-    INT_TYPE    min_key_val, max_key_val;
-    INT_TYPE    *key_buff_ptr;
-
-
-
-
-/*  Iteration alteration of keys */  
-    if(gd->my_rank == 0 )                    
-    {
-      gd->key_array[iteration] = iteration;
-      gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
-    }
-
-
-/*  Initialize */
-    for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )  
-    {
-        gd->bucket_size[i] = 0;
-        gd->bucket_size_totals[i] = 0;
-        gd->process_bucket_distrib_ptr1[i] = 0;
-        gd->process_bucket_distrib_ptr2[i] = 0;
-    }
-
-
-/*  Determine where the partial verify test keys are, load into  */
-/*  top of array bucket_size                                     */
-    for( i=0; i<TEST_ARRAY_SIZE; i++ )
-        if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
-            gd->bucket_size[NUM_BUCKETS+i] = 
-                          gd->key_array[gd->test_index_array[i] % NUM_KEYS];
-
-
-/*  Determine the number of keys in each bucket */
-    for( i=0; i<NUM_KEYS; i++ )
-        gd->bucket_size[gd->key_array[i] >> shift]++;
-
-
-/*  Accumulative bucket sizes are the bucket pointers */
-    gd->bucket_ptrs[0] = 0;
-    for( i=1; i< NUM_BUCKETS; i++ )  
-        gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
-
-
-/*  Sort into appropriate bucket */
-    for( i=0; i<NUM_KEYS; i++ )  
-    {
-        key = gd->key_array[i];
-        gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
-    }
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 2 );
-    timer_start(gd, 3 );
-#endif
-
-/*  Get the bucket size totals for the entire problem. These 
-    will be used to determine the redistribution of keys      */
-    MPI_Allreduce( gd->bucket_size, 
-                   gd->bucket_size_totals, 
-                   NUM_BUCKETS+TEST_ARRAY_SIZE, 
-                   MP_KEY_TYPE,
-                   MPI_SUM,
-                   MPI_COMM_WORLD );
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 3 );
-    timer_start(gd, 2 );
-#endif
-
-/*  Determine Redistibution of keys: accumulate the bucket size totals 
-    till this number surpasses NUM_KEYS (which the average number of keys
-    per processor).  Then all keys in these buckets go to processor 0.
-    Continue accumulating again until supassing 2*NUM_KEYS. All keys
-    in these buckets go to processor 1, etc.  This algorithm guarantees
-    that all processors have work ranking; no processors are left idle.
-    The optimum number of buckets, however, does not result in as high
-    a degree of load balancing (as even a distribution of keys as is
-    possible) as is obtained from increasing the number of buckets, but
-    more buckets results in more computation per processor so that the
-    optimum number of buckets turns out to be 1024 for machines tested.
-    Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
-    number of first and last bucket which each processor will have after   
-    the redistribution is done.                                          */
-
-    bucket_sum_accumulator = 0;
-    local_bucket_sum_accumulator = 0;
-    gd->send_displ[0] = 0;
-    gd->process_bucket_distrib_ptr1[0] = 0;
-    for( i=0, j=0; i<NUM_BUCKETS; i++ )  
-    {
-        bucket_sum_accumulator       += gd->bucket_size_totals[i];
-        local_bucket_sum_accumulator += gd->bucket_size[i];
-        if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )  
-        {
-            gd->send_count[j] = local_bucket_sum_accumulator;
-            if( j != 0 )
-            {
-                gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
-                gd->process_bucket_distrib_ptr1[j] = 
-                                        gd->process_bucket_distrib_ptr2[j-1]+1;
-            }
-            gd->process_bucket_distrib_ptr2[j++] = i;
-            local_bucket_sum_accumulator = 0;
-        }
-    }
-
-/*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
-    that the last few processors don't get any buckets.  So, we
-    need to set counts properly in this case to avoid any fallouts.    */
-    while( j < gd->comm_size )
-    {
-        gd->send_count[j] = 0;
-        gd->process_bucket_distrib_ptr1[j] = 1;
-        j++;
-    }
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 2 );
-    timer_start(gd, 3 ); 
-#endif
-
-/*  This is the redistribution section:  first find out how many keys
-    each processor will send to every other processor:                 */
-    MPI_Alltoall( gd->send_count,
-                  1,
-                  MPI_INT,
-                  gd->recv_count,
-                  1,
-                  MPI_INT,
-                  MPI_COMM_WORLD );
-
-/*  Determine the receive array displacements for the buckets */    
-    gd->recv_displ[0] = 0;
-    for( i=1; i<gd->comm_size; i++ )
-        gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
-
-
-/*  Now send the keys to respective processors  */    
-    MPI_Alltoallv( gd->key_buff1,
-                   gd->send_count,
-                   gd->send_displ,
-                   MP_KEY_TYPE,
-                   gd->key_buff2,
-                   gd->recv_count,
-                   gd->recv_displ,
-                   MP_KEY_TYPE,
-                   MPI_COMM_WORLD );
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 3 ); 
-    timer_start(gd, 2 );
-#endif
-
-/*  The starting and ending bucket numbers on each processor are
-    multiplied by the interval size of the buckets to obtain the 
-    smallest possible min and greatest possible max value of any 
-    key on each processor                                          */
-    min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
-    max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
-
-/*  Clear the work array */
-    for( i=0; i<max_key_val-min_key_val+1; i++ )
-        gd->key_buff1[i] = 0;
-
-/*  Determine the total number of keys on all other 
-    processors holding keys of lesser value         */
-    m = 0;
-    for( k=0; k<gd->my_rank; k++ )
-        for( i= gd->process_bucket_distrib_ptr1[k];
-             i<=gd->process_bucket_distrib_ptr2[k];
-             i++ )  
-            m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
-
-/*  Determine total number of keys on this processor */
-    j = 0;                                 
-    for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
-         i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
-         i++ )  
-        j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
-
-
-/*  Ranking of all keys occurs in this section:                 */
-/*  shift it backwards so no subtractions are necessary in loop */
-    key_buff_ptr = gd->key_buff1 - min_key_val;
-
-/*  In this section, the keys themselves are used as their 
-    own indexes to determine how many of each there are: their
-    individual population                                       */
-    for( i=0; i<j; i++ )
-        key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
-                                       /* population                     */
-
-/*  To obtain ranks of each key, successively add the individual key
-    population, not forgetting the total of lesser keys, m.
-    NOTE: Since the total of lesser keys would be subtracted later 
-    in verification, it is no longer added to the first key population 
-    here, but still needed during the partial verify test.  This is to 
-    ensure that 32-bit key_buff can still be used for class D.           */
-/*    key_buff_ptr[min_key_val] += m;    */
-    for( i=min_key_val; i<max_key_val; i++ )   
-        key_buff_ptr[i+1] += key_buff_ptr[i];  
-
-
-/* This is the partial verify test section */
-/* Observe that test_rank_array vals are   */
-/* shifted differently for different cases */
-    for( i=0; i<TEST_ARRAY_SIZE; i++ )
-    {                                             
-        k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
-        if( min_key_val <= k  &&  k <= max_key_val )
-        {
-            /* Add the total of lesser keys, m, here */
-            INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
-            int failed = 0;
-
-            switch( CLASS )
-            {
-                case 'S':
-                    if( i <= 2 )
-                    {
-                        if( key_rank != gd->test_rank_array[i]+iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'W':
-                    if( i < 2 )
-                    {
-                        if( key_rank != gd->test_rank_array[i]+(iteration-2) )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'A':
-                    if( i <= 2 )
-              {
-                        if( key_rank != gd->test_rank_array[i]+(iteration-1) )
-                            failed = 1;
-                        else
-                          gd->passed_verification++;
-              }
-                    else
-                    {
-                        if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'B':
-                    if( i == 1 || i == 2 || i == 4 )
-              {
-                        if( key_rank != gd->test_rank_array[i]+iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-              }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'C':
-                    if( i <= 2 )
-              {
-                        if( key_rank != gd->test_rank_array[i]+iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-              }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-                case 'D':
-                    if( i < 2 )
-              {
-                        if( key_rank != gd->test_rank_array[i]+iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-              }
-                    else
-                    {
-                        if( key_rank != gd->test_rank_array[i]-iteration )
-                            failed = 1;
-                        else
-                            gd->passed_verification++;
-                    }
-                    break;
-            }
-            if( failed == 1 )
-                printf( "Failed partial verification: "
-                        "iteration %d, processor %d, test key %d\n", 
-                         iteration, gd->my_rank, (int)i );
-        }
-    }
-
-
-
-
-/*  Make copies of rank info for use by full_verify: these variables
-    in rank are local; making them global slows down the code, probably
-    since they cannot be made register by compiler                        */
-
-    if( iteration == MAX_ITERATIONS ) 
-    {
-        gd->key_buff_ptr_global = key_buff_ptr;
-        gd->total_local_keys    = j;
-        gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
-    }
-
-}      
-
-
-/*****************************************************************/
-/*************             M  A  I  N             ****************/
-/*****************************************************************/
-
-int main( int argc, char **argv )
-{
-
-    int             i, iteration, itemp;
-
-    double          timecounter, maxtime;
-
-    global_data* gd = malloc(sizeof(global_data));
-/*  Initialize MPI */
-    MPI_Init( &argc, &argv );
-    MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
-    MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
-
-/*  Initialize the verification arrays if a valid class */
-    for( i=0; i<TEST_ARRAY_SIZE; i++ )
-        switch( CLASS )
-        {
-            case 'S':
-                gd->test_index_array[i] = S_test_index_array[i];
-                gd->test_rank_array[i]  = S_test_rank_array[i];
-                break;
-            case 'A':
-                gd->test_index_array[i] = A_test_index_array[i];
-                gd->test_rank_array[i]  = A_test_rank_array[i];
-                break;
-            case 'W':
-                gd->test_index_array[i] = W_test_index_array[i];
-                gd->test_rank_array[i]  = W_test_rank_array[i];
-                break;
-            case 'B':
-                gd->test_index_array[i] = B_test_index_array[i];
-                gd->test_rank_array[i]  = B_test_rank_array[i];
-                break;
-            case 'C':
-                gd->test_index_array[i] = C_test_index_array[i];
-                gd->test_rank_array[i]  = C_test_rank_array[i];
-                break;
-            case 'D':
-                gd->test_index_array[i] = D_test_index_array[i];
-                gd->test_rank_array[i]  = D_test_rank_array[i];
-                break;
-        };
-
-        
-
-/*  Printout initial NPB info */
-    if( gd->my_rank == 0 )
-    {
-        printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
-        printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
-        printf( " Iterations:   %d\n", MAX_ITERATIONS );
-        printf( " Number of processes:     %d\n",gd->comm_size );
-    }
-
-/*  Check that actual and compiled number of processors agree */
-    if( gd->comm_size != NUM_PROCS )
-    {
-        if( gd->my_rank == 0 )
-            printf( "\n ERROR: compiled for %d processes\n"
-                    " Number of active processes: %d\n"
-                    " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
-        MPI_Finalize();
-        exit( 1 );
-    }
-
-/*  Check to see whether total number of processes is within bounds.
-    This could in principle be checked in setparams.c, but it is more
-    convenient to do it here                                               */
-    if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
-    {
-       if( gd->my_rank == 0 )
-           printf( "\n ERROR: number of processes %d not within range %d-%d"
-                   "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
-       MPI_Finalize();
-       exit( 1 );
-    }
-
-
-/*  Generate random number sequence and subsequent keys on all procs */
-    create_seq(gd,  find_my_seed( gd->my_rank, 
-                              gd->comm_size, 
-                              4*(long)TOTAL_KEYS*MIN_PROCS,
-                              314159265.00,      /* Random number gen seed */
-                              1220703125.00 ),   /* Random number gen mult */
-                1220703125.00 );                 /* Random number gen mult */
-
-/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
-    all data and code pages and respective tables */
-    rank(gd, 1 );  
-
-/*  Start verification counter */
-    gd->passed_verification = 0;
-
-    if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
-
-/*  Initialize timer  */             
-    timer_clear(gd, 0 );
-
-/*  Initialize separate communication, computation timing */
-#ifdef  TIMING_ENABLED 
-    for( i=1; i<=3; i++ ) timer_clear(gd, i );
-#endif
-
-/*  Start timer  */             
-    timer_start(gd, 0 );
-
-#ifdef  TIMING_ENABLED
-    timer_start(gd, 1 );
-    timer_start(gd, 2 );
-#endif
-
-    char smpi_category[100];
-    snprintf (smpi_category, 100, "%d", gd->my_rank);
-    TRACE_smpi_set_category (smpi_category);
-
-/*  This is the main iteration */
-    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
-    {
-        if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
-        rank(gd,  iteration );
-    }
-    TRACE_smpi_set_category (NULL);
-
-#ifdef  TIMING_ENABLED
-    timer_stop(gd, 2 );
-    timer_stop(gd, 1 );
-#endif
-
-/*  Stop timer, obtain time for processors */
-    timer_stop(gd, 0 );
-
-    timecounter = timer_read(gd, 0 );
-
-/*  End of timing, obtain maximum time of all processors */
-    MPI_Reduce( &timecounter,
-                &maxtime,
-                1,
-                MPI_DOUBLE,
-                MPI_MAX,
-                0,
-                MPI_COMM_WORLD );
-
-#ifdef  TIMING_ENABLED
-    {
-        double    tmin, tsum, tmax;
-    
-        if( my_rank == 0 )
-        {
-            printf( "\ntimer 1/2/3 = total/computation/communication time\n");
-            printf( "              min                avg                max\n" );
-        }
-        for( i=1; i<=3; i++ )
-        {
-            timecounter = timer_read(gd, i );
-            MPI_Reduce( &timecounter,
-                        &tmin,
-                        1,
-                        MPI_DOUBLE,
-                        MPI_MIN,
-                        0,
-                        MPI_COMM_WORLD );
-            MPI_Reduce( &timecounter,
-                        &tsum,
-                        1,
-                        MPI_DOUBLE,
-                        MPI_SUM,
-                        0,
-                        MPI_COMM_WORLD );
-            MPI_Reduce( &timecounter,
-                        &tmax,
-                        1,
-                        MPI_DOUBLE,
-                        MPI_MAX,
-                        0,
-                        MPI_COMM_WORLD );
-            if( my_rank == 0 )
-                printf( "timer %d:    %f           %f            %f\n",
-                        i, tmin, tsum/((double) comm_size), tmax );
-        }
-        if( my_rank == 0 )
-            printf( "\n" );
-    }
-#endif
-
-/*  This tests that keys are in sequence: sorting of last ranked key seq
-    occurs here, but is an untimed operation                             */
-    full_verify(gd);
-
-
-/*  Obtain verification counter sum */
-    itemp =gd->passed_verification;
-    MPI_Reduce( &itemp,
-                &gd->passed_verification,
-                1,
-                MPI_INT,
-                MPI_SUM,
-                0,
-                MPI_COMM_WORLD );
-
-
-
-/*  The final printout  */
-    if( gd->my_rank == 0 )
-    {
-        if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
-            gd->passed_verification = 0;
-        c_print_results( "IS",
-                         CLASS,
-                         (int)(TOTAL_KEYS),
-                         MIN_PROCS,
-                         0,
-                         MAX_ITERATIONS,
-                         NUM_PROCS,
-                         gd->comm_size,
-                         maxtime,
-                         ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
-                                                      /maxtime/1000000.,
-                         "keys ranked", 
-                         gd->passed_verification,
-                         NPBVERSION,
-                         COMPILETIME,
-                         MPICC,
-                         CLINK,
-                         CMPI_LIB,
-                         CMPI_INC,
-                         CFLAGS,
-                         CLINKFLAGS );
-    }
-                    
-    MPI_Finalize();
-    free(gd);
-
-    return 0;
-         /**************************/
-}        /*  E N D  P R O G R A M  */
-         /**************************/
diff --git a/examples/smpi/NAS/IS/is.c b/examples/smpi/NAS/IS/is.c
index d617726937..57b370bb39 100644
--- a/examples/smpi/NAS/IS/is.c
+++ b/examples/smpi/NAS/IS/is.c
@@ -44,6 +44,8 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#include "simgrid/instr.h" //TRACE_
+
 /******************/
 /* default values */
 /******************/
@@ -1025,13 +1027,17 @@ int main( int argc, char **argv )
     timer_start(gd, 2 );
 #endif
 
+    char smpi_category[100];
+    snprintf (smpi_category, 100, "%d", gd->my_rank);
+    TRACE_smpi_set_category (smpi_category);
+
 /*  This is the main iteration */
     for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
     {
         if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
         rank(gd,  iteration );
     }
-
+    TRACE_smpi_set_category (NULL);
 
 #ifdef  TIMING_ENABLED
     timer_stop(gd, 2 );
diff --git a/examples/smpi/NAS/config/NAS.samples/README b/examples/smpi/NAS/config/NAS.samples/README
deleted file mode 100644
index ae535e95cf..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/README
+++ /dev/null
@@ -1,7 +0,0 @@
-This directory contains examples of make.def files that were used 
-by the NPB team in testing the benchmarks on different platforms. 
-They can be used as starting points for make.def files for your 
-own platform, but you may need to taylor them for best performance 
-on your installation. A clean template can be found in directory 
-`config'.
-Some examples of suite.def files are also provided.
\ No newline at end of file
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha b/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha
deleted file mode 100644
index 44f045354d..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha
+++ /dev/null
@@ -1,18 +0,0 @@
-#This is for a DEC Alpha 8400. The code will execute on a 
-#single processor
-#Warning: parallel make does not work properly in general
-MPIF77  = f77
-FLINK   = f77
-#Optimization -O5 breaks SP; works fine for all other codes
-FFLAGS  = -O4
-
-MPICC   = cc
-CLINK   = cc
-CFLAGS  = -O5 
-
-include ../config/make.dummy
-
-CC      = cc -g
-BINDIR  = ../bin
-
-RAND   = randi8
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2 b/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2
deleted file mode 100644
index f76404755a..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/make.def.irix6.2
+++ /dev/null
@@ -1,16 +0,0 @@
-#This is for a generic single-processor SGI workstation
-MPIF77 = f77
-FLINK	= f77
-FFLAGS	= -O3
-
-MPICC = cc
-CLINK	= cc
-CFLAGS	= -O3 
-
-include ../config/make.dummy
-
-CC	= cc -g
-BINDIR	= ../bin
-
-RAND   = randi8
-
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.origin b/examples/smpi/NAS/config/NAS.samples/make.def.origin
deleted file mode 100644
index 11c63c975c..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/make.def.origin
+++ /dev/null
@@ -1,20 +0,0 @@
-# This is for a an SGI Origin 2000 or 3000 with vendor MPI. The Fortran
-# record length is specified, so it can be used for the I/O benchmark.
-# as well
-MPIF77   = f77 
-FMPI_LIB = -lmpi
-FLINK    = f77 -64
-FFLAGS   = -O3 -64
-
-MPICC    = cc
-CMPI_LIB = -lmpi
-CLINK    = cc
-CFLAGS   = -O3 
-
-CC       = cc -g
-BINDIR   = ../bin
-
-RAND   = randi8
-
-CONVERTFLAG = -DFORTRAN_REC_SIZE=4
-
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge b/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge
deleted file mode 100644
index 379726d057..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge
+++ /dev/null
@@ -1,16 +0,0 @@
-# This is for the SGI PowerChallenge Array at NASA Ames. mrf77 and 
-# mrcc are local scripts that invoke the proper MPI library.
-MPIF77 = mrf77
-FLINK  = mrf77
-FFLAGS = -O3 -OPT:fold_arith_limit=1204
-
-MPICC  = mrcc
-CLINK  = mrcc
-CFLAGS = -O3 -OPT:fold_arith_limit=1204
-
-CC     = cc -g
-BINDIR = ../bin
-
-RAND   = randi8
-
-
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage b/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage
deleted file mode 100644
index 7896d56f7d..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage
+++ /dev/null
@@ -1,17 +0,0 @@
-#This is for the IBM SP2 at Ames; mrf77 and mrcc are local scripts
-MPIF77     = mrf77
-FLINK      = mrf77
-FFLAGS     = -O3 
-FLINKFLAGS = -bmaxdata:0x60000000
-
-MPICC      = mrcc
-CLINK      = mrcc
-CFLAGS     = -O3 
-CLINKFLAGS = -bmaxdata:0x60000000
-
-CC         = cc -g
-
-BINDIR     = ../bin
-
-RAND       = randi8
-
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc b/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc
deleted file mode 100644
index 420dfde821..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc
+++ /dev/null
@@ -1,30 +0,0 @@
-#    This is for a Sun SparcCenter or UltraEnterprise machine 
-MPIF77     = f77
-FLINK      = f77
-FMPI_LIB   = -L<your mpich installation tree>/lib/solaris/ch_lfshmem -lmpi
-FMPI_INC   = -I<your mpich installation tree>/include
-#    sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m)
-#    and f77 -V returns 4.0 or greater
-# FFLAGS   = -fast -xtarget=super -xO4 -depend
-#    Ultra1,2, UltraEnterprise servers (uname -m returns sun4u)
-FFLAGS     = -fast -xtarget=ultra -xarch=v8plus -xO4 -depend
-FLINKFLAGS = -lmopt -lcopt -lsunmath
-
-MPICC      = cc
-CLINK      = cc
-CMPI_LIB   = -L<your mpich installation tree>/lib/solaris/ch_lfshmem -lmpi
-CMPI_INC   = -I<your mpich installation tree>/include
-#    sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m)
-#    and cc -V returns 4.0 or greater
-#CFLAGS	   =  -fast -xtarget=super -xO4 -xdepend
-#    Ultra1,2, UltraEnterprise servers (uname -m returns sun4u)
-CFLAGS     =  -fast -xtarget=ultra -xarch=v8plus -xO4 -xdepend
-CLINKFLAGS = -fast
-
-CC         = cc -g
-
-BINDIR     = ../bin
-
-#    Cannot use randi8 or randi8-safe on a 32-but machine. Use double precision
-RAND       = randdp
-
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos b/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos
deleted file mode 100644
index d3b3bbf528..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos
+++ /dev/null
@@ -1,25 +0,0 @@
-#This is for the Cray T3D at the Jet Propulsion Laboratory
-MPIF77     = cf77
-FLINK      = cf77
-FMPI_LIB   = -L/usr/local/mpp/lib -lmpi
-FMPI_INC   = -I/usr/local/mpp/lib/include/mpp
-FFLAGS     = -dp -Wf-onoieeedivide -C cray-t3d 
-#The following flags provide more effective optimization, but may
-#cause the random number generator randi8(_safe) to break in EP
-#FFLAGS    = -dp -Wf-oaggress -Wf-onoieeedivide -C cray-t3d 
-FLINKFLAGS = -Wl-Drdahead=on -C cray-t3d
-
-MPICC      = cc
-CLINK	   = cc
-CMPI_LIB   = -L/usr/local/mpp/lib -lmpi
-CMPI_INC   = -I/usr/local/mpp/lib/include/mpp
-CFLAGS	   = -O3 -Tcray-t3d
-CLINKFLAGS = -Tcray-t3d
-
-CC	   = cc -g -Tcray-ymp
-BINDIR	   = ../bin
-
-CONVERTFLAG= -DCONVERTDOUBLE
-
-RAND       = randi8
-
diff --git a/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich b/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich
deleted file mode 100644
index 99b0b69041..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich
+++ /dev/null
@@ -1,165 +0,0 @@
-#---------------------------------------------------------------------------
-#
-#                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
-#
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# Items in this file will need to be changed for each platform.
-# (Note these definitions are inconsistent with NPB2.1.)
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# Parallel Fortran:
-#
-# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must 
-# be defined:
-#
-# MPIF77     - Fortran compiler
-# FFLAGS     - Fortran compilation arguments
-# FMPI_INC   - any -I arguments required for compiling MPI/Fortran 
-# FLINK      - Fortran linker
-# FLINKFLAGS - Fortran linker arguments
-# FMPI_LIB   - any -L and -l arguments required for linking MPI/Fortran 
-# 
-# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or
-#                            $(MPIF77) $(FFLAGS)
-# linking is done with       $(FLINK) $(FMPI_LIB) $(FLINKFLAGS)
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# This is the fortran compiler used for MPI programs
-#---------------------------------------------------------------------------
-MPIF77 = mpif77
-# This links MPI fortran programs; usually the same as ${MPIF77}
-FLINK	= $(MPIF77)
-
-#---------------------------------------------------------------------------
-# These macros are passed to the linker to help link with MPI correctly
-#---------------------------------------------------------------------------
-FMPI_LIB  =
-
-#---------------------------------------------------------------------------
-# These macros are passed to the compiler to help find 'mpif.h'
-#---------------------------------------------------------------------------
-FMPI_INC =
-
-#---------------------------------------------------------------------------
-# Global *compile time* flags for Fortran programs
-#---------------------------------------------------------------------------
-FFLAGS	= -fast
-# FFLAGS = -g
-
-#---------------------------------------------------------------------------
-# Global *link time* flags. Flags for increasing maximum executable 
-# size usually go here. 
-#---------------------------------------------------------------------------
-FLINKFLAGS = -fast
-
-
-#---------------------------------------------------------------------------
-# Parallel C:
-#
-# For IS, which is in C, the following must be defined:
-#
-# MPICC      - C compiler 
-# CFLAGS     - C compilation arguments
-# CMPI_INC   - any -I arguments required for compiling MPI/C 
-# CLINK      - C linker
-# CLINKFLAGS - C linker flags
-# CMPI_LIB   - any -L and -l arguments required for linking MPI/C 
-#
-# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
-#                            $(MPICC) $(CFLAGS)
-# linking is done with       $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# This is the C compiler used for MPI programs
-#---------------------------------------------------------------------------
-MPICC = mpicc
-# This links MPI C programs; usually the same as ${MPICC}
-CLINK	= $(MPICC)
-
-#---------------------------------------------------------------------------
-# These macros are passed to the linker to help link with MPI correctly
-#---------------------------------------------------------------------------
-CMPI_LIB  =
-
-#---------------------------------------------------------------------------
-# These macros are passed to the compiler to help find 'mpi.h'
-#---------------------------------------------------------------------------
-CMPI_INC =
-
-#---------------------------------------------------------------------------
-# Global *compile time* flags for C programs
-#---------------------------------------------------------------------------
-CFLAGS	= -fast
-# CFLAGS = -g
-
-#---------------------------------------------------------------------------
-# Global *link time* flags. Flags for increasing maximum executable 
-# size usually go here. 
-#---------------------------------------------------------------------------
-CLINKFLAGS = -fast
-
-
-#---------------------------------------------------------------------------
-# MPI dummy library:
-#
-# Uncomment if you want to use the MPI dummy library supplied by NAS instead 
-# of the true message-passing library. The include file redefines several of
-# the above macros. It also invokes make in subdirectory MPI_dummy. Make 
-# sure that no spaces or tabs precede include.
-#---------------------------------------------------------------------------
-# include ../config/make.dummy
-
-
-#---------------------------------------------------------------------------
-# Utilities C:
-#
-# This is the C compiler used to compile C utilities.  Flags required by 
-# this compiler go here also; typically there are few flags required; hence 
-# there are no separate macros provided for such flags.
-#---------------------------------------------------------------------------
-CC	= cc -g
-
-
-#---------------------------------------------------------------------------
-# Destination of executables, relative to subdirs of the main directory. . 
-#---------------------------------------------------------------------------
-BINDIR	= ../bin
-
-
-#---------------------------------------------------------------------------
-# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which
-# is twice the precision required for the NPB suite. A compiler flag 
-# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to
-# 64 bits, but the MPI library may continue to send 128 bits. Short of
-# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit
-# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment
-# the following line to enable this substitution. 
-# 
-# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO
-#       SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC
-#       VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE
-#       THE CORRECT VALUE FOR "length".
-#       IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED,
-#       UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR
-#       "length"
-#---------------------------------------------------------------------------
-# CONVERTFLAG	= -DCONVERTDOUBLE
-CONVERTFLAG	= -DFORTRAN_REC_SIZE=1
-# CONVERTFLAG	= -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length
-
-
-#---------------------------------------------------------------------------
-# The variable RAND controls which random number generator 
-# is used. It is described in detail in Doc/README.install. 
-# Use "randi8" unless there is a reason to use another one. 
-# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
-#---------------------------------------------------------------------------
-RAND   = randi8
-# The following is highly reliable but may be slow:
-# RAND   = randdp
-
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.bt b/examples/smpi/NAS/config/NAS.samples/suite.def.bt
deleted file mode 100644
index f330636aa0..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/suite.def.bt
+++ /dev/null
@@ -1,37 +0,0 @@
-bt	S	1
-bt	S	4
-bt	S	9
-bt	S	16
-bt	A	1
-bt	A	4
-bt	A	9
-bt	A	16
-bt	A	25
-bt	A	36
-bt	A	49
-bt	A	64
-bt	A	81
-bt	A	100
-bt	A	121
-bt	B	1
-bt	B	4
-bt	B	9
-bt	B	16
-bt	B	25
-bt	B	36
-bt	B	49
-bt	B	64
-bt	B	81
-bt	B	100
-bt	B	121
-bt	C	1
-bt	C	4
-bt	C	9
-bt	C	16
-bt	C	25
-bt	C	36
-bt	C	49
-bt	C	64
-bt	C	81
-bt	C	100
-bt	C	121
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.cg b/examples/smpi/NAS/config/NAS.samples/suite.def.cg
deleted file mode 100644
index 393bc508f6..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/suite.def.cg
+++ /dev/null
@@ -1,29 +0,0 @@
-cg	S	1
-cg	S	2
-cg	S	4
-cg	S	8
-cg	S	16
-cg	A	1
-cg	A	2
-cg	A	4
-cg	A	8
-cg	A	16
-cg	A	32
-cg	A	64
-cg	A	128
-cg	B	1
-cg	B	2
-cg	B	4
-cg	B	8
-cg	B	16
-cg	B	32
-cg	B	64
-cg	B	128
-cg	C	1
-cg	C	2
-cg	C	4
-cg	C	8
-cg	C	16
-cg	C	32
-cg	C	64
-cg	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.ft b/examples/smpi/NAS/config/NAS.samples/suite.def.ft
deleted file mode 100644
index 6f05189094..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/suite.def.ft
+++ /dev/null
@@ -1,29 +0,0 @@
-ft	S	1
-ft	S	2
-ft	S	4
-ft	S	8
-ft	S	16
-ft	A	1
-ft	A	2
-ft	A	4
-ft	A	8
-ft	A	16
-ft	A	32
-ft	A	64
-ft	A	128
-ft	B	1
-ft	B	2
-ft	B	4
-ft	B	8
-ft	B	16
-ft	B	32
-ft	B	64
-ft	B	128
-ft	C	1
-ft	C	2
-ft	C	4
-ft	C	8
-ft	C	16
-ft	C	32
-ft	C	64
-ft	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.lu b/examples/smpi/NAS/config/NAS.samples/suite.def.lu
deleted file mode 100644
index 442e0b6842..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/suite.def.lu
+++ /dev/null
@@ -1,29 +0,0 @@
-lu	S	1
-lu	S	2
-lu	S	4
-lu	S	8
-lu	S	16
-lu	A	1
-lu	A	2
-lu	A	4
-lu	A	8
-lu	A	16
-lu	A	32
-lu	A	64
-lu	A	128
-lu	B	1
-lu	B	2
-lu	B	4
-lu	B	8
-lu	B	16
-lu	B	32
-lu	B	64
-lu	B	128
-lu	C	1
-lu	C	2
-lu	C	4
-lu	C	8
-lu	C	16
-lu	C	32
-lu	C	64
-lu	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.mg b/examples/smpi/NAS/config/NAS.samples/suite.def.mg
deleted file mode 100644
index b5c01d4557..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/suite.def.mg
+++ /dev/null
@@ -1,29 +0,0 @@
-mg	S	1
-mg	S	2
-mg	S	4
-mg	S	8
-mg	S	16
-mg	A	1
-mg	A	2
-mg	A	4
-mg	A	8
-mg	A	16
-mg	A	32
-mg	A	64
-mg	A	128
-mg	B	1
-mg	B	2
-mg	B	4
-mg	B	8
-mg	B	16
-mg	B	32
-mg	B	64
-mg	B	128
-mg	C	1
-mg	C	2
-mg	C	4
-mg	C	8
-mg	C	16
-mg	C	32
-mg	C	64
-mg	C	128
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.small b/examples/smpi/NAS/config/NAS.samples/suite.def.small
deleted file mode 100644
index 5a094042bd..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/suite.def.small
+++ /dev/null
@@ -1,8 +0,0 @@
-bt	S	1
-cg	S	1
-ep	S	1
-ft	S	1
-is	S	1
-lu	S	1
-mg	S	1
-sp	S	1
diff --git a/examples/smpi/NAS/config/NAS.samples/suite.def.sp b/examples/smpi/NAS/config/NAS.samples/suite.def.sp
deleted file mode 100644
index f8113a256e..0000000000
--- a/examples/smpi/NAS/config/NAS.samples/suite.def.sp
+++ /dev/null
@@ -1,37 +0,0 @@
-sp	S	1
-sp	S	4
-sp	S	9
-sp	S	16
-sp	A	1
-sp	A	4
-sp	A	9
-sp	A	16
-sp	A	25
-sp	A	36
-sp	A	49
-sp	A	64
-sp	A	81
-sp	A	100
-sp	A	121
-sp	B	1
-sp	B	4
-sp	B	9
-sp	B	16
-sp	B	25
-sp	B	36
-sp	B	49
-sp	B	64
-sp	B	81
-sp	B	100
-sp	B	121
-sp	C	1
-sp	C	4
-sp	C	9
-sp	C	16
-sp	C	25
-sp	C	36
-sp	C	49
-sp	C	64
-sp	C	81
-sp	C	100
-sp	C	121
diff --git a/examples/smpi/NAS/config/suite.def.template b/examples/smpi/NAS/config/suite.def.template
index aea8b2358e..5e02f30a22 100644
--- a/examples/smpi/NAS/config/suite.def.template
+++ b/examples/smpi/NAS/config/suite.def.template
@@ -3,8 +3,7 @@
 # Typing "make suite" in the main directory will build all the benchmarks
 # specified in this file. 
 # Each line of this file contains a benchmark name, class, and number
-# of nodes. The name is one of "cg", "is", "ep", mg", "ft", "sp", "bt", 
-# "lu", and "dt". 
+# of nodes. The name is one of "is", "ep", and "dt"
 # The class is one of "S", "W", "A", "B", "C", "D", and "E"
 # (except that no classes C, D and E for DT, and no class E for IS).
 # The number of nodes must be a legal number for a particular
@@ -13,12 +12,6 @@
 # Comments start with "#" as the first character on a line. 
 # No blank lines. 
 # The following example builds 1 processor sample sizes of all benchmarks. 
-ft	S	1
-mg	S	1
-sp	S	1
-lu	S	1
-bt	S	1
 is	S	1
 ep	S	1
-cg	S	1
 dt	S	1