From e6e323a7749044351eada87127f0658c9a8c937d Mon Sep 17 00:00:00 2001
From: schnorr <schnorr@48e7efb5-ca39-0410-a469-dd3cf9ba447f>
Date: Fri, 17 Sep 2010 09:26:52 +0000
Subject: [PATCH] creating instrumented versions of DT, EP and IS benchmarks

details:
- new directories created to avoid the modification of the original .c files
- the binaries created have the same name as the binaries generated by original benchmarks
- instrumentation consists basically in:
   - #include "instr/instr.h"
   - use TRACE_smpi_set_category to declare and set a category to monitor resource utilization
   - execute with -trace parameter
- no instrumentation needed if only a traditional gantt-chart is needed
   - in this case, original versions of benchmarks must be executed with -trace parameter only

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/simgrid/simgrid/trunk@8196 48e7efb5-ca39-0410-a469-dd3cf9ba447f
---
 examples/smpi/NAS/DT-trace/DGraph.c   |  184 ++++
 examples/smpi/NAS/DT-trace/DGraph.h   |   43 +
 examples/smpi/NAS/DT-trace/Makefile   |   26 +
 examples/smpi/NAS/DT-trace/README     |   22 +
 examples/smpi/NAS/DT-trace/dt-trace.c |  767 ++++++++++++++++
 examples/smpi/NAS/EP-trace/Makefile   |   28 +
 examples/smpi/NAS/EP-trace/README     |    6 +
 examples/smpi/NAS/EP-trace/ep-trace.c |  449 ++++++++++
 examples/smpi/NAS/EP-trace/mpinpb.h   |    9 +
 examples/smpi/NAS/EP-trace/randlc.c   |  107 +++
 examples/smpi/NAS/EP-trace/randlc.h   |    3 +
 examples/smpi/NAS/IS-trace/Makefile   |   22 +
 examples/smpi/NAS/IS-trace/is-trace.c | 1158 +++++++++++++++++++++++++
 examples/smpi/NAS/Makefile            |   12 +
 14 files changed, 2836 insertions(+)
 create mode 100644 examples/smpi/NAS/DT-trace/DGraph.c
 create mode 100644 examples/smpi/NAS/DT-trace/DGraph.h
 create mode 100644 examples/smpi/NAS/DT-trace/Makefile
 create mode 100644 examples/smpi/NAS/DT-trace/README
 create mode 100644 examples/smpi/NAS/DT-trace/dt-trace.c
 create mode 100644 examples/smpi/NAS/EP-trace/Makefile
 create mode 100644 examples/smpi/NAS/EP-trace/README
 create mode 100644 examples/smpi/NAS/EP-trace/ep-trace.c
 create mode 100644 examples/smpi/NAS/EP-trace/mpinpb.h
 create mode 100644 examples/smpi/NAS/EP-trace/randlc.c
 create mode 100644 examples/smpi/NAS/EP-trace/randlc.h
 create mode 100644 examples/smpi/NAS/IS-trace/Makefile
 create mode 100644 examples/smpi/NAS/IS-trace/is-trace.c

diff --git a/examples/smpi/NAS/DT-trace/DGraph.c b/examples/smpi/NAS/DT-trace/DGraph.c
new file mode 100644
index 0000000000..5d5839df19
--- /dev/null
+++ b/examples/smpi/NAS/DT-trace/DGraph.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "DGraph.h"
+
+DGArc *newArc(DGNode *tl,DGNode *hd){
+  DGArc *ar=(DGArc *)malloc(sizeof(DGArc));
+  ar->tail=tl;
+  ar->head=hd;
+  return ar;
+}
+void arcShow(DGArc *ar){
+  DGNode *tl=(DGNode *)ar->tail,
+         *hd=(DGNode *)ar->head;
+  fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name);
+}
+
+DGNode *newNode(char *nm){
+  DGNode *nd=(DGNode *)malloc(sizeof(DGNode));
+  nd->attribute=0;
+  nd->color=0;
+  nd->inDegree=0;
+  nd->outDegree=0;
+  nd->maxInDegree=SMALL_BLOCK_SIZE;
+  nd->maxOutDegree=SMALL_BLOCK_SIZE;
+  nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*));
+  nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*));
+  nd->name=strdup(nm);
+  nd->feat=NULL;
+  return nd;
+}
+void nodeShow(DGNode* nd){
+  fprintf( stderr,"%3d.%s: (%d,%d)\n",
+	           nd->id,nd->name,nd->inDegree,nd->outDegree);
+/*
+  if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
+  else if(nd->verified==0)  fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
+  else  fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name);   
+*/
+}
+
+DGraph* newDGraph(char* nm){
+  DGraph *dg=(DGraph *)malloc(sizeof(DGraph));
+  dg->numNodes=0;
+  dg->numArcs=0;
+  dg->maxNodes=BLOCK_SIZE;
+  dg->maxArcs=BLOCK_SIZE;
+  dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*));
+  dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*));
+  dg->name=strdup(nm);
+  return dg;
+}
+int AttachNode(DGraph* dg, DGNode* nd) {
+  int i=0,j,len=0;
+  DGNode **nds =NULL, *tmpnd=NULL;
+  DGArc **ar=NULL;
+
+	if (dg->numNodes == dg->maxNodes-1 ) {
+	  dg->maxNodes += BLOCK_SIZE;
+          nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
+	  memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
+	  free(dg->node);
+	  dg->node=nds;
+	}
+
+        len = strlen( nd->name);
+	for (i = 0; i < dg->numNodes; i++) {
+	  tmpnd =dg->node[ i];
+	  ar=NULL;
+	  if ( strlen( tmpnd->name) != len ) continue;
+	  if ( strncmp( nd->name, tmpnd->name, len) ) continue;
+	  if ( nd->inDegree > 0 ) {
+	    tmpnd->maxInDegree += nd->maxInDegree;
+            ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
+	    memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
+	    free(tmpnd->inArc);
+	    tmpnd->inArc=ar;
+	    for (j = 0; j < nd->inDegree; j++ ) {
+	      nd->inArc[ j]->head = tmpnd;
+	    }
+	    memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *));
+	    tmpnd->inDegree += nd->inDegree;
+	  } 	
+	  if ( nd->outDegree > 0 ) {
+	    tmpnd->maxOutDegree += nd->maxOutDegree;
+            ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
+	    memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
+	    free(tmpnd->outArc);
+	    tmpnd->outArc=ar;
+	    for (j = 0; j < nd->outDegree; j++ ) {
+	      nd->outArc[ j]->tail = tmpnd;
+	    }			
+	    memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
+	    tmpnd->outDegree += nd->outDegree;
+	  } 
+	  free(nd); 
+	  return i;
+	}
+	nd->id = dg->numNodes;
+	dg->node[dg->numNodes] = nd;
+	dg->numNodes++;
+return nd->id;
+}
+int AttachArc(DGraph *dg,DGArc* nar){
+int	arcId = -1;
+int i=0,newNumber=0;
+DGNode	*head = nar->head,
+	*tail = nar->tail; 
+DGArc **ars=NULL,*probe=NULL;
+/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
+	if ( !tail || !head ) return arcId;
+	if ( dg->numArcs == dg->maxArcs-1 ) {
+	  dg->maxArcs += BLOCK_SIZE;
+          ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
+	  memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
+	  free(dg->arc);
+	  dg->arc=ars;
+	}
+	for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
+	  probe = tail->outArc[ i];
+	  if(probe->head == head
+	     &&
+	     probe->length == nar->length
+            ){
+            free(nar);
+	    return probe->id;   
+	  }
+	}
+	
+	nar->id = dg->numArcs;
+	arcId=dg->numArcs;
+	dg->arc[dg->numArcs] = nar;
+	dg->numArcs++;
+	
+	head->inArc[ head->inDegree] = nar;
+	head->inDegree++;
+	if ( head->inDegree >= head->maxInDegree ) {
+	  newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
+          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+	  memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
+	  free(head->inArc);
+	  head->inArc=ars;
+	  head->maxInDegree = newNumber;
+	}
+	tail->outArc[ tail->outDegree] = nar;
+	tail->outDegree++;
+	if(tail->outDegree >= tail->maxOutDegree ) {
+	  newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
+          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+	  memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
+	  free(tail->outArc);
+	  tail->outArc=ars;
+	  tail->maxOutDegree = newNumber;
+	}
+/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
+return arcId;
+}
+void graphShow(DGraph *dg,int DetailsLevel){
+  int i=0,j=0;
+  fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
+  if ( DetailsLevel < 1) return;
+  for (i = 0; i < dg->numNodes; i++ ) {
+    DGNode *focusNode = dg->node[ i];
+    if(DetailsLevel >= 2) {
+      for (j = 0; j < focusNode->inDegree; j++ ) {
+	fprintf(stderr,"\t ");
+	nodeShow(focusNode->inArc[ j]->tail);
+      }
+    }
+    nodeShow(focusNode);
+    if ( DetailsLevel < 2) continue;
+    for (j = 0; j < focusNode->outDegree; j++ ) {
+      fprintf(stderr, "\t ");
+      nodeShow(focusNode->outArc[ j]->head);
+    }	
+    fprintf(stderr, "---\n");
+  }
+  fprintf(stderr,"----------------------------------------\n");
+  if ( DetailsLevel < 3) return;
+}
+
+
+
diff --git a/examples/smpi/NAS/DT-trace/DGraph.h b/examples/smpi/NAS/DT-trace/DGraph.h
new file mode 100644
index 0000000000..f38f898b24
--- /dev/null
+++ b/examples/smpi/NAS/DT-trace/DGraph.h
@@ -0,0 +1,43 @@
+#ifndef _DGRAPH
+#define _DGRAPH
+
+#define BLOCK_SIZE  128
+#define SMALL_BLOCK_SIZE 32
+
+typedef struct{
+  int id;
+  void *tail,*head;
+  int length,width,attribute,maxWidth;
+}DGArc;
+
+typedef struct{
+  int maxInDegree,maxOutDegree;
+  int inDegree,outDegree;
+  int id;
+  char *name;
+  DGArc **inArc,**outArc;
+  int depth,height,width;
+  int color,attribute,address,verified;
+  void *feat;
+}DGNode;
+
+typedef struct{
+  int maxNodes,maxArcs;
+  int id;
+  char *name;
+  int numNodes,numArcs;
+  DGNode **node;
+  DGArc **arc;
+} DGraph;
+
+DGArc *newArc(DGNode *tl,DGNode *hd);
+void arcShow(DGArc *ar);
+DGNode *newNode(char *nm);
+void nodeShow(DGNode* nd);
+
+DGraph* newDGraph(char *nm);
+int AttachNode(DGraph *dg,DGNode *nd);
+int AttachArc(DGraph *dg,DGArc* nar);
+void graphShow(DGraph *dg,int DetailsLevel);
+
+#endif
diff --git a/examples/smpi/NAS/DT-trace/Makefile b/examples/smpi/NAS/DT-trace/Makefile
new file mode 100644
index 0000000000..8953b82516
--- /dev/null
+++ b/examples/smpi/NAS/DT-trace/Makefile
@@ -0,0 +1,26 @@
+SHELL=/bin/sh
+BENCHMARK=dt
+BENCHMARKU=DT
+
+include ../config/make.def
+
+include ../sys/make.common
+#Override PROGRAM
+DTPROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS)
+
+OBJS = dt-trace.o DGraph.o \
+	${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
+
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+	${CCOMPILE} $<
+
+dt-trace.o:             dt-trace.c  npbparams.h
+DGraph.o:	DGraph.c DGraph.h
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f dt npbparams.h core
diff --git a/examples/smpi/NAS/DT-trace/README b/examples/smpi/NAS/DT-trace/README
new file mode 100644
index 0000000000..873e3ae6f2
--- /dev/null
+++ b/examples/smpi/NAS/DT-trace/README
@@ -0,0 +1,22 @@
+Data Traffic benchmark DT is new in the NPB suite 
+(released as part of NPB3.x-MPI package).
+----------------------------------------------------
+
+DT is written in C and same executable can run on any number of processors,
+provided this number is not less than the number of nodes in the communication
+graph.  DT benchmark takes one argument: BH, WH, or SH. This argument 
+specifies the communication graph Black Hole, White Hole, or SHuffle 
+respectively. The current release contains verification numbers for 
+CLASSES S, W, A, and B only.  Classes C and D are defined, but verification 
+numbers are not provided in this release.
+
+The following table summarizes the number of nodes in the communication
+graph based on CLASS and graph TYPE.
+
+CLASS  N_Source N_Nodes(BH,WH) N_Nodes(SH)
+ S      4        5              12
+ W      8        11             32
+ A      16       21             80
+ B      32       43             192
+ C      64       85             448
+ D      128      171            1024
diff --git a/examples/smpi/NAS/DT-trace/dt-trace.c b/examples/smpi/NAS/DT-trace/dt-trace.c
new file mode 100644
index 0000000000..89ab4de079
--- /dev/null
+++ b/examples/smpi/NAS/DT-trace/dt-trace.c
@@ -0,0 +1,767 @@
+/*************************************************************************
+ *                                                                       * 
+ *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
+ *                                                                       * 
+ *                                  D T					 * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
+ *                                                                       * 
+ *   Permission to use, copy, distribute and modify this software        * 
+ *   for any purpose with or without fee is hereby granted.  We          * 
+ *   request, however, that all derived work reference the NAS           * 
+ *   Parallel Benchmarks 3.3. This software is provided "as is"          *
+ *   without express or implied warranty.                                * 
+ *                                                                       * 
+ *   Information on NPB 3.3, including the technical report, the         *
+ *   original specifications, source code, results and information       * 
+ *   on how to submit new results, is available at:                      * 
+ *                                                                       * 
+ *          http:  www.nas.nasa.gov/Software/NPB                         * 
+ *                                                                       * 
+ *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
+ *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
+ *                                                                       * 
+ *         NAS Parallel Benchmarks Group                                 * 
+ *         NASA Ames Research Center                                     * 
+ *         Mail Stop: T27A-1                                             * 
+ *         Moffett Field, CA   94035-1000                                * 
+ *                                                                       * 
+ *         E-mail:  npb@nas.nasa.gov                                     * 
+ *         Fax:     (650) 604-3957                                       * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   Author: M. Frumkin							 *						 * 
+ *                                                                       * 
+ *************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#include "instr/instr.h" //TRACE_
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS            1                 
+#endif
+
+//int      passed_verification;
+extern double randlc( double *X, double *A );
+extern
+void c_print_results( char   *name,
+                      char   class,
+                      int    n1, 
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      int    nprocs_compiled,
+                      int    nprocs_total,
+                      double t,
+                      double mops,
+		      char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *mpicc,
+                      char   *clink,
+                      char   *cmpi_lib,
+                      char   *cmpi_inc,
+                      char   *cflags,
+                      char   *clinkflags );
+		      
+void    timer_clear( int n );
+void    timer_start( int n );
+void    timer_stop( int n );
+double  timer_read( int n );
+int timer_on=0,timers_tot=64;
+
+int verify(char *bmname,double rnm2){
+    double verify_value=0.0;
+    double epsilon=1.0E-8;
+    char cls=CLASS;
+    int verified=-1;
+    if (cls != 'U') {
+       if(cls=='S') {
+         if(strstr(bmname,"BH")){
+           verify_value=30892725.0;
+         }else if(strstr(bmname,"WH")){
+           verify_value=67349758.0;
+         }else if(strstr(bmname,"SH")){
+           verify_value=58875767.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = 0;
+       }else if(cls=='W') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 4102461.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 204280762.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 186944764.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = 0;
+       }else if(cls=='A') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 17809491.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 1289925229.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 610856482.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+  	 verified = 0;
+       }else if(cls=='B') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 4317114.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 7877279917.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 1836863082.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+  	   verified = 0;
+         }
+       }else if(cls=='C') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 0.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+  	   verified = -1;
+         }
+       }else if(cls=='D') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 0.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = -1;
+       }else{
+         fprintf(stderr,"No such class as %c.\n",cls);
+       }
+       fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
+       if(verified==-1){
+  	 fprintf(stderr," No verification was performed.\n");
+       }else if( rnm2 - verify_value < epsilon &&
+                 rnm2 - verify_value > -epsilon) {  /* abs here does not work on ALTIX */
+  	  verified = 1;
+  	  fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
+       }else{
+  	 verified = 0;
+  	 fprintf(stderr," The correct verification value = %f\n",verify_value);
+  	 fprintf(stderr," Got value = %f\n",rnm2);
+       }
+    }else{
+       verified = -1;
+    }
+    return  verified;  
+  }
+
+int ipowMod(int a,long long int n,int md){ 
+  int seed=1,q=a,r=1;
+  if(n<0){
+    fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
+    n=-n; /* temp fix */
+/*    return 1; */
+  }
+  if(md<=0){
+    fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
+    return 1;
+  }
+  if(n==0) return 1;
+  while(n>1){
+    int n2 = n/2;
+    if (n2*2==n){
+       seed = (q*q)%md;
+       q=seed;
+       n = n2;
+    }else{
+       seed = (r*q)%md;
+       r=seed;
+       n = n-1;
+    }
+  }
+  seed = (r*q)%md;
+  return seed;
+}
+
+#include "DGraph.h"
+DGraph *buildSH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  DGraph *dg;
+  int numSources=NUM_SOURCES; /* must be power of 2 */
+  int numOfLayers=0,tmpS=numSources>>1;
+  int firstLayerNode=0;
+  DGArc *ar=NULL;
+  DGNode *nd=NULL;
+  int mask=0x0,ndid=0,ndoff=0;
+  int i=0,j=0;
+  char nm[BLOCK_SIZE];
+  
+  sprintf(nm,"DT_SH.%c",cls);
+  dg=newDGraph(nm);
+
+  while(tmpS>1){
+    numOfLayers++;
+    tmpS>>=1;
+  }
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Source.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  for(j=0;j<numOfLayers;j++){
+    mask=0x00000001<<j;
+    for(i=0;i<numSources;i++){
+      sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
+      nd=newNode(nm);
+      AttachNode(dg,nd);
+      ndoff=i&(~mask);
+      ndid=firstLayerNode+ndoff;
+      ar=newArc(dg->node[ndid],nd);     
+      AttachArc(dg,ar);
+      ndoff+=mask;
+      ndid=firstLayerNode+ndoff;
+      ar=newArc(dg->node[ndid],nd);     
+      AttachArc(dg,ar);
+    }
+    firstLayerNode+=numSources;
+  }
+  mask=0x00000001<<numOfLayers;
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Sink.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+    ndoff=i&(~mask);
+    ndid=firstLayerNode+ndoff;
+    ar=newArc(dg->node[ndid],nd);     
+    AttachArc(dg,ar);
+    ndoff+=mask;
+    ndid=firstLayerNode+ndoff;
+    ar=newArc(dg->node[ndid],nd);     
+    AttachArc(dg,ar);
+  }
+return dg;
+}
+DGraph *buildWH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  int i=0,j=0;
+  int numSources=NUM_SOURCES,maxInDeg=4;
+  int numLayerNodes=numSources,firstLayerNode=0;
+  int totComparators=0;
+  int numPrevLayerNodes=numLayerNodes;
+  int id=0,sid=0;
+  DGraph *dg;
+  DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
+  DGArc *ar=NULL;
+  char nm[BLOCK_SIZE];
+
+  sprintf(nm,"DT_WH.%c",cls);
+  dg=newDGraph(nm);
+
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Sink.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  totComparators=0;
+  numPrevLayerNodes=numLayerNodes;
+  while(numLayerNodes>maxInDeg){
+    numLayerNodes=numLayerNodes/maxInDeg;
+    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+    for(i=0;i<numLayerNodes;i++){
+      sprintf(nm,"Comparator.%d",totComparators);
+      totComparators++;
+      nd=newNode(nm);
+      id=AttachNode(dg,nd);
+      for(j=0;j<maxInDeg;j++){
+        sid=i*maxInDeg+j;
+	if(sid>=numPrevLayerNodes) break;
+        snd=dg->node[firstLayerNode+sid];
+        ar=newArc(dg->node[id],snd);
+        AttachArc(dg,ar);
+      }
+    }
+    firstLayerNode+=numPrevLayerNodes;
+    numPrevLayerNodes=numLayerNodes;
+  }
+  source=newNode("Source");
+  AttachNode(dg,source);   
+  for(i=0;i<numPrevLayerNodes;i++){
+    nd=dg->node[firstLayerNode+i];
+    ar=newArc(source,nd);
+    AttachArc(dg,ar);
+  }
+
+  for(i=0;i<dg->numNodes/2;i++){  /* Topological sorting */
+    tmp=dg->node[i];
+    dg->node[i]=dg->node[dg->numNodes-1-i];
+    dg->node[i]->id=i;
+    dg->node[dg->numNodes-1-i]=tmp;
+    dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
+  }
+return dg;
+}
+DGraph *buildBH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  int i=0,j=0;
+  int numSources=NUM_SOURCES,maxInDeg=4;
+  int numLayerNodes=numSources,firstLayerNode=0;
+  DGraph *dg;
+  DGNode *nd=NULL, *snd=NULL, *sink=NULL;
+  DGArc *ar=NULL;
+  int totComparators=0;
+  int numPrevLayerNodes=numLayerNodes;
+  int id=0, sid=0;
+  char nm[BLOCK_SIZE];
+
+  sprintf(nm,"DT_BH.%c",cls);
+  dg=newDGraph(nm);
+
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Source.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  while(numLayerNodes>maxInDeg){
+    numLayerNodes=numLayerNodes/maxInDeg;
+    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+    for(i=0;i<numLayerNodes;i++){
+      sprintf(nm,"Comparator.%d",totComparators);
+      totComparators++;
+      nd=newNode(nm);
+      id=AttachNode(dg,nd);
+      for(j=0;j<maxInDeg;j++){
+        sid=i*maxInDeg+j;
+	if(sid>=numPrevLayerNodes) break;
+        snd=dg->node[firstLayerNode+sid];
+        ar=newArc(snd,dg->node[id]);
+        AttachArc(dg,ar);
+      }
+    }
+    firstLayerNode+=numPrevLayerNodes;
+    numPrevLayerNodes=numLayerNodes;
+  }
+  sink=newNode("Sink");
+  AttachNode(dg,sink);   
+  for(i=0;i<numPrevLayerNodes;i++){
+    nd=dg->node[firstLayerNode+i];
+    ar=newArc(nd,sink);
+    AttachArc(dg,ar);
+  }
+return dg;
+}
+
+typedef struct{
+  int len;
+  double* val;
+} Arr;
+Arr *newArr(int len){
+  Arr *arr=(Arr *)malloc(sizeof(Arr));
+  arr->len=len;
+  arr->val=(double *)malloc(len*sizeof(double));
+  return arr;
+}
+void arrShow(Arr* a){
+  if(!a) fprintf(stderr,"-- NULL array\n");
+  else{
+    fprintf(stderr,"-- length=%d\n",a->len);
+  }
+}
+double CheckVal(Arr *feat){
+  double csum=0.0;
+  int i=0;
+  for(i=0;i<feat->len;i++){
+    csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since 
+                                                  result will be 0 for large len  */
+  }
+   return csum;
+}
+int GetFNumDPar(int* mean, int* stdev){
+  *mean=NUM_SAMPLES;
+  *stdev=STD_DEVIATION;
+  return 0;
+}
+int GetFeatureNum(char *mbname,int id){
+  double tran=314159265.0;
+  double A=2*id+1;
+  double denom=randlc(&tran,&A);
+  char cval='S';
+  int mean=NUM_SAMPLES,stdev=128;
+  int rtfs=0,len=0;
+  GetFNumDPar(&mean,&stdev);
+  rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
+  if(rtfs<0) rtfs=-rtfs;
+  len=mean-stdev+rtfs;
+  return len;
+}
+Arr* RandomFeatures(char *bmname,int fdim,int id){
+  int len=GetFeatureNum(bmname,id)*fdim;
+  Arr* feat=newArr(len);
+  int nxg=2,nyg=2,nzg=2,nfg=5;
+  int nx=421,ny=419,nz=1427,nf=3527;
+  long long int expon=(len*(id+1))%3141592;
+  int seedx=ipowMod(nxg,expon,nx),
+      seedy=ipowMod(nyg,expon,ny),
+      seedz=ipowMod(nzg,expon,nz),
+      seedf=ipowMod(nfg,expon,nf);
+  int i=0;
+  if(timer_on){
+    timer_clear(id+1);
+    timer_start(id+1);
+  }
+  for(i=0;i<len;i+=fdim){
+    seedx=(seedx*nxg)%nx;
+    seedy=(seedy*nyg)%ny;
+    seedz=(seedz*nzg)%nz;
+    seedf=(seedf*nfg)%nf;
+    feat->val[i]=seedx;
+    feat->val[i+1]=seedy;
+    feat->val[i+2]=seedz;
+    feat->val[i+3]=seedf;
+  }
+  if(timer_on){
+    timer_stop(id+1);
+    fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
+  }
+  return feat;   
+}
+void Resample(Arr *a,int blen){
+    long long int i=0,j=0,jlo=0,jhi=0;
+    double avval=0.0;
+    double *nval=(double *)malloc(blen*sizeof(double));
+    Arr *tmp=newArr(10);
+    for(i=0;i<blen;i++) nval[i]=0.0;
+    for(i=1;i<a->len-1;i++){
+      jlo=(int)(0.5*(2*i-1)*(blen/a->len)); 
+      jhi=(int)(0.5*(2*i+1)*(blen/a->len));
+
+      avval=a->val[i]/(jhi-jlo+1);    
+      for(j=jlo;j<=jhi;j++){
+        nval[j]+=avval;
+      }
+    }
+    nval[0]=a->val[0];
+    nval[blen-1]=a->val[a->len-1];
+    free(a->val);
+    a->val=nval;
+    a->len=blen;
+}
+#define fielddim 4
+Arr* WindowFilter(Arr *a, Arr* b,int w){
+  int i=0,j=0,k=0;
+  double rms0=0.0,rms1=0.0,rmsm1=0.0;
+  double weight=((double) (w+1))/(w+2);
+ 
+  w+=1;
+  if(timer_on){
+    timer_clear(w);
+    timer_start(w);
+  }
+  if(a->len<b->len) Resample(a,b->len);
+  if(a->len>b->len) Resample(b,a->len);
+  for(i=fielddim;i<a->len-fielddim;i+=fielddim){
+    rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
+	+(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
+	+(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
+	+(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
+    j=i+fielddim;
+    rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+    	+(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+    	+(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+    	+(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+    j=i-fielddim;
+    rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+	 +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+	 +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+	 +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+    k=0;
+    if(rms1<rms0){
+      k=1;
+      rms0=rms1;
+    }
+    if(rmsm1<rms0) k=-1;
+    if(k==0){
+      j=i+fielddim;
+      a->val[i]=weight*b->val[i];
+      a->val[i+1]=weight*b->val[i+1];
+      a->val[i+2]=weight*b->val[i+2];
+      a->val[i+3]=weight*b->val[i+3];  
+    }else if(k==1){
+      j=i+fielddim;
+      a->val[i]=weight*b->val[j];
+      a->val[i+1]=weight*b->val[j+1];
+      a->val[i+2]=weight*b->val[j+2];
+      a->val[i+3]=weight*b->val[j+3];  
+    }else { /*if(k==-1)*/
+      j=i-fielddim;
+      a->val[i]=weight*b->val[j];
+      a->val[i+1]=weight*b->val[j+1];
+      a->val[i+2]=weight*b->val[j+2];
+      a->val[i+3]=weight*b->val[j+3];  
+    }	   
+  }
+  if(timer_on){
+    timer_stop(w);
+    fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
+  }
+  return a;
+}
+
+int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
+  int i=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *head=NULL;
+  if(!feat) return 0;
+  for(i=0;i<nd->outDegree;i++){
+    ar=nd->outArc[i];
+    if(ar->tail!=nd) continue;
+    head=ar->head;
+    tag=ar->id;
+    if(head->address!=nd->address){
+      MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
+      MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
+    }
+  }
+  return 1;
+}
+Arr* CombineStreams(DGraph *dg,DGNode *nd){
+  Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
+  int i=0,len=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *tail=NULL;
+  MPI_Status status;
+  Arr *feat=NULL,*featp=NULL;
+
+  if(nd->inDegree==0) return NULL;
+  for(i=0;i<nd->inDegree;i++){
+    ar=nd->inArc[i];
+    if(ar->head!=nd) continue;
+    tail=ar->tail;
+    if(tail->address!=nd->address){
+      len=0;
+      tag=ar->id;
+      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+      feat=newArr(len);
+      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+      resfeat=WindowFilter(resfeat,feat,nd->id);
+      free(feat);
+    }else{
+      featp=(Arr *)tail->feat;
+      feat=newArr(featp->len);
+      memcpy(feat->val,featp->val,featp->len*sizeof(double));
+      resfeat=WindowFilter(resfeat,feat,nd->id);  
+      free(feat);
+    }
+  }
+  for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
+  nd->feat=resfeat;
+  return nd->feat;
+}
+double Reduce(Arr *a,int w){
+  double retv=0.0;
+  if(timer_on){
+    timer_clear(w);
+    timer_start(w);
+  }
+  retv=(int)(w*CheckVal(a));/* The casting needed for node  
+                               and array dependent verifcation */
+  if(timer_on){
+    timer_stop(w);
+    fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
+  }
+  return retv;
+}
+
+double ReduceStreams(DGraph *dg,DGNode *nd){
+  double csum=0.0;
+  int i=0,len=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *tail=NULL;
+  Arr *feat=NULL;
+  double retv=0.0;
+
+  for(i=0;i<nd->inDegree;i++){
+    ar=nd->inArc[i];
+    if(ar->head!=nd) continue;
+    tail=ar->tail;
+    if(tail->address!=nd->address){
+      MPI_Status status;
+      len=0;
+      tag=ar->id;
+      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+      feat=newArr(len);
+      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+      csum+=Reduce(feat,(nd->id+1));  
+      free(feat);
+    }else{
+      csum+=Reduce(tail->feat,(nd->id+1));  
+    }
+  }
+  if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
+  retv=(nd->id+1)*csum;
+  return retv;
+}
+
+int ProcessNodes(DGraph *dg,int me){
+  double chksum=0.0;
+  Arr *feat=NULL;
+  int i=0,verified=0,tag;
+  DGNode *nd=NULL;
+  double rchksum=0.0;
+  MPI_Status status;
+
+  for(i=0;i<dg->numNodes;i++){
+    nd=dg->node[i];
+    if(nd->address!=me) continue;
+    if(strstr(nd->name,"Source")){
+      nd->feat=RandomFeatures(dg->name,fielddim,nd->id); 
+      SendResults(dg,nd,nd->feat);
+    }else if(strstr(nd->name,"Sink")){
+      chksum=ReduceStreams(dg,nd);
+      tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+      MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
+    }else{
+      feat=CombineStreams(dg,nd);
+      SendResults(dg,nd,feat);
+    }
+  }
+  if(me==0){ /* Report node */
+    rchksum=0.0;
+    chksum=0.0;
+    for(i=0;i<dg->numNodes;i++){
+      nd=dg->node[i];
+      if(!strstr(nd->name,"Sink")) continue;
+       tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+      MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
+      chksum+=rchksum;
+    }
+    verified=verify(dg->name,chksum);
+  }
+return verified;
+}
+
+int main(int argc,char **argv ){
+  int my_rank,comm_size;
+  int i;
+  DGraph *dg=NULL;
+  int verified=0, featnum=0;
+  double bytes_sent=2.0,tot_time=0.0;
+
+
+    TRACE_smpi_set_category ("start");
+
+    MPI_Init( &argc, &argv );
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+     if(argc!=2||
+                (  strncmp(argv[1],"BH",2)!=0
+                 &&strncmp(argv[1],"WH",2)!=0
+                 &&strncmp(argv[1],"SH",2)!=0
+                )
+      ){
+      if(my_rank==0){
+        fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
+        fprintf(stderr,"** Where \n   - N is integer number of MPI processes\n");
+        fprintf(stderr,"   - S is the class S, W, or A \n");
+        fprintf(stderr,"   - GraphName is the communication graph name BH, WH, or SH.\n");
+        fprintf(stderr,"   - the number of MPI processes N should not be be less than \n");
+        fprintf(stderr,"     the number of nodes in the graph\n");
+      }
+      MPI_Finalize();
+      exit(0);
+    } 
+   if(strncmp(argv[1],"BH",2)==0){
+      dg=buildBH(CLASS);
+    }else if(strncmp(argv[1],"WH",2)==0){
+      dg=buildWH(CLASS);
+    }else if(strncmp(argv[1],"SH",2)==0){
+      dg=buildSH(CLASS);
+    }
+
+    if(timer_on&&dg->numNodes+1>timers_tot){
+      timer_on=0;
+      if(my_rank==0)
+        fprintf(stderr,"Not enough timers. Node timeing is off. \n");
+    }
+    if(dg->numNodes>comm_size){
+      if(my_rank==0){
+        fprintf(stderr,"**  The number of MPI processes should not be less than \n");
+        fprintf(stderr,"**  the number of nodes in the graph\n");
+        fprintf(stderr,"**  Number of MPI processes = %d\n",comm_size);
+        fprintf(stderr,"**  Number nodes in the graph = %d\n",dg->numNodes);
+      }
+      MPI_Finalize();
+      exit(0);
+    }
+    for(i=0;i<dg->numNodes;i++){ 
+      dg->node[i]->address=i;
+    }
+    if( my_rank == 0 ){
+      printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
+      graphShow(dg,0);
+      timer_clear(0);
+      timer_start(0);
+    }
+
+    TRACE_smpi_set_category ("dt");
+    verified=ProcessNodes(dg,my_rank);
+    TRACE_smpi_set_category ("finalize");
+ 
+    featnum=NUM_SAMPLES*fielddim;
+    bytes_sent=featnum*dg->numArcs;
+    bytes_sent/=1048576;
+    if(my_rank==0){
+      timer_stop(0);
+      tot_time=timer_read(0);
+      c_print_results( dg->name,
+        	       CLASS,
+        	       featnum,
+        	       0,
+        	       0,
+        	       dg->numNodes,
+        	       0,
+        	       comm_size,
+        	       tot_time,
+        	       bytes_sent/tot_time,
+        	       "bytes transmitted", 
+        	       verified,
+        	       NPBVERSION,
+        	       COMPILETIME,
+        	       MPICC,
+        	       CLINK,
+        	       CMPI_LIB,
+        	       CMPI_INC,
+        	       CFLAGS,
+        	       CLINKFLAGS );
+    }          
+    MPI_Finalize();
+  return 1;
+}
diff --git a/examples/smpi/NAS/EP-trace/Makefile b/examples/smpi/NAS/EP-trace/Makefile
new file mode 100644
index 0000000000..6194dcfe97
--- /dev/null
+++ b/examples/smpi/NAS/EP-trace/Makefile
@@ -0,0 +1,28 @@
+SHELL=/bin/sh
+BENCHMARK=ep
+BENCHMARKU=EP
+
+include ../config/make.def
+
+#OBJS = ep-trace.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o
+OBJS = ep-trace.o randlc.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+#	${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
+
+
+#ep-trace.o:		ep-trace.f  mpinpb.h npbparams.h
+#	${FCOMPILE} ep-trace.f
+
+ep-trace.o:	ep-trace.c randlc.c mpinpb.h npbparams.h
+	${CCOMPILE} ep-trace.c
+
+clean:
+	- rm -f *.o *~ 
+	- rm -f npbparams.h core
+
+
+
diff --git a/examples/smpi/NAS/EP-trace/README b/examples/smpi/NAS/EP-trace/README
new file mode 100644
index 0000000000..6eb36571af
--- /dev/null
+++ b/examples/smpi/NAS/EP-trace/README
@@ -0,0 +1,6 @@
+This code implements the random-number generator described in the
+NAS Parallel Benchmark document RNR Technical Report RNR-94-007.
+The code is "embarrassingly" parallel in that no communication is
+required for the generation of the random numbers itself. There is
+no special requirement on the number of processors used for running
+the benchmark.
diff --git a/examples/smpi/NAS/EP-trace/ep-trace.c b/examples/smpi/NAS/EP-trace/ep-trace.c
new file mode 100644
index 0000000000..a0cec1b10a
--- /dev/null
+++ b/examples/smpi/NAS/EP-trace/ep-trace.c
@@ -0,0 +1,449 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#include "instr/instr.h" //TRACE_
+
+#include "randlc.h"
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS            1                 
+#endif
+#define true 1
+#define false 0
+
+
+//---NOTE : all the timers function have been modified to
+//          avoid global timers (privatize these). 
+      // ----------------------- timers ---------------------
+      void timer_clear(double *onetimer) {
+            //elapsed[n] = 0.0;
+            *onetimer = 0.0;
+      }
+
+      void timer_start(double *onetimer) {
+            *onetimer = MPI_Wtime();
+      }
+
+      void timer_stop(int n,double *elapsed,double *start) {
+            double t, now;
+
+            now = MPI_Wtime();
+            t = now - start[n];
+            elapsed[n] += t;
+      }
+
+      double timer_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
+            return(elapsed[n]);
+      }
+      /********************************************************************
+       *****************            V R A N L C          ******************
+       *****************                                 *****************/           
+      double vranlc(int n, double x, double a, double *y)
+      {
+        int i;
+        long  i246m1=0x00003FFFFFFFFFFF;
+	  long  LLx, Lx, La;
+        double d2m46;
+
+// This doesn't work, because the compiler does the calculation in 32
+// bits and overflows. No standard way (without f90 stuff) to specify
+// that the rhs should be done in 64 bit arithmetic.
+//     parameter(i246m1=2**46-1)
+
+      d2m46=pow(0.5,46);
+
+// c Note that the v6 compiler on an R8000 does something stupid with
+// c the above. Using the following instead (or various other things)
+// c makes the calculation run almost 10 times as fast.
+//
+// c     save d2m46
+// c      data d2m46/0.0d0/
+// c      if (d2m46 .eq. 0.0d0) then
+// c         d2m46 = 0.5d0**46
+// c      endif
+
+      Lx = (long)x;
+      La = (long)a;
+      //fprintf(stdout,("================== Vranlc ================");
+      //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
+	LLx = Lx;
+	for (i=0; i< n; i++) {
+		  Lx   = Lx*La & i246m1 ;
+		  LLx = Lx;
+		  y[i] = d2m46 * (double)LLx;
+		  /*
+		     if(i == 0) {
+		     fprintf(stdout,("After loop 0:");
+		     fprintf(stdout,("Lx = " + Lx + ", La = " + La);
+		     fprintf(stdout,("d2m46 = " + d2m46);
+		     fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
+		     fprintf(stdout,("Y[0]" + y[0]);
+		     }
+		   */
+	}
+
+      x = (double)LLx;
+      /*
+      fprintf(stdout,("Change: Lx = " + Lx);
+      fprintf(stdout,("=============End   Vranlc ================");
+      */
+      return x;
+    }
+
+
+
+//-------------- the core (unique function) -----------
+      void doTest(int argc, char **argv) {
+		  double dum[3] = {1.,1.,1.};
+		  double x1, x2, sx, sy, tm, an, tt, gc;
+		  double Mops;
+		  double epsilon=1.0E-8, a = 1220703125., s=271828183.;
+		  double t1, t2, t3, t4; 
+		  double sx_verify_value, sy_verify_value, sx_err, sy_err;
+
+#include "npbparams.h"
+		  int    mk=16, 
+			   // --> set by make : in npbparams.h
+			   //m=28, // for CLASS=A
+			   //m=30, // for CLASS=B
+			   //npm=2, // NPROCS
+			   mm = m-mk, 
+			   nn = (int)(pow(2,mm)), 
+			   nk = (int)(pow(2,mk)), 
+			   nq=10, 
+			   np, 
+			   node, 
+			   no_nodes, 
+			   i, 
+			   ik, 
+			   kk, 
+			   l, 
+			   k, nit, no_large_nodes,
+			   np_add, k_offset, j;
+		  int    me, nprocs, root=0, dp_type;
+		  int verified, 
+			    timers_enabled=true;
+		  char  size[500]; // mind the size of the string to represent a big number
+
+		  //Use in randlc..
+		  int KS = 0;
+		  double R23, R46, T23, T46;
+
+		  double *qq = (double *) malloc (10000*sizeof(double));
+		  double *start = (double *) malloc (64*sizeof(double));
+		  double *elapsed = (double *) malloc (64*sizeof(double));
+
+		  double *x = (double *) malloc (2*nk*sizeof(double));
+		  double *q = (double *) malloc (nq*sizeof(double));
+
+      TRACE_smpi_set_category ("start");
+
+		  MPI_Init( &argc, &argv );
+		  MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
+		  MPI_Comm_rank( MPI_COMM_WORLD, &node);
+
+#ifdef USE_MPE
+    MPE_Init_log();
+#endif
+		  root = 0;
+		  if (node == root ) {
+
+			    /*   Because the size of the problem is too large to store in a 32-bit
+			     *   integer for some classes, we put it into a string (for printing).
+			     *   Have to strip off the decimal point put in there by the floating
+			     *   point print statement (internal file)
+			     */
+			    fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
+			    sprintf(size,"%d",pow(2,m+1));
+			    //size = size.replace('.', ' ');
+			    fprintf(stdout," Number of random numbers generated: %s\n",size);
+			    fprintf(stdout," Number of active processes: %d\n",no_nodes);
+
+		  }
+		  verified = false;
+
+		  /* c   Compute the number of "batches" of random number pairs generated 
+		     c   per processor. Adjust if the number of processors does not evenly 
+		     c   divide the total number
+*/
+
+       np = nn / no_nodes;
+       no_large_nodes = nn % no_nodes;
+       if (node < no_large_nodes) np_add = 1;
+       else np_add = 0;
+       np = np + np_add;
+
+       if (np == 0) {
+             fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
+             MPI_Abort(MPI_COMM_WORLD,1);
+             exit(0); 
+       } 
+
+/* c   Call the random number generator functions and initialize
+   c   the x-array to reduce the effects of paging on the timings.
+   c   Also, call all mathematical functions that are used. Make
+   c   sure these initializations cannot be eliminated as dead code.
+*/
+
+	 //call vranlc(0, dum[1], dum[2], dum[3]);
+	 // Array indexes start at 1 in Fortran, 0 in Java
+	 vranlc(0, dum[0], dum[1], &(dum[2])); 
+
+	 dum[0] = randlc(&(dum[1]),&(dum[2]));
+	 /////////////////////////////////
+	 for (i=0;i<2*nk;i++) {
+		   x[i] = -1e99;
+	 }
+	 Mops = log(sqrt(abs(1))); 
+
+	 /*
+	    c---------------------------------------------------------------------
+	    c    Synchronize before placing time stamp
+	    c---------------------------------------------------------------------
+	  */
+        MPI_Barrier( MPI_COMM_WORLD );
+
+
+      TRACE_smpi_set_category ("ep");
+
+        timer_clear(&(elapsed[1]));
+        timer_clear(&(elapsed[2]));
+        timer_clear(&(elapsed[3]));
+        timer_start(&(start[1]));
+        
+        t1 = a;
+	//fprintf(stdout,("(ep.f:160) t1 = " + t1);
+        t1 = vranlc(0, t1, a, x);
+	//fprintf(stdout,("(ep.f:161) t1 = " + t1);
+	
+        
+/* c   Compute AN = A ^ (2 * NK) (mod 2^46). */
+        
+        t1 = a;
+	//fprintf(stdout,("(ep.f:165) t1 = " + t1);
+        for (i=1; i <= mk+1; i++) {
+               t2 = randlc(&t1, &t1);
+	       //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
+        } 
+        an = t1;
+	//fprintf(stdout,("(ep.f:172) s = " + s);
+        tt = s;
+        gc = 0.;
+        sx = 0.;
+        sy = 0.;
+        for (i=0; i < nq ; i++) {
+               q[i] = 0.;
+        }
+
+/*
+    Each instance of this loop may be performed independently. We compute
+    the k offsets separately to take into account the fact that some nodes
+    have more numbers to generate than others
+*/
+
+      if (np_add == 1)
+         k_offset = node * np -1;
+      else
+         k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
+     
+      int stop = false;
+      for(k = 1; k <= np; k++) {
+         stop = false;
+         kk = k_offset + k ;
+         t1 = s;
+         //fprintf(stdout,("(ep.f:193) t1 = " + t1);
+         t2 = an;
+
+//       Find starting seed t1 for this kk.
+
+         for (i=1;i<=100 && !stop;i++) {
+            ik = kk / 2;
+	    //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
+            if (2 * ik != kk)  {
+                t3 = randlc(&t1, &t2);
+                //fprintf(stdout,("(ep.f:200) t1= " +t1 );
+            }
+            if (ik==0)
+                stop = true;
+            else {
+               t3 = randlc(&t2, &t2);
+               kk = ik;
+           }
+         }
+//       Compute uniform pseudorandom numbers.
+
+         //if (timers_enabled)  timer_start(3);
+	 timer_start(&(start[3]));
+         //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
+
+	//fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
+	//fprintf(stdout,"2*nk = " + (2*nk));
+	//fprintf(stdout,"t1 = " + t1);
+	//fprintf(stdout,"a  = " + a);
+	//fprintf(stdout,"x[0] = " + x[0]);
+	//fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+        
+	t1 = vranlc(2 * nk, t1, a, x);
+
+	//fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
+	//fprintf(stdout,("2*nk = " + (2*nk));
+	//fprintf(stdout,("t1 = " + t1);
+	//fprintf(stdout,("a  = " + a);
+	//fprintf(stdout,("x[0] = " + x[0]);
+	//fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+        
+         //if (timers_enabled)  timer_stop(3);
+	 timer_stop(3,elapsed,start);
+
+/*       Compute Gaussian deviates by acceptance-rejection method and 
+ *       tally counts in concentric square annuli.  This loop is not 
+ *       vectorizable. 
+ */
+         //if (timers_enabled) timer_start(2);
+ 	 timer_start(&(start[2]));
+         for(i=1; i<=nk;i++) {
+            x1 = 2. * x[2*i-2] -1.0;
+            x2 = 2. * x[2*i-1] - 1.0;
+            t1 = x1*x1 + x2*x2;
+            if (t1 <= 1.) {
+               t2   = sqrt(-2. * log(t1) / t1);
+               t3   = (x1 * t2);
+               t4   = (x2 * t2);
+               l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
+               q[l] = q[l] + 1.;
+               sx   = sx + t3;
+               sy   = sy + t4;
+             }
+		/*
+	     if(i == 1) {
+                fprintf(stdout,"x1 = " + x1);
+                fprintf(stdout,"x2 = " + x2);
+                fprintf(stdout,"t1 = " + t1);
+                fprintf(stdout,"t2 = " + t2);
+                fprintf(stdout,"t3 = " + t3);
+                fprintf(stdout,"t4 = " + t4);
+                fprintf(stdout,"l = " + l);
+                fprintf(stdout,"q[l] = " + q[l]);
+                fprintf(stdout,"sx = " + sx);
+                fprintf(stdout,"sy = " + sy);
+	     }
+		*/
+           }
+         //if (timers_enabled)  timer_stop(2);
+ 	 timer_stop(2,elapsed,start);
+      }
+
+      TRACE_smpi_set_category ("finalize");
+
+      //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)   
+	MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+	sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
+      MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+      sy = x[0];
+      MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+      for(i = 0; i < nq; i++) {
+		q[i] = x[i];
+	}
+	for(i = 0; i < nq; i++) {
+		gc += q[i];
+	}
+
+	timer_stop(1,elapsed,start);
+      tm = timer_read(1,elapsed);
+	MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+	tm = x[0];
+
+	if(node == root) {
+		nit = 0;
+		verified = true;
+
+		if(m == 24) {
+         		sx_verify_value = -3.247834652034740E3;
+	                sy_verify_value = -6.958407078382297E3;
+		} else if(m == 25) {
+	            	sx_verify_value = -2.863319731645753E3;
+			sy_verify_value = -6.320053679109499E3;
+		} else if(m == 28) {
+		        sx_verify_value = -4.295875165629892E3;
+			sy_verify_value = -1.580732573678431E4;
+		} else if(m == 30) {
+	        	sx_verify_value =  4.033815542441498E4;
+	                sy_verify_value = -2.660669192809235E4;
+		} else if(m == 32) {
+	                sx_verify_value =  4.764367927995374E4;
+                   	sy_verify_value = -8.084072988043731E4;
+		} else if(m == 36) {
+		        sx_verify_value =  1.982481200946593E5;
+		        sy_verify_value = -1.020596636361769E5;
+		} else {
+			verified = false;
+		}
+
+		/*
+		fprintf(stdout,("sx        = " + sx);
+		fprintf(stdout,("sx_verify = " + sx_verify_value);
+		fprintf(stdout,("sy        = " + sy);
+		fprintf(stdout,("sy_verify = " + sy_verify_value);
+		*/
+		if(verified) {
+			sx_err = abs((sx - sx_verify_value)/sx_verify_value);
+			sy_err = abs((sy - sy_verify_value)/sy_verify_value);
+			/*
+			fprintf(stdout,("sx_err = " + sx_err);
+			fprintf(stdout,("sy_err = " + sx_err);
+			fprintf(stdout,("epsilon= " + epsilon);
+			*/
+			verified = ((sx_err < epsilon) && (sy_err < epsilon));
+		}
+
+		Mops = (pow(2.0, m+1))/tm/1000;
+
+		fprintf(stdout,"EP Benchmark Results:\n");
+		fprintf(stdout,"CPU Time=%d\n",tm);
+		fprintf(stdout,"N = 2^%d\n",m);
+		fprintf(stdout,"No. Gaussain Pairs =%d\n",gc);
+		fprintf(stdout,"Sum = %lf %ld\n",sx,sy);
+		fprintf(stdout,"Count:");
+		for(i = 0; i < nq; i++) {
+			fprintf(stdout,"%d\t %ld\n",i,q[i]);
+		}
+
+		/*
+		print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
+				"Random numbers generated", verified, npbversion,
+				compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
+		fprintf(stdout,"\nEP Benchmark Completed\n");
+            fprintf(stdout,"Class           = %s\n", _class);
+		fprintf(stdout,"Size            = %s\n", size);
+		fprintf(stdout,"Iteration       = %d\n", nit);
+		fprintf(stdout,"Time in seconds = %lf\n",(tm/1000));
+		fprintf(stdout,"Total processes = %d\n",no_nodes);
+		fprintf(stdout,"Mops/s total    = %lf\n",Mops);
+		fprintf(stdout,"Mops/s/process  = %lf\n", Mops/no_nodes);
+		fprintf(stdout,"Operation type  = Random number generated\n");
+		if(verified) {
+			fprintf(stdout,"Verification    = SUCCESSFUL\n");
+		} else {
+			fprintf(stdout,"Verification    = UNSUCCESSFUL\n");
+		}
+           	fprintf(stdout,"Total time:     %lf\n",(timer_read(1,elapsed)/1000));
+           	fprintf(stdout,"Gaussian pairs: %lf\n",(timer_read(2,elapsed)/1000));
+           	fprintf(stdout,"Random numbers: %lf\n",(timer_read(3,elapsed)/1000));
+       	}
+#ifdef USE_MPE
+    MPE_Finish_log(argv[0]);
+#endif
+ 
+       MPI_Finalize();
+      }
+
+    int main(int argc, char **argv) {
+       doTest(argc,argv);
+    }
diff --git a/examples/smpi/NAS/EP-trace/mpinpb.h b/examples/smpi/NAS/EP-trace/mpinpb.h
new file mode 100644
index 0000000000..1f13637ae4
--- /dev/null
+++ b/examples/smpi/NAS/EP-trace/mpinpb.h
@@ -0,0 +1,9 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'mpif.h'
+
+      integer           me, nprocs, root, dp_type
+      common /mpistuff/ me, nprocs, root, dp_type
+
diff --git a/examples/smpi/NAS/EP-trace/randlc.c b/examples/smpi/NAS/EP-trace/randlc.c
new file mode 100644
index 0000000000..624b800df8
--- /dev/null
+++ b/examples/smpi/NAS/EP-trace/randlc.c
@@ -0,0 +1,107 @@
+
+/*
+ *    FUNCTION RANDLC (X, A)
+ *
+ *  This routine returns a uniform pseudorandom double precision number in the
+ *  range (0, 1) by using the linear congruential generator
+ *
+ *  x_{k+1} = a x_k  (mod 2^46)
+ *
+ *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+ *  before repeating.  The argument A is the same as 'a' in the above formula,
+ *  and X is the same as x_0.  A and X must be odd double precision integers
+ *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
+ *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+ *  the new seed x_1, so that subsequent calls to RANDLC using the same
+ *  arguments will generate a continuous sequence.
+ *
+ *  This routine should produce the same results on any computer with at least
+ *  48 mantissa bits in double precision floating point data.  On Cray systems,
+ *  double precision should be disabled.
+ *
+ *  David H. Bailey     October 26, 1990
+ *
+ *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ *     SAVE KS, R23, R46, T23, T46
+ *     DATA KS/0/
+ *
+ *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
+ *  by merely using the ** operator, in order to insure that the results are
+ *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/*************           R  A  N  D  L  C             ************/
+/*************                                        ************/
+/*************    portable random number generator    ************/
+/*****************************************************************/
+
+double	randlc( double *X, double *A )
+{
+      static int        KS=0;
+      static double	R23, R46, T23, T46;
+      double		T1, T2, T3, T4;
+      double		A1;
+      double		A2;
+      double		X1;
+      double		X2;
+      double		Z;
+      int     		i, j;
+
+      if (KS == 0) 
+      {
+        R23 = 1.0;
+        R46 = 1.0;
+        T23 = 1.0;
+        T46 = 1.0;
+    
+        for (i=1; i<=23; i++)
+        {
+          R23 = 0.50 * R23;
+          T23 = 2.0 * T23;
+        }
+        for (i=1; i<=46; i++)
+        {
+          R46 = 0.50 * R46;
+          T46 = 2.0 * T46;
+        }
+        KS = 1;
+      }
+
+/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
+
+      T1 = R23 * *A;
+      j  = T1;
+      A1 = j;
+      A2 = *A - T23 * A1;
+
+/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
+    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
+
+      T1 = R23 * *X;
+      j  = T1;
+      X1 = j;
+      X2 = *X - T23 * X1;
+      T1 = A1 * X2 + A2 * X1;
+      
+      j  = R23 * T1;
+      T2 = j;
+      Z = T1 - T23 * T2;
+      T3 = T23 * Z + A2 * X2;
+      j  = R46 * T3;
+      T4 = j;
+      *X = T3 - T46 * T4;
+      return(R46 * *X);
+} 
+
+
+
+/*****************************************************************/
+/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
+/************                                         ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
diff --git a/examples/smpi/NAS/EP-trace/randlc.h b/examples/smpi/NAS/EP-trace/randlc.h
new file mode 100644
index 0000000000..aff84d341a
--- /dev/null
+++ b/examples/smpi/NAS/EP-trace/randlc.h
@@ -0,0 +1,3 @@
+
+double      randlc( double *X, double *A );
+
diff --git a/examples/smpi/NAS/IS-trace/Makefile b/examples/smpi/NAS/IS-trace/Makefile
new file mode 100644
index 0000000000..4911d0ddd5
--- /dev/null
+++ b/examples/smpi/NAS/IS-trace/Makefile
@@ -0,0 +1,22 @@
+SHELL=/bin/sh
+BENCHMARK=is
+BENCHMARKU=IS
+
+include ../config/make.def
+
+include ../sys/make.common
+
+OBJS = is-trace.o ${COMMON}/c_print_results.o
+
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+	${CCOMPILE} $<
+
+is-trace.o: is-trace.c npbparams.h
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f is-trace npbparams.h core
diff --git a/examples/smpi/NAS/IS-trace/is-trace.c b/examples/smpi/NAS/IS-trace/is-trace.c
new file mode 100644
index 0000000000..f85c7c15a0
--- /dev/null
+++ b/examples/smpi/NAS/IS-trace/is-trace.c
@@ -0,0 +1,1158 @@
+/*************************************************************************
+ *                                                                       * 
+ *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
+ *                                                                       * 
+ *                                  I S                                  * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
+ *   It is described in NAS Technical Report 95-020.                     * 
+ *                                                                       * 
+ *   Permission to use, copy, distribute and modify this software        * 
+ *   for any purpose with or without fee is hereby granted.  We          * 
+ *   request, however, that all derived work reference the NAS           * 
+ *   Parallel Benchmarks 3.3. This software is provided "as is"          *
+ *   without express or implied warranty.                                * 
+ *                                                                       * 
+ *   Information on NPB 3.3, including the technical report, the         *
+ *   original specifications, source code, results and information       * 
+ *   on how to submit new results, is available at:                      * 
+ *                                                                       * 
+ *          http://www.nas.nasa.gov/Software/NPB                         * 
+ *                                                                       * 
+ *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
+ *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
+ *                                                                       * 
+ *         NAS Parallel Benchmarks Group                                 * 
+ *         NASA Ames Research Center                                     * 
+ *         Mail Stop: T27A-1                                             * 
+ *         Moffett Field, CA   94035-1000                                * 
+ *                                                                       * 
+ *         E-mail:  npb@nas.nasa.gov                                     * 
+ *         Fax:     (650) 604-3957                                       * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   Author: M. Yarrow                                                   * 
+ *           H. Jin                                                      * 
+ *                                                                       * 
+ *************************************************************************/
+
+#include "mpi.h"
+#include "npbparams.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "instr/instr.h" //TRACE_
+
+/******************/
+/* default values */
+/******************/
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS            1                 
+#endif
+#define MIN_PROCS            1
+
+
+/*************/
+/*  CLASS S  */
+/*************/
+#if CLASS == 'S'
+#define  TOTAL_KEYS_LOG_2    16
+#define  MAX_KEY_LOG_2       11
+#define  NUM_BUCKETS_LOG_2   9
+#endif
+
+
+/*************/
+/*  CLASS W  */
+/*************/
+#if CLASS == 'W'
+#define  TOTAL_KEYS_LOG_2    20
+#define  MAX_KEY_LOG_2       16
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+/*************/
+/*  CLASS A  */
+/*************/
+#if CLASS == 'A'
+#define  TOTAL_KEYS_LOG_2    23
+#define  MAX_KEY_LOG_2       19
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS B  */
+/*************/
+#if CLASS == 'B'
+#define  TOTAL_KEYS_LOG_2    25
+#define  MAX_KEY_LOG_2       21
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS C  */
+/*************/
+#if CLASS == 'C'
+#define  TOTAL_KEYS_LOG_2    27
+#define  MAX_KEY_LOG_2       23
+#define  NUM_BUCKETS_LOG_2   10
+#endif
+
+
+/*************/
+/*  CLASS D  */
+/*************/
+#if CLASS == 'D'
+#define  TOTAL_KEYS_LOG_2    29
+#define  MAX_KEY_LOG_2       27
+#define  NUM_BUCKETS_LOG_2   10
+#undef   MIN_PROCS
+#define  MIN_PROCS           4
+#endif
+
+
+#define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
+#define  MAX_KEY             (1 << MAX_KEY_LOG_2)
+#define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
+#define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
+
+/*****************************************************************/
+/* On larger number of processors, since the keys are (roughly)  */ 
+/* gaussian distributed, the first and last processor sort keys  */ 
+/* in a large interval, requiring array sizes to be larger. Note */
+/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
+/* The required array size also depends on the bucket size used. */
+/* The following values are validated for the 1024-bucket setup. */
+/*****************************************************************/
+#if   NUM_PROCS < 256
+#define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
+#elif NUM_PROCS < 512
+#define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
+#elif NUM_PROCS < 1024
+#define  SIZE_OF_BUFFERS     4*NUM_KEYS
+#else
+#define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
+#endif
+
+/*****************************************************************/
+/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
+/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
+/* MAX_PROCS AT YOUR PERIL                                       */
+/*****************************************************************/
+#if CLASS == 'S'
+#define  MAX_PROCS           128
+#else
+#define  MAX_PROCS           1024
+#endif
+
+#define  MAX_ITERATIONS      10
+#define  TEST_ARRAY_SIZE     5
+
+
+/***********************************/
+/* Enable separate communication,  */
+/* computation timing and printout */
+/***********************************/
+/* #define  TIMING_ENABLED         */
+
+
+/*************************************/
+/* Typedef: if necessary, change the */
+/* size of int here by changing the  */
+/* int type to, say, long            */
+/*************************************/
+typedef  int  INT_TYPE;
+typedef  long INT_TYPE2;
+#define MP_KEY_TYPE MPI_INT
+
+
+typedef struct {
+
+/********************/
+/* MPI properties:  */
+/********************/
+int      my_rank,
+         comm_size;
+
+
+/********************/
+/* Some global info */
+/********************/
+INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
+         total_local_keys,             /* copies of rank info        */
+         total_lesser_keys;
+
+
+int      passed_verification;
+                                 
+
+
+/************************************/
+/* These are the three main arrays. */
+/* See SIZE_OF_BUFFERS def above    */
+/************************************/
+INT_TYPE key_array[SIZE_OF_BUFFERS],    
+         key_buff1[SIZE_OF_BUFFERS],    
+         key_buff2[SIZE_OF_BUFFERS],
+         bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
+         bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
+         bucket_ptrs[NUM_BUCKETS],
+         process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],   
+         process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];   
+int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
+         send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
+
+
+/**********************/
+/* Partial verif info */
+/**********************/
+INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
+         test_rank_array[TEST_ARRAY_SIZE];
+
+/**********/
+/* Timers */
+/**********/
+double start[64], elapsed[64];
+
+} global_data;
+
+
+const INT_TYPE2
+         S_test_index_array[TEST_ARRAY_SIZE] = 
+                             {48427,17148,23627,62548,4431},
+         S_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {0,18,346,64917,65463},
+
+         W_test_index_array[TEST_ARRAY_SIZE] = 
+                             {357773,934767,875723,898999,404505},
+         W_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {1249,11698,1039987,1043896,1048018},
+
+         A_test_index_array[TEST_ARRAY_SIZE] = 
+                             {2112377,662041,5336171,3642833,4250760},
+         A_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {104,17523,123928,8288932,8388264},
+
+         B_test_index_array[TEST_ARRAY_SIZE] = 
+                             {41869,812306,5102857,18232239,26860214},
+         B_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {33422937,10244,59149,33135281,99}, 
+
+         C_test_index_array[TEST_ARRAY_SIZE] = 
+                             {44172927,72999161,74326391,129606274,21736814},
+         C_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {61147,882988,266290,133997595,133525895},
+
+         D_test_index_array[TEST_ARRAY_SIZE] = 
+                             {1317351170,995930646,1157283250,1503301535,1453734525},
+         D_test_rank_array[TEST_ARRAY_SIZE] = 
+                             {1,36538729,1978098519,2145192618,2147425337};
+
+
+
+/***********************/
+/* function prototypes */
+/***********************/
+double	randlc( double *X, double *A );
+
+void full_verify( global_data* gd );
+
+void c_print_results( char   *name,
+                      char   class,
+                      int    n1, 
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      int    nprocs_compiled,
+                      int    nprocs_total,
+                      double t,
+                      double mops,
+		      char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *mpicc,
+                      char   *clink,
+                      char   *cmpi_lib,
+                      char   *cmpi_inc,
+                      char   *cflags,
+                      char   *clinkflags );
+
+void    timer_clear(global_data* gd, int n );
+void    timer_start(global_data* gd, int n );
+void    timer_stop(global_data* gd, int n );
+double  timer_read(global_data* gd, int n );
+
+void    timer_clear(global_data* gd, int n ) {
+   gd->elapsed[n] = 0.0;
+}
+
+void    timer_start(global_data* gd, int n ) {
+   gd->start[n] = MPI_Wtime();
+}
+
+void    timer_stop(global_data* gd, int n ) {
+   gd->elapsed[n] += MPI_Wtime() - gd->start[n];
+}
+
+double  timer_read(global_data* gd, int n ) {
+   return gd->elapsed[n];
+}
+
+
+/*
+ *    FUNCTION RANDLC (X, A)
+ *
+ *  This routine returns a uniform pseudorandom double precision number in the
+ *  range (0, 1) by using the linear congruential generator
+ *
+ *  x_{k+1} = a x_k  (mod 2^46)
+ *
+ *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+ *  before repeating.  The argument A is the same as 'a' in the above formula,
+ *  and X is the same as x_0.  A and X must be odd double precision integers
+ *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
+ *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+ *  the new seed x_1, so that subsequent calls to RANDLC using the same
+ *  arguments will generate a continuous sequence.
+ *
+ *  This routine should produce the same results on any computer with at least
+ *  48 mantissa bits in double precision floating point data.  On Cray systems,
+ *  double precision should be disabled.
+ *
+ *  David H. Bailey     October 26, 1990
+ *
+ *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ *     SAVE KS, R23, R46, T23, T46
+ *     DATA KS/0/
+ *
+ *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
+ *  by merely using the ** operator, in order to insure that the results are
+ *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/*************           R  A  N  D  L  C             ************/
+/*************                                        ************/
+/*************    portable random number generator    ************/
+/*****************************************************************/
+
+double	randlc( double *X, double *A )
+{
+      static int        KS=0;
+      static double	R23, R46, T23, T46;
+      double		T1, T2, T3, T4;
+      double		A1;
+      double		A2;
+      double		X1;
+      double		X2;
+      double		Z;
+      int     		i, j;
+
+      if (KS == 0) 
+      {
+        R23 = 1.0;
+        R46 = 1.0;
+        T23 = 1.0;
+        T46 = 1.0;
+    
+        for (i=1; i<=23; i++)
+        {
+          R23 = 0.50 * R23;
+          T23 = 2.0 * T23;
+        }
+        for (i=1; i<=46; i++)
+        {
+          R46 = 0.50 * R46;
+          T46 = 2.0 * T46;
+        }
+        KS = 1;
+      }
+
+/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
+
+      T1 = R23 * *A;
+      j  = T1;
+      A1 = j;
+      A2 = *A - T23 * A1;
+
+/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
+    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
+
+      T1 = R23 * *X;
+      j  = T1;
+      X1 = j;
+      X2 = *X - T23 * X1;
+      T1 = A1 * X2 + A2 * X1;
+      
+      j  = R23 * T1;
+      T2 = j;
+      Z = T1 - T23 * T2;
+      T3 = T23 * Z + A2 * X2;
+      j  = R46 * T3;
+      T4 = j;
+      *X = T3 - T46 * T4;
+      return(R46 * *X);
+} 
+
+
+
+/*****************************************************************/
+/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
+/************                                         ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
+/*
+ * Create a random number sequence of total length nn residing
+ * on np number of processors.  Each processor will therefore have a 
+ * subsequence of length nn/np.  This routine returns that random 
+ * number which is the first random number for the subsequence belonging
+ * to processor rank kn, and which is used as seed for proc kn ran # gen.
+ */
+
+double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
+                       int  np,       /* np = num procs                      */
+                       long nn,       /* total num of ran numbers, all procs */
+                       double s,      /* Ran num seed, for ex.: 314159265.00 */
+                       double a )     /* Ran num gen mult, try 1220703125.00 */
+{
+
+  long   i;
+
+  double t1,t2,t3,an;
+  long   mq,nq,kk,ik;
+
+
+
+      nq = nn / np;
+
+      for( mq=0; nq>1; mq++,nq/=2 )
+          ;
+
+      t1 = a;
+
+      for( i=1; i<=mq; i++ )
+        t2 = randlc( &t1, &t1 );
+
+      an = t1;
+
+      kk = kn;
+      t1 = s;
+      t2 = an;
+
+      for( i=1; i<=100; i++ )
+      {
+        ik = kk / 2;
+        if( 2 * ik !=  kk ) 
+            t3 = randlc( &t1, &t2 );
+        if( ik == 0 ) 
+            break;
+        t3 = randlc( &t2, &t2 );
+        kk = ik;
+      }
+
+      return( t1 );
+
+}
+
+
+
+
+/*****************************************************************/
+/*************      C  R  E  A  T  E  _  S  E  Q      ************/
+/*****************************************************************/
+
+void	create_seq( global_data* gd, double seed, double a )
+{
+	double x;
+	int    i, k;
+
+        k = MAX_KEY/4;
+
+	for (i=0; i<NUM_KEYS; i++)
+	{
+	    x = randlc(&seed, &a);
+	    x += randlc(&seed, &a);
+    	    x += randlc(&seed, &a);
+	    x += randlc(&seed, &a);  
+
+            gd->key_array[i] = k*x;
+	}
+}
+
+
+
+
+/*****************************************************************/
+/*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
+/*****************************************************************/
+
+
+void full_verify( global_data* gd )
+{
+    MPI_Status  status;
+    MPI_Request request;
+    
+    INT_TYPE    i, j;
+    INT_TYPE    k, last_local_key;
+
+    
+/*  Now, finally, sort the keys:  */
+    for( i=0; i<gd->total_local_keys; i++ )
+        gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
+                                 gd->total_lesser_keys] = gd->key_buff2[i];
+    last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
+
+/*  Send largest key value to next processor  */
+    if( gd->my_rank > 0 )
+        MPI_Irecv( &k,
+                   1,
+                   MP_KEY_TYPE,
+                   gd->my_rank-1,
+                   1000,
+                   MPI_COMM_WORLD,
+                   &request );                   
+    if( gd->my_rank < gd->comm_size-1 )
+        MPI_Send( &gd->key_array[last_local_key],
+                  1,
+                  MP_KEY_TYPE,
+                  gd->my_rank+1,
+                  1000,
+                  MPI_COMM_WORLD );
+    if( gd->my_rank > 0 )
+        MPI_Wait( &request, &status );
+
+/*  Confirm that neighbor's greatest key value 
+    is not greater than my least key value       */              
+    j = 0;
+    if( gd->my_rank > 0 && gd->total_local_keys > 0 )
+        if( k > gd->key_array[0] )
+            j++;
+
+
+/*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
+    for( i=1; i<gd->total_local_keys; i++ )
+        if( gd->key_array[i-1] > gd->key_array[i] )
+            j++;
+
+
+    if( j != 0 )
+    {
+        printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
+                gd->my_rank, j );
+    }
+    else
+        gd->passed_verification++;
+           
+
+}
+
+
+
+
+/*****************************************************************/
+/*************             R  A  N  K             ****************/
+/*****************************************************************/
+
+
+void rank( global_data* gd, int iteration )
+{
+
+    INT_TYPE    i, k;
+
+    INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
+    INT_TYPE    key;
+    INT_TYPE2   bucket_sum_accumulator, j, m;
+    INT_TYPE    local_bucket_sum_accumulator;
+    INT_TYPE    min_key_val, max_key_val;
+    INT_TYPE    *key_buff_ptr;
+
+
+
+
+/*  Iteration alteration of keys */  
+    if(gd->my_rank == 0 )                    
+    {
+      gd->key_array[iteration] = iteration;
+      gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
+    }
+
+
+/*  Initialize */
+    for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )  
+    {
+        gd->bucket_size[i] = 0;
+        gd->bucket_size_totals[i] = 0;
+        gd->process_bucket_distrib_ptr1[i] = 0;
+        gd->process_bucket_distrib_ptr2[i] = 0;
+    }
+
+
+/*  Determine where the partial verify test keys are, load into  */
+/*  top of array bucket_size                                     */
+    for( i=0; i<TEST_ARRAY_SIZE; i++ )
+        if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
+            gd->bucket_size[NUM_BUCKETS+i] = 
+                          gd->key_array[gd->test_index_array[i] % NUM_KEYS];
+
+
+/*  Determine the number of keys in each bucket */
+    for( i=0; i<NUM_KEYS; i++ )
+        gd->bucket_size[gd->key_array[i] >> shift]++;
+
+
+/*  Accumulative bucket sizes are the bucket pointers */
+    gd->bucket_ptrs[0] = 0;
+    for( i=1; i< NUM_BUCKETS; i++ )  
+        gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
+
+
+/*  Sort into appropriate bucket */
+    for( i=0; i<NUM_KEYS; i++ )  
+    {
+        key = gd->key_array[i];
+        gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
+    }
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 2 );
+    timer_start(gd, 3 );
+#endif
+
+/*  Get the bucket size totals for the entire problem. These 
+    will be used to determine the redistribution of keys      */
+    MPI_Allreduce( gd->bucket_size, 
+                   gd->bucket_size_totals, 
+                   NUM_BUCKETS+TEST_ARRAY_SIZE, 
+                   MP_KEY_TYPE,
+                   MPI_SUM,
+                   MPI_COMM_WORLD );
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 3 );
+    timer_start(gd, 2 );
+#endif
+
+/*  Determine Redistibution of keys: accumulate the bucket size totals 
+    till this number surpasses NUM_KEYS (which the average number of keys
+    per processor).  Then all keys in these buckets go to processor 0.
+    Continue accumulating again until supassing 2*NUM_KEYS. All keys
+    in these buckets go to processor 1, etc.  This algorithm guarantees
+    that all processors have work ranking; no processors are left idle.
+    The optimum number of buckets, however, does not result in as high
+    a degree of load balancing (as even a distribution of keys as is
+    possible) as is obtained from increasing the number of buckets, but
+    more buckets results in more computation per processor so that the
+    optimum number of buckets turns out to be 1024 for machines tested.
+    Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
+    number of first and last bucket which each processor will have after   
+    the redistribution is done.                                          */
+
+    bucket_sum_accumulator = 0;
+    local_bucket_sum_accumulator = 0;
+    gd->send_displ[0] = 0;
+    gd->process_bucket_distrib_ptr1[0] = 0;
+    for( i=0, j=0; i<NUM_BUCKETS; i++ )  
+    {
+        bucket_sum_accumulator       += gd->bucket_size_totals[i];
+        local_bucket_sum_accumulator += gd->bucket_size[i];
+        if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )  
+        {
+            gd->send_count[j] = local_bucket_sum_accumulator;
+            if( j != 0 )
+            {
+                gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
+                gd->process_bucket_distrib_ptr1[j] = 
+                                        gd->process_bucket_distrib_ptr2[j-1]+1;
+            }
+            gd->process_bucket_distrib_ptr2[j++] = i;
+            local_bucket_sum_accumulator = 0;
+        }
+    }
+
+/*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
+    that the last few processors don't get any buckets.  So, we
+    need to set counts properly in this case to avoid any fallouts.    */
+    while( j < gd->comm_size )
+    {
+        gd->send_count[j] = 0;
+        gd->process_bucket_distrib_ptr1[j] = 1;
+        j++;
+    }
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 2 );
+    timer_start(gd, 3 ); 
+#endif
+
+/*  This is the redistribution section:  first find out how many keys
+    each processor will send to every other processor:                 */
+    MPI_Alltoall( gd->send_count,
+                  1,
+                  MPI_INT,
+                  gd->recv_count,
+                  1,
+                  MPI_INT,
+                  MPI_COMM_WORLD );
+
+/*  Determine the receive array displacements for the buckets */    
+    gd->recv_displ[0] = 0;
+    for( i=1; i<gd->comm_size; i++ )
+        gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
+
+
+/*  Now send the keys to respective processors  */    
+    MPI_Alltoallv( gd->key_buff1,
+                   gd->send_count,
+                   gd->send_displ,
+                   MP_KEY_TYPE,
+                   gd->key_buff2,
+                   gd->recv_count,
+                   gd->recv_displ,
+                   MP_KEY_TYPE,
+                   MPI_COMM_WORLD );
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 3 ); 
+    timer_start(gd, 2 );
+#endif
+
+/*  The starting and ending bucket numbers on each processor are
+    multiplied by the interval size of the buckets to obtain the 
+    smallest possible min and greatest possible max value of any 
+    key on each processor                                          */
+    min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
+    max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
+
+/*  Clear the work array */
+    for( i=0; i<max_key_val-min_key_val+1; i++ )
+        gd->key_buff1[i] = 0;
+
+/*  Determine the total number of keys on all other 
+    processors holding keys of lesser value         */
+    m = 0;
+    for( k=0; k<gd->my_rank; k++ )
+        for( i= gd->process_bucket_distrib_ptr1[k];
+             i<=gd->process_bucket_distrib_ptr2[k];
+             i++ )  
+            m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
+
+/*  Determine total number of keys on this processor */
+    j = 0;                                 
+    for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
+         i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
+         i++ )  
+        j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
+
+
+/*  Ranking of all keys occurs in this section:                 */
+/*  shift it backwards so no subtractions are necessary in loop */
+    key_buff_ptr = gd->key_buff1 - min_key_val;
+
+/*  In this section, the keys themselves are used as their 
+    own indexes to determine how many of each there are: their
+    individual population                                       */
+    for( i=0; i<j; i++ )
+        key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
+                                       /* population                     */
+
+/*  To obtain ranks of each key, successively add the individual key
+    population, not forgetting the total of lesser keys, m.
+    NOTE: Since the total of lesser keys would be subtracted later 
+    in verification, it is no longer added to the first key population 
+    here, but still needed during the partial verify test.  This is to 
+    ensure that 32-bit key_buff can still be used for class D.           */
+/*    key_buff_ptr[min_key_val] += m;    */
+    for( i=min_key_val; i<max_key_val; i++ )   
+        key_buff_ptr[i+1] += key_buff_ptr[i];  
+
+
+/* This is the partial verify test section */
+/* Observe that test_rank_array vals are   */
+/* shifted differently for different cases */
+    for( i=0; i<TEST_ARRAY_SIZE; i++ )
+    {                                             
+        k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
+        if( min_key_val <= k  &&  k <= max_key_val )
+        {
+            /* Add the total of lesser keys, m, here */
+            INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
+            int failed = 0;
+
+            switch( CLASS )
+            {
+                case 'S':
+                    if( i <= 2 )
+                    {
+                        if( key_rank != gd->test_rank_array[i]+iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'W':
+                    if( i < 2 )
+                    {
+                        if( key_rank != gd->test_rank_array[i]+(iteration-2) )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'A':
+                    if( i <= 2 )
+        	    {
+                        if( key_rank != gd->test_rank_array[i]+(iteration-1) )
+                            failed = 1;
+                        else
+                          gd->passed_verification++;
+        	    }
+                    else
+                    {
+                        if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'B':
+                    if( i == 1 || i == 2 || i == 4 )
+        	    {
+                        if( key_rank != gd->test_rank_array[i]+iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+        	    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'C':
+                    if( i <= 2 )
+        	    {
+                        if( key_rank != gd->test_rank_array[i]+iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+        	    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+                case 'D':
+                    if( i < 2 )
+        	    {
+                        if( key_rank != gd->test_rank_array[i]+iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+        	    }
+                    else
+                    {
+                        if( key_rank != gd->test_rank_array[i]-iteration )
+                            failed = 1;
+                        else
+                            gd->passed_verification++;
+                    }
+                    break;
+            }
+            if( failed == 1 )
+                printf( "Failed partial verification: "
+                        "iteration %d, processor %d, test key %d\n", 
+                         iteration, gd->my_rank, (int)i );
+        }
+    }
+
+
+
+
+/*  Make copies of rank info for use by full_verify: these variables
+    in rank are local; making them global slows down the code, probably
+    since they cannot be made register by compiler                        */
+
+    if( iteration == MAX_ITERATIONS ) 
+    {
+        gd->key_buff_ptr_global = key_buff_ptr;
+        gd->total_local_keys    = j;
+        gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
+    }
+
+}      
+
+
+/*****************************************************************/
+/*************             M  A  I  N             ****************/
+/*****************************************************************/
+
+int main( int argc, char **argv )
+{
+
+    int             i, iteration, itemp;
+
+    double          timecounter, maxtime;
+
+    TRACE_smpi_set_category ("start");
+
+    global_data* gd = malloc(sizeof(global_data));
+/*  Initialize MPI */
+    MPI_Init( &argc, &argv );
+    MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
+    MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
+
+/*  Initialize the verification arrays if a valid class */
+    for( i=0; i<TEST_ARRAY_SIZE; i++ )
+        switch( CLASS )
+        {
+            case 'S':
+                gd->test_index_array[i] = S_test_index_array[i];
+                gd->test_rank_array[i]  = S_test_rank_array[i];
+                break;
+            case 'A':
+                gd->test_index_array[i] = A_test_index_array[i];
+                gd->test_rank_array[i]  = A_test_rank_array[i];
+                break;
+            case 'W':
+                gd->test_index_array[i] = W_test_index_array[i];
+                gd->test_rank_array[i]  = W_test_rank_array[i];
+                break;
+            case 'B':
+                gd->test_index_array[i] = B_test_index_array[i];
+                gd->test_rank_array[i]  = B_test_rank_array[i];
+                break;
+            case 'C':
+                gd->test_index_array[i] = C_test_index_array[i];
+                gd->test_rank_array[i]  = C_test_rank_array[i];
+                break;
+            case 'D':
+                gd->test_index_array[i] = D_test_index_array[i];
+                gd->test_rank_array[i]  = D_test_rank_array[i];
+                break;
+        };
+
+        
+
+/*  Printout initial NPB info */
+    if( gd->my_rank == 0 )
+    {
+        printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
+        printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
+        printf( " Iterations:   %d\n", MAX_ITERATIONS );
+        printf( " Number of processes:     %d\n",gd->comm_size );
+    }
+
+/*  Check that actual and compiled number of processors agree */
+    if( gd->comm_size != NUM_PROCS )
+    {
+        if( gd->my_rank == 0 )
+            printf( "\n ERROR: compiled for %d processes\n"
+                    " Number of active processes: %d\n"
+                    " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
+        MPI_Finalize();
+        exit( 1 );
+    }
+
+/*  Check to see whether total number of processes is within bounds.
+    This could in principle be checked in setparams.c, but it is more
+    convenient to do it here                                               */
+    if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
+    {
+       if( gd->my_rank == 0 )
+           printf( "\n ERROR: number of processes %d not within range %d-%d"
+                   "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
+       MPI_Finalize();
+       exit( 1 );
+    }
+
+
+/*  Generate random number sequence and subsequent keys on all procs */
+    create_seq(gd,  find_my_seed( gd->my_rank, 
+                              gd->comm_size, 
+                              4*(long)TOTAL_KEYS*MIN_PROCS,
+                              314159265.00,      /* Random number gen seed */
+                              1220703125.00 ),   /* Random number gen mult */
+                1220703125.00 );                 /* Random number gen mult */
+
+/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
+    all data and code pages and respective tables */
+    rank(gd, 1 );  
+
+/*  Start verification counter */
+    gd->passed_verification = 0;
+
+    if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
+
+/*  Initialize timer  */             
+    timer_clear(gd, 0 );
+
+/*  Initialize separate communication, computation timing */
+#ifdef  TIMING_ENABLED 
+    for( i=1; i<=3; i++ ) timer_clear(gd, i );
+#endif
+
+/*  Start timer  */             
+    timer_start(gd, 0 );
+
+#ifdef  TIMING_ENABLED
+    timer_start(gd, 1 );
+    timer_start(gd, 2 );
+#endif
+
+/*  This is the main iteration */
+    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
+    {
+        char it_str[100];
+        snprintf (it_str, 100, "i%d", iteration);
+        TRACE_smpi_set_category (it_str);
+
+        if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
+        rank(gd,  iteration );
+    }
+
+    TRACE_smpi_set_category ("finalize");
+
+
+#ifdef  TIMING_ENABLED
+    timer_stop(gd, 2 );
+    timer_stop(gd, 1 );
+#endif
+
+/*  Stop timer, obtain time for processors */
+    timer_stop(gd, 0 );
+
+    timecounter = timer_read(gd, 0 );
+
+/*  End of timing, obtain maximum time of all processors */
+    MPI_Reduce( &timecounter,
+                &maxtime,
+                1,
+                MPI_DOUBLE,
+                MPI_MAX,
+                0,
+                MPI_COMM_WORLD );
+
+#ifdef  TIMING_ENABLED
+    {
+        double    tmin, tsum, tmax;
+    
+        if( my_rank == 0 )
+        {
+            printf( "\ntimer 1/2/3 = total/computation/communication time\n");
+            printf( "              min                avg                max\n" );
+        }
+        for( i=1; i<=3; i++ )
+        {
+            timecounter = timer_read(gd, i );
+            MPI_Reduce( &timecounter,
+                        &tmin,
+                        1,
+                        MPI_DOUBLE,
+                        MPI_MIN,
+                        0,
+                        MPI_COMM_WORLD );
+            MPI_Reduce( &timecounter,
+                        &tsum,
+                        1,
+                        MPI_DOUBLE,
+                        MPI_SUM,
+                        0,
+                        MPI_COMM_WORLD );
+            MPI_Reduce( &timecounter,
+                        &tmax,
+                        1,
+                        MPI_DOUBLE,
+                        MPI_MAX,
+                        0,
+                        MPI_COMM_WORLD );
+            if( my_rank == 0 )
+                printf( "timer %d:    %f           %f            %f\n",
+                        i, tmin, tsum/((double) comm_size), tmax );
+        }
+        if( my_rank == 0 )
+            printf( "\n" );
+    }
+#endif
+
+/*  This tests that keys are in sequence: sorting of last ranked key seq
+    occurs here, but is an untimed operation                             */
+    full_verify(gd);
+
+
+/*  Obtain verification counter sum */
+    itemp =gd->passed_verification;
+    MPI_Reduce( &itemp,
+                &gd->passed_verification,
+                1,
+                MPI_INT,
+                MPI_SUM,
+                0,
+                MPI_COMM_WORLD );
+
+
+
+/*  The final printout  */
+    if( gd->my_rank == 0 )
+    {
+        if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
+            gd->passed_verification = 0;
+        c_print_results( "IS",
+                         CLASS,
+                         (int)(TOTAL_KEYS),
+                         MIN_PROCS,
+                         0,
+                         MAX_ITERATIONS,
+                         NUM_PROCS,
+                         gd->comm_size,
+                         maxtime,
+                         ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
+                                                      /maxtime/1000000.,
+                         "keys ranked", 
+                         gd->passed_verification,
+                         NPBVERSION,
+                         COMPILETIME,
+                         MPICC,
+                         CLINK,
+                         CMPI_LIB,
+                         CMPI_INC,
+                         CFLAGS,
+                         CLINKFLAGS );
+    }
+                    
+    MPI_Finalize();
+    free(gd);
+
+    return 0;
+         /**************************/
+}        /*  E N D  P R O G R A M  */
+         /**************************/
diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile
index 8f356aa71d..0386e2d556 100644
--- a/examples/smpi/NAS/Makefile
+++ b/examples/smpi/NAS/Makefile
@@ -32,6 +32,10 @@ IS: is
 is: header
 	cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
 
+IS-trace: is-trace
+is-trace: header
+	cd IS-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
 CG: cg
 cg: header
 	cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
@@ -40,10 +44,18 @@ EP: ep
 ep: header
 	cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
 
+EP-trace: ep-trace
+ep-trace: header
+	cd EP-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
 DT: dt
 dt: header
 	cd DT; $(MAKE) CLASS=$(CLASS)
 
+DT-trace: dt-trace
+dt-trace: header
+	cd DT-trace; $(MAKE) CLASS=$(CLASS)
+
 # Awk script courtesy cmg@cray.com, modified by Haoqiang Jin
 suite:
 	@ awk -f sys/suite.awk SMAKE=$(MAKE) $(SFILE) | $(SHELL)
-- 
2.20.1