From: pini <pini@48e7efb5-ca39-0410-a469-dd3cf9ba447f>
Date: Thu, 23 Sep 2010 10:01:21 +0000 (+0000)
Subject: Added a version of DT with RAM folding.
X-Git-Tag: v3_5~616
X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/827813f7390e44cb5c35a9d9c6aea722e5d71d6a?ds=sidebyside

Added a version of DT with RAM folding.

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/simgrid/simgrid/trunk@8208 48e7efb5-ca39-0410-a469-dd3cf9ba447f
---

diff --git a/examples/smpi/NAS/DT-folding/DGraph.c b/examples/smpi/NAS/DT-folding/DGraph.c
new file mode 100644
index 0000000000..5d5839df19
--- /dev/null
+++ b/examples/smpi/NAS/DT-folding/DGraph.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "DGraph.h"
+
+DGArc *newArc(DGNode *tl,DGNode *hd){
+  DGArc *ar=(DGArc *)malloc(sizeof(DGArc));
+  ar->tail=tl;
+  ar->head=hd;
+  return ar;
+}
+void arcShow(DGArc *ar){
+  DGNode *tl=(DGNode *)ar->tail,
+         *hd=(DGNode *)ar->head;
+  fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name);
+}
+
+DGNode *newNode(char *nm){
+  DGNode *nd=(DGNode *)malloc(sizeof(DGNode));
+  nd->attribute=0;
+  nd->color=0;
+  nd->inDegree=0;
+  nd->outDegree=0;
+  nd->maxInDegree=SMALL_BLOCK_SIZE;
+  nd->maxOutDegree=SMALL_BLOCK_SIZE;
+  nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*));
+  nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*));
+  nd->name=strdup(nm);
+  nd->feat=NULL;
+  return nd;
+}
+void nodeShow(DGNode* nd){
+  fprintf( stderr,"%3d.%s: (%d,%d)\n",
+	           nd->id,nd->name,nd->inDegree,nd->outDegree);
+/*
+  if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
+  else if(nd->verified==0)  fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
+  else  fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name);   
+*/
+}
+
+DGraph* newDGraph(char* nm){
+  DGraph *dg=(DGraph *)malloc(sizeof(DGraph));
+  dg->numNodes=0;
+  dg->numArcs=0;
+  dg->maxNodes=BLOCK_SIZE;
+  dg->maxArcs=BLOCK_SIZE;
+  dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*));
+  dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*));
+  dg->name=strdup(nm);
+  return dg;
+}
+int AttachNode(DGraph* dg, DGNode* nd) {
+  int i=0,j,len=0;
+  DGNode **nds =NULL, *tmpnd=NULL;
+  DGArc **ar=NULL;
+
+	if (dg->numNodes == dg->maxNodes-1 ) {
+	  dg->maxNodes += BLOCK_SIZE;
+          nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
+	  memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
+	  free(dg->node);
+	  dg->node=nds;
+	}
+
+        len = strlen( nd->name);
+	for (i = 0; i < dg->numNodes; i++) {
+	  tmpnd =dg->node[ i];
+	  ar=NULL;
+	  if ( strlen( tmpnd->name) != len ) continue;
+	  if ( strncmp( nd->name, tmpnd->name, len) ) continue;
+	  if ( nd->inDegree > 0 ) {
+	    tmpnd->maxInDegree += nd->maxInDegree;
+            ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
+	    memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
+	    free(tmpnd->inArc);
+	    tmpnd->inArc=ar;
+	    for (j = 0; j < nd->inDegree; j++ ) {
+	      nd->inArc[ j]->head = tmpnd;
+	    }
+	    memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *));
+	    tmpnd->inDegree += nd->inDegree;
+	  } 	
+	  if ( nd->outDegree > 0 ) {
+	    tmpnd->maxOutDegree += nd->maxOutDegree;
+            ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
+	    memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
+	    free(tmpnd->outArc);
+	    tmpnd->outArc=ar;
+	    for (j = 0; j < nd->outDegree; j++ ) {
+	      nd->outArc[ j]->tail = tmpnd;
+	    }			
+	    memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
+	    tmpnd->outDegree += nd->outDegree;
+	  } 
+	  free(nd); 
+	  return i;
+	}
+	nd->id = dg->numNodes;
+	dg->node[dg->numNodes] = nd;
+	dg->numNodes++;
+return nd->id;
+}
+int AttachArc(DGraph *dg,DGArc* nar){
+int	arcId = -1;
+int i=0,newNumber=0;
+DGNode	*head = nar->head,
+	*tail = nar->tail; 
+DGArc **ars=NULL,*probe=NULL;
+/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
+	if ( !tail || !head ) return arcId;
+	if ( dg->numArcs == dg->maxArcs-1 ) {
+	  dg->maxArcs += BLOCK_SIZE;
+          ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
+	  memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
+	  free(dg->arc);
+	  dg->arc=ars;
+	}
+	for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
+	  probe = tail->outArc[ i];
+	  if(probe->head == head
+	     &&
+	     probe->length == nar->length
+            ){
+            free(nar);
+	    return probe->id;   
+	  }
+	}
+	
+	nar->id = dg->numArcs;
+	arcId=dg->numArcs;
+	dg->arc[dg->numArcs] = nar;
+	dg->numArcs++;
+	
+	head->inArc[ head->inDegree] = nar;
+	head->inDegree++;
+	if ( head->inDegree >= head->maxInDegree ) {
+	  newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
+          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+	  memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
+	  free(head->inArc);
+	  head->inArc=ars;
+	  head->maxInDegree = newNumber;
+	}
+	tail->outArc[ tail->outDegree] = nar;
+	tail->outDegree++;
+	if(tail->outDegree >= tail->maxOutDegree ) {
+	  newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
+          ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+	  memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
+	  free(tail->outArc);
+	  tail->outArc=ars;
+	  tail->maxOutDegree = newNumber;
+	}
+/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
+return arcId;
+}
+void graphShow(DGraph *dg,int DetailsLevel){
+  int i=0,j=0;
+  fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
+  if ( DetailsLevel < 1) return;
+  for (i = 0; i < dg->numNodes; i++ ) {
+    DGNode *focusNode = dg->node[ i];
+    if(DetailsLevel >= 2) {
+      for (j = 0; j < focusNode->inDegree; j++ ) {
+	fprintf(stderr,"\t ");
+	nodeShow(focusNode->inArc[ j]->tail);
+      }
+    }
+    nodeShow(focusNode);
+    if ( DetailsLevel < 2) continue;
+    for (j = 0; j < focusNode->outDegree; j++ ) {
+      fprintf(stderr, "\t ");
+      nodeShow(focusNode->outArc[ j]->head);
+    }	
+    fprintf(stderr, "---\n");
+  }
+  fprintf(stderr,"----------------------------------------\n");
+  if ( DetailsLevel < 3) return;
+}
+
+
+
diff --git a/examples/smpi/NAS/DT-folding/DGraph.h b/examples/smpi/NAS/DT-folding/DGraph.h
new file mode 100644
index 0000000000..f38f898b24
--- /dev/null
+++ b/examples/smpi/NAS/DT-folding/DGraph.h
@@ -0,0 +1,43 @@
+#ifndef _DGRAPH
+#define _DGRAPH
+
+#define BLOCK_SIZE  128
+#define SMALL_BLOCK_SIZE 32
+
+typedef struct{
+  int id;
+  void *tail,*head;
+  int length,width,attribute,maxWidth;
+}DGArc;
+
+typedef struct{
+  int maxInDegree,maxOutDegree;
+  int inDegree,outDegree;
+  int id;
+  char *name;
+  DGArc **inArc,**outArc;
+  int depth,height,width;
+  int color,attribute,address,verified;
+  void *feat;
+}DGNode;
+
+typedef struct{
+  int maxNodes,maxArcs;
+  int id;
+  char *name;
+  int numNodes,numArcs;
+  DGNode **node;
+  DGArc **arc;
+} DGraph;
+
+DGArc *newArc(DGNode *tl,DGNode *hd);
+void arcShow(DGArc *ar);
+DGNode *newNode(char *nm);
+void nodeShow(DGNode* nd);
+
+DGraph* newDGraph(char *nm);
+int AttachNode(DGraph *dg,DGNode *nd);
+int AttachArc(DGraph *dg,DGArc* nar);
+void graphShow(DGraph *dg,int DetailsLevel);
+
+#endif
diff --git a/examples/smpi/NAS/DT-folding/Makefile b/examples/smpi/NAS/DT-folding/Makefile
new file mode 100644
index 0000000000..d1b3702916
--- /dev/null
+++ b/examples/smpi/NAS/DT-folding/Makefile
@@ -0,0 +1,26 @@
+SHELL=/bin/sh
+BENCHMARK=dt
+BENCHMARKU=DT
+
+include ../config/make.def
+
+include ../sys/make.common
+#Override PROGRAM
+DTPROGRAM  = $(BINDIR)/$(BENCHMARK)-folding.$(CLASS)
+
+OBJS = dt.o DGraph.o \
+	${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
+
+
+${PROGRAM}: config ${OBJS}
+	${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+	${CCOMPILE} $<
+
+dt.o:             dt.c  npbparams.h
+DGraph.o:	DGraph.c DGraph.h
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f dt npbparams.h core
diff --git a/examples/smpi/NAS/DT-folding/README b/examples/smpi/NAS/DT-folding/README
new file mode 100644
index 0000000000..873e3ae6f2
--- /dev/null
+++ b/examples/smpi/NAS/DT-folding/README
@@ -0,0 +1,22 @@
+Data Traffic benchmark DT is new in the NPB suite 
+(released as part of NPB3.x-MPI package).
+----------------------------------------------------
+
+DT is written in C and same executable can run on any number of processors,
+provided this number is not less than the number of nodes in the communication
+graph.  DT benchmark takes one argument: BH, WH, or SH. This argument 
+specifies the communication graph Black Hole, White Hole, or SHuffle 
+respectively. The current release contains verification numbers for 
+CLASSES S, W, A, and B only.  Classes C and D are defined, but verification 
+numbers are not provided in this release.
+
+The following table summarizes the number of nodes in the communication
+graph based on CLASS and graph TYPE.
+
+CLASS  N_Source N_Nodes(BH,WH) N_Nodes(SH)
+ S      4        5              12
+ W      8        11             32
+ A      16       21             80
+ B      32       43             192
+ C      64       85             448
+ D      128      171            1024
diff --git a/examples/smpi/NAS/DT-folding/dt.c b/examples/smpi/NAS/DT-folding/dt.c
new file mode 100644
index 0000000000..d080bde52b
--- /dev/null
+++ b/examples/smpi/NAS/DT-folding/dt.c
@@ -0,0 +1,759 @@
+/*************************************************************************
+ *                                                                       * 
+ *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
+ *                                                                       * 
+ *                                  D T					 * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
+ *                                                                       * 
+ *   Permission to use, copy, distribute and modify this software        * 
+ *   for any purpose with or without fee is hereby granted.  We          * 
+ *   request, however, that all derived work reference the NAS           * 
+ *   Parallel Benchmarks 3.3. This software is provided "as is"          *
+ *   without express or implied warranty.                                * 
+ *                                                                       * 
+ *   Information on NPB 3.3, including the technical report, the         *
+ *   original specifications, source code, results and information       * 
+ *   on how to submit new results, is available at:                      * 
+ *                                                                       * 
+ *          http:  www.nas.nasa.gov/Software/NPB                         * 
+ *                                                                       * 
+ *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
+ *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
+ *                                                                       * 
+ *         NAS Parallel Benchmarks Group                                 * 
+ *         NASA Ames Research Center                                     * 
+ *         Mail Stop: T27A-1                                             * 
+ *         Moffett Field, CA   94035-1000                                * 
+ *                                                                       * 
+ *         E-mail:  npb@nas.nasa.gov                                     * 
+ *         Fax:     (650) 604-3957                                       * 
+ *                                                                       * 
+ ************************************************************************* 
+ *                                                                       * 
+ *   Author: M. Frumkin							 *						 * 
+ *                                                                       * 
+ *************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS            1                 
+#endif
+
+//int      passed_verification;
+extern double randlc( double *X, double *A );
+extern
+void c_print_results( char   *name,
+                      char   class,
+                      int    n1, 
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      int    nprocs_compiled,
+                      int    nprocs_total,
+                      double t,
+                      double mops,
+		      char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *mpicc,
+                      char   *clink,
+                      char   *cmpi_lib,
+                      char   *cmpi_inc,
+                      char   *cflags,
+                      char   *clinkflags );
+		      
+void    timer_clear( int n );
+void    timer_start( int n );
+void    timer_stop( int n );
+double  timer_read( int n );
+int timer_on=0,timers_tot=64;
+
+int verify(char *bmname,double rnm2){
+    double verify_value=0.0;
+    double epsilon=1.0E-8;
+    char cls=CLASS;
+    int verified=-1;
+    if (cls != 'U') {
+       if(cls=='S') {
+         if(strstr(bmname,"BH")){
+           verify_value=30892725.0;
+         }else if(strstr(bmname,"WH")){
+           verify_value=67349758.0;
+         }else if(strstr(bmname,"SH")){
+           verify_value=58875767.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = 0;
+       }else if(cls=='W') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 4102461.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 204280762.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 186944764.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = 0;
+       }else if(cls=='A') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 17809491.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 1289925229.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 610856482.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+  	 verified = 0;
+       }else if(cls=='B') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 4317114.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 7877279917.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 1836863082.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+  	   verified = 0;
+         }
+       }else if(cls=='C') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 0.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+  	   verified = -1;
+         }
+       }else if(cls=='D') {
+         if(strstr(bmname,"BH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"WH")){
+  	   verify_value = 0.0;
+         }else if(strstr(bmname,"SH")){
+  	   verify_value = 0.0;
+         }else{
+           fprintf(stderr,"No such benchmark as %s.\n",bmname);
+         }
+         verified = -1;
+       }else{
+         fprintf(stderr,"No such class as %c.\n",cls);
+       }
+       fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
+       if(verified==-1){
+  	 fprintf(stderr," No verification was performed.\n");
+       }else if( rnm2 - verify_value < epsilon &&
+                 rnm2 - verify_value > -epsilon) {  /* abs here does not work on ALTIX */
+  	  verified = 1;
+  	  fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
+       }else{
+  	 verified = 0;
+  	 fprintf(stderr," The correct verification value = %f\n",verify_value);
+  	 fprintf(stderr," Got value = %f\n",rnm2);
+       }
+    }else{
+       verified = -1;
+    }
+    return  verified;  
+  }
+
+int ipowMod(int a,long long int n,int md){ 
+  int seed=1,q=a,r=1;
+  if(n<0){
+    fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
+    n=-n; /* temp fix */
+/*    return 1; */
+  }
+  if(md<=0){
+    fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
+    return 1;
+  }
+  if(n==0) return 1;
+  while(n>1){
+    int n2 = n/2;
+    if (n2*2==n){
+       seed = (q*q)%md;
+       q=seed;
+       n = n2;
+    }else{
+       seed = (r*q)%md;
+       r=seed;
+       n = n-1;
+    }
+  }
+  seed = (r*q)%md;
+  return seed;
+}
+
+#include "DGraph.h"
+DGraph *buildSH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  DGraph *dg;
+  int numSources=NUM_SOURCES; /* must be power of 2 */
+  int numOfLayers=0,tmpS=numSources>>1;
+  int firstLayerNode=0;
+  DGArc *ar=NULL;
+  DGNode *nd=NULL;
+  int mask=0x0,ndid=0,ndoff=0;
+  int i=0,j=0;
+  char nm[BLOCK_SIZE];
+  
+  sprintf(nm,"DT_SH.%c",cls);
+  dg=newDGraph(nm);
+
+  while(tmpS>1){
+    numOfLayers++;
+    tmpS>>=1;
+  }
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Source.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  for(j=0;j<numOfLayers;j++){
+    mask=0x00000001<<j;
+    for(i=0;i<numSources;i++){
+      sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
+      nd=newNode(nm);
+      AttachNode(dg,nd);
+      ndoff=i&(~mask);
+      ndid=firstLayerNode+ndoff;
+      ar=newArc(dg->node[ndid],nd);     
+      AttachArc(dg,ar);
+      ndoff+=mask;
+      ndid=firstLayerNode+ndoff;
+      ar=newArc(dg->node[ndid],nd);     
+      AttachArc(dg,ar);
+    }
+    firstLayerNode+=numSources;
+  }
+  mask=0x00000001<<numOfLayers;
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Sink.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+    ndoff=i&(~mask);
+    ndid=firstLayerNode+ndoff;
+    ar=newArc(dg->node[ndid],nd);     
+    AttachArc(dg,ar);
+    ndoff+=mask;
+    ndid=firstLayerNode+ndoff;
+    ar=newArc(dg->node[ndid],nd);     
+    AttachArc(dg,ar);
+  }
+return dg;
+}
+DGraph *buildWH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  int i=0,j=0;
+  int numSources=NUM_SOURCES,maxInDeg=4;
+  int numLayerNodes=numSources,firstLayerNode=0;
+  int totComparators=0;
+  int numPrevLayerNodes=numLayerNodes;
+  int id=0,sid=0;
+  DGraph *dg;
+  DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
+  DGArc *ar=NULL;
+  char nm[BLOCK_SIZE];
+
+  sprintf(nm,"DT_WH.%c",cls);
+  dg=newDGraph(nm);
+
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Sink.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  totComparators=0;
+  numPrevLayerNodes=numLayerNodes;
+  while(numLayerNodes>maxInDeg){
+    numLayerNodes=numLayerNodes/maxInDeg;
+    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+    for(i=0;i<numLayerNodes;i++){
+      sprintf(nm,"Comparator.%d",totComparators);
+      totComparators++;
+      nd=newNode(nm);
+      id=AttachNode(dg,nd);
+      for(j=0;j<maxInDeg;j++){
+        sid=i*maxInDeg+j;
+	if(sid>=numPrevLayerNodes) break;
+        snd=dg->node[firstLayerNode+sid];
+        ar=newArc(dg->node[id],snd);
+        AttachArc(dg,ar);
+      }
+    }
+    firstLayerNode+=numPrevLayerNodes;
+    numPrevLayerNodes=numLayerNodes;
+  }
+  source=newNode("Source");
+  AttachNode(dg,source);   
+  for(i=0;i<numPrevLayerNodes;i++){
+    nd=dg->node[firstLayerNode+i];
+    ar=newArc(source,nd);
+    AttachArc(dg,ar);
+  }
+
+  for(i=0;i<dg->numNodes/2;i++){  /* Topological sorting */
+    tmp=dg->node[i];
+    dg->node[i]=dg->node[dg->numNodes-1-i];
+    dg->node[i]->id=i;
+    dg->node[dg->numNodes-1-i]=tmp;
+    dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
+  }
+return dg;
+}
+DGraph *buildBH(char cls){
+/*
+  Nodes of the graph must be topologically sorted
+  to avoid MPI deadlock.
+*/
+  int i=0,j=0;
+  int numSources=NUM_SOURCES,maxInDeg=4;
+  int numLayerNodes=numSources,firstLayerNode=0;
+  DGraph *dg;
+  DGNode *nd=NULL, *snd=NULL, *sink=NULL;
+  DGArc *ar=NULL;
+  int totComparators=0;
+  int numPrevLayerNodes=numLayerNodes;
+  int id=0, sid=0;
+  char nm[BLOCK_SIZE];
+
+  sprintf(nm,"DT_BH.%c",cls);
+  dg=newDGraph(nm);
+
+  for(i=0;i<numSources;i++){
+    sprintf(nm,"Source.%d",i);
+    nd=newNode(nm);
+    AttachNode(dg,nd);
+  }
+  while(numLayerNodes>maxInDeg){
+    numLayerNodes=numLayerNodes/maxInDeg;
+    if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+    for(i=0;i<numLayerNodes;i++){
+      sprintf(nm,"Comparator.%d",totComparators);
+      totComparators++;
+      nd=newNode(nm);
+      id=AttachNode(dg,nd);
+      for(j=0;j<maxInDeg;j++){
+        sid=i*maxInDeg+j;
+	if(sid>=numPrevLayerNodes) break;
+        snd=dg->node[firstLayerNode+sid];
+        ar=newArc(snd,dg->node[id]);
+        AttachArc(dg,ar);
+      }
+    }
+    firstLayerNode+=numPrevLayerNodes;
+    numPrevLayerNodes=numLayerNodes;
+  }
+  sink=newNode("Sink");
+  AttachNode(dg,sink);   
+  for(i=0;i<numPrevLayerNodes;i++){
+    nd=dg->node[firstLayerNode+i];
+    ar=newArc(nd,sink);
+    AttachArc(dg,ar);
+  }
+return dg;
+}
+
+typedef struct{
+  int len;
+  double* val;
+} Arr;
+Arr *newArr(int len){
+  Arr *arr=(Arr *)SMPI_SHARED_MALLOC(sizeof(Arr));
+  arr->len=len;
+  arr->val=(double *)SMPI_SHARED_MALLOC(len*sizeof(double));
+  return arr;
+}
+void arrShow(Arr* a){
+  if(!a) fprintf(stderr,"-- NULL array\n");
+  else{
+    fprintf(stderr,"-- length=%d\n",a->len);
+  }
+}
+double CheckVal(Arr *feat){
+  double csum=0.0;
+  int i=0;
+  for(i=0;i<feat->len;i++){
+    csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since 
+                                                  result will be 0 for large len  */
+  }
+   return csum;
+}
+int GetFNumDPar(int* mean, int* stdev){
+  *mean=NUM_SAMPLES;
+  *stdev=STD_DEVIATION;
+  return 0;
+}
+int GetFeatureNum(char *mbname,int id){
+  double tran=314159265.0;
+  double A=2*id+1;
+  double denom=randlc(&tran,&A);
+  char cval='S';
+  int mean=NUM_SAMPLES,stdev=128;
+  int rtfs=0,len=0;
+  GetFNumDPar(&mean,&stdev);
+  rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
+  if(rtfs<0) rtfs=-rtfs;
+  len=mean-stdev+rtfs;
+  return len;
+}
+Arr* RandomFeatures(char *bmname,int fdim,int id){
+  int len=GetFeatureNum(bmname,id)*fdim;
+  Arr* feat=newArr(len);
+  int nxg=2,nyg=2,nzg=2,nfg=5;
+  int nx=421,ny=419,nz=1427,nf=3527;
+  long long int expon=(len*(id+1))%3141592;
+  int seedx=ipowMod(nxg,expon,nx),
+      seedy=ipowMod(nyg,expon,ny),
+      seedz=ipowMod(nzg,expon,nz),
+      seedf=ipowMod(nfg,expon,nf);
+  int i=0;
+  if(timer_on){
+    timer_clear(id+1);
+    timer_start(id+1);
+  }
+  for(i=0;i<len;i+=fdim){
+    seedx=(seedx*nxg)%nx;
+    seedy=(seedy*nyg)%ny;
+    seedz=(seedz*nzg)%nz;
+    seedf=(seedf*nfg)%nf;
+    feat->val[i]=seedx;
+    feat->val[i+1]=seedy;
+    feat->val[i+2]=seedz;
+    feat->val[i+3]=seedf;
+  }
+  if(timer_on){
+    timer_stop(id+1);
+    fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
+  }
+  return feat;   
+}
+void Resample(Arr *a,int blen){
+    long long int i=0,j=0,jlo=0,jhi=0;
+    double avval=0.0;
+    double *nval=(double *)SMPI_SHARED_MALLOC(blen*sizeof(double));
+    Arr *tmp=newArr(10);
+    for(i=0;i<blen;i++) nval[i]=0.0;
+    for(i=1;i<a->len-1;i++){
+      jlo=(int)(0.5*(2*i-1)*(blen/a->len)); 
+      jhi=(int)(0.5*(2*i+1)*(blen/a->len));
+
+      avval=a->val[i]/(jhi-jlo+1);    
+      for(j=jlo;j<=jhi;j++){
+        nval[j]+=avval;
+      }
+    }
+    nval[0]=a->val[0];
+    nval[blen-1]=a->val[a->len-1];
+    SMPI_SHARED_FREE(a->val);
+    a->val=nval;
+    a->len=blen;
+}
+#define fielddim 4
+Arr* WindowFilter(Arr *a, Arr* b,int w){
+  int i=0,j=0,k=0;
+  double rms0=0.0,rms1=0.0,rmsm1=0.0;
+  double weight=((double) (w+1))/(w+2);
+ 
+  w+=1;
+  if(timer_on){
+    timer_clear(w);
+    timer_start(w);
+  }
+  if(a->len<b->len) Resample(a,b->len);
+  if(a->len>b->len) Resample(b,a->len);
+  for(i=fielddim;i<a->len-fielddim;i+=fielddim){
+    rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
+	+(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
+	+(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
+	+(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
+    j=i+fielddim;
+    rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+    	+(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+    	+(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+    	+(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+    j=i-fielddim;
+    rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+	 +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+	 +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+	 +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+    k=0;
+    if(rms1<rms0){
+      k=1;
+      rms0=rms1;
+    }
+    if(rmsm1<rms0) k=-1;
+    if(k==0){
+      j=i+fielddim;
+      a->val[i]=weight*b->val[i];
+      a->val[i+1]=weight*b->val[i+1];
+      a->val[i+2]=weight*b->val[i+2];
+      a->val[i+3]=weight*b->val[i+3];  
+    }else if(k==1){
+      j=i+fielddim;
+      a->val[i]=weight*b->val[j];
+      a->val[i+1]=weight*b->val[j+1];
+      a->val[i+2]=weight*b->val[j+2];
+      a->val[i+3]=weight*b->val[j+3];  
+    }else { /*if(k==-1)*/
+      j=i-fielddim;
+      a->val[i]=weight*b->val[j];
+      a->val[i+1]=weight*b->val[j+1];
+      a->val[i+2]=weight*b->val[j+2];
+      a->val[i+3]=weight*b->val[j+3];  
+    }	   
+  }
+  if(timer_on){
+    timer_stop(w);
+    fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
+  }
+  return a;
+}
+
+int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
+  int i=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *head=NULL;
+  if(!feat) return 0;
+  for(i=0;i<nd->outDegree;i++){
+    ar=nd->outArc[i];
+    if(ar->tail!=nd) continue;
+    head=ar->head;
+    tag=ar->id;
+    if(head->address!=nd->address){
+      MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
+      MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
+    }
+  }
+  return 1;
+}
+Arr* CombineStreams(DGraph *dg,DGNode *nd){
+  Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
+  int i=0,len=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *tail=NULL;
+  MPI_Status status;
+  Arr *feat=NULL,*featp=NULL;
+
+  if(nd->inDegree==0) return NULL;
+  for(i=0;i<nd->inDegree;i++){
+    ar=nd->inArc[i];
+    if(ar->head!=nd) continue;
+    tail=ar->tail;
+    if(tail->address!=nd->address){
+      len=0;
+      tag=ar->id;
+      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+      feat=newArr(len);
+      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+      resfeat=WindowFilter(resfeat,feat,nd->id);
+      SMPI_SHARED_FREE(feat);
+    }else{
+      featp=(Arr *)tail->feat;
+      feat=newArr(featp->len);
+      memcpy(feat->val,featp->val,featp->len*sizeof(double));
+      resfeat=WindowFilter(resfeat,feat,nd->id);  
+      SMPI_SHARED_FREE(feat);
+    }
+  }
+  for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
+  nd->feat=resfeat;
+  return nd->feat;
+}
+double Reduce(Arr *a,int w){
+  double retv=0.0;
+  if(timer_on){
+    timer_clear(w);
+    timer_start(w);
+  }
+  retv=(int)(w*CheckVal(a));/* The casting needed for node  
+                               and array dependent verifcation */
+  if(timer_on){
+    timer_stop(w);
+    fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
+  }
+  return retv;
+}
+
+double ReduceStreams(DGraph *dg,DGNode *nd){
+  double csum=0.0;
+  int i=0,len=0,tag=0;
+  DGArc *ar=NULL;
+  DGNode *tail=NULL;
+  Arr *feat=NULL;
+  double retv=0.0;
+
+  for(i=0;i<nd->inDegree;i++){
+    ar=nd->inArc[i];
+    if(ar->head!=nd) continue;
+    tail=ar->tail;
+    if(tail->address!=nd->address){
+      MPI_Status status;
+      len=0;
+      tag=ar->id;
+      MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+      feat=newArr(len);
+      MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+      csum+=Reduce(feat,(nd->id+1));  
+      SMPI_SHARED_FREE(feat);
+    }else{
+      csum+=Reduce(tail->feat,(nd->id+1));  
+    }
+  }
+  if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
+  retv=(nd->id+1)*csum;
+  return retv;
+}
+
+int ProcessNodes(DGraph *dg,int me){
+  double chksum=0.0;
+  Arr *feat=NULL;
+  int i=0,verified=0,tag;
+  DGNode *nd=NULL;
+  double rchksum=0.0;
+  MPI_Status status;
+
+  for(i=0;i<dg->numNodes;i++){
+    nd=dg->node[i];
+    if(nd->address!=me) continue;
+    if(strstr(nd->name,"Source")){
+      nd->feat=RandomFeatures(dg->name,fielddim,nd->id); 
+      SendResults(dg,nd,nd->feat);
+    }else if(strstr(nd->name,"Sink")){
+      chksum=ReduceStreams(dg,nd);
+      tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+      MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
+    }else{
+      feat=CombineStreams(dg,nd);
+      SendResults(dg,nd,feat);
+    }
+  }
+  if(me==0){ /* Report node */
+    rchksum=0.0;
+    chksum=0.0;
+    for(i=0;i<dg->numNodes;i++){
+      nd=dg->node[i];
+      if(!strstr(nd->name,"Sink")) continue;
+       tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+      MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
+      chksum+=rchksum;
+    }
+    verified=verify(dg->name,chksum);
+  }
+return verified;
+}
+
+int main(int argc,char **argv ){
+  int my_rank,comm_size;
+  int i;
+  DGraph *dg=NULL;
+  int verified=0, featnum=0;
+  double bytes_sent=2.0,tot_time=0.0;
+
+    MPI_Init( &argc, &argv );
+    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+     if(argc!=2||
+                (  strncmp(argv[1],"BH",2)!=0
+                 &&strncmp(argv[1],"WH",2)!=0
+                 &&strncmp(argv[1],"SH",2)!=0
+                )
+      ){
+      if(my_rank==0){
+        fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
+        fprintf(stderr,"** Where \n   - N is integer number of MPI processes\n");
+        fprintf(stderr,"   - S is the class S, W, or A \n");
+        fprintf(stderr,"   - GraphName is the communication graph name BH, WH, or SH.\n");
+        fprintf(stderr,"   - the number of MPI processes N should not be be less than \n");
+        fprintf(stderr,"     the number of nodes in the graph\n");
+      }
+      MPI_Finalize();
+      exit(0);
+    } 
+   if(strncmp(argv[1],"BH",2)==0){
+      dg=buildBH(CLASS);
+    }else if(strncmp(argv[1],"WH",2)==0){
+      dg=buildWH(CLASS);
+    }else if(strncmp(argv[1],"SH",2)==0){
+      dg=buildSH(CLASS);
+    }
+
+    if(timer_on&&dg->numNodes+1>timers_tot){
+      timer_on=0;
+      if(my_rank==0)
+        fprintf(stderr,"Not enough timers. Node timeing is off. \n");
+    }
+    if(dg->numNodes>comm_size){
+      if(my_rank==0){
+        fprintf(stderr,"**  The number of MPI processes should not be less than \n");
+        fprintf(stderr,"**  the number of nodes in the graph\n");
+        fprintf(stderr,"**  Number of MPI processes = %d\n",comm_size);
+        fprintf(stderr,"**  Number nodes in the graph = %d\n",dg->numNodes);
+      }
+      MPI_Finalize();
+      exit(0);
+    }
+    for(i=0;i<dg->numNodes;i++){ 
+      dg->node[i]->address=i;
+    }
+    if( my_rank == 0 ){
+      printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
+      graphShow(dg,0);
+      timer_clear(0);
+      timer_start(0);
+    }
+    verified=ProcessNodes(dg,my_rank);
+    
+    featnum=NUM_SAMPLES*fielddim;
+    bytes_sent=featnum*dg->numArcs;
+    bytes_sent/=1048576;
+    if(my_rank==0){
+      timer_stop(0);
+      tot_time=timer_read(0);
+      c_print_results( dg->name,
+        	       CLASS,
+        	       featnum,
+        	       0,
+        	       0,
+        	       dg->numNodes,
+        	       0,
+        	       comm_size,
+        	       tot_time,
+        	       bytes_sent/tot_time,
+        	       "bytes transmitted", 
+        	       verified,
+        	       NPBVERSION,
+        	       COMPILETIME,
+        	       MPICC,
+        	       CLINK,
+        	       CMPI_LIB,
+        	       CMPI_INC,
+        	       CFLAGS,
+        	       CLINKFLAGS );
+    }          
+    MPI_Finalize();
+  return 1;
+}
diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile
index 0386e2d556..794191c451 100644
--- a/examples/smpi/NAS/Makefile
+++ b/examples/smpi/NAS/Makefile
@@ -56,6 +56,10 @@ DT-trace: dt-trace
 dt-trace: header
 	cd DT-trace; $(MAKE) CLASS=$(CLASS)
 
+DT-folding: dt-folding
+dt-folding: header
+	cd DT-folding; $(MAKE) CLASS=$(CLASS)
+
 # Awk script courtesy cmg@cray.com, modified by Haoqiang Jin
 suite:
 	@ awk -f sys/suite.awk SMAKE=$(MAKE) $(SFILE) | $(SHELL)