From e6e323a7749044351eada87127f0658c9a8c937d Mon Sep 17 00:00:00 2001 From: schnorr Date: Fri, 17 Sep 2010 09:26:52 +0000 Subject: [PATCH] creating instrumented versions of DT, EP and IS benchmarks details: - new directories created to avoid the modification of the original .c files - the binaries created have the same name as the binaries generated by original benchmarks - instrumentation consists basically in: - #include "instr/instr.h" - use TRACE_smpi_set_category to declare and set a category to monitor resource utilization - execute with -trace parameter - no instrumentation needed if only a traditional gantt-chart is needed - in this case, original versions of benchmarks must be executed with -trace parameter only git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/simgrid/simgrid/trunk@8196 48e7efb5-ca39-0410-a469-dd3cf9ba447f --- examples/smpi/NAS/DT-trace/DGraph.c | 184 ++++ examples/smpi/NAS/DT-trace/DGraph.h | 43 + examples/smpi/NAS/DT-trace/Makefile | 26 + examples/smpi/NAS/DT-trace/README | 22 + examples/smpi/NAS/DT-trace/dt-trace.c | 767 ++++++++++++++++ examples/smpi/NAS/EP-trace/Makefile | 28 + examples/smpi/NAS/EP-trace/README | 6 + examples/smpi/NAS/EP-trace/ep-trace.c | 449 ++++++++++ examples/smpi/NAS/EP-trace/mpinpb.h | 9 + examples/smpi/NAS/EP-trace/randlc.c | 107 +++ examples/smpi/NAS/EP-trace/randlc.h | 3 + examples/smpi/NAS/IS-trace/Makefile | 22 + examples/smpi/NAS/IS-trace/is-trace.c | 1158 +++++++++++++++++++++++++ examples/smpi/NAS/Makefile | 12 + 14 files changed, 2836 insertions(+) create mode 100644 examples/smpi/NAS/DT-trace/DGraph.c create mode 100644 examples/smpi/NAS/DT-trace/DGraph.h create mode 100644 examples/smpi/NAS/DT-trace/Makefile create mode 100644 examples/smpi/NAS/DT-trace/README create mode 100644 examples/smpi/NAS/DT-trace/dt-trace.c create mode 100644 examples/smpi/NAS/EP-trace/Makefile create mode 100644 examples/smpi/NAS/EP-trace/README create mode 100644 examples/smpi/NAS/EP-trace/ep-trace.c create mode 100644 examples/smpi/NAS/EP-trace/mpinpb.h create mode 100644 examples/smpi/NAS/EP-trace/randlc.c create mode 100644 examples/smpi/NAS/EP-trace/randlc.h create mode 100644 examples/smpi/NAS/IS-trace/Makefile create mode 100644 examples/smpi/NAS/IS-trace/is-trace.c diff --git a/examples/smpi/NAS/DT-trace/DGraph.c b/examples/smpi/NAS/DT-trace/DGraph.c new file mode 100644 index 0000000000..5d5839df19 --- /dev/null +++ b/examples/smpi/NAS/DT-trace/DGraph.c @@ -0,0 +1,184 @@ +#include +#include +#include + +#include "DGraph.h" + +DGArc *newArc(DGNode *tl,DGNode *hd){ + DGArc *ar=(DGArc *)malloc(sizeof(DGArc)); + ar->tail=tl; + ar->head=hd; + return ar; +} +void arcShow(DGArc *ar){ + DGNode *tl=(DGNode *)ar->tail, + *hd=(DGNode *)ar->head; + fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name); +} + +DGNode *newNode(char *nm){ + DGNode *nd=(DGNode *)malloc(sizeof(DGNode)); + nd->attribute=0; + nd->color=0; + nd->inDegree=0; + nd->outDegree=0; + nd->maxInDegree=SMALL_BLOCK_SIZE; + nd->maxOutDegree=SMALL_BLOCK_SIZE; + nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*)); + nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*)); + nd->name=strdup(nm); + nd->feat=NULL; + return nd; +} +void nodeShow(DGNode* nd){ + fprintf( stderr,"%3d.%s: (%d,%d)\n", + nd->id,nd->name,nd->inDegree,nd->outDegree); +/* + if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name); + else if(nd->verified==0) fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name); + else fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name); +*/ +} + +DGraph* newDGraph(char* nm){ + DGraph *dg=(DGraph *)malloc(sizeof(DGraph)); + dg->numNodes=0; + dg->numArcs=0; + dg->maxNodes=BLOCK_SIZE; + dg->maxArcs=BLOCK_SIZE; + dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*)); + dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*)); + dg->name=strdup(nm); + return dg; +} +int AttachNode(DGraph* dg, DGNode* nd) { + int i=0,j,len=0; + DGNode **nds =NULL, *tmpnd=NULL; + DGArc **ar=NULL; + + if (dg->numNodes == dg->maxNodes-1 ) { + dg->maxNodes += BLOCK_SIZE; + nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*)); + memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*)); + free(dg->node); + dg->node=nds; + } + + len = strlen( nd->name); + for (i = 0; i < dg->numNodes; i++) { + tmpnd =dg->node[ i]; + ar=NULL; + if ( strlen( tmpnd->name) != len ) continue; + if ( strncmp( nd->name, tmpnd->name, len) ) continue; + if ( nd->inDegree > 0 ) { + tmpnd->maxInDegree += nd->maxInDegree; + ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*)); + memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*)); + free(tmpnd->inArc); + tmpnd->inArc=ar; + for (j = 0; j < nd->inDegree; j++ ) { + nd->inArc[ j]->head = tmpnd; + } + memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *)); + tmpnd->inDegree += nd->inDegree; + } + if ( nd->outDegree > 0 ) { + tmpnd->maxOutDegree += nd->maxOutDegree; + ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*)); + memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*)); + free(tmpnd->outArc); + tmpnd->outArc=ar; + for (j = 0; j < nd->outDegree; j++ ) { + nd->outArc[ j]->tail = tmpnd; + } + memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *)); + tmpnd->outDegree += nd->outDegree; + } + free(nd); + return i; + } + nd->id = dg->numNodes; + dg->node[dg->numNodes] = nd; + dg->numNodes++; +return nd->id; +} +int AttachArc(DGraph *dg,DGArc* nar){ +int arcId = -1; +int i=0,newNumber=0; +DGNode *head = nar->head, + *tail = nar->tail; +DGArc **ars=NULL,*probe=NULL; +/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */ + if ( !tail || !head ) return arcId; + if ( dg->numArcs == dg->maxArcs-1 ) { + dg->maxArcs += BLOCK_SIZE; + ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*)); + memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*)); + free(dg->arc); + dg->arc=ars; + } + for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */ + probe = tail->outArc[ i]; + if(probe->head == head + && + probe->length == nar->length + ){ + free(nar); + return probe->id; + } + } + + nar->id = dg->numArcs; + arcId=dg->numArcs; + dg->arc[dg->numArcs] = nar; + dg->numArcs++; + + head->inArc[ head->inDegree] = nar; + head->inDegree++; + if ( head->inDegree >= head->maxInDegree ) { + newNumber = head->maxInDegree + SMALL_BLOCK_SIZE; + ars =(DGArc **) calloc(newNumber,sizeof(DGArc*)); + memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*)); + free(head->inArc); + head->inArc=ars; + head->maxInDegree = newNumber; + } + tail->outArc[ tail->outDegree] = nar; + tail->outDegree++; + if(tail->outDegree >= tail->maxOutDegree ) { + newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE; + ars =(DGArc **) calloc(newNumber,sizeof(DGArc*)); + memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*)); + free(tail->outArc); + tail->outArc=ars; + tail->maxOutDegree = newNumber; + } +/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/ +return arcId; +} +void graphShow(DGraph *dg,int DetailsLevel){ + int i=0,j=0; + fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs); + if ( DetailsLevel < 1) return; + for (i = 0; i < dg->numNodes; i++ ) { + DGNode *focusNode = dg->node[ i]; + if(DetailsLevel >= 2) { + for (j = 0; j < focusNode->inDegree; j++ ) { + fprintf(stderr,"\t "); + nodeShow(focusNode->inArc[ j]->tail); + } + } + nodeShow(focusNode); + if ( DetailsLevel < 2) continue; + for (j = 0; j < focusNode->outDegree; j++ ) { + fprintf(stderr, "\t "); + nodeShow(focusNode->outArc[ j]->head); + } + fprintf(stderr, "---\n"); + } + fprintf(stderr,"----------------------------------------\n"); + if ( DetailsLevel < 3) return; +} + + + diff --git a/examples/smpi/NAS/DT-trace/DGraph.h b/examples/smpi/NAS/DT-trace/DGraph.h new file mode 100644 index 0000000000..f38f898b24 --- /dev/null +++ b/examples/smpi/NAS/DT-trace/DGraph.h @@ -0,0 +1,43 @@ +#ifndef _DGRAPH +#define _DGRAPH + +#define BLOCK_SIZE 128 +#define SMALL_BLOCK_SIZE 32 + +typedef struct{ + int id; + void *tail,*head; + int length,width,attribute,maxWidth; +}DGArc; + +typedef struct{ + int maxInDegree,maxOutDegree; + int inDegree,outDegree; + int id; + char *name; + DGArc **inArc,**outArc; + int depth,height,width; + int color,attribute,address,verified; + void *feat; +}DGNode; + +typedef struct{ + int maxNodes,maxArcs; + int id; + char *name; + int numNodes,numArcs; + DGNode **node; + DGArc **arc; +} DGraph; + +DGArc *newArc(DGNode *tl,DGNode *hd); +void arcShow(DGArc *ar); +DGNode *newNode(char *nm); +void nodeShow(DGNode* nd); + +DGraph* newDGraph(char *nm); +int AttachNode(DGraph *dg,DGNode *nd); +int AttachArc(DGraph *dg,DGArc* nar); +void graphShow(DGraph *dg,int DetailsLevel); + +#endif diff --git a/examples/smpi/NAS/DT-trace/Makefile b/examples/smpi/NAS/DT-trace/Makefile new file mode 100644 index 0000000000..8953b82516 --- /dev/null +++ b/examples/smpi/NAS/DT-trace/Makefile @@ -0,0 +1,26 @@ +SHELL=/bin/sh +BENCHMARK=dt +BENCHMARKU=DT + +include ../config/make.def + +include ../sys/make.common +#Override PROGRAM +DTPROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS) + +OBJS = dt-trace.o DGraph.o \ + ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o + + +${PROGRAM}: config ${OBJS} + ${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB} + +.c.o: + ${CCOMPILE} $< + +dt-trace.o: dt-trace.c npbparams.h +DGraph.o: DGraph.c DGraph.h + +clean: + - rm -f *.o *~ mputil* + - rm -f dt npbparams.h core diff --git a/examples/smpi/NAS/DT-trace/README b/examples/smpi/NAS/DT-trace/README new file mode 100644 index 0000000000..873e3ae6f2 --- /dev/null +++ b/examples/smpi/NAS/DT-trace/README @@ -0,0 +1,22 @@ +Data Traffic benchmark DT is new in the NPB suite +(released as part of NPB3.x-MPI package). +---------------------------------------------------- + +DT is written in C and same executable can run on any number of processors, +provided this number is not less than the number of nodes in the communication +graph. DT benchmark takes one argument: BH, WH, or SH. This argument +specifies the communication graph Black Hole, White Hole, or SHuffle +respectively. The current release contains verification numbers for +CLASSES S, W, A, and B only. Classes C and D are defined, but verification +numbers are not provided in this release. + +The following table summarizes the number of nodes in the communication +graph based on CLASS and graph TYPE. + +CLASS N_Source N_Nodes(BH,WH) N_Nodes(SH) + S 4 5 12 + W 8 11 32 + A 16 21 80 + B 32 43 192 + C 64 85 448 + D 128 171 1024 diff --git a/examples/smpi/NAS/DT-trace/dt-trace.c b/examples/smpi/NAS/DT-trace/dt-trace.c new file mode 100644 index 0000000000..89ab4de079 --- /dev/null +++ b/examples/smpi/NAS/DT-trace/dt-trace.c @@ -0,0 +1,767 @@ +/************************************************************************* + * * + * N A S P A R A L L E L B E N C H M A R K S 3.3 * + * * + * D T * + * * + ************************************************************************* + * * + * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. * + * * + * Permission to use, copy, distribute and modify this software * + * for any purpose with or without fee is hereby granted. We * + * request, however, that all derived work reference the NAS * + * Parallel Benchmarks 3.3. This software is provided "as is" * + * without express or implied warranty. * + * * + * Information on NPB 3.3, including the technical report, the * + * original specifications, source code, results and information * + * on how to submit new results, is available at: * + * * + * http: www.nas.nasa.gov/Software/NPB * + * * + * Send comments or suggestions to npb@nas.nasa.gov * + * Send bug reports to npb-bugs@nas.nasa.gov * + * * + * NAS Parallel Benchmarks Group * + * NASA Ames Research Center * + * Mail Stop: T27A-1 * + * Moffett Field, CA 94035-1000 * + * * + * E-mail: npb@nas.nasa.gov * + * Fax: (650) 604-3957 * + * * + ************************************************************************* + * * + * Author: M. Frumkin * * + * * + *************************************************************************/ + +#include +#include +#include + +#include "mpi.h" +#include "npbparams.h" + +#include "instr/instr.h" //TRACE_ + +#ifndef CLASS +#define CLASS 'S' +#define NUM_PROCS 1 +#endif + +//int passed_verification; +extern double randlc( double *X, double *A ); +extern +void c_print_results( char *name, + char class, + int n1, + int n2, + int n3, + int niter, + int nprocs_compiled, + int nprocs_total, + double t, + double mops, + char *optype, + int passed_verification, + char *npbversion, + char *compiletime, + char *mpicc, + char *clink, + char *cmpi_lib, + char *cmpi_inc, + char *cflags, + char *clinkflags ); + +void timer_clear( int n ); +void timer_start( int n ); +void timer_stop( int n ); +double timer_read( int n ); +int timer_on=0,timers_tot=64; + +int verify(char *bmname,double rnm2){ + double verify_value=0.0; + double epsilon=1.0E-8; + char cls=CLASS; + int verified=-1; + if (cls != 'U') { + if(cls=='S') { + if(strstr(bmname,"BH")){ + verify_value=30892725.0; + }else if(strstr(bmname,"WH")){ + verify_value=67349758.0; + }else if(strstr(bmname,"SH")){ + verify_value=58875767.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + } + verified = 0; + }else if(cls=='W') { + if(strstr(bmname,"BH")){ + verify_value = 4102461.0; + }else if(strstr(bmname,"WH")){ + verify_value = 204280762.0; + }else if(strstr(bmname,"SH")){ + verify_value = 186944764.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + } + verified = 0; + }else if(cls=='A') { + if(strstr(bmname,"BH")){ + verify_value = 17809491.0; + }else if(strstr(bmname,"WH")){ + verify_value = 1289925229.0; + }else if(strstr(bmname,"SH")){ + verify_value = 610856482.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + } + verified = 0; + }else if(cls=='B') { + if(strstr(bmname,"BH")){ + verify_value = 4317114.0; + }else if(strstr(bmname,"WH")){ + verify_value = 7877279917.0; + }else if(strstr(bmname,"SH")){ + verify_value = 1836863082.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + verified = 0; + } + }else if(cls=='C') { + if(strstr(bmname,"BH")){ + verify_value = 0.0; + }else if(strstr(bmname,"WH")){ + verify_value = 0.0; + }else if(strstr(bmname,"SH")){ + verify_value = 0.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + verified = -1; + } + }else if(cls=='D') { + if(strstr(bmname,"BH")){ + verify_value = 0.0; + }else if(strstr(bmname,"WH")){ + verify_value = 0.0; + }else if(strstr(bmname,"SH")){ + verify_value = 0.0; + }else{ + fprintf(stderr,"No such benchmark as %s.\n",bmname); + } + verified = -1; + }else{ + fprintf(stderr,"No such class as %c.\n",cls); + } + fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2); + if(verified==-1){ + fprintf(stderr," No verification was performed.\n"); + }else if( rnm2 - verify_value < epsilon && + rnm2 - verify_value > -epsilon) { /* abs here does not work on ALTIX */ + verified = 1; + fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value)); + }else{ + verified = 0; + fprintf(stderr," The correct verification value = %f\n",verify_value); + fprintf(stderr," Got value = %f\n",rnm2); + } + }else{ + verified = -1; + } + return verified; + } + +int ipowMod(int a,long long int n,int md){ + int seed=1,q=a,r=1; + if(n<0){ + fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n); + n=-n; /* temp fix */ +/* return 1; */ + } + if(md<=0){ + fprintf(stderr,"ipowMod: module must be positive mod=%d",md); + return 1; + } + if(n==0) return 1; + while(n>1){ + int n2 = n/2; + if (n2*2==n){ + seed = (q*q)%md; + q=seed; + n = n2; + }else{ + seed = (r*q)%md; + r=seed; + n = n-1; + } + } + seed = (r*q)%md; + return seed; +} + +#include "DGraph.h" +DGraph *buildSH(char cls){ +/* + Nodes of the graph must be topologically sorted + to avoid MPI deadlock. +*/ + DGraph *dg; + int numSources=NUM_SOURCES; /* must be power of 2 */ + int numOfLayers=0,tmpS=numSources>>1; + int firstLayerNode=0; + DGArc *ar=NULL; + DGNode *nd=NULL; + int mask=0x0,ndid=0,ndoff=0; + int i=0,j=0; + char nm[BLOCK_SIZE]; + + sprintf(nm,"DT_SH.%c",cls); + dg=newDGraph(nm); + + while(tmpS>1){ + numOfLayers++; + tmpS>>=1; + } + for(i=0;inode[ndid],nd); + AttachArc(dg,ar); + ndoff+=mask; + ndid=firstLayerNode+ndoff; + ar=newArc(dg->node[ndid],nd); + AttachArc(dg,ar); + } + firstLayerNode+=numSources; + } + mask=0x00000001<node[ndid],nd); + AttachArc(dg,ar); + ndoff+=mask; + ndid=firstLayerNode+ndoff; + ar=newArc(dg->node[ndid],nd); + AttachArc(dg,ar); + } +return dg; +} +DGraph *buildWH(char cls){ +/* + Nodes of the graph must be topologically sorted + to avoid MPI deadlock. +*/ + int i=0,j=0; + int numSources=NUM_SOURCES,maxInDeg=4; + int numLayerNodes=numSources,firstLayerNode=0; + int totComparators=0; + int numPrevLayerNodes=numLayerNodes; + int id=0,sid=0; + DGraph *dg; + DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL; + DGArc *ar=NULL; + char nm[BLOCK_SIZE]; + + sprintf(nm,"DT_WH.%c",cls); + dg=newDGraph(nm); + + for(i=0;imaxInDeg){ + numLayerNodes=numLayerNodes/maxInDeg; + if(numLayerNodes*maxInDeg=numPrevLayerNodes) break; + snd=dg->node[firstLayerNode+sid]; + ar=newArc(dg->node[id],snd); + AttachArc(dg,ar); + } + } + firstLayerNode+=numPrevLayerNodes; + numPrevLayerNodes=numLayerNodes; + } + source=newNode("Source"); + AttachNode(dg,source); + for(i=0;inode[firstLayerNode+i]; + ar=newArc(source,nd); + AttachArc(dg,ar); + } + + for(i=0;inumNodes/2;i++){ /* Topological sorting */ + tmp=dg->node[i]; + dg->node[i]=dg->node[dg->numNodes-1-i]; + dg->node[i]->id=i; + dg->node[dg->numNodes-1-i]=tmp; + dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i; + } +return dg; +} +DGraph *buildBH(char cls){ +/* + Nodes of the graph must be topologically sorted + to avoid MPI deadlock. +*/ + int i=0,j=0; + int numSources=NUM_SOURCES,maxInDeg=4; + int numLayerNodes=numSources,firstLayerNode=0; + DGraph *dg; + DGNode *nd=NULL, *snd=NULL, *sink=NULL; + DGArc *ar=NULL; + int totComparators=0; + int numPrevLayerNodes=numLayerNodes; + int id=0, sid=0; + char nm[BLOCK_SIZE]; + + sprintf(nm,"DT_BH.%c",cls); + dg=newDGraph(nm); + + for(i=0;imaxInDeg){ + numLayerNodes=numLayerNodes/maxInDeg; + if(numLayerNodes*maxInDeg=numPrevLayerNodes) break; + snd=dg->node[firstLayerNode+sid]; + ar=newArc(snd,dg->node[id]); + AttachArc(dg,ar); + } + } + firstLayerNode+=numPrevLayerNodes; + numPrevLayerNodes=numLayerNodes; + } + sink=newNode("Sink"); + AttachNode(dg,sink); + for(i=0;inode[firstLayerNode+i]; + ar=newArc(nd,sink); + AttachArc(dg,ar); + } +return dg; +} + +typedef struct{ + int len; + double* val; +} Arr; +Arr *newArr(int len){ + Arr *arr=(Arr *)malloc(sizeof(Arr)); + arr->len=len; + arr->val=(double *)malloc(len*sizeof(double)); + return arr; +} +void arrShow(Arr* a){ + if(!a) fprintf(stderr,"-- NULL array\n"); + else{ + fprintf(stderr,"-- length=%d\n",a->len); + } +} +double CheckVal(Arr *feat){ + double csum=0.0; + int i=0; + for(i=0;ilen;i++){ + csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since + result will be 0 for large len */ + } + return csum; +} +int GetFNumDPar(int* mean, int* stdev){ + *mean=NUM_SAMPLES; + *stdev=STD_DEVIATION; + return 0; +} +int GetFeatureNum(char *mbname,int id){ + double tran=314159265.0; + double A=2*id+1; + double denom=randlc(&tran,&A); + char cval='S'; + int mean=NUM_SAMPLES,stdev=128; + int rtfs=0,len=0; + GetFNumDPar(&mean,&stdev); + rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev); + if(rtfs<0) rtfs=-rtfs; + len=mean-stdev+rtfs; + return len; +} +Arr* RandomFeatures(char *bmname,int fdim,int id){ + int len=GetFeatureNum(bmname,id)*fdim; + Arr* feat=newArr(len); + int nxg=2,nyg=2,nzg=2,nfg=5; + int nx=421,ny=419,nz=1427,nf=3527; + long long int expon=(len*(id+1))%3141592; + int seedx=ipowMod(nxg,expon,nx), + seedy=ipowMod(nyg,expon,ny), + seedz=ipowMod(nzg,expon,nz), + seedf=ipowMod(nfg,expon,nf); + int i=0; + if(timer_on){ + timer_clear(id+1); + timer_start(id+1); + } + for(i=0;ival[i]=seedx; + feat->val[i+1]=seedy; + feat->val[i+2]=seedz; + feat->val[i+3]=seedf; + } + if(timer_on){ + timer_stop(id+1); + fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1)); + } + return feat; +} +void Resample(Arr *a,int blen){ + long long int i=0,j=0,jlo=0,jhi=0; + double avval=0.0; + double *nval=(double *)malloc(blen*sizeof(double)); + Arr *tmp=newArr(10); + for(i=0;ilen-1;i++){ + jlo=(int)(0.5*(2*i-1)*(blen/a->len)); + jhi=(int)(0.5*(2*i+1)*(blen/a->len)); + + avval=a->val[i]/(jhi-jlo+1); + for(j=jlo;j<=jhi;j++){ + nval[j]+=avval; + } + } + nval[0]=a->val[0]; + nval[blen-1]=a->val[a->len-1]; + free(a->val); + a->val=nval; + a->len=blen; +} +#define fielddim 4 +Arr* WindowFilter(Arr *a, Arr* b,int w){ + int i=0,j=0,k=0; + double rms0=0.0,rms1=0.0,rmsm1=0.0; + double weight=((double) (w+1))/(w+2); + + w+=1; + if(timer_on){ + timer_clear(w); + timer_start(w); + } + if(a->lenlen) Resample(a,b->len); + if(a->len>b->len) Resample(b,a->len); + for(i=fielddim;ilen-fielddim;i+=fielddim){ + rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i]) + +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1]) + +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2]) + +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]); + j=i+fielddim; + rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j]) + +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1]) + +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2]) + +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]); + j=i-fielddim; + rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j]) + +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1]) + +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2]) + +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]); + k=0; + if(rms1val[i]=weight*b->val[i]; + a->val[i+1]=weight*b->val[i+1]; + a->val[i+2]=weight*b->val[i+2]; + a->val[i+3]=weight*b->val[i+3]; + }else if(k==1){ + j=i+fielddim; + a->val[i]=weight*b->val[j]; + a->val[i+1]=weight*b->val[j+1]; + a->val[i+2]=weight*b->val[j+2]; + a->val[i+3]=weight*b->val[j+3]; + }else { /*if(k==-1)*/ + j=i-fielddim; + a->val[i]=weight*b->val[j]; + a->val[i+1]=weight*b->val[j+1]; + a->val[i+2]=weight*b->val[j+2]; + a->val[i+3]=weight*b->val[j+3]; + } + } + if(timer_on){ + timer_stop(w); + fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w)); + } + return a; +} + +int SendResults(DGraph *dg,DGNode *nd,Arr *feat){ + int i=0,tag=0; + DGArc *ar=NULL; + DGNode *head=NULL; + if(!feat) return 0; + for(i=0;ioutDegree;i++){ + ar=nd->outArc[i]; + if(ar->tail!=nd) continue; + head=ar->head; + tag=ar->id; + if(head->address!=nd->address){ + MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD); + MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD); + } + } + return 1; +} +Arr* CombineStreams(DGraph *dg,DGNode *nd){ + Arr *resfeat=newArr(NUM_SAMPLES*fielddim); + int i=0,len=0,tag=0; + DGArc *ar=NULL; + DGNode *tail=NULL; + MPI_Status status; + Arr *feat=NULL,*featp=NULL; + + if(nd->inDegree==0) return NULL; + for(i=0;iinDegree;i++){ + ar=nd->inArc[i]; + if(ar->head!=nd) continue; + tail=ar->tail; + if(tail->address!=nd->address){ + len=0; + tag=ar->id; + MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status); + feat=newArr(len); + MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status); + resfeat=WindowFilter(resfeat,feat,nd->id); + free(feat); + }else{ + featp=(Arr *)tail->feat; + feat=newArr(featp->len); + memcpy(feat->val,featp->val,featp->len*sizeof(double)); + resfeat=WindowFilter(resfeat,feat,nd->id); + free(feat); + } + } + for(i=0;ilen;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree; + nd->feat=resfeat; + return nd->feat; +} +double Reduce(Arr *a,int w){ + double retv=0.0; + if(timer_on){ + timer_clear(w); + timer_start(w); + } + retv=(int)(w*CheckVal(a));/* The casting needed for node + and array dependent verifcation */ + if(timer_on){ + timer_stop(w); + fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w)); + } + return retv; +} + +double ReduceStreams(DGraph *dg,DGNode *nd){ + double csum=0.0; + int i=0,len=0,tag=0; + DGArc *ar=NULL; + DGNode *tail=NULL; + Arr *feat=NULL; + double retv=0.0; + + for(i=0;iinDegree;i++){ + ar=nd->inArc[i]; + if(ar->head!=nd) continue; + tail=ar->tail; + if(tail->address!=nd->address){ + MPI_Status status; + len=0; + tag=ar->id; + MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status); + feat=newArr(len); + MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status); + csum+=Reduce(feat,(nd->id+1)); + free(feat); + }else{ + csum+=Reduce(tail->feat,(nd->id+1)); + } + } + if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree); + retv=(nd->id+1)*csum; + return retv; +} + +int ProcessNodes(DGraph *dg,int me){ + double chksum=0.0; + Arr *feat=NULL; + int i=0,verified=0,tag; + DGNode *nd=NULL; + double rchksum=0.0; + MPI_Status status; + + for(i=0;inumNodes;i++){ + nd=dg->node[i]; + if(nd->address!=me) continue; + if(strstr(nd->name,"Source")){ + nd->feat=RandomFeatures(dg->name,fielddim,nd->id); + SendResults(dg,nd,nd->feat); + }else if(strstr(nd->name,"Sink")){ + chksum=ReduceStreams(dg,nd); + tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */ + MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD); + }else{ + feat=CombineStreams(dg,nd); + SendResults(dg,nd,feat); + } + } + if(me==0){ /* Report node */ + rchksum=0.0; + chksum=0.0; + for(i=0;inumNodes;i++){ + nd=dg->node[i]; + if(!strstr(nd->name,"Sink")) continue; + tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */ + MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status); + chksum+=rchksum; + } + verified=verify(dg->name,chksum); + } +return verified; +} + +int main(int argc,char **argv ){ + int my_rank,comm_size; + int i; + DGraph *dg=NULL; + int verified=0, featnum=0; + double bytes_sent=2.0,tot_time=0.0; + + + TRACE_smpi_set_category ("start"); + + MPI_Init( &argc, &argv ); + MPI_Comm_rank( MPI_COMM_WORLD, &my_rank ); + MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); + + if(argc!=2|| + ( strncmp(argv[1],"BH",2)!=0 + &&strncmp(argv[1],"WH",2)!=0 + &&strncmp(argv[1],"SH",2)!=0 + ) + ){ + if(my_rank==0){ + fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n"); + fprintf(stderr,"** Where \n - N is integer number of MPI processes\n"); + fprintf(stderr," - S is the class S, W, or A \n"); + fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n"); + fprintf(stderr," - the number of MPI processes N should not be be less than \n"); + fprintf(stderr," the number of nodes in the graph\n"); + } + MPI_Finalize(); + exit(0); + } + if(strncmp(argv[1],"BH",2)==0){ + dg=buildBH(CLASS); + }else if(strncmp(argv[1],"WH",2)==0){ + dg=buildWH(CLASS); + }else if(strncmp(argv[1],"SH",2)==0){ + dg=buildSH(CLASS); + } + + if(timer_on&&dg->numNodes+1>timers_tot){ + timer_on=0; + if(my_rank==0) + fprintf(stderr,"Not enough timers. Node timeing is off. \n"); + } + if(dg->numNodes>comm_size){ + if(my_rank==0){ + fprintf(stderr,"** The number of MPI processes should not be less than \n"); + fprintf(stderr,"** the number of nodes in the graph\n"); + fprintf(stderr,"** Number of MPI processes = %d\n",comm_size); + fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes); + } + MPI_Finalize(); + exit(0); + } + for(i=0;inumNodes;i++){ + dg->node[i]->address=i; + } + if( my_rank == 0 ){ + printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" ); + graphShow(dg,0); + timer_clear(0); + timer_start(0); + } + + TRACE_smpi_set_category ("dt"); + verified=ProcessNodes(dg,my_rank); + TRACE_smpi_set_category ("finalize"); + + featnum=NUM_SAMPLES*fielddim; + bytes_sent=featnum*dg->numArcs; + bytes_sent/=1048576; + if(my_rank==0){ + timer_stop(0); + tot_time=timer_read(0); + c_print_results( dg->name, + CLASS, + featnum, + 0, + 0, + dg->numNodes, + 0, + comm_size, + tot_time, + bytes_sent/tot_time, + "bytes transmitted", + verified, + NPBVERSION, + COMPILETIME, + MPICC, + CLINK, + CMPI_LIB, + CMPI_INC, + CFLAGS, + CLINKFLAGS ); + } + MPI_Finalize(); + return 1; +} diff --git a/examples/smpi/NAS/EP-trace/Makefile b/examples/smpi/NAS/EP-trace/Makefile new file mode 100644 index 0000000000..6194dcfe97 --- /dev/null +++ b/examples/smpi/NAS/EP-trace/Makefile @@ -0,0 +1,28 @@ +SHELL=/bin/sh +BENCHMARK=ep +BENCHMARKU=EP + +include ../config/make.def + +#OBJS = ep-trace.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o +OBJS = ep-trace.o randlc.o + +include ../sys/make.common + +${PROGRAM}: config ${OBJS} +# ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} + ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB} + + +#ep-trace.o: ep-trace.f mpinpb.h npbparams.h +# ${FCOMPILE} ep-trace.f + +ep-trace.o: ep-trace.c randlc.c mpinpb.h npbparams.h + ${CCOMPILE} ep-trace.c + +clean: + - rm -f *.o *~ + - rm -f npbparams.h core + + + diff --git a/examples/smpi/NAS/EP-trace/README b/examples/smpi/NAS/EP-trace/README new file mode 100644 index 0000000000..6eb36571af --- /dev/null +++ b/examples/smpi/NAS/EP-trace/README @@ -0,0 +1,6 @@ +This code implements the random-number generator described in the +NAS Parallel Benchmark document RNR Technical Report RNR-94-007. +The code is "embarrassingly" parallel in that no communication is +required for the generation of the random numbers itself. There is +no special requirement on the number of processors used for running +the benchmark. diff --git a/examples/smpi/NAS/EP-trace/ep-trace.c b/examples/smpi/NAS/EP-trace/ep-trace.c new file mode 100644 index 0000000000..a0cec1b10a --- /dev/null +++ b/examples/smpi/NAS/EP-trace/ep-trace.c @@ -0,0 +1,449 @@ +#include +#include +#include +#include + +#include "mpi.h" +#include "npbparams.h" + +#include "instr/instr.h" //TRACE_ + +#include "randlc.h" + +#ifndef CLASS +#define CLASS 'S' +#define NUM_PROCS 1 +#endif +#define true 1 +#define false 0 + + +//---NOTE : all the timers function have been modified to +// avoid global timers (privatize these). + // ----------------------- timers --------------------- + void timer_clear(double *onetimer) { + //elapsed[n] = 0.0; + *onetimer = 0.0; + } + + void timer_start(double *onetimer) { + *onetimer = MPI_Wtime(); + } + + void timer_stop(int n,double *elapsed,double *start) { + double t, now; + + now = MPI_Wtime(); + t = now - start[n]; + elapsed[n] += t; + } + + double timer_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */ + return(elapsed[n]); + } + /******************************************************************** + ***************** V R A N L C ****************** + ***************** *****************/ + double vranlc(int n, double x, double a, double *y) + { + int i; + long i246m1=0x00003FFFFFFFFFFF; + long LLx, Lx, La; + double d2m46; + +// This doesn't work, because the compiler does the calculation in 32 +// bits and overflows. No standard way (without f90 stuff) to specify +// that the rhs should be done in 64 bit arithmetic. +// parameter(i246m1=2**46-1) + + d2m46=pow(0.5,46); + +// c Note that the v6 compiler on an R8000 does something stupid with +// c the above. Using the following instead (or various other things) +// c makes the calculation run almost 10 times as fast. +// +// c save d2m46 +// c data d2m46/0.0d0/ +// c if (d2m46 .eq. 0.0d0) then +// c d2m46 = 0.5d0**46 +// c endif + + Lx = (long)x; + La = (long)a; + //fprintf(stdout,("================== Vranlc ================"); + //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La); + LLx = Lx; + for (i=0; i< n; i++) { + Lx = Lx*La & i246m1 ; + LLx = Lx; + y[i] = d2m46 * (double)LLx; + /* + if(i == 0) { + fprintf(stdout,("After loop 0:"); + fprintf(stdout,("Lx = " + Lx + ", La = " + La); + fprintf(stdout,("d2m46 = " + d2m46); + fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue()); + fprintf(stdout,("Y[0]" + y[0]); + } + */ + } + + x = (double)LLx; + /* + fprintf(stdout,("Change: Lx = " + Lx); + fprintf(stdout,("=============End Vranlc ================"); + */ + return x; + } + + + +//-------------- the core (unique function) ----------- + void doTest(int argc, char **argv) { + double dum[3] = {1.,1.,1.}; + double x1, x2, sx, sy, tm, an, tt, gc; + double Mops; + double epsilon=1.0E-8, a = 1220703125., s=271828183.; + double t1, t2, t3, t4; + double sx_verify_value, sy_verify_value, sx_err, sy_err; + +#include "npbparams.h" + int mk=16, + // --> set by make : in npbparams.h + //m=28, // for CLASS=A + //m=30, // for CLASS=B + //npm=2, // NPROCS + mm = m-mk, + nn = (int)(pow(2,mm)), + nk = (int)(pow(2,mk)), + nq=10, + np, + node, + no_nodes, + i, + ik, + kk, + l, + k, nit, no_large_nodes, + np_add, k_offset, j; + int me, nprocs, root=0, dp_type; + int verified, + timers_enabled=true; + char size[500]; // mind the size of the string to represent a big number + + //Use in randlc.. + int KS = 0; + double R23, R46, T23, T46; + + double *qq = (double *) malloc (10000*sizeof(double)); + double *start = (double *) malloc (64*sizeof(double)); + double *elapsed = (double *) malloc (64*sizeof(double)); + + double *x = (double *) malloc (2*nk*sizeof(double)); + double *q = (double *) malloc (nq*sizeof(double)); + + TRACE_smpi_set_category ("start"); + + MPI_Init( &argc, &argv ); + MPI_Comm_size( MPI_COMM_WORLD, &no_nodes); + MPI_Comm_rank( MPI_COMM_WORLD, &node); + +#ifdef USE_MPE + MPE_Init_log(); +#endif + root = 0; + if (node == root ) { + + /* Because the size of the problem is too large to store in a 32-bit + * integer for some classes, we put it into a string (for printing). + * Have to strip off the decimal point put in there by the floating + * point print statement (internal file) + */ + fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark"); + sprintf(size,"%d",pow(2,m+1)); + //size = size.replace('.', ' '); + fprintf(stdout," Number of random numbers generated: %s\n",size); + fprintf(stdout," Number of active processes: %d\n",no_nodes); + + } + verified = false; + + /* c Compute the number of "batches" of random number pairs generated + c per processor. Adjust if the number of processors does not evenly + c divide the total number +*/ + + np = nn / no_nodes; + no_large_nodes = nn % no_nodes; + if (node < no_large_nodes) np_add = 1; + else np_add = 0; + np = np + np_add; + + if (np == 0) { + fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn); + MPI_Abort(MPI_COMM_WORLD,1); + exit(0); + } + +/* c Call the random number generator functions and initialize + c the x-array to reduce the effects of paging on the timings. + c Also, call all mathematical functions that are used. Make + c sure these initializations cannot be eliminated as dead code. +*/ + + //call vranlc(0, dum[1], dum[2], dum[3]); + // Array indexes start at 1 in Fortran, 0 in Java + vranlc(0, dum[0], dum[1], &(dum[2])); + + dum[0] = randlc(&(dum[1]),&(dum[2])); + ///////////////////////////////// + for (i=0;i<2*nk;i++) { + x[i] = -1e99; + } + Mops = log(sqrt(abs(1))); + + /* + c--------------------------------------------------------------------- + c Synchronize before placing time stamp + c--------------------------------------------------------------------- + */ + MPI_Barrier( MPI_COMM_WORLD ); + + + TRACE_smpi_set_category ("ep"); + + timer_clear(&(elapsed[1])); + timer_clear(&(elapsed[2])); + timer_clear(&(elapsed[3])); + timer_start(&(start[1])); + + t1 = a; + //fprintf(stdout,("(ep.f:160) t1 = " + t1); + t1 = vranlc(0, t1, a, x); + //fprintf(stdout,("(ep.f:161) t1 = " + t1); + + +/* c Compute AN = A ^ (2 * NK) (mod 2^46). */ + + t1 = a; + //fprintf(stdout,("(ep.f:165) t1 = " + t1); + for (i=1; i <= mk+1; i++) { + t2 = randlc(&t1, &t1); + //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1); + } + an = t1; + //fprintf(stdout,("(ep.f:172) s = " + s); + tt = s; + gc = 0.; + sx = 0.; + sy = 0.; + for (i=0; i < nq ; i++) { + q[i] = 0.; + } + +/* + Each instance of this loop may be performed independently. We compute + the k offsets separately to take into account the fact that some nodes + have more numbers to generate than others +*/ + + if (np_add == 1) + k_offset = node * np -1; + else + k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1; + + int stop = false; + for(k = 1; k <= np; k++) { + stop = false; + kk = k_offset + k ; + t1 = s; + //fprintf(stdout,("(ep.f:193) t1 = " + t1); + t2 = an; + +// Find starting seed t1 for this kk. + + for (i=1;i<=100 && !stop;i++) { + ik = kk / 2; + //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk); + if (2 * ik != kk) { + t3 = randlc(&t1, &t2); + //fprintf(stdout,("(ep.f:200) t1= " +t1 ); + } + if (ik==0) + stop = true; + else { + t3 = randlc(&t2, &t2); + kk = ik; + } + } +// Compute uniform pseudorandom numbers. + + //if (timers_enabled) timer_start(3); + timer_start(&(start[3])); + //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified + + //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<"); + //fprintf(stdout,"2*nk = " + (2*nk)); + //fprintf(stdout,"t1 = " + t1); + //fprintf(stdout,"a = " + a); + //fprintf(stdout,"x[0] = " + x[0]); + //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<"); + + t1 = vranlc(2 * nk, t1, a, x); + + //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<"); + //fprintf(stdout,("2*nk = " + (2*nk)); + //fprintf(stdout,("t1 = " + t1); + //fprintf(stdout,("a = " + a); + //fprintf(stdout,("x[0] = " + x[0]); + //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<"); + + //if (timers_enabled) timer_stop(3); + timer_stop(3,elapsed,start); + +/* Compute Gaussian deviates by acceptance-rejection method and + * tally counts in concentric square annuli. This loop is not + * vectorizable. + */ + //if (timers_enabled) timer_start(2); + timer_start(&(start[2])); + for(i=1; i<=nk;i++) { + x1 = 2. * x[2*i-2] -1.0; + x2 = 2. * x[2*i-1] - 1.0; + t1 = x1*x1 + x2*x2; + if (t1 <= 1.) { + t2 = sqrt(-2. * log(t1) / t1); + t3 = (x1 * t2); + t4 = (x2 * t2); + l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4)); + q[l] = q[l] + 1.; + sx = sx + t3; + sy = sy + t4; + } + /* + if(i == 1) { + fprintf(stdout,"x1 = " + x1); + fprintf(stdout,"x2 = " + x2); + fprintf(stdout,"t1 = " + t1); + fprintf(stdout,"t2 = " + t2); + fprintf(stdout,"t3 = " + t3); + fprintf(stdout,"t4 = " + t4); + fprintf(stdout,"l = " + l); + fprintf(stdout,"q[l] = " + q[l]); + fprintf(stdout,"sx = " + sx); + fprintf(stdout,"sy = " + sy); + } + */ + } + //if (timers_enabled) timer_stop(2); + timer_stop(2,elapsed,start); + } + + TRACE_smpi_set_category ("finalize"); + + //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) + MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1 + MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + sy = x[0]; + MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + + for(i = 0; i < nq; i++) { + q[i] = x[i]; + } + for(i = 0; i < nq; i++) { + gc += q[i]; + } + + timer_stop(1,elapsed,start); + tm = timer_read(1,elapsed); + MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + tm = x[0]; + + if(node == root) { + nit = 0; + verified = true; + + if(m == 24) { + sx_verify_value = -3.247834652034740E3; + sy_verify_value = -6.958407078382297E3; + } else if(m == 25) { + sx_verify_value = -2.863319731645753E3; + sy_verify_value = -6.320053679109499E3; + } else if(m == 28) { + sx_verify_value = -4.295875165629892E3; + sy_verify_value = -1.580732573678431E4; + } else if(m == 30) { + sx_verify_value = 4.033815542441498E4; + sy_verify_value = -2.660669192809235E4; + } else if(m == 32) { + sx_verify_value = 4.764367927995374E4; + sy_verify_value = -8.084072988043731E4; + } else if(m == 36) { + sx_verify_value = 1.982481200946593E5; + sy_verify_value = -1.020596636361769E5; + } else { + verified = false; + } + + /* + fprintf(stdout,("sx = " + sx); + fprintf(stdout,("sx_verify = " + sx_verify_value); + fprintf(stdout,("sy = " + sy); + fprintf(stdout,("sy_verify = " + sy_verify_value); + */ + if(verified) { + sx_err = abs((sx - sx_verify_value)/sx_verify_value); + sy_err = abs((sy - sy_verify_value)/sy_verify_value); + /* + fprintf(stdout,("sx_err = " + sx_err); + fprintf(stdout,("sy_err = " + sx_err); + fprintf(stdout,("epsilon= " + epsilon); + */ + verified = ((sx_err < epsilon) && (sy_err < epsilon)); + } + + Mops = (pow(2.0, m+1))/tm/1000; + + fprintf(stdout,"EP Benchmark Results:\n"); + fprintf(stdout,"CPU Time=%d\n",tm); + fprintf(stdout,"N = 2^%d\n",m); + fprintf(stdout,"No. Gaussain Pairs =%d\n",gc); + fprintf(stdout,"Sum = %lf %ld\n",sx,sy); + fprintf(stdout,"Count:"); + for(i = 0; i < nq; i++) { + fprintf(stdout,"%d\t %ld\n",i,q[i]); + } + + /* + print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops, + "Random numbers generated", verified, npbversion, + compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */ + fprintf(stdout,"\nEP Benchmark Completed\n"); + fprintf(stdout,"Class = %s\n", _class); + fprintf(stdout,"Size = %s\n", size); + fprintf(stdout,"Iteration = %d\n", nit); + fprintf(stdout,"Time in seconds = %lf\n",(tm/1000)); + fprintf(stdout,"Total processes = %d\n",no_nodes); + fprintf(stdout,"Mops/s total = %lf\n",Mops); + fprintf(stdout,"Mops/s/process = %lf\n", Mops/no_nodes); + fprintf(stdout,"Operation type = Random number generated\n"); + if(verified) { + fprintf(stdout,"Verification = SUCCESSFUL\n"); + } else { + fprintf(stdout,"Verification = UNSUCCESSFUL\n"); + } + fprintf(stdout,"Total time: %lf\n",(timer_read(1,elapsed)/1000)); + fprintf(stdout,"Gaussian pairs: %lf\n",(timer_read(2,elapsed)/1000)); + fprintf(stdout,"Random numbers: %lf\n",(timer_read(3,elapsed)/1000)); + } +#ifdef USE_MPE + MPE_Finish_log(argv[0]); +#endif + + MPI_Finalize(); + } + + int main(int argc, char **argv) { + doTest(argc,argv); + } diff --git a/examples/smpi/NAS/EP-trace/mpinpb.h b/examples/smpi/NAS/EP-trace/mpinpb.h new file mode 100644 index 0000000000..1f13637ae4 --- /dev/null +++ b/examples/smpi/NAS/EP-trace/mpinpb.h @@ -0,0 +1,9 @@ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + include 'mpif.h' + + integer me, nprocs, root, dp_type + common /mpistuff/ me, nprocs, root, dp_type + diff --git a/examples/smpi/NAS/EP-trace/randlc.c b/examples/smpi/NAS/EP-trace/randlc.c new file mode 100644 index 0000000000..624b800df8 --- /dev/null +++ b/examples/smpi/NAS/EP-trace/randlc.c @@ -0,0 +1,107 @@ + +/* + * FUNCTION RANDLC (X, A) + * + * This routine returns a uniform pseudorandom double precision number in the + * range (0, 1) by using the linear congruential generator + * + * x_{k+1} = a x_k (mod 2^46) + * + * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers + * before repeating. The argument A is the same as 'a' in the above formula, + * and X is the same as x_0. A and X must be odd double precision integers + * in the range (1, 2^46). The returned value RANDLC is normalized to be + * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain + * the new seed x_1, so that subsequent calls to RANDLC using the same + * arguments will generate a continuous sequence. + * + * This routine should produce the same results on any computer with at least + * 48 mantissa bits in double precision floating point data. On Cray systems, + * double precision should be disabled. + * + * David H. Bailey October 26, 1990 + * + * IMPLICIT DOUBLE PRECISION (A-H, O-Z) + * SAVE KS, R23, R46, T23, T46 + * DATA KS/0/ + * + * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46, + * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than + * by merely using the ** operator, in order to insure that the results are + * exact on all systems. This code assumes that 0.5D0 is represented exactly. + */ + + +/*****************************************************************/ +/************* R A N D L C ************/ +/************* ************/ +/************* portable random number generator ************/ +/*****************************************************************/ + +double randlc( double *X, double *A ) +{ + static int KS=0; + static double R23, R46, T23, T46; + double T1, T2, T3, T4; + double A1; + double A2; + double X1; + double X2; + double Z; + int i, j; + + if (KS == 0) + { + R23 = 1.0; + R46 = 1.0; + T23 = 1.0; + T46 = 1.0; + + for (i=1; i<=23; i++) + { + R23 = 0.50 * R23; + T23 = 2.0 * T23; + } + for (i=1; i<=46; i++) + { + R46 = 0.50 * R46; + T46 = 2.0 * T46; + } + KS = 1; + } + +/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ + + T1 = R23 * *A; + j = T1; + A1 = j; + A2 = *A - T23 * A1; + +/* Break X into two parts such that X = 2^23 * X1 + X2, compute + Z = A1 * X2 + A2 * X1 (mod 2^23), and then + X = 2^23 * Z + A2 * X2 (mod 2^46). */ + + T1 = R23 * *X; + j = T1; + X1 = j; + X2 = *X - T23 * X1; + T1 = A1 * X2 + A2 * X1; + + j = R23 * T1; + T2 = j; + Z = T1 - T23 * T2; + T3 = T23 * Z + A2 * X2; + j = R46 * T3; + T4 = j; + *X = T3 - T46 * T4; + return(R46 * *X); +} + + + +/*****************************************************************/ +/************ F I N D _ M Y _ S E E D ************/ +/************ ************/ +/************ returns parallel random number seq seed ************/ +/*****************************************************************/ + diff --git a/examples/smpi/NAS/EP-trace/randlc.h b/examples/smpi/NAS/EP-trace/randlc.h new file mode 100644 index 0000000000..aff84d341a --- /dev/null +++ b/examples/smpi/NAS/EP-trace/randlc.h @@ -0,0 +1,3 @@ + +double randlc( double *X, double *A ); + diff --git a/examples/smpi/NAS/IS-trace/Makefile b/examples/smpi/NAS/IS-trace/Makefile new file mode 100644 index 0000000000..4911d0ddd5 --- /dev/null +++ b/examples/smpi/NAS/IS-trace/Makefile @@ -0,0 +1,22 @@ +SHELL=/bin/sh +BENCHMARK=is +BENCHMARKU=IS + +include ../config/make.def + +include ../sys/make.common + +OBJS = is-trace.o ${COMMON}/c_print_results.o + + +${PROGRAM}: config ${OBJS} + ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB} + +.c.o: + ${CCOMPILE} $< + +is-trace.o: is-trace.c npbparams.h + +clean: + - rm -f *.o *~ mputil* + - rm -f is-trace npbparams.h core diff --git a/examples/smpi/NAS/IS-trace/is-trace.c b/examples/smpi/NAS/IS-trace/is-trace.c new file mode 100644 index 0000000000..f85c7c15a0 --- /dev/null +++ b/examples/smpi/NAS/IS-trace/is-trace.c @@ -0,0 +1,1158 @@ +/************************************************************************* + * * + * N A S P A R A L L E L B E N C H M A R K S 3.3 * + * * + * I S * + * * + ************************************************************************* + * * + * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. * + * It is described in NAS Technical Report 95-020. * + * * + * Permission to use, copy, distribute and modify this software * + * for any purpose with or without fee is hereby granted. We * + * request, however, that all derived work reference the NAS * + * Parallel Benchmarks 3.3. This software is provided "as is" * + * without express or implied warranty. * + * * + * Information on NPB 3.3, including the technical report, the * + * original specifications, source code, results and information * + * on how to submit new results, is available at: * + * * + * http://www.nas.nasa.gov/Software/NPB * + * * + * Send comments or suggestions to npb@nas.nasa.gov * + * Send bug reports to npb-bugs@nas.nasa.gov * + * * + * NAS Parallel Benchmarks Group * + * NASA Ames Research Center * + * Mail Stop: T27A-1 * + * Moffett Field, CA 94035-1000 * + * * + * E-mail: npb@nas.nasa.gov * + * Fax: (650) 604-3957 * + * * + ************************************************************************* + * * + * Author: M. Yarrow * + * H. Jin * + * * + *************************************************************************/ + +#include "mpi.h" +#include "npbparams.h" +#include +#include + +#include "instr/instr.h" //TRACE_ + +/******************/ +/* default values */ +/******************/ +#ifndef CLASS +#define CLASS 'S' +#define NUM_PROCS 1 +#endif +#define MIN_PROCS 1 + + +/*************/ +/* CLASS S */ +/*************/ +#if CLASS == 'S' +#define TOTAL_KEYS_LOG_2 16 +#define MAX_KEY_LOG_2 11 +#define NUM_BUCKETS_LOG_2 9 +#endif + + +/*************/ +/* CLASS W */ +/*************/ +#if CLASS == 'W' +#define TOTAL_KEYS_LOG_2 20 +#define MAX_KEY_LOG_2 16 +#define NUM_BUCKETS_LOG_2 10 +#endif + +/*************/ +/* CLASS A */ +/*************/ +#if CLASS == 'A' +#define TOTAL_KEYS_LOG_2 23 +#define MAX_KEY_LOG_2 19 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS B */ +/*************/ +#if CLASS == 'B' +#define TOTAL_KEYS_LOG_2 25 +#define MAX_KEY_LOG_2 21 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS C */ +/*************/ +#if CLASS == 'C' +#define TOTAL_KEYS_LOG_2 27 +#define MAX_KEY_LOG_2 23 +#define NUM_BUCKETS_LOG_2 10 +#endif + + +/*************/ +/* CLASS D */ +/*************/ +#if CLASS == 'D' +#define TOTAL_KEYS_LOG_2 29 +#define MAX_KEY_LOG_2 27 +#define NUM_BUCKETS_LOG_2 10 +#undef MIN_PROCS +#define MIN_PROCS 4 +#endif + + +#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2) +#define MAX_KEY (1 << MAX_KEY_LOG_2) +#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2) +#define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS) + +/*****************************************************************/ +/* On larger number of processors, since the keys are (roughly) */ +/* gaussian distributed, the first and last processor sort keys */ +/* in a large interval, requiring array sizes to be larger. Note */ +/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/ +/* The required array size also depends on the bucket size used. */ +/* The following values are validated for the 1024-bucket setup. */ +/*****************************************************************/ +#if NUM_PROCS < 256 +#define SIZE_OF_BUFFERS 3*NUM_KEYS/2 +#elif NUM_PROCS < 512 +#define SIZE_OF_BUFFERS 5*NUM_KEYS/2 +#elif NUM_PROCS < 1024 +#define SIZE_OF_BUFFERS 4*NUM_KEYS +#else +#define SIZE_OF_BUFFERS 13*NUM_KEYS/2 +#endif + +/*****************************************************************/ +/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */ +/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */ +/* MAX_PROCS AT YOUR PERIL */ +/*****************************************************************/ +#if CLASS == 'S' +#define MAX_PROCS 128 +#else +#define MAX_PROCS 1024 +#endif + +#define MAX_ITERATIONS 10 +#define TEST_ARRAY_SIZE 5 + + +/***********************************/ +/* Enable separate communication, */ +/* computation timing and printout */ +/***********************************/ +/* #define TIMING_ENABLED */ + + +/*************************************/ +/* Typedef: if necessary, change the */ +/* size of int here by changing the */ +/* int type to, say, long */ +/*************************************/ +typedef int INT_TYPE; +typedef long INT_TYPE2; +#define MP_KEY_TYPE MPI_INT + + +typedef struct { + +/********************/ +/* MPI properties: */ +/********************/ +int my_rank, + comm_size; + + +/********************/ +/* Some global info */ +/********************/ +INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */ + total_local_keys, /* copies of rank info */ + total_lesser_keys; + + +int passed_verification; + + + +/************************************/ +/* These are the three main arrays. */ +/* See SIZE_OF_BUFFERS def above */ +/************************************/ +INT_TYPE key_array[SIZE_OF_BUFFERS], + key_buff1[SIZE_OF_BUFFERS], + key_buff2[SIZE_OF_BUFFERS], + bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */ + bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */ + bucket_ptrs[NUM_BUCKETS], + process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE], + process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE]; +int send_count[MAX_PROCS], recv_count[MAX_PROCS], + send_displ[MAX_PROCS], recv_displ[MAX_PROCS]; + + +/**********************/ +/* Partial verif info */ +/**********************/ +INT_TYPE2 test_index_array[TEST_ARRAY_SIZE], + test_rank_array[TEST_ARRAY_SIZE]; + +/**********/ +/* Timers */ +/**********/ +double start[64], elapsed[64]; + +} global_data; + + +const INT_TYPE2 + S_test_index_array[TEST_ARRAY_SIZE] = + {48427,17148,23627,62548,4431}, + S_test_rank_array[TEST_ARRAY_SIZE] = + {0,18,346,64917,65463}, + + W_test_index_array[TEST_ARRAY_SIZE] = + {357773,934767,875723,898999,404505}, + W_test_rank_array[TEST_ARRAY_SIZE] = + {1249,11698,1039987,1043896,1048018}, + + A_test_index_array[TEST_ARRAY_SIZE] = + {2112377,662041,5336171,3642833,4250760}, + A_test_rank_array[TEST_ARRAY_SIZE] = + {104,17523,123928,8288932,8388264}, + + B_test_index_array[TEST_ARRAY_SIZE] = + {41869,812306,5102857,18232239,26860214}, + B_test_rank_array[TEST_ARRAY_SIZE] = + {33422937,10244,59149,33135281,99}, + + C_test_index_array[TEST_ARRAY_SIZE] = + {44172927,72999161,74326391,129606274,21736814}, + C_test_rank_array[TEST_ARRAY_SIZE] = + {61147,882988,266290,133997595,133525895}, + + D_test_index_array[TEST_ARRAY_SIZE] = + {1317351170,995930646,1157283250,1503301535,1453734525}, + D_test_rank_array[TEST_ARRAY_SIZE] = + {1,36538729,1978098519,2145192618,2147425337}; + + + +/***********************/ +/* function prototypes */ +/***********************/ +double randlc( double *X, double *A ); + +void full_verify( global_data* gd ); + +void c_print_results( char *name, + char class, + int n1, + int n2, + int n3, + int niter, + int nprocs_compiled, + int nprocs_total, + double t, + double mops, + char *optype, + int passed_verification, + char *npbversion, + char *compiletime, + char *mpicc, + char *clink, + char *cmpi_lib, + char *cmpi_inc, + char *cflags, + char *clinkflags ); + +void timer_clear(global_data* gd, int n ); +void timer_start(global_data* gd, int n ); +void timer_stop(global_data* gd, int n ); +double timer_read(global_data* gd, int n ); + +void timer_clear(global_data* gd, int n ) { + gd->elapsed[n] = 0.0; +} + +void timer_start(global_data* gd, int n ) { + gd->start[n] = MPI_Wtime(); +} + +void timer_stop(global_data* gd, int n ) { + gd->elapsed[n] += MPI_Wtime() - gd->start[n]; +} + +double timer_read(global_data* gd, int n ) { + return gd->elapsed[n]; +} + + +/* + * FUNCTION RANDLC (X, A) + * + * This routine returns a uniform pseudorandom double precision number in the + * range (0, 1) by using the linear congruential generator + * + * x_{k+1} = a x_k (mod 2^46) + * + * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers + * before repeating. The argument A is the same as 'a' in the above formula, + * and X is the same as x_0. A and X must be odd double precision integers + * in the range (1, 2^46). The returned value RANDLC is normalized to be + * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain + * the new seed x_1, so that subsequent calls to RANDLC using the same + * arguments will generate a continuous sequence. + * + * This routine should produce the same results on any computer with at least + * 48 mantissa bits in double precision floating point data. On Cray systems, + * double precision should be disabled. + * + * David H. Bailey October 26, 1990 + * + * IMPLICIT DOUBLE PRECISION (A-H, O-Z) + * SAVE KS, R23, R46, T23, T46 + * DATA KS/0/ + * + * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46, + * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than + * by merely using the ** operator, in order to insure that the results are + * exact on all systems. This code assumes that 0.5D0 is represented exactly. + */ + + +/*****************************************************************/ +/************* R A N D L C ************/ +/************* ************/ +/************* portable random number generator ************/ +/*****************************************************************/ + +double randlc( double *X, double *A ) +{ + static int KS=0; + static double R23, R46, T23, T46; + double T1, T2, T3, T4; + double A1; + double A2; + double X1; + double X2; + double Z; + int i, j; + + if (KS == 0) + { + R23 = 1.0; + R46 = 1.0; + T23 = 1.0; + T46 = 1.0; + + for (i=1; i<=23; i++) + { + R23 = 0.50 * R23; + T23 = 2.0 * T23; + } + for (i=1; i<=46; i++) + { + R46 = 0.50 * R46; + T46 = 2.0 * T46; + } + KS = 1; + } + +/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ + + T1 = R23 * *A; + j = T1; + A1 = j; + A2 = *A - T23 * A1; + +/* Break X into two parts such that X = 2^23 * X1 + X2, compute + Z = A1 * X2 + A2 * X1 (mod 2^23), and then + X = 2^23 * Z + A2 * X2 (mod 2^46). */ + + T1 = R23 * *X; + j = T1; + X1 = j; + X2 = *X - T23 * X1; + T1 = A1 * X2 + A2 * X1; + + j = R23 * T1; + T2 = j; + Z = T1 - T23 * T2; + T3 = T23 * Z + A2 * X2; + j = R46 * T3; + T4 = j; + *X = T3 - T46 * T4; + return(R46 * *X); +} + + + +/*****************************************************************/ +/************ F I N D _ M Y _ S E E D ************/ +/************ ************/ +/************ returns parallel random number seq seed ************/ +/*****************************************************************/ + +/* + * Create a random number sequence of total length nn residing + * on np number of processors. Each processor will therefore have a + * subsequence of length nn/np. This routine returns that random + * number which is the first random number for the subsequence belonging + * to processor rank kn, and which is used as seed for proc kn ran # gen. + */ + +double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */ + int np, /* np = num procs */ + long nn, /* total num of ran numbers, all procs */ + double s, /* Ran num seed, for ex.: 314159265.00 */ + double a ) /* Ran num gen mult, try 1220703125.00 */ +{ + + long i; + + double t1,t2,t3,an; + long mq,nq,kk,ik; + + + + nq = nn / np; + + for( mq=0; nq>1; mq++,nq/=2 ) + ; + + t1 = a; + + for( i=1; i<=mq; i++ ) + t2 = randlc( &t1, &t1 ); + + an = t1; + + kk = kn; + t1 = s; + t2 = an; + + for( i=1; i<=100; i++ ) + { + ik = kk / 2; + if( 2 * ik != kk ) + t3 = randlc( &t1, &t2 ); + if( ik == 0 ) + break; + t3 = randlc( &t2, &t2 ); + kk = ik; + } + + return( t1 ); + +} + + + + +/*****************************************************************/ +/************* C R E A T E _ S E Q ************/ +/*****************************************************************/ + +void create_seq( global_data* gd, double seed, double a ) +{ + double x; + int i, k; + + k = MAX_KEY/4; + + for (i=0; ikey_array[i] = k*x; + } +} + + + + +/*****************************************************************/ +/************* F U L L _ V E R I F Y ************/ +/*****************************************************************/ + + +void full_verify( global_data* gd ) +{ + MPI_Status status; + MPI_Request request; + + INT_TYPE i, j; + INT_TYPE k, last_local_key; + + +/* Now, finally, sort the keys: */ + for( i=0; itotal_local_keys; i++ ) + gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- + gd->total_lesser_keys] = gd->key_buff2[i]; + last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1); + +/* Send largest key value to next processor */ + if( gd->my_rank > 0 ) + MPI_Irecv( &k, + 1, + MP_KEY_TYPE, + gd->my_rank-1, + 1000, + MPI_COMM_WORLD, + &request ); + if( gd->my_rank < gd->comm_size-1 ) + MPI_Send( &gd->key_array[last_local_key], + 1, + MP_KEY_TYPE, + gd->my_rank+1, + 1000, + MPI_COMM_WORLD ); + if( gd->my_rank > 0 ) + MPI_Wait( &request, &status ); + +/* Confirm that neighbor's greatest key value + is not greater than my least key value */ + j = 0; + if( gd->my_rank > 0 && gd->total_local_keys > 0 ) + if( k > gd->key_array[0] ) + j++; + + +/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */ + for( i=1; itotal_local_keys; i++ ) + if( gd->key_array[i-1] > gd->key_array[i] ) + j++; + + + if( j != 0 ) + { + printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", + gd->my_rank, j ); + } + else + gd->passed_verification++; + + +} + + + + +/*****************************************************************/ +/************* R A N K ****************/ +/*****************************************************************/ + + +void rank( global_data* gd, int iteration ) +{ + + INT_TYPE i, k; + + INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2; + INT_TYPE key; + INT_TYPE2 bucket_sum_accumulator, j, m; + INT_TYPE local_bucket_sum_accumulator; + INT_TYPE min_key_val, max_key_val; + INT_TYPE *key_buff_ptr; + + + + +/* Iteration alteration of keys */ + if(gd->my_rank == 0 ) + { + gd->key_array[iteration] = iteration; + gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration; + } + + +/* Initialize */ + for( i=0; ibucket_size[i] = 0; + gd->bucket_size_totals[i] = 0; + gd->process_bucket_distrib_ptr1[i] = 0; + gd->process_bucket_distrib_ptr2[i] = 0; + } + + +/* Determine where the partial verify test keys are, load into */ +/* top of array bucket_size */ + for( i=0; itest_index_array[i]/NUM_KEYS) == gd->my_rank ) + gd->bucket_size[NUM_BUCKETS+i] = + gd->key_array[gd->test_index_array[i] % NUM_KEYS]; + + +/* Determine the number of keys in each bucket */ + for( i=0; ibucket_size[gd->key_array[i] >> shift]++; + + +/* Accumulative bucket sizes are the bucket pointers */ + gd->bucket_ptrs[0] = 0; + for( i=1; i< NUM_BUCKETS; i++ ) + gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1]; + + +/* Sort into appropriate bucket */ + for( i=0; ikey_array[i]; + gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key; + } + +#ifdef TIMING_ENABLED + timer_stop(gd, 2 ); + timer_start(gd, 3 ); +#endif + +/* Get the bucket size totals for the entire problem. These + will be used to determine the redistribution of keys */ + MPI_Allreduce( gd->bucket_size, + gd->bucket_size_totals, + NUM_BUCKETS+TEST_ARRAY_SIZE, + MP_KEY_TYPE, + MPI_SUM, + MPI_COMM_WORLD ); + +#ifdef TIMING_ENABLED + timer_stop(gd, 3 ); + timer_start(gd, 2 ); +#endif + +/* Determine Redistibution of keys: accumulate the bucket size totals + till this number surpasses NUM_KEYS (which the average number of keys + per processor). Then all keys in these buckets go to processor 0. + Continue accumulating again until supassing 2*NUM_KEYS. All keys + in these buckets go to processor 1, etc. This algorithm guarantees + that all processors have work ranking; no processors are left idle. + The optimum number of buckets, however, does not result in as high + a degree of load balancing (as even a distribution of keys as is + possible) as is obtained from increasing the number of buckets, but + more buckets results in more computation per processor so that the + optimum number of buckets turns out to be 1024 for machines tested. + Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket + number of first and last bucket which each processor will have after + the redistribution is done. */ + + bucket_sum_accumulator = 0; + local_bucket_sum_accumulator = 0; + gd->send_displ[0] = 0; + gd->process_bucket_distrib_ptr1[0] = 0; + for( i=0, j=0; ibucket_size_totals[i]; + local_bucket_sum_accumulator += gd->bucket_size[i]; + if( bucket_sum_accumulator >= (j+1)*NUM_KEYS ) + { + gd->send_count[j] = local_bucket_sum_accumulator; + if( j != 0 ) + { + gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1]; + gd->process_bucket_distrib_ptr1[j] = + gd->process_bucket_distrib_ptr2[j-1]+1; + } + gd->process_bucket_distrib_ptr2[j++] = i; + local_bucket_sum_accumulator = 0; + } + } + +/* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible + that the last few processors don't get any buckets. So, we + need to set counts properly in this case to avoid any fallouts. */ + while( j < gd->comm_size ) + { + gd->send_count[j] = 0; + gd->process_bucket_distrib_ptr1[j] = 1; + j++; + } + +#ifdef TIMING_ENABLED + timer_stop(gd, 2 ); + timer_start(gd, 3 ); +#endif + +/* This is the redistribution section: first find out how many keys + each processor will send to every other processor: */ + MPI_Alltoall( gd->send_count, + 1, + MPI_INT, + gd->recv_count, + 1, + MPI_INT, + MPI_COMM_WORLD ); + +/* Determine the receive array displacements for the buckets */ + gd->recv_displ[0] = 0; + for( i=1; icomm_size; i++ ) + gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1]; + + +/* Now send the keys to respective processors */ + MPI_Alltoallv( gd->key_buff1, + gd->send_count, + gd->send_displ, + MP_KEY_TYPE, + gd->key_buff2, + gd->recv_count, + gd->recv_displ, + MP_KEY_TYPE, + MPI_COMM_WORLD ); + +#ifdef TIMING_ENABLED + timer_stop(gd, 3 ); + timer_start(gd, 2 ); +#endif + +/* The starting and ending bucket numbers on each processor are + multiplied by the interval size of the buckets to obtain the + smallest possible min and greatest possible max value of any + key on each processor */ + min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift; + max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1; + +/* Clear the work array */ + for( i=0; ikey_buff1[i] = 0; + +/* Determine the total number of keys on all other + processors holding keys of lesser value */ + m = 0; + for( k=0; kmy_rank; k++ ) + for( i= gd->process_bucket_distrib_ptr1[k]; + i<=gd->process_bucket_distrib_ptr2[k]; + i++ ) + m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */ + +/* Determine total number of keys on this processor */ + j = 0; + for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; + i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; + i++ ) + j += gd->bucket_size_totals[i]; /* j has total # of local keys */ + + +/* Ranking of all keys occurs in this section: */ +/* shift it backwards so no subtractions are necessary in loop */ + key_buff_ptr = gd->key_buff1 - min_key_val; + +/* In this section, the keys themselves are used as their + own indexes to determine how many of each there are: their + individual population */ + for( i=0; ikey_buff2[i]]++; /* Now they have individual key */ + /* population */ + +/* To obtain ranks of each key, successively add the individual key + population, not forgetting the total of lesser keys, m. + NOTE: Since the total of lesser keys would be subtracted later + in verification, it is no longer added to the first key population + here, but still needed during the partial verify test. This is to + ensure that 32-bit key_buff can still be used for class D. */ +/* key_buff_ptr[min_key_val] += m; */ + for( i=min_key_val; ibucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */ + if( min_key_val <= k && k <= max_key_val ) + { + /* Add the total of lesser keys, m, here */ + INT_TYPE2 key_rank = key_buff_ptr[k-1] + m; + int failed = 0; + + switch( CLASS ) + { + case 'S': + if( i <= 2 ) + { + if( key_rank != gd->test_rank_array[i]+iteration ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'W': + if( i < 2 ) + { + if( key_rank != gd->test_rank_array[i]+(iteration-2) ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'A': + if( i <= 2 ) + { + if( key_rank != gd->test_rank_array[i]+(iteration-1) ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-(iteration-1) ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'B': + if( i == 1 || i == 2 || i == 4 ) + { + if( key_rank != gd->test_rank_array[i]+iteration ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'C': + if( i <= 2 ) + { + if( key_rank != gd->test_rank_array[i]+iteration ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + case 'D': + if( i < 2 ) + { + if( key_rank != gd->test_rank_array[i]+iteration ) + failed = 1; + else + gd->passed_verification++; + } + else + { + if( key_rank != gd->test_rank_array[i]-iteration ) + failed = 1; + else + gd->passed_verification++; + } + break; + } + if( failed == 1 ) + printf( "Failed partial verification: " + "iteration %d, processor %d, test key %d\n", + iteration, gd->my_rank, (int)i ); + } + } + + + + +/* Make copies of rank info for use by full_verify: these variables + in rank are local; making them global slows down the code, probably + since they cannot be made register by compiler */ + + if( iteration == MAX_ITERATIONS ) + { + gd->key_buff_ptr_global = key_buff_ptr; + gd->total_local_keys = j; + gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */ + } + +} + + +/*****************************************************************/ +/************* M A I N ****************/ +/*****************************************************************/ + +int main( int argc, char **argv ) +{ + + int i, iteration, itemp; + + double timecounter, maxtime; + + TRACE_smpi_set_category ("start"); + + global_data* gd = malloc(sizeof(global_data)); +/* Initialize MPI */ + MPI_Init( &argc, &argv ); + MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank ); + MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size ); + +/* Initialize the verification arrays if a valid class */ + for( i=0; itest_index_array[i] = S_test_index_array[i]; + gd->test_rank_array[i] = S_test_rank_array[i]; + break; + case 'A': + gd->test_index_array[i] = A_test_index_array[i]; + gd->test_rank_array[i] = A_test_rank_array[i]; + break; + case 'W': + gd->test_index_array[i] = W_test_index_array[i]; + gd->test_rank_array[i] = W_test_rank_array[i]; + break; + case 'B': + gd->test_index_array[i] = B_test_index_array[i]; + gd->test_rank_array[i] = B_test_rank_array[i]; + break; + case 'C': + gd->test_index_array[i] = C_test_index_array[i]; + gd->test_rank_array[i] = C_test_rank_array[i]; + break; + case 'D': + gd->test_index_array[i] = D_test_index_array[i]; + gd->test_rank_array[i] = D_test_rank_array[i]; + break; + }; + + + +/* Printout initial NPB info */ + if( gd->my_rank == 0 ) + { + printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" ); + printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS ); + printf( " Iterations: %d\n", MAX_ITERATIONS ); + printf( " Number of processes: %d\n",gd->comm_size ); + } + +/* Check that actual and compiled number of processors agree */ + if( gd->comm_size != NUM_PROCS ) + { + if( gd->my_rank == 0 ) + printf( "\n ERROR: compiled for %d processes\n" + " Number of active processes: %d\n" + " Exiting program!\n\n", NUM_PROCS, gd->comm_size ); + MPI_Finalize(); + exit( 1 ); + } + +/* Check to see whether total number of processes is within bounds. + This could in principle be checked in setparams.c, but it is more + convenient to do it here */ + if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS) + { + if( gd->my_rank == 0 ) + printf( "\n ERROR: number of processes %d not within range %d-%d" + "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS); + MPI_Finalize(); + exit( 1 ); + } + + +/* Generate random number sequence and subsequent keys on all procs */ + create_seq(gd, find_my_seed( gd->my_rank, + gd->comm_size, + 4*(long)TOTAL_KEYS*MIN_PROCS, + 314159265.00, /* Random number gen seed */ + 1220703125.00 ), /* Random number gen mult */ + 1220703125.00 ); /* Random number gen mult */ + +/* Do one interation for free (i.e., untimed) to guarantee initialization of + all data and code pages and respective tables */ + rank(gd, 1 ); + +/* Start verification counter */ + gd->passed_verification = 0; + + if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" ); + +/* Initialize timer */ + timer_clear(gd, 0 ); + +/* Initialize separate communication, computation timing */ +#ifdef TIMING_ENABLED + for( i=1; i<=3; i++ ) timer_clear(gd, i ); +#endif + +/* Start timer */ + timer_start(gd, 0 ); + +#ifdef TIMING_ENABLED + timer_start(gd, 1 ); + timer_start(gd, 2 ); +#endif + +/* This is the main iteration */ + for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) + { + char it_str[100]; + snprintf (it_str, 100, "i%d", iteration); + TRACE_smpi_set_category (it_str); + + if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration ); + rank(gd, iteration ); + } + + TRACE_smpi_set_category ("finalize"); + + +#ifdef TIMING_ENABLED + timer_stop(gd, 2 ); + timer_stop(gd, 1 ); +#endif + +/* Stop timer, obtain time for processors */ + timer_stop(gd, 0 ); + + timecounter = timer_read(gd, 0 ); + +/* End of timing, obtain maximum time of all processors */ + MPI_Reduce( &timecounter, + &maxtime, + 1, + MPI_DOUBLE, + MPI_MAX, + 0, + MPI_COMM_WORLD ); + +#ifdef TIMING_ENABLED + { + double tmin, tsum, tmax; + + if( my_rank == 0 ) + { + printf( "\ntimer 1/2/3 = total/computation/communication time\n"); + printf( " min avg max\n" ); + } + for( i=1; i<=3; i++ ) + { + timecounter = timer_read(gd, i ); + MPI_Reduce( &timecounter, + &tmin, + 1, + MPI_DOUBLE, + MPI_MIN, + 0, + MPI_COMM_WORLD ); + MPI_Reduce( &timecounter, + &tsum, + 1, + MPI_DOUBLE, + MPI_SUM, + 0, + MPI_COMM_WORLD ); + MPI_Reduce( &timecounter, + &tmax, + 1, + MPI_DOUBLE, + MPI_MAX, + 0, + MPI_COMM_WORLD ); + if( my_rank == 0 ) + printf( "timer %d: %f %f %f\n", + i, tmin, tsum/((double) comm_size), tmax ); + } + if( my_rank == 0 ) + printf( "\n" ); + } +#endif + +/* This tests that keys are in sequence: sorting of last ranked key seq + occurs here, but is an untimed operation */ + full_verify(gd); + + +/* Obtain verification counter sum */ + itemp =gd->passed_verification; + MPI_Reduce( &itemp, + &gd->passed_verification, + 1, + MPI_INT, + MPI_SUM, + 0, + MPI_COMM_WORLD ); + + + +/* The final printout */ + if( gd->my_rank == 0 ) + { + if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size ) + gd->passed_verification = 0; + c_print_results( "IS", + CLASS, + (int)(TOTAL_KEYS), + MIN_PROCS, + 0, + MAX_ITERATIONS, + NUM_PROCS, + gd->comm_size, + maxtime, + ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS) + /maxtime/1000000., + "keys ranked", + gd->passed_verification, + NPBVERSION, + COMPILETIME, + MPICC, + CLINK, + CMPI_LIB, + CMPI_INC, + CFLAGS, + CLINKFLAGS ); + } + + MPI_Finalize(); + free(gd); + + return 0; + /**************************/ +} /* E N D P R O G R A M */ + /**************************/ diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile index 8f356aa71d..0386e2d556 100644 --- a/examples/smpi/NAS/Makefile +++ b/examples/smpi/NAS/Makefile @@ -32,6 +32,10 @@ IS: is is: header cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) +IS-trace: is-trace +is-trace: header + cd IS-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) + CG: cg cg: header cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) @@ -40,10 +44,18 @@ EP: ep ep: header cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) +EP-trace: ep-trace +ep-trace: header + cd EP-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) + DT: dt dt: header cd DT; $(MAKE) CLASS=$(CLASS) +DT-trace: dt-trace +dt-trace: header + cd DT-trace; $(MAKE) CLASS=$(CLASS) + # Awk script courtesy cmg@cray.com, modified by Haoqiang Jin suite: @ awk -f sys/suite.awk SMAKE=$(MAKE) $(SFILE) | $(SHELL) -- 2.20.1