--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "DGraph.h"
+
+DGArc *newArc(DGNode *tl,DGNode *hd){
+ DGArc *ar=(DGArc *)malloc(sizeof(DGArc));
+ ar->tail=tl;
+ ar->head=hd;
+ return ar;
+}
+void arcShow(DGArc *ar){
+ DGNode *tl=(DGNode *)ar->tail,
+ *hd=(DGNode *)ar->head;
+ fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name);
+}
+
+DGNode *newNode(char *nm){
+ DGNode *nd=(DGNode *)malloc(sizeof(DGNode));
+ nd->attribute=0;
+ nd->color=0;
+ nd->inDegree=0;
+ nd->outDegree=0;
+ nd->maxInDegree=SMALL_BLOCK_SIZE;
+ nd->maxOutDegree=SMALL_BLOCK_SIZE;
+ nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*));
+ nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*));
+ nd->name=strdup(nm);
+ nd->feat=NULL;
+ return nd;
+}
+void nodeShow(DGNode* nd){
+ fprintf( stderr,"%3d.%s: (%d,%d)\n",
+ nd->id,nd->name,nd->inDegree,nd->outDegree);
+/*
+ if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
+ else if(nd->verified==0) fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
+ else fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name);
+*/
+}
+
+DGraph* newDGraph(char* nm){
+ DGraph *dg=(DGraph *)malloc(sizeof(DGraph));
+ dg->numNodes=0;
+ dg->numArcs=0;
+ dg->maxNodes=BLOCK_SIZE;
+ dg->maxArcs=BLOCK_SIZE;
+ dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*));
+ dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*));
+ dg->name=strdup(nm);
+ return dg;
+}
+int AttachNode(DGraph* dg, DGNode* nd) {
+ int i=0,j,len=0;
+ DGNode **nds =NULL, *tmpnd=NULL;
+ DGArc **ar=NULL;
+
+ if (dg->numNodes == dg->maxNodes-1 ) {
+ dg->maxNodes += BLOCK_SIZE;
+ nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
+ memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
+ free(dg->node);
+ dg->node=nds;
+ }
+
+ len = strlen( nd->name);
+ for (i = 0; i < dg->numNodes; i++) {
+ tmpnd =dg->node[ i];
+ ar=NULL;
+ if ( strlen( tmpnd->name) != len ) continue;
+ if ( strncmp( nd->name, tmpnd->name, len) ) continue;
+ if ( nd->inDegree > 0 ) {
+ tmpnd->maxInDegree += nd->maxInDegree;
+ ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
+ memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
+ free(tmpnd->inArc);
+ tmpnd->inArc=ar;
+ for (j = 0; j < nd->inDegree; j++ ) {
+ nd->inArc[ j]->head = tmpnd;
+ }
+ memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *));
+ tmpnd->inDegree += nd->inDegree;
+ }
+ if ( nd->outDegree > 0 ) {
+ tmpnd->maxOutDegree += nd->maxOutDegree;
+ ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
+ memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
+ free(tmpnd->outArc);
+ tmpnd->outArc=ar;
+ for (j = 0; j < nd->outDegree; j++ ) {
+ nd->outArc[ j]->tail = tmpnd;
+ }
+ memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
+ tmpnd->outDegree += nd->outDegree;
+ }
+ free(nd);
+ return i;
+ }
+ nd->id = dg->numNodes;
+ dg->node[dg->numNodes] = nd;
+ dg->numNodes++;
+return nd->id;
+}
+int AttachArc(DGraph *dg,DGArc* nar){
+int arcId = -1;
+int i=0,newNumber=0;
+DGNode *head = nar->head,
+ *tail = nar->tail;
+DGArc **ars=NULL,*probe=NULL;
+/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
+ if ( !tail || !head ) return arcId;
+ if ( dg->numArcs == dg->maxArcs-1 ) {
+ dg->maxArcs += BLOCK_SIZE;
+ ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
+ memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
+ free(dg->arc);
+ dg->arc=ars;
+ }
+ for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
+ probe = tail->outArc[ i];
+ if(probe->head == head
+ &&
+ probe->length == nar->length
+ ){
+ free(nar);
+ return probe->id;
+ }
+ }
+
+ nar->id = dg->numArcs;
+ arcId=dg->numArcs;
+ dg->arc[dg->numArcs] = nar;
+ dg->numArcs++;
+
+ head->inArc[ head->inDegree] = nar;
+ head->inDegree++;
+ if ( head->inDegree >= head->maxInDegree ) {
+ newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
+ ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+ memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
+ free(head->inArc);
+ head->inArc=ars;
+ head->maxInDegree = newNumber;
+ }
+ tail->outArc[ tail->outDegree] = nar;
+ tail->outDegree++;
+ if(tail->outDegree >= tail->maxOutDegree ) {
+ newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
+ ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+ memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
+ free(tail->outArc);
+ tail->outArc=ars;
+ tail->maxOutDegree = newNumber;
+ }
+/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
+return arcId;
+}
+void graphShow(DGraph *dg,int DetailsLevel){
+ int i=0,j=0;
+ fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
+ if ( DetailsLevel < 1) return;
+ for (i = 0; i < dg->numNodes; i++ ) {
+ DGNode *focusNode = dg->node[ i];
+ if(DetailsLevel >= 2) {
+ for (j = 0; j < focusNode->inDegree; j++ ) {
+ fprintf(stderr,"\t ");
+ nodeShow(focusNode->inArc[ j]->tail);
+ }
+ }
+ nodeShow(focusNode);
+ if ( DetailsLevel < 2) continue;
+ for (j = 0; j < focusNode->outDegree; j++ ) {
+ fprintf(stderr, "\t ");
+ nodeShow(focusNode->outArc[ j]->head);
+ }
+ fprintf(stderr, "---\n");
+ }
+ fprintf(stderr,"----------------------------------------\n");
+ if ( DetailsLevel < 3) return;
+}
+
+
+
--- /dev/null
+#ifndef _DGRAPH
+#define _DGRAPH
+
+#define BLOCK_SIZE 128
+#define SMALL_BLOCK_SIZE 32
+
+typedef struct{
+ int id;
+ void *tail,*head;
+ int length,width,attribute,maxWidth;
+}DGArc;
+
+typedef struct{
+ int maxInDegree,maxOutDegree;
+ int inDegree,outDegree;
+ int id;
+ char *name;
+ DGArc **inArc,**outArc;
+ int depth,height,width;
+ int color,attribute,address,verified;
+ void *feat;
+}DGNode;
+
+typedef struct{
+ int maxNodes,maxArcs;
+ int id;
+ char *name;
+ int numNodes,numArcs;
+ DGNode **node;
+ DGArc **arc;
+} DGraph;
+
+DGArc *newArc(DGNode *tl,DGNode *hd);
+void arcShow(DGArc *ar);
+DGNode *newNode(char *nm);
+void nodeShow(DGNode* nd);
+
+DGraph* newDGraph(char *nm);
+int AttachNode(DGraph *dg,DGNode *nd);
+int AttachArc(DGraph *dg,DGArc* nar);
+void graphShow(DGraph *dg,int DetailsLevel);
+
+#endif
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=dt
+BENCHMARKU=DT
+
+include ../config/make.def
+
+include ../sys/make.common
+#Override PROGRAM
+DTPROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS)
+
+OBJS = dt-trace.o DGraph.o \
+ ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
+
+
+${PROGRAM}: config ${OBJS}
+ ${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+ ${CCOMPILE} $<
+
+dt-trace.o: dt-trace.c npbparams.h
+DGraph.o: DGraph.c DGraph.h
+
+clean:
+ - rm -f *.o *~ mputil*
+ - rm -f dt npbparams.h core
--- /dev/null
+Data Traffic benchmark DT is new in the NPB suite
+(released as part of NPB3.x-MPI package).
+----------------------------------------------------
+
+DT is written in C and same executable can run on any number of processors,
+provided this number is not less than the number of nodes in the communication
+graph. DT benchmark takes one argument: BH, WH, or SH. This argument
+specifies the communication graph Black Hole, White Hole, or SHuffle
+respectively. The current release contains verification numbers for
+CLASSES S, W, A, and B only. Classes C and D are defined, but verification
+numbers are not provided in this release.
+
+The following table summarizes the number of nodes in the communication
+graph based on CLASS and graph TYPE.
+
+CLASS N_Source N_Nodes(BH,WH) N_Nodes(SH)
+ S 4 5 12
+ W 8 11 32
+ A 16 21 80
+ B 32 43 192
+ C 64 85 448
+ D 128 171 1024
--- /dev/null
+/*************************************************************************
+ * *
+ * N A S P A R A L L E L B E N C H M A R K S 3.3 *
+ * *
+ * D T *
+ * *
+ *************************************************************************
+ * *
+ * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
+ * *
+ * Permission to use, copy, distribute and modify this software *
+ * for any purpose with or without fee is hereby granted. We *
+ * request, however, that all derived work reference the NAS *
+ * Parallel Benchmarks 3.3. This software is provided "as is" *
+ * without express or implied warranty. *
+ * *
+ * Information on NPB 3.3, including the technical report, the *
+ * original specifications, source code, results and information *
+ * on how to submit new results, is available at: *
+ * *
+ * http: www.nas.nasa.gov/Software/NPB *
+ * *
+ * Send comments or suggestions to npb@nas.nasa.gov *
+ * Send bug reports to npb-bugs@nas.nasa.gov *
+ * *
+ * NAS Parallel Benchmarks Group *
+ * NASA Ames Research Center *
+ * Mail Stop: T27A-1 *
+ * Moffett Field, CA 94035-1000 *
+ * *
+ * E-mail: npb@nas.nasa.gov *
+ * Fax: (650) 604-3957 *
+ * *
+ *************************************************************************
+ * *
+ * Author: M. Frumkin * *
+ * *
+ *************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#include "instr/instr.h" //TRACE_
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS 1
+#endif
+
+//int passed_verification;
+extern double randlc( double *X, double *A );
+extern
+void c_print_results( char *name,
+ char class,
+ int n1,
+ int n2,
+ int n3,
+ int niter,
+ int nprocs_compiled,
+ int nprocs_total,
+ double t,
+ double mops,
+ char *optype,
+ int passed_verification,
+ char *npbversion,
+ char *compiletime,
+ char *mpicc,
+ char *clink,
+ char *cmpi_lib,
+ char *cmpi_inc,
+ char *cflags,
+ char *clinkflags );
+
+void timer_clear( int n );
+void timer_start( int n );
+void timer_stop( int n );
+double timer_read( int n );
+int timer_on=0,timers_tot=64;
+
+int verify(char *bmname,double rnm2){
+ double verify_value=0.0;
+ double epsilon=1.0E-8;
+ char cls=CLASS;
+ int verified=-1;
+ if (cls != 'U') {
+ if(cls=='S') {
+ if(strstr(bmname,"BH")){
+ verify_value=30892725.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value=67349758.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value=58875767.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ }
+ verified = 0;
+ }else if(cls=='W') {
+ if(strstr(bmname,"BH")){
+ verify_value = 4102461.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 204280762.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 186944764.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ }
+ verified = 0;
+ }else if(cls=='A') {
+ if(strstr(bmname,"BH")){
+ verify_value = 17809491.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 1289925229.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 610856482.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ }
+ verified = 0;
+ }else if(cls=='B') {
+ if(strstr(bmname,"BH")){
+ verify_value = 4317114.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 7877279917.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 1836863082.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ verified = 0;
+ }
+ }else if(cls=='C') {
+ if(strstr(bmname,"BH")){
+ verify_value = 0.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 0.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 0.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ verified = -1;
+ }
+ }else if(cls=='D') {
+ if(strstr(bmname,"BH")){
+ verify_value = 0.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 0.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 0.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ }
+ verified = -1;
+ }else{
+ fprintf(stderr,"No such class as %c.\n",cls);
+ }
+ fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
+ if(verified==-1){
+ fprintf(stderr," No verification was performed.\n");
+ }else if( rnm2 - verify_value < epsilon &&
+ rnm2 - verify_value > -epsilon) { /* abs here does not work on ALTIX */
+ verified = 1;
+ fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
+ }else{
+ verified = 0;
+ fprintf(stderr," The correct verification value = %f\n",verify_value);
+ fprintf(stderr," Got value = %f\n",rnm2);
+ }
+ }else{
+ verified = -1;
+ }
+ return verified;
+ }
+
+int ipowMod(int a,long long int n,int md){
+ int seed=1,q=a,r=1;
+ if(n<0){
+ fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
+ n=-n; /* temp fix */
+/* return 1; */
+ }
+ if(md<=0){
+ fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
+ return 1;
+ }
+ if(n==0) return 1;
+ while(n>1){
+ int n2 = n/2;
+ if (n2*2==n){
+ seed = (q*q)%md;
+ q=seed;
+ n = n2;
+ }else{
+ seed = (r*q)%md;
+ r=seed;
+ n = n-1;
+ }
+ }
+ seed = (r*q)%md;
+ return seed;
+}
+
+#include "DGraph.h"
+DGraph *buildSH(char cls){
+/*
+ Nodes of the graph must be topologically sorted
+ to avoid MPI deadlock.
+*/
+ DGraph *dg;
+ int numSources=NUM_SOURCES; /* must be power of 2 */
+ int numOfLayers=0,tmpS=numSources>>1;
+ int firstLayerNode=0;
+ DGArc *ar=NULL;
+ DGNode *nd=NULL;
+ int mask=0x0,ndid=0,ndoff=0;
+ int i=0,j=0;
+ char nm[BLOCK_SIZE];
+
+ sprintf(nm,"DT_SH.%c",cls);
+ dg=newDGraph(nm);
+
+ while(tmpS>1){
+ numOfLayers++;
+ tmpS>>=1;
+ }
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Source.%d",i);
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ }
+ for(j=0;j<numOfLayers;j++){
+ mask=0x00000001<<j;
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ ndoff=i&(~mask);
+ ndid=firstLayerNode+ndoff;
+ ar=newArc(dg->node[ndid],nd);
+ AttachArc(dg,ar);
+ ndoff+=mask;
+ ndid=firstLayerNode+ndoff;
+ ar=newArc(dg->node[ndid],nd);
+ AttachArc(dg,ar);
+ }
+ firstLayerNode+=numSources;
+ }
+ mask=0x00000001<<numOfLayers;
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Sink.%d",i);
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ ndoff=i&(~mask);
+ ndid=firstLayerNode+ndoff;
+ ar=newArc(dg->node[ndid],nd);
+ AttachArc(dg,ar);
+ ndoff+=mask;
+ ndid=firstLayerNode+ndoff;
+ ar=newArc(dg->node[ndid],nd);
+ AttachArc(dg,ar);
+ }
+return dg;
+}
+DGraph *buildWH(char cls){
+/*
+ Nodes of the graph must be topologically sorted
+ to avoid MPI deadlock.
+*/
+ int i=0,j=0;
+ int numSources=NUM_SOURCES,maxInDeg=4;
+ int numLayerNodes=numSources,firstLayerNode=0;
+ int totComparators=0;
+ int numPrevLayerNodes=numLayerNodes;
+ int id=0,sid=0;
+ DGraph *dg;
+ DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
+ DGArc *ar=NULL;
+ char nm[BLOCK_SIZE];
+
+ sprintf(nm,"DT_WH.%c",cls);
+ dg=newDGraph(nm);
+
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Sink.%d",i);
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ }
+ totComparators=0;
+ numPrevLayerNodes=numLayerNodes;
+ while(numLayerNodes>maxInDeg){
+ numLayerNodes=numLayerNodes/maxInDeg;
+ if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+ for(i=0;i<numLayerNodes;i++){
+ sprintf(nm,"Comparator.%d",totComparators);
+ totComparators++;
+ nd=newNode(nm);
+ id=AttachNode(dg,nd);
+ for(j=0;j<maxInDeg;j++){
+ sid=i*maxInDeg+j;
+ if(sid>=numPrevLayerNodes) break;
+ snd=dg->node[firstLayerNode+sid];
+ ar=newArc(dg->node[id],snd);
+ AttachArc(dg,ar);
+ }
+ }
+ firstLayerNode+=numPrevLayerNodes;
+ numPrevLayerNodes=numLayerNodes;
+ }
+ source=newNode("Source");
+ AttachNode(dg,source);
+ for(i=0;i<numPrevLayerNodes;i++){
+ nd=dg->node[firstLayerNode+i];
+ ar=newArc(source,nd);
+ AttachArc(dg,ar);
+ }
+
+ for(i=0;i<dg->numNodes/2;i++){ /* Topological sorting */
+ tmp=dg->node[i];
+ dg->node[i]=dg->node[dg->numNodes-1-i];
+ dg->node[i]->id=i;
+ dg->node[dg->numNodes-1-i]=tmp;
+ dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
+ }
+return dg;
+}
+DGraph *buildBH(char cls){
+/*
+ Nodes of the graph must be topologically sorted
+ to avoid MPI deadlock.
+*/
+ int i=0,j=0;
+ int numSources=NUM_SOURCES,maxInDeg=4;
+ int numLayerNodes=numSources,firstLayerNode=0;
+ DGraph *dg;
+ DGNode *nd=NULL, *snd=NULL, *sink=NULL;
+ DGArc *ar=NULL;
+ int totComparators=0;
+ int numPrevLayerNodes=numLayerNodes;
+ int id=0, sid=0;
+ char nm[BLOCK_SIZE];
+
+ sprintf(nm,"DT_BH.%c",cls);
+ dg=newDGraph(nm);
+
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Source.%d",i);
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ }
+ while(numLayerNodes>maxInDeg){
+ numLayerNodes=numLayerNodes/maxInDeg;
+ if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+ for(i=0;i<numLayerNodes;i++){
+ sprintf(nm,"Comparator.%d",totComparators);
+ totComparators++;
+ nd=newNode(nm);
+ id=AttachNode(dg,nd);
+ for(j=0;j<maxInDeg;j++){
+ sid=i*maxInDeg+j;
+ if(sid>=numPrevLayerNodes) break;
+ snd=dg->node[firstLayerNode+sid];
+ ar=newArc(snd,dg->node[id]);
+ AttachArc(dg,ar);
+ }
+ }
+ firstLayerNode+=numPrevLayerNodes;
+ numPrevLayerNodes=numLayerNodes;
+ }
+ sink=newNode("Sink");
+ AttachNode(dg,sink);
+ for(i=0;i<numPrevLayerNodes;i++){
+ nd=dg->node[firstLayerNode+i];
+ ar=newArc(nd,sink);
+ AttachArc(dg,ar);
+ }
+return dg;
+}
+
+typedef struct{
+ int len;
+ double* val;
+} Arr;
+Arr *newArr(int len){
+ Arr *arr=(Arr *)malloc(sizeof(Arr));
+ arr->len=len;
+ arr->val=(double *)malloc(len*sizeof(double));
+ return arr;
+}
+void arrShow(Arr* a){
+ if(!a) fprintf(stderr,"-- NULL array\n");
+ else{
+ fprintf(stderr,"-- length=%d\n",a->len);
+ }
+}
+double CheckVal(Arr *feat){
+ double csum=0.0;
+ int i=0;
+ for(i=0;i<feat->len;i++){
+ csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since
+ result will be 0 for large len */
+ }
+ return csum;
+}
+int GetFNumDPar(int* mean, int* stdev){
+ *mean=NUM_SAMPLES;
+ *stdev=STD_DEVIATION;
+ return 0;
+}
+int GetFeatureNum(char *mbname,int id){
+ double tran=314159265.0;
+ double A=2*id+1;
+ double denom=randlc(&tran,&A);
+ char cval='S';
+ int mean=NUM_SAMPLES,stdev=128;
+ int rtfs=0,len=0;
+ GetFNumDPar(&mean,&stdev);
+ rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
+ if(rtfs<0) rtfs=-rtfs;
+ len=mean-stdev+rtfs;
+ return len;
+}
+Arr* RandomFeatures(char *bmname,int fdim,int id){
+ int len=GetFeatureNum(bmname,id)*fdim;
+ Arr* feat=newArr(len);
+ int nxg=2,nyg=2,nzg=2,nfg=5;
+ int nx=421,ny=419,nz=1427,nf=3527;
+ long long int expon=(len*(id+1))%3141592;
+ int seedx=ipowMod(nxg,expon,nx),
+ seedy=ipowMod(nyg,expon,ny),
+ seedz=ipowMod(nzg,expon,nz),
+ seedf=ipowMod(nfg,expon,nf);
+ int i=0;
+ if(timer_on){
+ timer_clear(id+1);
+ timer_start(id+1);
+ }
+ for(i=0;i<len;i+=fdim){
+ seedx=(seedx*nxg)%nx;
+ seedy=(seedy*nyg)%ny;
+ seedz=(seedz*nzg)%nz;
+ seedf=(seedf*nfg)%nf;
+ feat->val[i]=seedx;
+ feat->val[i+1]=seedy;
+ feat->val[i+2]=seedz;
+ feat->val[i+3]=seedf;
+ }
+ if(timer_on){
+ timer_stop(id+1);
+ fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
+ }
+ return feat;
+}
+void Resample(Arr *a,int blen){
+ long long int i=0,j=0,jlo=0,jhi=0;
+ double avval=0.0;
+ double *nval=(double *)malloc(blen*sizeof(double));
+ Arr *tmp=newArr(10);
+ for(i=0;i<blen;i++) nval[i]=0.0;
+ for(i=1;i<a->len-1;i++){
+ jlo=(int)(0.5*(2*i-1)*(blen/a->len));
+ jhi=(int)(0.5*(2*i+1)*(blen/a->len));
+
+ avval=a->val[i]/(jhi-jlo+1);
+ for(j=jlo;j<=jhi;j++){
+ nval[j]+=avval;
+ }
+ }
+ nval[0]=a->val[0];
+ nval[blen-1]=a->val[a->len-1];
+ free(a->val);
+ a->val=nval;
+ a->len=blen;
+}
+#define fielddim 4
+Arr* WindowFilter(Arr *a, Arr* b,int w){
+ int i=0,j=0,k=0;
+ double rms0=0.0,rms1=0.0,rmsm1=0.0;
+ double weight=((double) (w+1))/(w+2);
+
+ w+=1;
+ if(timer_on){
+ timer_clear(w);
+ timer_start(w);
+ }
+ if(a->len<b->len) Resample(a,b->len);
+ if(a->len>b->len) Resample(b,a->len);
+ for(i=fielddim;i<a->len-fielddim;i+=fielddim){
+ rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
+ +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
+ +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
+ +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
+ j=i+fielddim;
+ rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+ +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+ +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+ +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+ j=i-fielddim;
+ rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+ +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+ +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+ +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+ k=0;
+ if(rms1<rms0){
+ k=1;
+ rms0=rms1;
+ }
+ if(rmsm1<rms0) k=-1;
+ if(k==0){
+ j=i+fielddim;
+ a->val[i]=weight*b->val[i];
+ a->val[i+1]=weight*b->val[i+1];
+ a->val[i+2]=weight*b->val[i+2];
+ a->val[i+3]=weight*b->val[i+3];
+ }else if(k==1){
+ j=i+fielddim;
+ a->val[i]=weight*b->val[j];
+ a->val[i+1]=weight*b->val[j+1];
+ a->val[i+2]=weight*b->val[j+2];
+ a->val[i+3]=weight*b->val[j+3];
+ }else { /*if(k==-1)*/
+ j=i-fielddim;
+ a->val[i]=weight*b->val[j];
+ a->val[i+1]=weight*b->val[j+1];
+ a->val[i+2]=weight*b->val[j+2];
+ a->val[i+3]=weight*b->val[j+3];
+ }
+ }
+ if(timer_on){
+ timer_stop(w);
+ fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
+ }
+ return a;
+}
+
+int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
+ int i=0,tag=0;
+ DGArc *ar=NULL;
+ DGNode *head=NULL;
+ if(!feat) return 0;
+ for(i=0;i<nd->outDegree;i++){
+ ar=nd->outArc[i];
+ if(ar->tail!=nd) continue;
+ head=ar->head;
+ tag=ar->id;
+ if(head->address!=nd->address){
+ MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
+ MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
+ }
+ }
+ return 1;
+}
+Arr* CombineStreams(DGraph *dg,DGNode *nd){
+ Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
+ int i=0,len=0,tag=0;
+ DGArc *ar=NULL;
+ DGNode *tail=NULL;
+ MPI_Status status;
+ Arr *feat=NULL,*featp=NULL;
+
+ if(nd->inDegree==0) return NULL;
+ for(i=0;i<nd->inDegree;i++){
+ ar=nd->inArc[i];
+ if(ar->head!=nd) continue;
+ tail=ar->tail;
+ if(tail->address!=nd->address){
+ len=0;
+ tag=ar->id;
+ MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+ feat=newArr(len);
+ MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+ resfeat=WindowFilter(resfeat,feat,nd->id);
+ free(feat);
+ }else{
+ featp=(Arr *)tail->feat;
+ feat=newArr(featp->len);
+ memcpy(feat->val,featp->val,featp->len*sizeof(double));
+ resfeat=WindowFilter(resfeat,feat,nd->id);
+ free(feat);
+ }
+ }
+ for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
+ nd->feat=resfeat;
+ return nd->feat;
+}
+double Reduce(Arr *a,int w){
+ double retv=0.0;
+ if(timer_on){
+ timer_clear(w);
+ timer_start(w);
+ }
+ retv=(int)(w*CheckVal(a));/* The casting needed for node
+ and array dependent verifcation */
+ if(timer_on){
+ timer_stop(w);
+ fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
+ }
+ return retv;
+}
+
+double ReduceStreams(DGraph *dg,DGNode *nd){
+ double csum=0.0;
+ int i=0,len=0,tag=0;
+ DGArc *ar=NULL;
+ DGNode *tail=NULL;
+ Arr *feat=NULL;
+ double retv=0.0;
+
+ for(i=0;i<nd->inDegree;i++){
+ ar=nd->inArc[i];
+ if(ar->head!=nd) continue;
+ tail=ar->tail;
+ if(tail->address!=nd->address){
+ MPI_Status status;
+ len=0;
+ tag=ar->id;
+ MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+ feat=newArr(len);
+ MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+ csum+=Reduce(feat,(nd->id+1));
+ free(feat);
+ }else{
+ csum+=Reduce(tail->feat,(nd->id+1));
+ }
+ }
+ if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
+ retv=(nd->id+1)*csum;
+ return retv;
+}
+
+int ProcessNodes(DGraph *dg,int me){
+ double chksum=0.0;
+ Arr *feat=NULL;
+ int i=0,verified=0,tag;
+ DGNode *nd=NULL;
+ double rchksum=0.0;
+ MPI_Status status;
+
+ for(i=0;i<dg->numNodes;i++){
+ nd=dg->node[i];
+ if(nd->address!=me) continue;
+ if(strstr(nd->name,"Source")){
+ nd->feat=RandomFeatures(dg->name,fielddim,nd->id);
+ SendResults(dg,nd,nd->feat);
+ }else if(strstr(nd->name,"Sink")){
+ chksum=ReduceStreams(dg,nd);
+ tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+ MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
+ }else{
+ feat=CombineStreams(dg,nd);
+ SendResults(dg,nd,feat);
+ }
+ }
+ if(me==0){ /* Report node */
+ rchksum=0.0;
+ chksum=0.0;
+ for(i=0;i<dg->numNodes;i++){
+ nd=dg->node[i];
+ if(!strstr(nd->name,"Sink")) continue;
+ tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+ MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
+ chksum+=rchksum;
+ }
+ verified=verify(dg->name,chksum);
+ }
+return verified;
+}
+
+int main(int argc,char **argv ){
+ int my_rank,comm_size;
+ int i;
+ DGraph *dg=NULL;
+ int verified=0, featnum=0;
+ double bytes_sent=2.0,tot_time=0.0;
+
+
+ TRACE_smpi_set_category ("start");
+
+ MPI_Init( &argc, &argv );
+ MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+ MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+ if(argc!=2||
+ ( strncmp(argv[1],"BH",2)!=0
+ &&strncmp(argv[1],"WH",2)!=0
+ &&strncmp(argv[1],"SH",2)!=0
+ )
+ ){
+ if(my_rank==0){
+ fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
+ fprintf(stderr,"** Where \n - N is integer number of MPI processes\n");
+ fprintf(stderr," - S is the class S, W, or A \n");
+ fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n");
+ fprintf(stderr," - the number of MPI processes N should not be be less than \n");
+ fprintf(stderr," the number of nodes in the graph\n");
+ }
+ MPI_Finalize();
+ exit(0);
+ }
+ if(strncmp(argv[1],"BH",2)==0){
+ dg=buildBH(CLASS);
+ }else if(strncmp(argv[1],"WH",2)==0){
+ dg=buildWH(CLASS);
+ }else if(strncmp(argv[1],"SH",2)==0){
+ dg=buildSH(CLASS);
+ }
+
+ if(timer_on&&dg->numNodes+1>timers_tot){
+ timer_on=0;
+ if(my_rank==0)
+ fprintf(stderr,"Not enough timers. Node timeing is off. \n");
+ }
+ if(dg->numNodes>comm_size){
+ if(my_rank==0){
+ fprintf(stderr,"** The number of MPI processes should not be less than \n");
+ fprintf(stderr,"** the number of nodes in the graph\n");
+ fprintf(stderr,"** Number of MPI processes = %d\n",comm_size);
+ fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes);
+ }
+ MPI_Finalize();
+ exit(0);
+ }
+ for(i=0;i<dg->numNodes;i++){
+ dg->node[i]->address=i;
+ }
+ if( my_rank == 0 ){
+ printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
+ graphShow(dg,0);
+ timer_clear(0);
+ timer_start(0);
+ }
+
+ TRACE_smpi_set_category ("dt");
+ verified=ProcessNodes(dg,my_rank);
+ TRACE_smpi_set_category ("finalize");
+
+ featnum=NUM_SAMPLES*fielddim;
+ bytes_sent=featnum*dg->numArcs;
+ bytes_sent/=1048576;
+ if(my_rank==0){
+ timer_stop(0);
+ tot_time=timer_read(0);
+ c_print_results( dg->name,
+ CLASS,
+ featnum,
+ 0,
+ 0,
+ dg->numNodes,
+ 0,
+ comm_size,
+ tot_time,
+ bytes_sent/tot_time,
+ "bytes transmitted",
+ verified,
+ NPBVERSION,
+ COMPILETIME,
+ MPICC,
+ CLINK,
+ CMPI_LIB,
+ CMPI_INC,
+ CFLAGS,
+ CLINKFLAGS );
+ }
+ MPI_Finalize();
+ return 1;
+}
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=ep
+BENCHMARKU=EP
+
+include ../config/make.def
+
+#OBJS = ep-trace.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o
+OBJS = ep-trace.o randlc.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+# ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+ ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
+
+
+#ep-trace.o: ep-trace.f mpinpb.h npbparams.h
+# ${FCOMPILE} ep-trace.f
+
+ep-trace.o: ep-trace.c randlc.c mpinpb.h npbparams.h
+ ${CCOMPILE} ep-trace.c
+
+clean:
+ - rm -f *.o *~
+ - rm -f npbparams.h core
+
+
+
--- /dev/null
+This code implements the random-number generator described in the
+NAS Parallel Benchmark document RNR Technical Report RNR-94-007.
+The code is "embarrassingly" parallel in that no communication is
+required for the generation of the random numbers itself. There is
+no special requirement on the number of processors used for running
+the benchmark.
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#include "instr/instr.h" //TRACE_
+
+#include "randlc.h"
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS 1
+#endif
+#define true 1
+#define false 0
+
+
+//---NOTE : all the timers function have been modified to
+// avoid global timers (privatize these).
+ // ----------------------- timers ---------------------
+ void timer_clear(double *onetimer) {
+ //elapsed[n] = 0.0;
+ *onetimer = 0.0;
+ }
+
+ void timer_start(double *onetimer) {
+ *onetimer = MPI_Wtime();
+ }
+
+ void timer_stop(int n,double *elapsed,double *start) {
+ double t, now;
+
+ now = MPI_Wtime();
+ t = now - start[n];
+ elapsed[n] += t;
+ }
+
+ double timer_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */
+ return(elapsed[n]);
+ }
+ /********************************************************************
+ ***************** V R A N L C ******************
+ ***************** *****************/
+ double vranlc(int n, double x, double a, double *y)
+ {
+ int i;
+ long i246m1=0x00003FFFFFFFFFFF;
+ long LLx, Lx, La;
+ double d2m46;
+
+// This doesn't work, because the compiler does the calculation in 32
+// bits and overflows. No standard way (without f90 stuff) to specify
+// that the rhs should be done in 64 bit arithmetic.
+// parameter(i246m1=2**46-1)
+
+ d2m46=pow(0.5,46);
+
+// c Note that the v6 compiler on an R8000 does something stupid with
+// c the above. Using the following instead (or various other things)
+// c makes the calculation run almost 10 times as fast.
+//
+// c save d2m46
+// c data d2m46/0.0d0/
+// c if (d2m46 .eq. 0.0d0) then
+// c d2m46 = 0.5d0**46
+// c endif
+
+ Lx = (long)x;
+ La = (long)a;
+ //fprintf(stdout,("================== Vranlc ================");
+ //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
+ LLx = Lx;
+ for (i=0; i< n; i++) {
+ Lx = Lx*La & i246m1 ;
+ LLx = Lx;
+ y[i] = d2m46 * (double)LLx;
+ /*
+ if(i == 0) {
+ fprintf(stdout,("After loop 0:");
+ fprintf(stdout,("Lx = " + Lx + ", La = " + La);
+ fprintf(stdout,("d2m46 = " + d2m46);
+ fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
+ fprintf(stdout,("Y[0]" + y[0]);
+ }
+ */
+ }
+
+ x = (double)LLx;
+ /*
+ fprintf(stdout,("Change: Lx = " + Lx);
+ fprintf(stdout,("=============End Vranlc ================");
+ */
+ return x;
+ }
+
+
+
+//-------------- the core (unique function) -----------
+ void doTest(int argc, char **argv) {
+ double dum[3] = {1.,1.,1.};
+ double x1, x2, sx, sy, tm, an, tt, gc;
+ double Mops;
+ double epsilon=1.0E-8, a = 1220703125., s=271828183.;
+ double t1, t2, t3, t4;
+ double sx_verify_value, sy_verify_value, sx_err, sy_err;
+
+#include "npbparams.h"
+ int mk=16,
+ // --> set by make : in npbparams.h
+ //m=28, // for CLASS=A
+ //m=30, // for CLASS=B
+ //npm=2, // NPROCS
+ mm = m-mk,
+ nn = (int)(pow(2,mm)),
+ nk = (int)(pow(2,mk)),
+ nq=10,
+ np,
+ node,
+ no_nodes,
+ i,
+ ik,
+ kk,
+ l,
+ k, nit, no_large_nodes,
+ np_add, k_offset, j;
+ int me, nprocs, root=0, dp_type;
+ int verified,
+ timers_enabled=true;
+ char size[500]; // mind the size of the string to represent a big number
+
+ //Use in randlc..
+ int KS = 0;
+ double R23, R46, T23, T46;
+
+ double *qq = (double *) malloc (10000*sizeof(double));
+ double *start = (double *) malloc (64*sizeof(double));
+ double *elapsed = (double *) malloc (64*sizeof(double));
+
+ double *x = (double *) malloc (2*nk*sizeof(double));
+ double *q = (double *) malloc (nq*sizeof(double));
+
+ TRACE_smpi_set_category ("start");
+
+ MPI_Init( &argc, &argv );
+ MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
+ MPI_Comm_rank( MPI_COMM_WORLD, &node);
+
+#ifdef USE_MPE
+ MPE_Init_log();
+#endif
+ root = 0;
+ if (node == root ) {
+
+ /* Because the size of the problem is too large to store in a 32-bit
+ * integer for some classes, we put it into a string (for printing).
+ * Have to strip off the decimal point put in there by the floating
+ * point print statement (internal file)
+ */
+ fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
+ sprintf(size,"%d",pow(2,m+1));
+ //size = size.replace('.', ' ');
+ fprintf(stdout," Number of random numbers generated: %s\n",size);
+ fprintf(stdout," Number of active processes: %d\n",no_nodes);
+
+ }
+ verified = false;
+
+ /* c Compute the number of "batches" of random number pairs generated
+ c per processor. Adjust if the number of processors does not evenly
+ c divide the total number
+*/
+
+ np = nn / no_nodes;
+ no_large_nodes = nn % no_nodes;
+ if (node < no_large_nodes) np_add = 1;
+ else np_add = 0;
+ np = np + np_add;
+
+ if (np == 0) {
+ fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
+ MPI_Abort(MPI_COMM_WORLD,1);
+ exit(0);
+ }
+
+/* c Call the random number generator functions and initialize
+ c the x-array to reduce the effects of paging on the timings.
+ c Also, call all mathematical functions that are used. Make
+ c sure these initializations cannot be eliminated as dead code.
+*/
+
+ //call vranlc(0, dum[1], dum[2], dum[3]);
+ // Array indexes start at 1 in Fortran, 0 in Java
+ vranlc(0, dum[0], dum[1], &(dum[2]));
+
+ dum[0] = randlc(&(dum[1]),&(dum[2]));
+ /////////////////////////////////
+ for (i=0;i<2*nk;i++) {
+ x[i] = -1e99;
+ }
+ Mops = log(sqrt(abs(1)));
+
+ /*
+ c---------------------------------------------------------------------
+ c Synchronize before placing time stamp
+ c---------------------------------------------------------------------
+ */
+ MPI_Barrier( MPI_COMM_WORLD );
+
+
+ TRACE_smpi_set_category ("ep");
+
+ timer_clear(&(elapsed[1]));
+ timer_clear(&(elapsed[2]));
+ timer_clear(&(elapsed[3]));
+ timer_start(&(start[1]));
+
+ t1 = a;
+ //fprintf(stdout,("(ep.f:160) t1 = " + t1);
+ t1 = vranlc(0, t1, a, x);
+ //fprintf(stdout,("(ep.f:161) t1 = " + t1);
+
+
+/* c Compute AN = A ^ (2 * NK) (mod 2^46). */
+
+ t1 = a;
+ //fprintf(stdout,("(ep.f:165) t1 = " + t1);
+ for (i=1; i <= mk+1; i++) {
+ t2 = randlc(&t1, &t1);
+ //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
+ }
+ an = t1;
+ //fprintf(stdout,("(ep.f:172) s = " + s);
+ tt = s;
+ gc = 0.;
+ sx = 0.;
+ sy = 0.;
+ for (i=0; i < nq ; i++) {
+ q[i] = 0.;
+ }
+
+/*
+ Each instance of this loop may be performed independently. We compute
+ the k offsets separately to take into account the fact that some nodes
+ have more numbers to generate than others
+*/
+
+ if (np_add == 1)
+ k_offset = node * np -1;
+ else
+ k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
+
+ int stop = false;
+ for(k = 1; k <= np; k++) {
+ stop = false;
+ kk = k_offset + k ;
+ t1 = s;
+ //fprintf(stdout,("(ep.f:193) t1 = " + t1);
+ t2 = an;
+
+// Find starting seed t1 for this kk.
+
+ for (i=1;i<=100 && !stop;i++) {
+ ik = kk / 2;
+ //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
+ if (2 * ik != kk) {
+ t3 = randlc(&t1, &t2);
+ //fprintf(stdout,("(ep.f:200) t1= " +t1 );
+ }
+ if (ik==0)
+ stop = true;
+ else {
+ t3 = randlc(&t2, &t2);
+ kk = ik;
+ }
+ }
+// Compute uniform pseudorandom numbers.
+
+ //if (timers_enabled) timer_start(3);
+ timer_start(&(start[3]));
+ //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified
+
+ //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
+ //fprintf(stdout,"2*nk = " + (2*nk));
+ //fprintf(stdout,"t1 = " + t1);
+ //fprintf(stdout,"a = " + a);
+ //fprintf(stdout,"x[0] = " + x[0]);
+ //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+
+ t1 = vranlc(2 * nk, t1, a, x);
+
+ //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<");
+ //fprintf(stdout,("2*nk = " + (2*nk));
+ //fprintf(stdout,("t1 = " + t1);
+ //fprintf(stdout,("a = " + a);
+ //fprintf(stdout,("x[0] = " + x[0]);
+ //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+
+ //if (timers_enabled) timer_stop(3);
+ timer_stop(3,elapsed,start);
+
+/* Compute Gaussian deviates by acceptance-rejection method and
+ * tally counts in concentric square annuli. This loop is not
+ * vectorizable.
+ */
+ //if (timers_enabled) timer_start(2);
+ timer_start(&(start[2]));
+ for(i=1; i<=nk;i++) {
+ x1 = 2. * x[2*i-2] -1.0;
+ x2 = 2. * x[2*i-1] - 1.0;
+ t1 = x1*x1 + x2*x2;
+ if (t1 <= 1.) {
+ t2 = sqrt(-2. * log(t1) / t1);
+ t3 = (x1 * t2);
+ t4 = (x2 * t2);
+ l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
+ q[l] = q[l] + 1.;
+ sx = sx + t3;
+ sy = sy + t4;
+ }
+ /*
+ if(i == 1) {
+ fprintf(stdout,"x1 = " + x1);
+ fprintf(stdout,"x2 = " + x2);
+ fprintf(stdout,"t1 = " + t1);
+ fprintf(stdout,"t2 = " + t2);
+ fprintf(stdout,"t3 = " + t3);
+ fprintf(stdout,"t4 = " + t4);
+ fprintf(stdout,"l = " + l);
+ fprintf(stdout,"q[l] = " + q[l]);
+ fprintf(stdout,"sx = " + sx);
+ fprintf(stdout,"sy = " + sy);
+ }
+ */
+ }
+ //if (timers_enabled) timer_stop(2);
+ timer_stop(2,elapsed,start);
+ }
+
+ TRACE_smpi_set_category ("finalize");
+
+ //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+ MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+ sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
+ MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+ sy = x[0];
+ MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+ for(i = 0; i < nq; i++) {
+ q[i] = x[i];
+ }
+ for(i = 0; i < nq; i++) {
+ gc += q[i];
+ }
+
+ timer_stop(1,elapsed,start);
+ tm = timer_read(1,elapsed);
+ MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+ tm = x[0];
+
+ if(node == root) {
+ nit = 0;
+ verified = true;
+
+ if(m == 24) {
+ sx_verify_value = -3.247834652034740E3;
+ sy_verify_value = -6.958407078382297E3;
+ } else if(m == 25) {
+ sx_verify_value = -2.863319731645753E3;
+ sy_verify_value = -6.320053679109499E3;
+ } else if(m == 28) {
+ sx_verify_value = -4.295875165629892E3;
+ sy_verify_value = -1.580732573678431E4;
+ } else if(m == 30) {
+ sx_verify_value = 4.033815542441498E4;
+ sy_verify_value = -2.660669192809235E4;
+ } else if(m == 32) {
+ sx_verify_value = 4.764367927995374E4;
+ sy_verify_value = -8.084072988043731E4;
+ } else if(m == 36) {
+ sx_verify_value = 1.982481200946593E5;
+ sy_verify_value = -1.020596636361769E5;
+ } else {
+ verified = false;
+ }
+
+ /*
+ fprintf(stdout,("sx = " + sx);
+ fprintf(stdout,("sx_verify = " + sx_verify_value);
+ fprintf(stdout,("sy = " + sy);
+ fprintf(stdout,("sy_verify = " + sy_verify_value);
+ */
+ if(verified) {
+ sx_err = abs((sx - sx_verify_value)/sx_verify_value);
+ sy_err = abs((sy - sy_verify_value)/sy_verify_value);
+ /*
+ fprintf(stdout,("sx_err = " + sx_err);
+ fprintf(stdout,("sy_err = " + sx_err);
+ fprintf(stdout,("epsilon= " + epsilon);
+ */
+ verified = ((sx_err < epsilon) && (sy_err < epsilon));
+ }
+
+ Mops = (pow(2.0, m+1))/tm/1000;
+
+ fprintf(stdout,"EP Benchmark Results:\n");
+ fprintf(stdout,"CPU Time=%d\n",tm);
+ fprintf(stdout,"N = 2^%d\n",m);
+ fprintf(stdout,"No. Gaussain Pairs =%d\n",gc);
+ fprintf(stdout,"Sum = %lf %ld\n",sx,sy);
+ fprintf(stdout,"Count:");
+ for(i = 0; i < nq; i++) {
+ fprintf(stdout,"%d\t %ld\n",i,q[i]);
+ }
+
+ /*
+ print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
+ "Random numbers generated", verified, npbversion,
+ compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
+ fprintf(stdout,"\nEP Benchmark Completed\n");
+ fprintf(stdout,"Class = %s\n", _class);
+ fprintf(stdout,"Size = %s\n", size);
+ fprintf(stdout,"Iteration = %d\n", nit);
+ fprintf(stdout,"Time in seconds = %lf\n",(tm/1000));
+ fprintf(stdout,"Total processes = %d\n",no_nodes);
+ fprintf(stdout,"Mops/s total = %lf\n",Mops);
+ fprintf(stdout,"Mops/s/process = %lf\n", Mops/no_nodes);
+ fprintf(stdout,"Operation type = Random number generated\n");
+ if(verified) {
+ fprintf(stdout,"Verification = SUCCESSFUL\n");
+ } else {
+ fprintf(stdout,"Verification = UNSUCCESSFUL\n");
+ }
+ fprintf(stdout,"Total time: %lf\n",(timer_read(1,elapsed)/1000));
+ fprintf(stdout,"Gaussian pairs: %lf\n",(timer_read(2,elapsed)/1000));
+ fprintf(stdout,"Random numbers: %lf\n",(timer_read(3,elapsed)/1000));
+ }
+#ifdef USE_MPE
+ MPE_Finish_log(argv[0]);
+#endif
+
+ MPI_Finalize();
+ }
+
+ int main(int argc, char **argv) {
+ doTest(argc,argv);
+ }
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'mpif.h'
+
+ integer me, nprocs, root, dp_type
+ common /mpistuff/ me, nprocs, root, dp_type
+
--- /dev/null
+
+/*
+ * FUNCTION RANDLC (X, A)
+ *
+ * This routine returns a uniform pseudorandom double precision number in the
+ * range (0, 1) by using the linear congruential generator
+ *
+ * x_{k+1} = a x_k (mod 2^46)
+ *
+ * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+ * before repeating. The argument A is the same as 'a' in the above formula,
+ * and X is the same as x_0. A and X must be odd double precision integers
+ * in the range (1, 2^46). The returned value RANDLC is normalized to be
+ * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+ * the new seed x_1, so that subsequent calls to RANDLC using the same
+ * arguments will generate a continuous sequence.
+ *
+ * This routine should produce the same results on any computer with at least
+ * 48 mantissa bits in double precision floating point data. On Cray systems,
+ * double precision should be disabled.
+ *
+ * David H. Bailey October 26, 1990
+ *
+ * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ * SAVE KS, R23, R46, T23, T46
+ * DATA KS/0/
+ *
+ * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
+ * by merely using the ** operator, in order to insure that the results are
+ * exact on all systems. This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/************* R A N D L C ************/
+/************* ************/
+/************* portable random number generator ************/
+/*****************************************************************/
+
+double randlc( double *X, double *A )
+{
+ static int KS=0;
+ static double R23, R46, T23, T46;
+ double T1, T2, T3, T4;
+ double A1;
+ double A2;
+ double X1;
+ double X2;
+ double Z;
+ int i, j;
+
+ if (KS == 0)
+ {
+ R23 = 1.0;
+ R46 = 1.0;
+ T23 = 1.0;
+ T46 = 1.0;
+
+ for (i=1; i<=23; i++)
+ {
+ R23 = 0.50 * R23;
+ T23 = 2.0 * T23;
+ }
+ for (i=1; i<=46; i++)
+ {
+ R46 = 0.50 * R46;
+ T46 = 2.0 * T46;
+ }
+ KS = 1;
+ }
+
+/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
+
+ T1 = R23 * *A;
+ j = T1;
+ A1 = j;
+ A2 = *A - T23 * A1;
+
+/* Break X into two parts such that X = 2^23 * X1 + X2, compute
+ Z = A1 * X2 + A2 * X1 (mod 2^23), and then
+ X = 2^23 * Z + A2 * X2 (mod 2^46). */
+
+ T1 = R23 * *X;
+ j = T1;
+ X1 = j;
+ X2 = *X - T23 * X1;
+ T1 = A1 * X2 + A2 * X1;
+
+ j = R23 * T1;
+ T2 = j;
+ Z = T1 - T23 * T2;
+ T3 = T23 * Z + A2 * X2;
+ j = R46 * T3;
+ T4 = j;
+ *X = T3 - T46 * T4;
+ return(R46 * *X);
+}
+
+
+
+/*****************************************************************/
+/************ F I N D _ M Y _ S E E D ************/
+/************ ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
--- /dev/null
+
+double randlc( double *X, double *A );
+
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=is
+BENCHMARKU=IS
+
+include ../config/make.def
+
+include ../sys/make.common
+
+OBJS = is-trace.o ${COMMON}/c_print_results.o
+
+
+${PROGRAM}: config ${OBJS}
+ ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+ ${CCOMPILE} $<
+
+is-trace.o: is-trace.c npbparams.h
+
+clean:
+ - rm -f *.o *~ mputil*
+ - rm -f is-trace npbparams.h core
--- /dev/null
+/*************************************************************************
+ * *
+ * N A S P A R A L L E L B E N C H M A R K S 3.3 *
+ * *
+ * I S *
+ * *
+ *************************************************************************
+ * *
+ * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
+ * It is described in NAS Technical Report 95-020. *
+ * *
+ * Permission to use, copy, distribute and modify this software *
+ * for any purpose with or without fee is hereby granted. We *
+ * request, however, that all derived work reference the NAS *
+ * Parallel Benchmarks 3.3. This software is provided "as is" *
+ * without express or implied warranty. *
+ * *
+ * Information on NPB 3.3, including the technical report, the *
+ * original specifications, source code, results and information *
+ * on how to submit new results, is available at: *
+ * *
+ * http://www.nas.nasa.gov/Software/NPB *
+ * *
+ * Send comments or suggestions to npb@nas.nasa.gov *
+ * Send bug reports to npb-bugs@nas.nasa.gov *
+ * *
+ * NAS Parallel Benchmarks Group *
+ * NASA Ames Research Center *
+ * Mail Stop: T27A-1 *
+ * Moffett Field, CA 94035-1000 *
+ * *
+ * E-mail: npb@nas.nasa.gov *
+ * Fax: (650) 604-3957 *
+ * *
+ *************************************************************************
+ * *
+ * Author: M. Yarrow *
+ * H. Jin *
+ * *
+ *************************************************************************/
+
+#include "mpi.h"
+#include "npbparams.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "instr/instr.h" //TRACE_
+
+/******************/
+/* default values */
+/******************/
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS 1
+#endif
+#define MIN_PROCS 1
+
+
+/*************/
+/* CLASS S */
+/*************/
+#if CLASS == 'S'
+#define TOTAL_KEYS_LOG_2 16
+#define MAX_KEY_LOG_2 11
+#define NUM_BUCKETS_LOG_2 9
+#endif
+
+
+/*************/
+/* CLASS W */
+/*************/
+#if CLASS == 'W'
+#define TOTAL_KEYS_LOG_2 20
+#define MAX_KEY_LOG_2 16
+#define NUM_BUCKETS_LOG_2 10
+#endif
+
+/*************/
+/* CLASS A */
+/*************/
+#if CLASS == 'A'
+#define TOTAL_KEYS_LOG_2 23
+#define MAX_KEY_LOG_2 19
+#define NUM_BUCKETS_LOG_2 10
+#endif
+
+
+/*************/
+/* CLASS B */
+/*************/
+#if CLASS == 'B'
+#define TOTAL_KEYS_LOG_2 25
+#define MAX_KEY_LOG_2 21
+#define NUM_BUCKETS_LOG_2 10
+#endif
+
+
+/*************/
+/* CLASS C */
+/*************/
+#if CLASS == 'C'
+#define TOTAL_KEYS_LOG_2 27
+#define MAX_KEY_LOG_2 23
+#define NUM_BUCKETS_LOG_2 10
+#endif
+
+
+/*************/
+/* CLASS D */
+/*************/
+#if CLASS == 'D'
+#define TOTAL_KEYS_LOG_2 29
+#define MAX_KEY_LOG_2 27
+#define NUM_BUCKETS_LOG_2 10
+#undef MIN_PROCS
+#define MIN_PROCS 4
+#endif
+
+
+#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2)
+#define MAX_KEY (1 << MAX_KEY_LOG_2)
+#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2)
+#define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
+
+/*****************************************************************/
+/* On larger number of processors, since the keys are (roughly) */
+/* gaussian distributed, the first and last processor sort keys */
+/* in a large interval, requiring array sizes to be larger. Note */
+/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
+/* The required array size also depends on the bucket size used. */
+/* The following values are validated for the 1024-bucket setup. */
+/*****************************************************************/
+#if NUM_PROCS < 256
+#define SIZE_OF_BUFFERS 3*NUM_KEYS/2
+#elif NUM_PROCS < 512
+#define SIZE_OF_BUFFERS 5*NUM_KEYS/2
+#elif NUM_PROCS < 1024
+#define SIZE_OF_BUFFERS 4*NUM_KEYS
+#else
+#define SIZE_OF_BUFFERS 13*NUM_KEYS/2
+#endif
+
+/*****************************************************************/
+/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
+/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */
+/* MAX_PROCS AT YOUR PERIL */
+/*****************************************************************/
+#if CLASS == 'S'
+#define MAX_PROCS 128
+#else
+#define MAX_PROCS 1024
+#endif
+
+#define MAX_ITERATIONS 10
+#define TEST_ARRAY_SIZE 5
+
+
+/***********************************/
+/* Enable separate communication, */
+/* computation timing and printout */
+/***********************************/
+/* #define TIMING_ENABLED */
+
+
+/*************************************/
+/* Typedef: if necessary, change the */
+/* size of int here by changing the */
+/* int type to, say, long */
+/*************************************/
+typedef int INT_TYPE;
+typedef long INT_TYPE2;
+#define MP_KEY_TYPE MPI_INT
+
+
+typedef struct {
+
+/********************/
+/* MPI properties: */
+/********************/
+int my_rank,
+ comm_size;
+
+
+/********************/
+/* Some global info */
+/********************/
+INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
+ total_local_keys, /* copies of rank info */
+ total_lesser_keys;
+
+
+int passed_verification;
+
+
+
+/************************************/
+/* These are the three main arrays. */
+/* See SIZE_OF_BUFFERS def above */
+/************************************/
+INT_TYPE key_array[SIZE_OF_BUFFERS],
+ key_buff1[SIZE_OF_BUFFERS],
+ key_buff2[SIZE_OF_BUFFERS],
+ bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */
+ bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
+ bucket_ptrs[NUM_BUCKETS],
+ process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
+ process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
+int send_count[MAX_PROCS], recv_count[MAX_PROCS],
+ send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
+
+
+/**********************/
+/* Partial verif info */
+/**********************/
+INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
+ test_rank_array[TEST_ARRAY_SIZE];
+
+/**********/
+/* Timers */
+/**********/
+double start[64], elapsed[64];
+
+} global_data;
+
+
+const INT_TYPE2
+ S_test_index_array[TEST_ARRAY_SIZE] =
+ {48427,17148,23627,62548,4431},
+ S_test_rank_array[TEST_ARRAY_SIZE] =
+ {0,18,346,64917,65463},
+
+ W_test_index_array[TEST_ARRAY_SIZE] =
+ {357773,934767,875723,898999,404505},
+ W_test_rank_array[TEST_ARRAY_SIZE] =
+ {1249,11698,1039987,1043896,1048018},
+
+ A_test_index_array[TEST_ARRAY_SIZE] =
+ {2112377,662041,5336171,3642833,4250760},
+ A_test_rank_array[TEST_ARRAY_SIZE] =
+ {104,17523,123928,8288932,8388264},
+
+ B_test_index_array[TEST_ARRAY_SIZE] =
+ {41869,812306,5102857,18232239,26860214},
+ B_test_rank_array[TEST_ARRAY_SIZE] =
+ {33422937,10244,59149,33135281,99},
+
+ C_test_index_array[TEST_ARRAY_SIZE] =
+ {44172927,72999161,74326391,129606274,21736814},
+ C_test_rank_array[TEST_ARRAY_SIZE] =
+ {61147,882988,266290,133997595,133525895},
+
+ D_test_index_array[TEST_ARRAY_SIZE] =
+ {1317351170,995930646,1157283250,1503301535,1453734525},
+ D_test_rank_array[TEST_ARRAY_SIZE] =
+ {1,36538729,1978098519,2145192618,2147425337};
+
+
+
+/***********************/
+/* function prototypes */
+/***********************/
+double randlc( double *X, double *A );
+
+void full_verify( global_data* gd );
+
+void c_print_results( char *name,
+ char class,
+ int n1,
+ int n2,
+ int n3,
+ int niter,
+ int nprocs_compiled,
+ int nprocs_total,
+ double t,
+ double mops,
+ char *optype,
+ int passed_verification,
+ char *npbversion,
+ char *compiletime,
+ char *mpicc,
+ char *clink,
+ char *cmpi_lib,
+ char *cmpi_inc,
+ char *cflags,
+ char *clinkflags );
+
+void timer_clear(global_data* gd, int n );
+void timer_start(global_data* gd, int n );
+void timer_stop(global_data* gd, int n );
+double timer_read(global_data* gd, int n );
+
+void timer_clear(global_data* gd, int n ) {
+ gd->elapsed[n] = 0.0;
+}
+
+void timer_start(global_data* gd, int n ) {
+ gd->start[n] = MPI_Wtime();
+}
+
+void timer_stop(global_data* gd, int n ) {
+ gd->elapsed[n] += MPI_Wtime() - gd->start[n];
+}
+
+double timer_read(global_data* gd, int n ) {
+ return gd->elapsed[n];
+}
+
+
+/*
+ * FUNCTION RANDLC (X, A)
+ *
+ * This routine returns a uniform pseudorandom double precision number in the
+ * range (0, 1) by using the linear congruential generator
+ *
+ * x_{k+1} = a x_k (mod 2^46)
+ *
+ * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+ * before repeating. The argument A is the same as 'a' in the above formula,
+ * and X is the same as x_0. A and X must be odd double precision integers
+ * in the range (1, 2^46). The returned value RANDLC is normalized to be
+ * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+ * the new seed x_1, so that subsequent calls to RANDLC using the same
+ * arguments will generate a continuous sequence.
+ *
+ * This routine should produce the same results on any computer with at least
+ * 48 mantissa bits in double precision floating point data. On Cray systems,
+ * double precision should be disabled.
+ *
+ * David H. Bailey October 26, 1990
+ *
+ * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ * SAVE KS, R23, R46, T23, T46
+ * DATA KS/0/
+ *
+ * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
+ * by merely using the ** operator, in order to insure that the results are
+ * exact on all systems. This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/************* R A N D L C ************/
+/************* ************/
+/************* portable random number generator ************/
+/*****************************************************************/
+
+double randlc( double *X, double *A )
+{
+ static int KS=0;
+ static double R23, R46, T23, T46;
+ double T1, T2, T3, T4;
+ double A1;
+ double A2;
+ double X1;
+ double X2;
+ double Z;
+ int i, j;
+
+ if (KS == 0)
+ {
+ R23 = 1.0;
+ R46 = 1.0;
+ T23 = 1.0;
+ T46 = 1.0;
+
+ for (i=1; i<=23; i++)
+ {
+ R23 = 0.50 * R23;
+ T23 = 2.0 * T23;
+ }
+ for (i=1; i<=46; i++)
+ {
+ R46 = 0.50 * R46;
+ T46 = 2.0 * T46;
+ }
+ KS = 1;
+ }
+
+/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
+
+ T1 = R23 * *A;
+ j = T1;
+ A1 = j;
+ A2 = *A - T23 * A1;
+
+/* Break X into two parts such that X = 2^23 * X1 + X2, compute
+ Z = A1 * X2 + A2 * X1 (mod 2^23), and then
+ X = 2^23 * Z + A2 * X2 (mod 2^46). */
+
+ T1 = R23 * *X;
+ j = T1;
+ X1 = j;
+ X2 = *X - T23 * X1;
+ T1 = A1 * X2 + A2 * X1;
+
+ j = R23 * T1;
+ T2 = j;
+ Z = T1 - T23 * T2;
+ T3 = T23 * Z + A2 * X2;
+ j = R46 * T3;
+ T4 = j;
+ *X = T3 - T46 * T4;
+ return(R46 * *X);
+}
+
+
+
+/*****************************************************************/
+/************ F I N D _ M Y _ S E E D ************/
+/************ ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
+/*
+ * Create a random number sequence of total length nn residing
+ * on np number of processors. Each processor will therefore have a
+ * subsequence of length nn/np. This routine returns that random
+ * number which is the first random number for the subsequence belonging
+ * to processor rank kn, and which is used as seed for proc kn ran # gen.
+ */
+
+double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
+ int np, /* np = num procs */
+ long nn, /* total num of ran numbers, all procs */
+ double s, /* Ran num seed, for ex.: 314159265.00 */
+ double a ) /* Ran num gen mult, try 1220703125.00 */
+{
+
+ long i;
+
+ double t1,t2,t3,an;
+ long mq,nq,kk,ik;
+
+
+
+ nq = nn / np;
+
+ for( mq=0; nq>1; mq++,nq/=2 )
+ ;
+
+ t1 = a;
+
+ for( i=1; i<=mq; i++ )
+ t2 = randlc( &t1, &t1 );
+
+ an = t1;
+
+ kk = kn;
+ t1 = s;
+ t2 = an;
+
+ for( i=1; i<=100; i++ )
+ {
+ ik = kk / 2;
+ if( 2 * ik != kk )
+ t3 = randlc( &t1, &t2 );
+ if( ik == 0 )
+ break;
+ t3 = randlc( &t2, &t2 );
+ kk = ik;
+ }
+
+ return( t1 );
+
+}
+
+
+
+
+/*****************************************************************/
+/************* C R E A T E _ S E Q ************/
+/*****************************************************************/
+
+void create_seq( global_data* gd, double seed, double a )
+{
+ double x;
+ int i, k;
+
+ k = MAX_KEY/4;
+
+ for (i=0; i<NUM_KEYS; i++)
+ {
+ x = randlc(&seed, &a);
+ x += randlc(&seed, &a);
+ x += randlc(&seed, &a);
+ x += randlc(&seed, &a);
+
+ gd->key_array[i] = k*x;
+ }
+}
+
+
+
+
+/*****************************************************************/
+/************* F U L L _ V E R I F Y ************/
+/*****************************************************************/
+
+
+void full_verify( global_data* gd )
+{
+ MPI_Status status;
+ MPI_Request request;
+
+ INT_TYPE i, j;
+ INT_TYPE k, last_local_key;
+
+
+/* Now, finally, sort the keys: */
+ for( i=0; i<gd->total_local_keys; i++ )
+ gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
+ gd->total_lesser_keys] = gd->key_buff2[i];
+ last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
+
+/* Send largest key value to next processor */
+ if( gd->my_rank > 0 )
+ MPI_Irecv( &k,
+ 1,
+ MP_KEY_TYPE,
+ gd->my_rank-1,
+ 1000,
+ MPI_COMM_WORLD,
+ &request );
+ if( gd->my_rank < gd->comm_size-1 )
+ MPI_Send( &gd->key_array[last_local_key],
+ 1,
+ MP_KEY_TYPE,
+ gd->my_rank+1,
+ 1000,
+ MPI_COMM_WORLD );
+ if( gd->my_rank > 0 )
+ MPI_Wait( &request, &status );
+
+/* Confirm that neighbor's greatest key value
+ is not greater than my least key value */
+ j = 0;
+ if( gd->my_rank > 0 && gd->total_local_keys > 0 )
+ if( k > gd->key_array[0] )
+ j++;
+
+
+/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
+ for( i=1; i<gd->total_local_keys; i++ )
+ if( gd->key_array[i-1] > gd->key_array[i] )
+ j++;
+
+
+ if( j != 0 )
+ {
+ printf( "Processor %d: Full_verify: number of keys out of sort: %d\n",
+ gd->my_rank, j );
+ }
+ else
+ gd->passed_verification++;
+
+
+}
+
+
+
+
+/*****************************************************************/
+/************* R A N K ****************/
+/*****************************************************************/
+
+
+void rank( global_data* gd, int iteration )
+{
+
+ INT_TYPE i, k;
+
+ INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
+ INT_TYPE key;
+ INT_TYPE2 bucket_sum_accumulator, j, m;
+ INT_TYPE local_bucket_sum_accumulator;
+ INT_TYPE min_key_val, max_key_val;
+ INT_TYPE *key_buff_ptr;
+
+
+
+
+/* Iteration alteration of keys */
+ if(gd->my_rank == 0 )
+ {
+ gd->key_array[iteration] = iteration;
+ gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
+ }
+
+
+/* Initialize */
+ for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
+ {
+ gd->bucket_size[i] = 0;
+ gd->bucket_size_totals[i] = 0;
+ gd->process_bucket_distrib_ptr1[i] = 0;
+ gd->process_bucket_distrib_ptr2[i] = 0;
+ }
+
+
+/* Determine where the partial verify test keys are, load into */
+/* top of array bucket_size */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ )
+ if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
+ gd->bucket_size[NUM_BUCKETS+i] =
+ gd->key_array[gd->test_index_array[i] % NUM_KEYS];
+
+
+/* Determine the number of keys in each bucket */
+ for( i=0; i<NUM_KEYS; i++ )
+ gd->bucket_size[gd->key_array[i] >> shift]++;
+
+
+/* Accumulative bucket sizes are the bucket pointers */
+ gd->bucket_ptrs[0] = 0;
+ for( i=1; i< NUM_BUCKETS; i++ )
+ gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
+
+
+/* Sort into appropriate bucket */
+ for( i=0; i<NUM_KEYS; i++ )
+ {
+ key = gd->key_array[i];
+ gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
+ }
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 2 );
+ timer_start(gd, 3 );
+#endif
+
+/* Get the bucket size totals for the entire problem. These
+ will be used to determine the redistribution of keys */
+ MPI_Allreduce( gd->bucket_size,
+ gd->bucket_size_totals,
+ NUM_BUCKETS+TEST_ARRAY_SIZE,
+ MP_KEY_TYPE,
+ MPI_SUM,
+ MPI_COMM_WORLD );
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 3 );
+ timer_start(gd, 2 );
+#endif
+
+/* Determine Redistibution of keys: accumulate the bucket size totals
+ till this number surpasses NUM_KEYS (which the average number of keys
+ per processor). Then all keys in these buckets go to processor 0.
+ Continue accumulating again until supassing 2*NUM_KEYS. All keys
+ in these buckets go to processor 1, etc. This algorithm guarantees
+ that all processors have work ranking; no processors are left idle.
+ The optimum number of buckets, however, does not result in as high
+ a degree of load balancing (as even a distribution of keys as is
+ possible) as is obtained from increasing the number of buckets, but
+ more buckets results in more computation per processor so that the
+ optimum number of buckets turns out to be 1024 for machines tested.
+ Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
+ number of first and last bucket which each processor will have after
+ the redistribution is done. */
+
+ bucket_sum_accumulator = 0;
+ local_bucket_sum_accumulator = 0;
+ gd->send_displ[0] = 0;
+ gd->process_bucket_distrib_ptr1[0] = 0;
+ for( i=0, j=0; i<NUM_BUCKETS; i++ )
+ {
+ bucket_sum_accumulator += gd->bucket_size_totals[i];
+ local_bucket_sum_accumulator += gd->bucket_size[i];
+ if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
+ {
+ gd->send_count[j] = local_bucket_sum_accumulator;
+ if( j != 0 )
+ {
+ gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
+ gd->process_bucket_distrib_ptr1[j] =
+ gd->process_bucket_distrib_ptr2[j-1]+1;
+ }
+ gd->process_bucket_distrib_ptr2[j++] = i;
+ local_bucket_sum_accumulator = 0;
+ }
+ }
+
+/* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
+ that the last few processors don't get any buckets. So, we
+ need to set counts properly in this case to avoid any fallouts. */
+ while( j < gd->comm_size )
+ {
+ gd->send_count[j] = 0;
+ gd->process_bucket_distrib_ptr1[j] = 1;
+ j++;
+ }
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 2 );
+ timer_start(gd, 3 );
+#endif
+
+/* This is the redistribution section: first find out how many keys
+ each processor will send to every other processor: */
+ MPI_Alltoall( gd->send_count,
+ 1,
+ MPI_INT,
+ gd->recv_count,
+ 1,
+ MPI_INT,
+ MPI_COMM_WORLD );
+
+/* Determine the receive array displacements for the buckets */
+ gd->recv_displ[0] = 0;
+ for( i=1; i<gd->comm_size; i++ )
+ gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
+
+
+/* Now send the keys to respective processors */
+ MPI_Alltoallv( gd->key_buff1,
+ gd->send_count,
+ gd->send_displ,
+ MP_KEY_TYPE,
+ gd->key_buff2,
+ gd->recv_count,
+ gd->recv_displ,
+ MP_KEY_TYPE,
+ MPI_COMM_WORLD );
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 3 );
+ timer_start(gd, 2 );
+#endif
+
+/* The starting and ending bucket numbers on each processor are
+ multiplied by the interval size of the buckets to obtain the
+ smallest possible min and greatest possible max value of any
+ key on each processor */
+ min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
+ max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
+
+/* Clear the work array */
+ for( i=0; i<max_key_val-min_key_val+1; i++ )
+ gd->key_buff1[i] = 0;
+
+/* Determine the total number of keys on all other
+ processors holding keys of lesser value */
+ m = 0;
+ for( k=0; k<gd->my_rank; k++ )
+ for( i= gd->process_bucket_distrib_ptr1[k];
+ i<=gd->process_bucket_distrib_ptr2[k];
+ i++ )
+ m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
+
+/* Determine total number of keys on this processor */
+ j = 0;
+ for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
+ i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
+ i++ )
+ j += gd->bucket_size_totals[i]; /* j has total # of local keys */
+
+
+/* Ranking of all keys occurs in this section: */
+/* shift it backwards so no subtractions are necessary in loop */
+ key_buff_ptr = gd->key_buff1 - min_key_val;
+
+/* In this section, the keys themselves are used as their
+ own indexes to determine how many of each there are: their
+ individual population */
+ for( i=0; i<j; i++ )
+ key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key */
+ /* population */
+
+/* To obtain ranks of each key, successively add the individual key
+ population, not forgetting the total of lesser keys, m.
+ NOTE: Since the total of lesser keys would be subtracted later
+ in verification, it is no longer added to the first key population
+ here, but still needed during the partial verify test. This is to
+ ensure that 32-bit key_buff can still be used for class D. */
+/* key_buff_ptr[min_key_val] += m; */
+ for( i=min_key_val; i<max_key_val; i++ )
+ key_buff_ptr[i+1] += key_buff_ptr[i];
+
+
+/* This is the partial verify test section */
+/* Observe that test_rank_array vals are */
+/* shifted differently for different cases */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ )
+ {
+ k = gd->bucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */
+ if( min_key_val <= k && k <= max_key_val )
+ {
+ /* Add the total of lesser keys, m, here */
+ INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
+ int failed = 0;
+
+ switch( CLASS )
+ {
+ case 'S':
+ if( i <= 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'W':
+ if( i < 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+(iteration-2) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'A':
+ if( i <= 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+(iteration-1) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-(iteration-1) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'B':
+ if( i == 1 || i == 2 || i == 4 )
+ {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'C':
+ if( i <= 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'D':
+ if( i < 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ }
+ if( failed == 1 )
+ printf( "Failed partial verification: "
+ "iteration %d, processor %d, test key %d\n",
+ iteration, gd->my_rank, (int)i );
+ }
+ }
+
+
+
+
+/* Make copies of rank info for use by full_verify: these variables
+ in rank are local; making them global slows down the code, probably
+ since they cannot be made register by compiler */
+
+ if( iteration == MAX_ITERATIONS )
+ {
+ gd->key_buff_ptr_global = key_buff_ptr;
+ gd->total_local_keys = j;
+ gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
+ }
+
+}
+
+
+/*****************************************************************/
+/************* M A I N ****************/
+/*****************************************************************/
+
+int main( int argc, char **argv )
+{
+
+ int i, iteration, itemp;
+
+ double timecounter, maxtime;
+
+ TRACE_smpi_set_category ("start");
+
+ global_data* gd = malloc(sizeof(global_data));
+/* Initialize MPI */
+ MPI_Init( &argc, &argv );
+ MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
+ MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
+
+/* Initialize the verification arrays if a valid class */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ )
+ switch( CLASS )
+ {
+ case 'S':
+ gd->test_index_array[i] = S_test_index_array[i];
+ gd->test_rank_array[i] = S_test_rank_array[i];
+ break;
+ case 'A':
+ gd->test_index_array[i] = A_test_index_array[i];
+ gd->test_rank_array[i] = A_test_rank_array[i];
+ break;
+ case 'W':
+ gd->test_index_array[i] = W_test_index_array[i];
+ gd->test_rank_array[i] = W_test_rank_array[i];
+ break;
+ case 'B':
+ gd->test_index_array[i] = B_test_index_array[i];
+ gd->test_rank_array[i] = B_test_rank_array[i];
+ break;
+ case 'C':
+ gd->test_index_array[i] = C_test_index_array[i];
+ gd->test_rank_array[i] = C_test_rank_array[i];
+ break;
+ case 'D':
+ gd->test_index_array[i] = D_test_index_array[i];
+ gd->test_rank_array[i] = D_test_rank_array[i];
+ break;
+ };
+
+
+
+/* Printout initial NPB info */
+ if( gd->my_rank == 0 )
+ {
+ printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
+ printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
+ printf( " Iterations: %d\n", MAX_ITERATIONS );
+ printf( " Number of processes: %d\n",gd->comm_size );
+ }
+
+/* Check that actual and compiled number of processors agree */
+ if( gd->comm_size != NUM_PROCS )
+ {
+ if( gd->my_rank == 0 )
+ printf( "\n ERROR: compiled for %d processes\n"
+ " Number of active processes: %d\n"
+ " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
+ MPI_Finalize();
+ exit( 1 );
+ }
+
+/* Check to see whether total number of processes is within bounds.
+ This could in principle be checked in setparams.c, but it is more
+ convenient to do it here */
+ if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
+ {
+ if( gd->my_rank == 0 )
+ printf( "\n ERROR: number of processes %d not within range %d-%d"
+ "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
+ MPI_Finalize();
+ exit( 1 );
+ }
+
+
+/* Generate random number sequence and subsequent keys on all procs */
+ create_seq(gd, find_my_seed( gd->my_rank,
+ gd->comm_size,
+ 4*(long)TOTAL_KEYS*MIN_PROCS,
+ 314159265.00, /* Random number gen seed */
+ 1220703125.00 ), /* Random number gen mult */
+ 1220703125.00 ); /* Random number gen mult */
+
+/* Do one interation for free (i.e., untimed) to guarantee initialization of
+ all data and code pages and respective tables */
+ rank(gd, 1 );
+
+/* Start verification counter */
+ gd->passed_verification = 0;
+
+ if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" );
+
+/* Initialize timer */
+ timer_clear(gd, 0 );
+
+/* Initialize separate communication, computation timing */
+#ifdef TIMING_ENABLED
+ for( i=1; i<=3; i++ ) timer_clear(gd, i );
+#endif
+
+/* Start timer */
+ timer_start(gd, 0 );
+
+#ifdef TIMING_ENABLED
+ timer_start(gd, 1 );
+ timer_start(gd, 2 );
+#endif
+
+/* This is the main iteration */
+ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
+ {
+ char it_str[100];
+ snprintf (it_str, 100, "i%d", iteration);
+ TRACE_smpi_set_category (it_str);
+
+ if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
+ rank(gd, iteration );
+ }
+
+ TRACE_smpi_set_category ("finalize");
+
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 2 );
+ timer_stop(gd, 1 );
+#endif
+
+/* Stop timer, obtain time for processors */
+ timer_stop(gd, 0 );
+
+ timecounter = timer_read(gd, 0 );
+
+/* End of timing, obtain maximum time of all processors */
+ MPI_Reduce( &timecounter,
+ &maxtime,
+ 1,
+ MPI_DOUBLE,
+ MPI_MAX,
+ 0,
+ MPI_COMM_WORLD );
+
+#ifdef TIMING_ENABLED
+ {
+ double tmin, tsum, tmax;
+
+ if( my_rank == 0 )
+ {
+ printf( "\ntimer 1/2/3 = total/computation/communication time\n");
+ printf( " min avg max\n" );
+ }
+ for( i=1; i<=3; i++ )
+ {
+ timecounter = timer_read(gd, i );
+ MPI_Reduce( &timecounter,
+ &tmin,
+ 1,
+ MPI_DOUBLE,
+ MPI_MIN,
+ 0,
+ MPI_COMM_WORLD );
+ MPI_Reduce( &timecounter,
+ &tsum,
+ 1,
+ MPI_DOUBLE,
+ MPI_SUM,
+ 0,
+ MPI_COMM_WORLD );
+ MPI_Reduce( &timecounter,
+ &tmax,
+ 1,
+ MPI_DOUBLE,
+ MPI_MAX,
+ 0,
+ MPI_COMM_WORLD );
+ if( my_rank == 0 )
+ printf( "timer %d: %f %f %f\n",
+ i, tmin, tsum/((double) comm_size), tmax );
+ }
+ if( my_rank == 0 )
+ printf( "\n" );
+ }
+#endif
+
+/* This tests that keys are in sequence: sorting of last ranked key seq
+ occurs here, but is an untimed operation */
+ full_verify(gd);
+
+
+/* Obtain verification counter sum */
+ itemp =gd->passed_verification;
+ MPI_Reduce( &itemp,
+ &gd->passed_verification,
+ 1,
+ MPI_INT,
+ MPI_SUM,
+ 0,
+ MPI_COMM_WORLD );
+
+
+
+/* The final printout */
+ if( gd->my_rank == 0 )
+ {
+ if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
+ gd->passed_verification = 0;
+ c_print_results( "IS",
+ CLASS,
+ (int)(TOTAL_KEYS),
+ MIN_PROCS,
+ 0,
+ MAX_ITERATIONS,
+ NUM_PROCS,
+ gd->comm_size,
+ maxtime,
+ ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
+ /maxtime/1000000.,
+ "keys ranked",
+ gd->passed_verification,
+ NPBVERSION,
+ COMPILETIME,
+ MPICC,
+ CLINK,
+ CMPI_LIB,
+ CMPI_INC,
+ CFLAGS,
+ CLINKFLAGS );
+ }
+
+ MPI_Finalize();
+ free(gd);
+
+ return 0;
+ /**************************/
+} /* E N D P R O G R A M */
+ /**************************/
is: header
cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+IS-trace: is-trace
+is-trace: header
+ cd IS-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
CG: cg
cg: header
cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
ep: header
cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+EP-trace: ep-trace
+ep-trace: header
+ cd EP-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
DT: dt
dt: header
cd DT; $(MAKE) CLASS=$(CLASS)
+DT-trace: dt-trace
+dt-trace: header
+ cd DT-trace; $(MAKE) CLASS=$(CLASS)
+
# Awk script courtesy cmg@cray.com, modified by Haoqiang Jin
suite:
@ awk -f sys/suite.awk SMAKE=$(MAKE) $(SFILE) | $(SHELL)