+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "DGraph.h"
-
-DGArc *newArc(DGNode *tl,DGNode *hd){
- DGArc *ar=(DGArc *)malloc(sizeof(DGArc));
- ar->tail=tl;
- ar->head=hd;
- return ar;
-}
-void arcShow(DGArc *ar){
- DGNode *tl=(DGNode *)ar->tail,
- *hd=(DGNode *)ar->head;
- fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name);
-}
-
-DGNode *newNode(char *nm){
- DGNode *nd=(DGNode *)malloc(sizeof(DGNode));
- nd->attribute=0;
- nd->color=0;
- nd->inDegree=0;
- nd->outDegree=0;
- nd->maxInDegree=SMALL_BLOCK_SIZE;
- nd->maxOutDegree=SMALL_BLOCK_SIZE;
- nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*));
- nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*));
- nd->name=strdup(nm);
- nd->feat=NULL;
- return nd;
-}
-void nodeShow(DGNode* nd){
- fprintf( stderr,"%3d.%s: (%d,%d)\n",
- nd->id,nd->name,nd->inDegree,nd->outDegree);
-/*
- if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
- else if(nd->verified==0) fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
- else fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name);
-*/
-}
-
-DGraph* newDGraph(char* nm){
- DGraph *dg=(DGraph *)malloc(sizeof(DGraph));
- dg->numNodes=0;
- dg->numArcs=0;
- dg->maxNodes=BLOCK_SIZE;
- dg->maxArcs=BLOCK_SIZE;
- dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*));
- dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*));
- dg->name=strdup(nm);
- return dg;
-}
-int AttachNode(DGraph* dg, DGNode* nd) {
- int i=0,j,len=0;
- DGNode **nds =NULL, *tmpnd=NULL;
- DGArc **ar=NULL;
-
- if (dg->numNodes == dg->maxNodes-1 ) {
- dg->maxNodes += BLOCK_SIZE;
- nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
- memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
- free(dg->node);
- dg->node=nds;
- }
-
- len = strlen( nd->name);
- for (i = 0; i < dg->numNodes; i++) {
- tmpnd =dg->node[ i];
- ar=NULL;
- if ( strlen( tmpnd->name) != len ) continue;
- if ( strncmp( nd->name, tmpnd->name, len) ) continue;
- if ( nd->inDegree > 0 ) {
- tmpnd->maxInDegree += nd->maxInDegree;
- ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
- memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
- free(tmpnd->inArc);
- tmpnd->inArc=ar;
- for (j = 0; j < nd->inDegree; j++ ) {
- nd->inArc[ j]->head = tmpnd;
- }
- memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *));
- tmpnd->inDegree += nd->inDegree;
- }
- if ( nd->outDegree > 0 ) {
- tmpnd->maxOutDegree += nd->maxOutDegree;
- ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
- memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
- free(tmpnd->outArc);
- tmpnd->outArc=ar;
- for (j = 0; j < nd->outDegree; j++ ) {
- nd->outArc[ j]->tail = tmpnd;
- }
- memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
- tmpnd->outDegree += nd->outDegree;
- }
- free(nd);
- return i;
- }
- nd->id = dg->numNodes;
- dg->node[dg->numNodes] = nd;
- dg->numNodes++;
-return nd->id;
-}
-int AttachArc(DGraph *dg,DGArc* nar){
-int arcId = -1;
-int i=0,newNumber=0;
-DGNode *head = nar->head,
- *tail = nar->tail;
-DGArc **ars=NULL,*probe=NULL;
-/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
- if ( !tail || !head ) return arcId;
- if ( dg->numArcs == dg->maxArcs-1 ) {
- dg->maxArcs += BLOCK_SIZE;
- ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
- memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
- free(dg->arc);
- dg->arc=ars;
- }
- for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
- probe = tail->outArc[ i];
- if(probe->head == head
- &&
- probe->length == nar->length
- ){
- free(nar);
- return probe->id;
- }
- }
-
- nar->id = dg->numArcs;
- arcId=dg->numArcs;
- dg->arc[dg->numArcs] = nar;
- dg->numArcs++;
-
- head->inArc[ head->inDegree] = nar;
- head->inDegree++;
- if ( head->inDegree >= head->maxInDegree ) {
- newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
- ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
- memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
- free(head->inArc);
- head->inArc=ars;
- head->maxInDegree = newNumber;
- }
- tail->outArc[ tail->outDegree] = nar;
- tail->outDegree++;
- if(tail->outDegree >= tail->maxOutDegree ) {
- newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
- ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
- memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
- free(tail->outArc);
- tail->outArc=ars;
- tail->maxOutDegree = newNumber;
- }
-/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
-return arcId;
-}
-void graphShow(DGraph *dg,int DetailsLevel){
- int i=0,j=0;
- fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
- if ( DetailsLevel < 1) return;
- for (i = 0; i < dg->numNodes; i++ ) {
- DGNode *focusNode = dg->node[ i];
- if(DetailsLevel >= 2) {
- for (j = 0; j < focusNode->inDegree; j++ ) {
- fprintf(stderr,"\t ");
- nodeShow(focusNode->inArc[ j]->tail);
- }
- }
- nodeShow(focusNode);
- if ( DetailsLevel < 2) continue;
- for (j = 0; j < focusNode->outDegree; j++ ) {
- fprintf(stderr, "\t ");
- nodeShow(focusNode->outArc[ j]->head);
- }
- fprintf(stderr, "---\n");
- }
- fprintf(stderr,"----------------------------------------\n");
- if ( DetailsLevel < 3) return;
-}
-
-
-
+++ /dev/null
-#ifndef _DGRAPH
-#define _DGRAPH
-
-#define BLOCK_SIZE 128
-#define SMALL_BLOCK_SIZE 32
-
-typedef struct{
- int id;
- void *tail,*head;
- int length,width,attribute,maxWidth;
-}DGArc;
-
-typedef struct{
- int maxInDegree,maxOutDegree;
- int inDegree,outDegree;
- int id;
- char *name;
- DGArc **inArc,**outArc;
- int depth,height,width;
- int color,attribute,address,verified;
- void *feat;
-}DGNode;
-
-typedef struct{
- int maxNodes,maxArcs;
- int id;
- char *name;
- int numNodes,numArcs;
- DGNode **node;
- DGArc **arc;
-} DGraph;
-
-DGArc *newArc(DGNode *tl,DGNode *hd);
-void arcShow(DGArc *ar);
-DGNode *newNode(char *nm);
-void nodeShow(DGNode* nd);
-
-DGraph* newDGraph(char *nm);
-int AttachNode(DGraph *dg,DGNode *nd);
-int AttachArc(DGraph *dg,DGArc* nar);
-void graphShow(DGraph *dg,int DetailsLevel);
-
-#endif
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=dt
-BENCHMARKU=DT
-
-include ../config/make.def
-
-include ../sys/make.common
-#Override PROGRAM
-DTPROGRAM = $(BINDIR)/$(BENCHMARK)-trace.$(CLASS)
-
-OBJS = dt.o DGraph.o \
- ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
-
-
-${PROGRAM}: config ${OBJS}
- ${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
-
-.c.o:
- ${CCOMPILE} $<
-
-dt.o: dt.c npbparams.h
-DGraph.o: DGraph.c DGraph.h
-
-clean:
- - rm -f *.o *~ mputil*
- - rm -f dt npbparams.h core
+++ /dev/null
-Data Traffic benchmark DT is new in the NPB suite
-(released as part of NPB3.x-MPI package).
-----------------------------------------------------
-
-DT is written in C and same executable can run on any number of processors,
-provided this number is not less than the number of nodes in the communication
-graph. DT benchmark takes one argument: BH, WH, or SH. This argument
-specifies the communication graph Black Hole, White Hole, or SHuffle
-respectively. The current release contains verification numbers for
-CLASSES S, W, A, and B only. Classes C and D are defined, but verification
-numbers are not provided in this release.
-
-The following table summarizes the number of nodes in the communication
-graph based on CLASS and graph TYPE.
-
-CLASS N_Source N_Nodes(BH,WH) N_Nodes(SH)
- S 4 5 12
- W 8 11 32
- A 16 21 80
- B 32 43 192
- C 64 85 448
- D 128 171 1024
+++ /dev/null
-/*************************************************************************
- * *
- * N A S P A R A L L E L B E N C H M A R K S 3.3 *
- * *
- * D T *
- * *
- *************************************************************************
- * *
- * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
- * *
- * Permission to use, copy, distribute and modify this software *
- * for any purpose with or without fee is hereby granted. We *
- * request, however, that all derived work reference the NAS *
- * Parallel Benchmarks 3.3. This software is provided "as is" *
- * without express or implied warranty. *
- * *
- * Information on NPB 3.3, including the technical report, the *
- * original specifications, source code, results and information *
- * on how to submit new results, is available at: *
- * *
- * http: www.nas.nasa.gov/Software/NPB *
- * *
- * Send comments or suggestions to npb@nas.nasa.gov *
- * Send bug reports to npb-bugs@nas.nasa.gov *
- * *
- * NAS Parallel Benchmarks Group *
- * NASA Ames Research Center *
- * Mail Stop: T27A-1 *
- * Moffett Field, CA 94035-1000 *
- * *
- * E-mail: npb@nas.nasa.gov *
- * Fax: (650) 604-3957 *
- * *
- *************************************************************************
- * *
- * Author: M. Frumkin * *
- * *
- *************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#include "simgrid/instr.h" //TRACE_
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-
-//int passed_verification;
-extern double randlc( double *X, double *A );
-extern
-void c_print_results( char *name,
- char class,
- int n1,
- int n2,
- int n3,
- int niter,
- int nprocs_compiled,
- int nprocs_total,
- double t,
- double mops,
- char *optype,
- int passed_verification,
- char *npbversion,
- char *compiletime,
- char *mpicc,
- char *clink,
- char *cmpi_lib,
- char *cmpi_inc,
- char *cflags,
- char *clinkflags );
-
-void timer_clear( int n );
-void timer_start( int n );
-void timer_stop( int n );
-double timer_read( int n );
-int timer_on=0,timers_tot=64;
-
-int verify(char *bmname,double rnm2){
- double verify_value=0.0;
- double epsilon=1.0E-8;
- char cls=CLASS;
- int verified=-1;
- if (cls != 'U') {
- if(cls=='S') {
- if(strstr(bmname,"BH")){
- verify_value=30892725.0;
- }else if(strstr(bmname,"WH")){
- verify_value=67349758.0;
- }else if(strstr(bmname,"SH")){
- verify_value=58875767.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- }
- verified = 0;
- }else if(cls=='W') {
- if(strstr(bmname,"BH")){
- verify_value = 4102461.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 204280762.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 186944764.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- }
- verified = 0;
- }else if(cls=='A') {
- if(strstr(bmname,"BH")){
- verify_value = 17809491.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 1289925229.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 610856482.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- }
- verified = 0;
- }else if(cls=='B') {
- if(strstr(bmname,"BH")){
- verify_value = 4317114.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 7877279917.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 1836863082.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- verified = 0;
- }
- }else if(cls=='C') {
- if(strstr(bmname,"BH")){
- verify_value = 0.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 0.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 0.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- verified = -1;
- }
- }else if(cls=='D') {
- if(strstr(bmname,"BH")){
- verify_value = 0.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 0.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 0.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- }
- verified = -1;
- }else{
- fprintf(stderr,"No such class as %c.\n",cls);
- }
- fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
- if(verified==-1){
- fprintf(stderr," No verification was performed.\n");
- }else if( rnm2 - verify_value < epsilon &&
- rnm2 - verify_value > -epsilon) { /* abs here does not work on ALTIX */
- verified = 1;
- fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
- }else{
- verified = 0;
- fprintf(stderr," The correct verification value = %f\n",verify_value);
- fprintf(stderr," Got value = %f\n",rnm2);
- }
- }else{
- verified = -1;
- }
- return verified;
- }
-
-int ipowMod(int a,long long int n,int md){
- int seed=1,q=a,r=1;
- if(n<0){
- fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
- n=-n; /* temp fix */
-/* return 1; */
- }
- if(md<=0){
- fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
- return 1;
- }
- if(n==0) return 1;
- while(n>1){
- int n2 = n/2;
- if (n2*2==n){
- seed = (q*q)%md;
- q=seed;
- n = n2;
- }else{
- seed = (r*q)%md;
- r=seed;
- n = n-1;
- }
- }
- seed = (r*q)%md;
- return seed;
-}
-
-#include "DGraph.h"
-DGraph *buildSH(char cls){
-/*
- Nodes of the graph must be topologically sorted
- to avoid MPI deadlock.
-*/
- DGraph *dg;
- int numSources=NUM_SOURCES; /* must be power of 2 */
- int numOfLayers=0,tmpS=numSources>>1;
- int firstLayerNode=0;
- DGArc *ar=NULL;
- DGNode *nd=NULL;
- int mask=0x0,ndid=0,ndoff=0;
- int i=0,j=0;
- char nm[BLOCK_SIZE];
-
- sprintf(nm,"DT_SH.%c",cls);
- dg=newDGraph(nm);
-
- while(tmpS>1){
- numOfLayers++;
- tmpS>>=1;
- }
- for(i=0;i<numSources;i++){
- sprintf(nm,"Source.%d",i);
- nd=newNode(nm);
- AttachNode(dg,nd);
- }
- for(j=0;j<numOfLayers;j++){
- mask=0x00000001<<j;
- for(i=0;i<numSources;i++){
- sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
- nd=newNode(nm);
- AttachNode(dg,nd);
- ndoff=i&(~mask);
- ndid=firstLayerNode+ndoff;
- ar=newArc(dg->node[ndid],nd);
- AttachArc(dg,ar);
- ndoff+=mask;
- ndid=firstLayerNode+ndoff;
- ar=newArc(dg->node[ndid],nd);
- AttachArc(dg,ar);
- }
- firstLayerNode+=numSources;
- }
- mask=0x00000001<<numOfLayers;
- for(i=0;i<numSources;i++){
- sprintf(nm,"Sink.%d",i);
- nd=newNode(nm);
- AttachNode(dg,nd);
- ndoff=i&(~mask);
- ndid=firstLayerNode+ndoff;
- ar=newArc(dg->node[ndid],nd);
- AttachArc(dg,ar);
- ndoff+=mask;
- ndid=firstLayerNode+ndoff;
- ar=newArc(dg->node[ndid],nd);
- AttachArc(dg,ar);
- }
-return dg;
-}
-DGraph *buildWH(char cls){
-/*
- Nodes of the graph must be topologically sorted
- to avoid MPI deadlock.
-*/
- int i=0,j=0;
- int numSources=NUM_SOURCES,maxInDeg=4;
- int numLayerNodes=numSources,firstLayerNode=0;
- int totComparators=0;
- int numPrevLayerNodes=numLayerNodes;
- int id=0,sid=0;
- DGraph *dg;
- DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
- DGArc *ar=NULL;
- char nm[BLOCK_SIZE];
-
- sprintf(nm,"DT_WH.%c",cls);
- dg=newDGraph(nm);
-
- for(i=0;i<numSources;i++){
- sprintf(nm,"Sink.%d",i);
- nd=newNode(nm);
- AttachNode(dg,nd);
- }
- totComparators=0;
- numPrevLayerNodes=numLayerNodes;
- while(numLayerNodes>maxInDeg){
- numLayerNodes=numLayerNodes/maxInDeg;
- if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
- for(i=0;i<numLayerNodes;i++){
- sprintf(nm,"Comparator.%d",totComparators);
- totComparators++;
- nd=newNode(nm);
- id=AttachNode(dg,nd);
- for(j=0;j<maxInDeg;j++){
- sid=i*maxInDeg+j;
- if(sid>=numPrevLayerNodes) break;
- snd=dg->node[firstLayerNode+sid];
- ar=newArc(dg->node[id],snd);
- AttachArc(dg,ar);
- }
- }
- firstLayerNode+=numPrevLayerNodes;
- numPrevLayerNodes=numLayerNodes;
- }
- source=newNode("Source");
- AttachNode(dg,source);
- for(i=0;i<numPrevLayerNodes;i++){
- nd=dg->node[firstLayerNode+i];
- ar=newArc(source,nd);
- AttachArc(dg,ar);
- }
-
- for(i=0;i<dg->numNodes/2;i++){ /* Topological sorting */
- tmp=dg->node[i];
- dg->node[i]=dg->node[dg->numNodes-1-i];
- dg->node[i]->id=i;
- dg->node[dg->numNodes-1-i]=tmp;
- dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
- }
-return dg;
-}
-DGraph *buildBH(char cls){
-/*
- Nodes of the graph must be topologically sorted
- to avoid MPI deadlock.
-*/
- int i=0,j=0;
- int numSources=NUM_SOURCES,maxInDeg=4;
- int numLayerNodes=numSources,firstLayerNode=0;
- DGraph *dg;
- DGNode *nd=NULL, *snd=NULL, *sink=NULL;
- DGArc *ar=NULL;
- int totComparators=0;
- int numPrevLayerNodes=numLayerNodes;
- int id=0, sid=0;
- char nm[BLOCK_SIZE];
-
- sprintf(nm,"DT_BH.%c",cls);
- dg=newDGraph(nm);
-
- for(i=0;i<numSources;i++){
- sprintf(nm,"Source.%d",i);
- nd=newNode(nm);
- AttachNode(dg,nd);
- }
- while(numLayerNodes>maxInDeg){
- numLayerNodes=numLayerNodes/maxInDeg;
- if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
- for(i=0;i<numLayerNodes;i++){
- sprintf(nm,"Comparator.%d",totComparators);
- totComparators++;
- nd=newNode(nm);
- id=AttachNode(dg,nd);
- for(j=0;j<maxInDeg;j++){
- sid=i*maxInDeg+j;
- if(sid>=numPrevLayerNodes) break;
- snd=dg->node[firstLayerNode+sid];
- ar=newArc(snd,dg->node[id]);
- AttachArc(dg,ar);
- }
- }
- firstLayerNode+=numPrevLayerNodes;
- numPrevLayerNodes=numLayerNodes;
- }
- sink=newNode("Sink");
- AttachNode(dg,sink);
- for(i=0;i<numPrevLayerNodes;i++){
- nd=dg->node[firstLayerNode+i];
- ar=newArc(nd,sink);
- AttachArc(dg,ar);
- }
-return dg;
-}
-
-typedef struct{
- int len;
- double* val;
-} Arr;
-Arr *newArr(int len){
- Arr *arr=(Arr *)SMPI_SHARED_MALLOC(sizeof(Arr));
- arr->len=len;
- arr->val=(double *)SMPI_SHARED_MALLOC(len*sizeof(double));
- return arr;
-}
-void arrShow(Arr* a){
- if(!a) fprintf(stderr,"-- NULL array\n");
- else{
- fprintf(stderr,"-- length=%d\n",a->len);
- }
-}
-double CheckVal(Arr *feat){
- double csum=0.0;
- int i=0;
- for(i=0;i<feat->len;i++){
- csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since
- result will be 0 for large len */
- }
- return csum;
-}
-int GetFNumDPar(int* mean, int* stdev){
- *mean=NUM_SAMPLES;
- *stdev=STD_DEVIATION;
- return 0;
-}
-int GetFeatureNum(char *mbname,int id){
- double tran=314159265.0;
- double A=2*id+1;
- double denom=randlc(&tran,&A);
- char cval='S';
- int mean=NUM_SAMPLES,stdev=128;
- int rtfs=0,len=0;
- GetFNumDPar(&mean,&stdev);
- rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
- if(rtfs<0) rtfs=-rtfs;
- len=mean-stdev+rtfs;
- return len;
-}
-Arr* RandomFeatures(char *bmname,int fdim,int id){
- int len=GetFeatureNum(bmname,id)*fdim;
- Arr* feat=newArr(len);
- int nxg=2,nyg=2,nzg=2,nfg=5;
- int nx=421,ny=419,nz=1427,nf=3527;
- long long int expon=(len*(id+1))%3141592;
- int seedx=ipowMod(nxg,expon,nx),
- seedy=ipowMod(nyg,expon,ny),
- seedz=ipowMod(nzg,expon,nz),
- seedf=ipowMod(nfg,expon,nf);
- int i=0;
- if(timer_on){
- timer_clear(id+1);
- timer_start(id+1);
- }
- for(i=0;i<len;i+=fdim){
- seedx=(seedx*nxg)%nx;
- seedy=(seedy*nyg)%ny;
- seedz=(seedz*nzg)%nz;
- seedf=(seedf*nfg)%nf;
- feat->val[i]=seedx;
- feat->val[i+1]=seedy;
- feat->val[i+2]=seedz;
- feat->val[i+3]=seedf;
- }
- if(timer_on){
- timer_stop(id+1);
- fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
- }
- return feat;
-}
-void Resample(Arr *a,int blen){
- long long int i=0,j=0,jlo=0,jhi=0;
- double avval=0.0;
- double *nval=(double *)SMPI_SHARED_MALLOC(blen*sizeof(double));
- Arr *tmp=newArr(10);
- for(i=0;i<blen;i++) nval[i]=0.0;
- for(i=1;i<a->len-1;i++){
- jlo=(int)(0.5*(2*i-1)*(blen/a->len));
- jhi=(int)(0.5*(2*i+1)*(blen/a->len));
-
- avval=a->val[i]/(jhi-jlo+1);
- for(j=jlo;j<=jhi;j++){
- nval[j]+=avval;
- }
- }
- nval[0]=a->val[0];
- nval[blen-1]=a->val[a->len-1];
- SMPI_SHARED_FREE(a->val);
- a->val=nval;
- a->len=blen;
-}
-#define fielddim 4
-Arr* WindowFilter(Arr *a, Arr* b,int w){
- int i=0,j=0,k=0;
- double rms0=0.0,rms1=0.0,rmsm1=0.0;
- double weight=((double) (w+1))/(w+2);
-
- w+=1;
- if(timer_on){
- timer_clear(w);
- timer_start(w);
- }
- if(a->len<b->len) Resample(a,b->len);
- if(a->len>b->len) Resample(b,a->len);
- for(i=fielddim;i<a->len-fielddim;i+=fielddim){
- rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
- +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
- +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
- +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
- j=i+fielddim;
- rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
- +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
- +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
- +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
- j=i-fielddim;
- rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
- +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
- +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
- +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
- k=0;
- if(rms1<rms0){
- k=1;
- rms0=rms1;
- }
- if(rmsm1<rms0) k=-1;
- if(k==0){
- j=i+fielddim;
- a->val[i]=weight*b->val[i];
- a->val[i+1]=weight*b->val[i+1];
- a->val[i+2]=weight*b->val[i+2];
- a->val[i+3]=weight*b->val[i+3];
- }else if(k==1){
- j=i+fielddim;
- a->val[i]=weight*b->val[j];
- a->val[i+1]=weight*b->val[j+1];
- a->val[i+2]=weight*b->val[j+2];
- a->val[i+3]=weight*b->val[j+3];
- }else { /*if(k==-1)*/
- j=i-fielddim;
- a->val[i]=weight*b->val[j];
- a->val[i+1]=weight*b->val[j+1];
- a->val[i+2]=weight*b->val[j+2];
- a->val[i+3]=weight*b->val[j+3];
- }
- }
- if(timer_on){
- timer_stop(w);
- fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
- }
- return a;
-}
-
-int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
- int i=0,tag=0;
- DGArc *ar=NULL;
- DGNode *head=NULL;
- if(!feat) return 0;
- TRACE_smpi_set_category ("SendResults");
- for(i=0;i<nd->outDegree;i++){
- ar=nd->outArc[i];
- if(ar->tail!=nd) continue;
- head=ar->head;
- tag=ar->id;
- if(head->address!=nd->address){
- MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
- MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
- }
- }
- TRACE_smpi_set_category (NULL);
- return 1;
-}
-Arr* CombineStreams(DGraph *dg,DGNode *nd){
- Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
- int i=0,len=0,tag=0;
- DGArc *ar=NULL;
- DGNode *tail=NULL;
- MPI_Status status;
- Arr *feat=NULL,*featp=NULL;
-
- if(nd->inDegree==0) return NULL;
- for(i=0;i<nd->inDegree;i++){
- ar=nd->inArc[i];
- if(ar->head!=nd) continue;
- tail=ar->tail;
- if(tail->address!=nd->address){
- len=0;
- tag=ar->id;
- MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
- feat=newArr(len);
- MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
- resfeat=WindowFilter(resfeat,feat,nd->id);
- SMPI_SHARED_FREE(feat);
- }else{
- featp=(Arr *)tail->feat;
- feat=newArr(featp->len);
- memcpy(feat->val,featp->val,featp->len*sizeof(double));
- resfeat=WindowFilter(resfeat,feat,nd->id);
- SMPI_SHARED_FREE(feat);
- }
- }
- for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
- nd->feat=resfeat;
- return nd->feat;
-}
-double Reduce(Arr *a,int w){
- double retv=0.0;
- if(timer_on){
- timer_clear(w);
- timer_start(w);
- }
- retv=(int)(w*CheckVal(a));/* The casting needed for node
- and array dependent verifcation */
- if(timer_on){
- timer_stop(w);
- fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
- }
- return retv;
-}
-
-double ReduceStreams(DGraph *dg,DGNode *nd){
- double csum=0.0;
- int i=0,len=0,tag=0;
- DGArc *ar=NULL;
- DGNode *tail=NULL;
- Arr *feat=NULL;
- double retv=0.0;
-
- TRACE_smpi_set_category ("ReduceStreams");
-
- for(i=0;i<nd->inDegree;i++){
- ar=nd->inArc[i];
- if(ar->head!=nd) continue;
- tail=ar->tail;
- if(tail->address!=nd->address){
- MPI_Status status;
- len=0;
- tag=ar->id;
- MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
- feat=newArr(len);
- MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
- csum+=Reduce(feat,(nd->id+1));
- SMPI_SHARED_FREE(feat);
- }else{
- csum+=Reduce(tail->feat,(nd->id+1));
- }
- }
- if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
- retv=(nd->id+1)*csum;
- return retv;
-}
-
-int ProcessNodes(DGraph *dg,int me){
- double chksum=0.0;
- Arr *feat=NULL;
- int i=0,verified=0,tag;
- DGNode *nd=NULL;
- double rchksum=0.0;
- MPI_Status status;
-
- TRACE_smpi_set_category ("ProcessNodes");
-
-
- for(i=0;i<dg->numNodes;i++){
- nd=dg->node[i];
- if(nd->address!=me) continue;
- if(strstr(nd->name,"Source")){
- nd->feat=RandomFeatures(dg->name,fielddim,nd->id);
- SendResults(dg,nd,nd->feat);
- }else if(strstr(nd->name,"Sink")){
- chksum=ReduceStreams(dg,nd);
- tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
- MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
- }else{
- feat=CombineStreams(dg,nd);
- SendResults(dg,nd,feat);
- }
- }
-
- TRACE_smpi_set_category ("ProcessNodes");
-
-
- if(me==0){ /* Report node */
- rchksum=0.0;
- chksum=0.0;
- for(i=0;i<dg->numNodes;i++){
- nd=dg->node[i];
- if(!strstr(nd->name,"Sink")) continue;
- tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
- MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
- chksum+=rchksum;
- }
- verified=verify(dg->name,chksum);
- }
-return verified;
-}
-
-int main(int argc,char **argv ){
- int my_rank,comm_size;
- int i;
- DGraph *dg=NULL;
- int verified=0, featnum=0;
- double bytes_sent=2.0,tot_time=0.0;
-
-
-
- MPI_Init( &argc, &argv );
- MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
- MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
- TRACE_smpi_set_category ("begin");
-
- if(argc!=2||
- ( strncmp(argv[1],"BH",2)!=0
- &&strncmp(argv[1],"WH",2)!=0
- &&strncmp(argv[1],"SH",2)!=0
- )
- ){
- if(my_rank==0){
- fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
- fprintf(stderr,"** Where \n - N is integer number of MPI processes\n");
- fprintf(stderr," - S is the class S, W, or A \n");
- fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n");
- fprintf(stderr," - the number of MPI processes N should not be be less than \n");
- fprintf(stderr," the number of nodes in the graph\n");
- }
- MPI_Finalize();
- exit(0);
- }
- if(strncmp(argv[1],"BH",2)==0){
- dg=buildBH(CLASS);
- }else if(strncmp(argv[1],"WH",2)==0){
- dg=buildWH(CLASS);
- }else if(strncmp(argv[1],"SH",2)==0){
- dg=buildSH(CLASS);
- }
-
- if(timer_on&&dg->numNodes+1>timers_tot){
- timer_on=0;
- if(my_rank==0)
- fprintf(stderr,"Not enough timers. Node timeing is off. \n");
- }
- if(dg->numNodes>comm_size){
- if(my_rank==0){
- fprintf(stderr,"** The number of MPI processes should not be less than \n");
- fprintf(stderr,"** the number of nodes in the graph\n");
- fprintf(stderr,"** Number of MPI processes = %d\n",comm_size);
- fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes);
- }
- MPI_Finalize();
- exit(0);
- }
- for(i=0;i<dg->numNodes;i++){
- dg->node[i]->address=i;
- }
- if( my_rank == 0 ){
- printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
- graphShow(dg,0);
- timer_clear(0);
- timer_start(0);
- }
-
- verified=ProcessNodes(dg,my_rank);
- TRACE_smpi_set_category ("end");
-
- featnum=NUM_SAMPLES*fielddim;
- bytes_sent=featnum*dg->numArcs;
- bytes_sent/=1048576;
- if(my_rank==0){
- timer_stop(0);
- tot_time=timer_read(0);
- c_print_results( dg->name,
- CLASS,
- featnum,
- 0,
- 0,
- dg->numNodes,
- 0,
- comm_size,
- tot_time,
- bytes_sent/tot_time,
- "bytes transmitted",
- verified,
- NPBVERSION,
- COMPILETIME,
- MPICC,
- CLINK,
- CMPI_LIB,
- CMPI_INC,
- CFLAGS,
- CLINKFLAGS );
- }
- MPI_Finalize();
- return 1;
-}
#include "mpi.h"
#include "npbparams.h"
+#include "simgrid/instr.h" //TRACE_
+
#ifndef CLASS
#define CLASS 'S'
#define NUM_PROCS 1
DGArc *ar=NULL;
DGNode *head=NULL;
if(!feat) return 0;
+ TRACE_smpi_set_category ("SendResults");
for(i=0;i<nd->outDegree;i++){
ar=nd->outArc[i];
if(ar->tail!=nd) continue;
MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
}
}
+ TRACE_smpi_set_category (NULL);
return 1;
}
Arr* CombineStreams(DGraph *dg,DGNode *nd){
Arr *feat=NULL;
double retv=0.0;
+ TRACE_smpi_set_category ("ReduceStreams");
+
for(i=0;i<nd->inDegree;i++){
ar=nd->inArc[i];
if(ar->head!=nd) continue;
double rchksum=0.0;
MPI_Status status;
+ TRACE_smpi_set_category ("ProcessNodes");
+
for(i=0;i<dg->numNodes;i++){
nd=dg->node[i];
if(nd->address!=me) continue;
SendResults(dg,nd,feat);
}
}
+
+ TRACE_smpi_set_category ("ProcessNodes");
+
if(me==0){ /* Report node */
rchksum=0.0;
chksum=0.0;
MPI_Init( &argc, &argv );
MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+ TRACE_smpi_set_category ("begin");
if(argc!=2||
( strncmp(argv[1],"BH",2)!=0
timer_start(0);
}
verified=ProcessNodes(dg,my_rank);
-
+ TRACE_smpi_set_category ("end");
+
featnum=NUM_SAMPLES*fielddim;
bytes_sent=featnum*dg->numArcs;
bytes_sent/=1048576;
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=ep
-BENCHMARKU=EP
-
-include ../config/make.def
-
-#OBJS = ep-trace.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o
-OBJS = ep-trace.o randlc.o
-
-include ../sys/make.common
-
-${PROGRAM}: config ${OBJS}
-# ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
- ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
-
-
-#ep-trace.o: ep-trace.f mpinpb.h npbparams.h
-# ${FCOMPILE} ep-trace.f
-
-ep-trace.o: ep-trace.c randlc.c mpinpb.h npbparams.h
- ${CCOMPILE} ep-trace.c
-
-clean:
- - rm -f *.o *~
- - rm -f npbparams.h core
-
-
-
+++ /dev/null
-This code implements the random-number generator described in the
-NAS Parallel Benchmark document RNR Technical Report RNR-94-007.
-The code is "embarrassingly" parallel in that no communication is
-required for the generation of the random numbers itself. There is
-no special requirement on the number of processors used for running
-the benchmark.
+++ /dev/null
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#include "simgrid/instr.h" //TRACE_
-
-#include "randlc.h"
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-#define true 1
-#define false 0
-
-
-//---NOTE : all the timers function have been modified to
-// avoid global timers (privatize these).
- // ----------------------- timers ---------------------
- void timer_clear(double *onetimer) {
- //elapsed[n] = 0.0;
- *onetimer = 0.0;
- }
-
- void timer_start(double *onetimer) {
- *onetimer = MPI_Wtime();
- }
-
- void timer_stop(int n,double *elapsed,double *start) {
- double t, now;
-
- now = MPI_Wtime();
- t = now - start[n];
- elapsed[n] += t;
- }
-
- double timer_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */
- return(elapsed[n]);
- }
- /********************************************************************
- ***************** V R A N L C ******************
- ***************** *****************/
- double vranlc(int n, double x, double a, double *y)
- {
- int i;
- long i246m1=0x00003FFFFFFFFFFF;
- long LLx, Lx, La;
- double d2m46;
-
-// This doesn't work, because the compiler does the calculation in 32
-// bits and overflows. No standard way (without f90 stuff) to specify
-// that the rhs should be done in 64 bit arithmetic.
-// parameter(i246m1=2**46-1)
-
- d2m46=pow(0.5,46);
-
-// c Note that the v6 compiler on an R8000 does something stupid with
-// c the above. Using the following instead (or various other things)
-// c makes the calculation run almost 10 times as fast.
-//
-// c save d2m46
-// c data d2m46/0.0d0/
-// c if (d2m46 .eq. 0.0d0) then
-// c d2m46 = 0.5d0**46
-// c endif
-
- Lx = (long)x;
- La = (long)a;
- //fprintf(stdout,("================== Vranlc ================");
- //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
- LLx = Lx;
- for (i=0; i< n; i++) {
- Lx = Lx*La & i246m1 ;
- LLx = Lx;
- y[i] = d2m46 * (double)LLx;
- /*
- if(i == 0) {
- fprintf(stdout,("After loop 0:");
- fprintf(stdout,("Lx = " + Lx + ", La = " + La);
- fprintf(stdout,("d2m46 = " + d2m46);
- fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
- fprintf(stdout,("Y[0]" + y[0]);
- }
- */
- }
-
- x = (double)LLx;
- /*
- fprintf(stdout,("Change: Lx = " + Lx);
- fprintf(stdout,("=============End Vranlc ================");
- */
- return x;
- }
-
-
-
-//-------------- the core (unique function) -----------
- void doTest(int argc, char **argv) {
- double dum[3] = {1.,1.,1.};
- double x1, x2, sx, sy, tm, an, tt, gc;
- double Mops;
- double epsilon=1.0E-8, a = 1220703125., s=271828183.;
- double t1, t2, t3, t4;
- double sx_verify_value, sy_verify_value, sx_err, sy_err;
-
-#include "npbparams.h"
- int mk=16,
- // --> set by make : in npbparams.h
- //m=28, // for CLASS=A
- //m=30, // for CLASS=B
- //npm=2, // NPROCS
- mm = m-mk,
- nn = (int)(pow(2,mm)),
- nk = (int)(pow(2,mk)),
- nq=10,
- np,
- node,
- no_nodes,
- i,
- ik,
- kk,
- l,
- k, nit, no_large_nodes,
- np_add, k_offset, j;
- int me, nprocs, root=0, dp_type;
- int verified,
- timers_enabled=true;
- char size[500]; // mind the size of the string to represent a big number
-
- //Use in randlc..
- int KS = 0;
- double R23, R46, T23, T46;
-
- double *qq = (double *) malloc (10000*sizeof(double));
- double *start = (double *) malloc (64*sizeof(double));
- double *elapsed = (double *) malloc (64*sizeof(double));
-
- double *x = (double *) malloc (2*nk*sizeof(double));
- double *q = (double *) malloc (nq*sizeof(double));
-
- TRACE_smpi_set_category ("start");
-
- MPI_Init( &argc, &argv );
- MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
- MPI_Comm_rank( MPI_COMM_WORLD, &node);
-
-#ifdef USE_MPE
- MPE_Init_log();
-#endif
- root = 0;
- if (node == root ) {
-
- /* Because the size of the problem is too large to store in a 32-bit
- * integer for some classes, we put it into a string (for printing).
- * Have to strip off the decimal point put in there by the floating
- * point print statement (internal file)
- */
- fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
- sprintf(size,"%d",pow(2,m+1));
- //size = size.replace('.', ' ');
- fprintf(stdout," Number of random numbers generated: %s\n",size);
- fprintf(stdout," Number of active processes: %d\n",no_nodes);
-
- }
- verified = false;
-
- /* c Compute the number of "batches" of random number pairs generated
- c per processor. Adjust if the number of processors does not evenly
- c divide the total number
-*/
-
- np = nn / no_nodes;
- no_large_nodes = nn % no_nodes;
- if (node < no_large_nodes) np_add = 1;
- else np_add = 0;
- np = np + np_add;
-
- if (np == 0) {
- fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
- MPI_Abort(MPI_COMM_WORLD,1);
- exit(0);
- }
-
-/* c Call the random number generator functions and initialize
- c the x-array to reduce the effects of paging on the timings.
- c Also, call all mathematical functions that are used. Make
- c sure these initializations cannot be eliminated as dead code.
-*/
-
- //call vranlc(0, dum[1], dum[2], dum[3]);
- // Array indexes start at 1 in Fortran, 0 in Java
- vranlc(0, dum[0], dum[1], &(dum[2]));
-
- dum[0] = randlc(&(dum[1]),&(dum[2]));
- /////////////////////////////////
- for (i=0;i<2*nk;i++) {
- x[i] = -1e99;
- }
- Mops = log(sqrt(abs(1)));
-
- /*
- c---------------------------------------------------------------------
- c Synchronize before placing time stamp
- c---------------------------------------------------------------------
- */
- MPI_Barrier( MPI_COMM_WORLD );
-
-
- TRACE_smpi_set_category ("ep");
-
- timer_clear(&(elapsed[1]));
- timer_clear(&(elapsed[2]));
- timer_clear(&(elapsed[3]));
- timer_start(&(start[1]));
-
- t1 = a;
- //fprintf(stdout,("(ep.f:160) t1 = " + t1);
- t1 = vranlc(0, t1, a, x);
- //fprintf(stdout,("(ep.f:161) t1 = " + t1);
-
-
-/* c Compute AN = A ^ (2 * NK) (mod 2^46). */
-
- t1 = a;
- //fprintf(stdout,("(ep.f:165) t1 = " + t1);
- for (i=1; i <= mk+1; i++) {
- t2 = randlc(&t1, &t1);
- //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
- }
- an = t1;
- //fprintf(stdout,("(ep.f:172) s = " + s);
- tt = s;
- gc = 0.;
- sx = 0.;
- sy = 0.;
- for (i=0; i < nq ; i++) {
- q[i] = 0.;
- }
-
-/*
- Each instance of this loop may be performed independently. We compute
- the k offsets separately to take into account the fact that some nodes
- have more numbers to generate than others
-*/
-
- if (np_add == 1)
- k_offset = node * np -1;
- else
- k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
-
- int stop = false;
- for(k = 1; k <= np; k++) {
- stop = false;
- kk = k_offset + k ;
- t1 = s;
- //fprintf(stdout,("(ep.f:193) t1 = " + t1);
- t2 = an;
-
-// Find starting seed t1 for this kk.
-
- for (i=1;i<=100 && !stop;i++) {
- ik = kk / 2;
- //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
- if (2 * ik != kk) {
- t3 = randlc(&t1, &t2);
- //fprintf(stdout,("(ep.f:200) t1= " +t1 );
- }
- if (ik==0)
- stop = true;
- else {
- t3 = randlc(&t2, &t2);
- kk = ik;
- }
- }
-// Compute uniform pseudorandom numbers.
-
- //if (timers_enabled) timer_start(3);
- timer_start(&(start[3]));
- //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified
-
- //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
- //fprintf(stdout,"2*nk = " + (2*nk));
- //fprintf(stdout,"t1 = " + t1);
- //fprintf(stdout,"a = " + a);
- //fprintf(stdout,"x[0] = " + x[0]);
- //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-
- t1 = vranlc(2 * nk, t1, a, x);
-
- //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<");
- //fprintf(stdout,("2*nk = " + (2*nk));
- //fprintf(stdout,("t1 = " + t1);
- //fprintf(stdout,("a = " + a);
- //fprintf(stdout,("x[0] = " + x[0]);
- //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-
- //if (timers_enabled) timer_stop(3);
- timer_stop(3,elapsed,start);
-
-/* Compute Gaussian deviates by acceptance-rejection method and
- * tally counts in concentric square annuli. This loop is not
- * vectorizable.
- */
- //if (timers_enabled) timer_start(2);
- timer_start(&(start[2]));
- for(i=1; i<=nk;i++) {
- x1 = 2. * x[2*i-2] -1.0;
- x2 = 2. * x[2*i-1] - 1.0;
- t1 = x1*x1 + x2*x2;
- if (t1 <= 1.) {
- t2 = sqrt(-2. * log(t1) / t1);
- t3 = (x1 * t2);
- t4 = (x2 * t2);
- l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
- q[l] = q[l] + 1.;
- sx = sx + t3;
- sy = sy + t4;
- }
- /*
- if(i == 1) {
- fprintf(stdout,"x1 = " + x1);
- fprintf(stdout,"x2 = " + x2);
- fprintf(stdout,"t1 = " + t1);
- fprintf(stdout,"t2 = " + t2);
- fprintf(stdout,"t3 = " + t3);
- fprintf(stdout,"t4 = " + t4);
- fprintf(stdout,"l = " + l);
- fprintf(stdout,"q[l] = " + q[l]);
- fprintf(stdout,"sx = " + sx);
- fprintf(stdout,"sy = " + sy);
- }
- */
- }
- //if (timers_enabled) timer_stop(2);
- timer_stop(2,elapsed,start);
- }
-
- TRACE_smpi_set_category ("finalize");
-
- //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
- MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
- sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
- MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
- sy = x[0];
- MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
- for(i = 0; i < nq; i++) {
- q[i] = x[i];
- }
- for(i = 0; i < nq; i++) {
- gc += q[i];
- }
-
- timer_stop(1,elapsed,start);
- tm = timer_read(1,elapsed);
- MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
- tm = x[0];
-
- if(node == root) {
- nit = 0;
- verified = true;
-
- if(m == 24) {
- sx_verify_value = -3.247834652034740E3;
- sy_verify_value = -6.958407078382297E3;
- } else if(m == 25) {
- sx_verify_value = -2.863319731645753E3;
- sy_verify_value = -6.320053679109499E3;
- } else if(m == 28) {
- sx_verify_value = -4.295875165629892E3;
- sy_verify_value = -1.580732573678431E4;
- } else if(m == 30) {
- sx_verify_value = 4.033815542441498E4;
- sy_verify_value = -2.660669192809235E4;
- } else if(m == 32) {
- sx_verify_value = 4.764367927995374E4;
- sy_verify_value = -8.084072988043731E4;
- } else if(m == 36) {
- sx_verify_value = 1.982481200946593E5;
- sy_verify_value = -1.020596636361769E5;
- } else {
- verified = false;
- }
-
- /*
- fprintf(stdout,("sx = " + sx);
- fprintf(stdout,("sx_verify = " + sx_verify_value);
- fprintf(stdout,("sy = " + sy);
- fprintf(stdout,("sy_verify = " + sy_verify_value);
- */
- if(verified) {
- sx_err = abs((sx - sx_verify_value)/sx_verify_value);
- sy_err = abs((sy - sy_verify_value)/sy_verify_value);
- /*
- fprintf(stdout,("sx_err = " + sx_err);
- fprintf(stdout,("sy_err = " + sx_err);
- fprintf(stdout,("epsilon= " + epsilon);
- */
- verified = ((sx_err < epsilon) && (sy_err < epsilon));
- }
-
- Mops = (pow(2.0, m+1))/tm/1000;
-
- fprintf(stdout,"EP Benchmark Results:\n");
- fprintf(stdout,"CPU Time=%d\n",tm);
- fprintf(stdout,"N = 2^%d\n",m);
- fprintf(stdout,"No. Gaussain Pairs =%d\n",gc);
- fprintf(stdout,"Sum = %f %ld\n",sx,sy);
- fprintf(stdout,"Count:");
- for(i = 0; i < nq; i++) {
- fprintf(stdout,"%d\t %ld\n",i,q[i]);
- }
-
- /*
- print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
- "Random numbers generated", verified, npbversion,
- compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
- fprintf(stdout,"\nEP Benchmark Completed\n");
- fprintf(stdout,"Class = %s\n", _class);
- fprintf(stdout,"Size = %s\n", size);
- fprintf(stdout,"Iteration = %d\n", nit);
- fprintf(stdout,"Time in seconds = %f\n",(tm/1000));
- fprintf(stdout,"Total processes = %d\n",no_nodes);
- fprintf(stdout,"Mops/s total = %f\n",Mops);
- fprintf(stdout,"Mops/s/process = %f\n", Mops/no_nodes);
- fprintf(stdout,"Operation type = Random number generated\n");
- if(verified) {
- fprintf(stdout,"Verification = SUCCESSFUL\n");
- } else {
- fprintf(stdout,"Verification = UNSUCCESSFUL\n");
- }
- fprintf(stdout,"Total time: %f\n",(timer_read(1,elapsed)/1000));
- fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2,elapsed)/1000));
- fprintf(stdout,"Random numbers: %f\n",(timer_read(3,elapsed)/1000));
- }
-#ifdef USE_MPE
- MPE_Finish_log(argv[0]);
-#endif
-
- MPI_Finalize();
- }
-
- int main(int argc, char **argv) {
- doTest(argc,argv);
- }
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'mpif.h'
-
- integer me, nprocs, root, dp_type
- common /mpistuff/ me, nprocs, root, dp_type
-
+++ /dev/null
-
-/*
- * FUNCTION RANDLC (X, A)
- *
- * This routine returns a uniform pseudorandom double precision number in the
- * range (0, 1) by using the linear congruential generator
- *
- * x_{k+1} = a x_k (mod 2^46)
- *
- * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
- * before repeating. The argument A is the same as 'a' in the above formula,
- * and X is the same as x_0. A and X must be odd double precision integers
- * in the range (1, 2^46). The returned value RANDLC is normalized to be
- * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
- * the new seed x_1, so that subsequent calls to RANDLC using the same
- * arguments will generate a continuous sequence.
- *
- * This routine should produce the same results on any computer with at least
- * 48 mantissa bits in double precision floating point data. On Cray systems,
- * double precision should be disabled.
- *
- * David H. Bailey October 26, 1990
- *
- * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- * SAVE KS, R23, R46, T23, T46
- * DATA KS/0/
- *
- * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
- * by merely using the ** operator, in order to insure that the results are
- * exact on all systems. This code assumes that 0.5D0 is represented exactly.
- */
-
-
-/*****************************************************************/
-/************* R A N D L C ************/
-/************* ************/
-/************* portable random number generator ************/
-/*****************************************************************/
-
-double randlc( double *X, double *A )
-{
- static int KS=0;
- static double R23, R46, T23, T46;
- double T1, T2, T3, T4;
- double A1;
- double A2;
- double X1;
- double X2;
- double Z;
- int i, j;
-
- if (KS == 0)
- {
- R23 = 1.0;
- R46 = 1.0;
- T23 = 1.0;
- T46 = 1.0;
-
- for (i=1; i<=23; i++)
- {
- R23 = 0.50 * R23;
- T23 = 2.0 * T23;
- }
- for (i=1; i<=46; i++)
- {
- R46 = 0.50 * R46;
- T46 = 2.0 * T46;
- }
- KS = 1;
- }
-
-/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
-
- T1 = R23 * *A;
- j = T1;
- A1 = j;
- A2 = *A - T23 * A1;
-
-/* Break X into two parts such that X = 2^23 * X1 + X2, compute
- Z = A1 * X2 + A2 * X1 (mod 2^23), and then
- X = 2^23 * Z + A2 * X2 (mod 2^46). */
-
- T1 = R23 * *X;
- j = T1;
- X1 = j;
- X2 = *X - T23 * X1;
- T1 = A1 * X2 + A2 * X1;
-
- j = R23 * T1;
- T2 = j;
- Z = T1 - T23 * T2;
- T3 = T23 * Z + A2 * X2;
- j = R46 * T3;
- T4 = j;
- *X = T3 - T46 * T4;
- return(R46 * *X);
-}
-
-
-
-/*****************************************************************/
-/************ F I N D _ M Y _ S E E D ************/
-/************ ************/
-/************ returns parallel random number seq seed ************/
-/*****************************************************************/
-
+++ /dev/null
-
-double randlc( double *X, double *A );
-
#include "mpi.h"
#include "npbparams.h"
+#include "simgrid/instr.h" //TRACE_
+
#include "randlc.h"
#ifndef CLASS
double *x = (double *) malloc (2*nk*sizeof(double));
double *q = (double *) malloc (nq*sizeof(double));
+ TRACE_smpi_set_category ("start");
+
MPI_Init( &argc, &argv );
MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
MPI_Comm_rank( MPI_COMM_WORLD, &node);
*/
MPI_Barrier( MPI_COMM_WORLD );
+ TRACE_smpi_set_category ("ep");
+
timer_clear(&(elapsed[1]));
timer_clear(&(elapsed[2]));
timer_clear(&(elapsed[3]));
timer_stop(2,elapsed,start);
}
+ TRACE_smpi_set_category ("finalize");
+
//int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=is
-BENCHMARKU=IS
-
-include ../config/make.def
-
-include ../sys/make.common
-
-OBJS = is-trace.o ${COMMON}/c_print_results.o
-
-
-${PROGRAM}: config ${OBJS}
- ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
-
-.c.o:
- ${CCOMPILE} $<
-
-is-trace.o: is-trace.c npbparams.h
-
-clean:
- - rm -f *.o *~ mputil*
- - rm -f is-trace npbparams.h core
+++ /dev/null
-/*************************************************************************
- * *
- * N A S P A R A L L E L B E N C H M A R K S 3.3 *
- * *
- * I S *
- * *
- *************************************************************************
- * *
- * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
- * It is described in NAS Technical Report 95-020. *
- * *
- * Permission to use, copy, distribute and modify this software *
- * for any purpose with or without fee is hereby granted. We *
- * request, however, that all derived work reference the NAS *
- * Parallel Benchmarks 3.3. This software is provided "as is" *
- * without express or implied warranty. *
- * *
- * Information on NPB 3.3, including the technical report, the *
- * original specifications, source code, results and information *
- * on how to submit new results, is available at: *
- * *
- * http://www.nas.nasa.gov/Software/NPB *
- * *
- * Send comments or suggestions to npb@nas.nasa.gov *
- * Send bug reports to npb-bugs@nas.nasa.gov *
- * *
- * NAS Parallel Benchmarks Group *
- * NASA Ames Research Center *
- * Mail Stop: T27A-1 *
- * Moffett Field, CA 94035-1000 *
- * *
- * E-mail: npb@nas.nasa.gov *
- * Fax: (650) 604-3957 *
- * *
- *************************************************************************
- * *
- * Author: M. Yarrow *
- * H. Jin *
- * *
- *************************************************************************/
-
-#include "mpi.h"
-#include "npbparams.h"
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "simgrid/instr.h" //TRACE_
-
-/******************/
-/* default values */
-/******************/
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-#define MIN_PROCS 1
-
-
-/*************/
-/* CLASS S */
-/*************/
-#if CLASS == 'S'
-#define TOTAL_KEYS_LOG_2 16
-#define MAX_KEY_LOG_2 11
-#define NUM_BUCKETS_LOG_2 9
-#endif
-
-
-/*************/
-/* CLASS W */
-/*************/
-#if CLASS == 'W'
-#define TOTAL_KEYS_LOG_2 20
-#define MAX_KEY_LOG_2 16
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-/*************/
-/* CLASS A */
-/*************/
-#if CLASS == 'A'
-#define TOTAL_KEYS_LOG_2 23
-#define MAX_KEY_LOG_2 19
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS B */
-/*************/
-#if CLASS == 'B'
-#define TOTAL_KEYS_LOG_2 25
-#define MAX_KEY_LOG_2 21
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS C */
-/*************/
-#if CLASS == 'C'
-#define TOTAL_KEYS_LOG_2 27
-#define MAX_KEY_LOG_2 23
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS D */
-/*************/
-#if CLASS == 'D'
-#define TOTAL_KEYS_LOG_2 29
-#define MAX_KEY_LOG_2 27
-#define NUM_BUCKETS_LOG_2 10
-#undef MIN_PROCS
-#define MIN_PROCS 4
-#endif
-
-
-#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2)
-#define MAX_KEY (1 << MAX_KEY_LOG_2)
-#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2)
-#define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
-
-/*****************************************************************/
-/* On larger number of processors, since the keys are (roughly) */
-/* gaussian distributed, the first and last processor sort keys */
-/* in a large interval, requiring array sizes to be larger. Note */
-/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
-/* The required array size also depends on the bucket size used. */
-/* The following values are validated for the 1024-bucket setup. */
-/*****************************************************************/
-#if NUM_PROCS < 256
-#define SIZE_OF_BUFFERS 3*NUM_KEYS/2
-#elif NUM_PROCS < 512
-#define SIZE_OF_BUFFERS 5*NUM_KEYS/2
-#elif NUM_PROCS < 1024
-#define SIZE_OF_BUFFERS 4*NUM_KEYS
-#else
-#define SIZE_OF_BUFFERS 13*NUM_KEYS/2
-#endif
-
-/*****************************************************************/
-/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
-/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */
-/* MAX_PROCS AT YOUR PERIL */
-/*****************************************************************/
-#if CLASS == 'S'
-#define MAX_PROCS 128
-#else
-#define MAX_PROCS 1024
-#endif
-
-#define MAX_ITERATIONS 10
-#define TEST_ARRAY_SIZE 5
-
-
-/***********************************/
-/* Enable separate communication, */
-/* computation timing and printout */
-/***********************************/
-/* #define TIMING_ENABLED */
-
-
-/*************************************/
-/* Typedef: if necessary, change the */
-/* size of int here by changing the */
-/* int type to, say, long */
-/*************************************/
-typedef int INT_TYPE;
-typedef long INT_TYPE2;
-#define MP_KEY_TYPE MPI_INT
-
-
-typedef struct {
-
-/********************/
-/* MPI properties: */
-/********************/
-int my_rank,
- comm_size;
-
-
-/********************/
-/* Some global info */
-/********************/
-INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
- total_local_keys, /* copies of rank info */
- total_lesser_keys;
-
-
-int passed_verification;
-
-
-
-/************************************/
-/* These are the three main arrays. */
-/* See SIZE_OF_BUFFERS def above */
-/************************************/
-INT_TYPE key_array[SIZE_OF_BUFFERS],
- key_buff1[SIZE_OF_BUFFERS],
- key_buff2[SIZE_OF_BUFFERS],
- bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */
- bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
- bucket_ptrs[NUM_BUCKETS],
- process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
- process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
-int send_count[MAX_PROCS], recv_count[MAX_PROCS],
- send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
-
-
-/**********************/
-/* Partial verif info */
-/**********************/
-INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
- test_rank_array[TEST_ARRAY_SIZE];
-
-/**********/
-/* Timers */
-/**********/
-double start[64], elapsed[64];
-
-} global_data;
-
-
-const INT_TYPE2
- S_test_index_array[TEST_ARRAY_SIZE] =
- {48427,17148,23627,62548,4431},
- S_test_rank_array[TEST_ARRAY_SIZE] =
- {0,18,346,64917,65463},
-
- W_test_index_array[TEST_ARRAY_SIZE] =
- {357773,934767,875723,898999,404505},
- W_test_rank_array[TEST_ARRAY_SIZE] =
- {1249,11698,1039987,1043896,1048018},
-
- A_test_index_array[TEST_ARRAY_SIZE] =
- {2112377,662041,5336171,3642833,4250760},
- A_test_rank_array[TEST_ARRAY_SIZE] =
- {104,17523,123928,8288932,8388264},
-
- B_test_index_array[TEST_ARRAY_SIZE] =
- {41869,812306,5102857,18232239,26860214},
- B_test_rank_array[TEST_ARRAY_SIZE] =
- {33422937,10244,59149,33135281,99},
-
- C_test_index_array[TEST_ARRAY_SIZE] =
- {44172927,72999161,74326391,129606274,21736814},
- C_test_rank_array[TEST_ARRAY_SIZE] =
- {61147,882988,266290,133997595,133525895},
-
- D_test_index_array[TEST_ARRAY_SIZE] =
- {1317351170,995930646,1157283250,1503301535,1453734525},
- D_test_rank_array[TEST_ARRAY_SIZE] =
- {1,36538729,1978098519,2145192618,2147425337};
-
-
-
-/***********************/
-/* function prototypes */
-/***********************/
-double randlc( double *X, double *A );
-
-void full_verify( global_data* gd );
-
-void c_print_results( char *name,
- char class,
- int n1,
- int n2,
- int n3,
- int niter,
- int nprocs_compiled,
- int nprocs_total,
- double t,
- double mops,
- char *optype,
- int passed_verification,
- char *npbversion,
- char *compiletime,
- char *mpicc,
- char *clink,
- char *cmpi_lib,
- char *cmpi_inc,
- char *cflags,
- char *clinkflags );
-
-void timer_clear(global_data* gd, int n );
-void timer_start(global_data* gd, int n );
-void timer_stop(global_data* gd, int n );
-double timer_read(global_data* gd, int n );
-
-void timer_clear(global_data* gd, int n ) {
- gd->elapsed[n] = 0.0;
-}
-
-void timer_start(global_data* gd, int n ) {
- gd->start[n] = MPI_Wtime();
-}
-
-void timer_stop(global_data* gd, int n ) {
- gd->elapsed[n] += MPI_Wtime() - gd->start[n];
-}
-
-double timer_read(global_data* gd, int n ) {
- return gd->elapsed[n];
-}
-
-
-/*
- * FUNCTION RANDLC (X, A)
- *
- * This routine returns a uniform pseudorandom double precision number in the
- * range (0, 1) by using the linear congruential generator
- *
- * x_{k+1} = a x_k (mod 2^46)
- *
- * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
- * before repeating. The argument A is the same as 'a' in the above formula,
- * and X is the same as x_0. A and X must be odd double precision integers
- * in the range (1, 2^46). The returned value RANDLC is normalized to be
- * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
- * the new seed x_1, so that subsequent calls to RANDLC using the same
- * arguments will generate a continuous sequence.
- *
- * This routine should produce the same results on any computer with at least
- * 48 mantissa bits in double precision floating point data. On Cray systems,
- * double precision should be disabled.
- *
- * David H. Bailey October 26, 1990
- *
- * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- * SAVE KS, R23, R46, T23, T46
- * DATA KS/0/
- *
- * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
- * by merely using the ** operator, in order to insure that the results are
- * exact on all systems. This code assumes that 0.5D0 is represented exactly.
- */
-
-
-/*****************************************************************/
-/************* R A N D L C ************/
-/************* ************/
-/************* portable random number generator ************/
-/*****************************************************************/
-
-double randlc( double *X, double *A )
-{
- static int KS=0;
- static double R23, R46, T23, T46;
- double T1, T2, T3, T4;
- double A1;
- double A2;
- double X1;
- double X2;
- double Z;
- int i, j;
-
- if (KS == 0)
- {
- R23 = 1.0;
- R46 = 1.0;
- T23 = 1.0;
- T46 = 1.0;
-
- for (i=1; i<=23; i++)
- {
- R23 = 0.50 * R23;
- T23 = 2.0 * T23;
- }
- for (i=1; i<=46; i++)
- {
- R46 = 0.50 * R46;
- T46 = 2.0 * T46;
- }
- KS = 1;
- }
-
-/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
-
- T1 = R23 * *A;
- j = T1;
- A1 = j;
- A2 = *A - T23 * A1;
-
-/* Break X into two parts such that X = 2^23 * X1 + X2, compute
- Z = A1 * X2 + A2 * X1 (mod 2^23), and then
- X = 2^23 * Z + A2 * X2 (mod 2^46). */
-
- T1 = R23 * *X;
- j = T1;
- X1 = j;
- X2 = *X - T23 * X1;
- T1 = A1 * X2 + A2 * X1;
-
- j = R23 * T1;
- T2 = j;
- Z = T1 - T23 * T2;
- T3 = T23 * Z + A2 * X2;
- j = R46 * T3;
- T4 = j;
- *X = T3 - T46 * T4;
- return(R46 * *X);
-}
-
-
-
-/*****************************************************************/
-/************ F I N D _ M Y _ S E E D ************/
-/************ ************/
-/************ returns parallel random number seq seed ************/
-/*****************************************************************/
-
-/*
- * Create a random number sequence of total length nn residing
- * on np number of processors. Each processor will therefore have a
- * subsequence of length nn/np. This routine returns that random
- * number which is the first random number for the subsequence belonging
- * to processor rank kn, and which is used as seed for proc kn ran # gen.
- */
-
-double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
- int np, /* np = num procs */
- long nn, /* total num of ran numbers, all procs */
- double s, /* Ran num seed, for ex.: 314159265.00 */
- double a ) /* Ran num gen mult, try 1220703125.00 */
-{
-
- long i;
-
- double t1,t2,t3,an;
- long mq,nq,kk,ik;
-
-
-
- nq = nn / np;
-
- for( mq=0; nq>1; mq++,nq/=2 )
- ;
-
- t1 = a;
-
- for( i=1; i<=mq; i++ )
- t2 = randlc( &t1, &t1 );
-
- an = t1;
-
- kk = kn;
- t1 = s;
- t2 = an;
-
- for( i=1; i<=100; i++ )
- {
- ik = kk / 2;
- if( 2 * ik != kk )
- t3 = randlc( &t1, &t2 );
- if( ik == 0 )
- break;
- t3 = randlc( &t2, &t2 );
- kk = ik;
- }
-
- return( t1 );
-
-}
-
-
-
-
-/*****************************************************************/
-/************* C R E A T E _ S E Q ************/
-/*****************************************************************/
-
-void create_seq( global_data* gd, double seed, double a )
-{
- double x;
- int i, k;
-
- k = MAX_KEY/4;
-
- for (i=0; i<NUM_KEYS; i++)
- {
- x = randlc(&seed, &a);
- x += randlc(&seed, &a);
- x += randlc(&seed, &a);
- x += randlc(&seed, &a);
-
- gd->key_array[i] = k*x;
- }
-}
-
-
-
-
-/*****************************************************************/
-/************* F U L L _ V E R I F Y ************/
-/*****************************************************************/
-
-
-void full_verify( global_data* gd )
-{
- MPI_Status status;
- MPI_Request request;
-
- INT_TYPE i, j;
- INT_TYPE k, last_local_key;
-
-
-/* Now, finally, sort the keys: */
- for( i=0; i<gd->total_local_keys; i++ )
- gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
- gd->total_lesser_keys] = gd->key_buff2[i];
- last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
-
-/* Send largest key value to next processor */
- if( gd->my_rank > 0 )
- MPI_Irecv( &k,
- 1,
- MP_KEY_TYPE,
- gd->my_rank-1,
- 1000,
- MPI_COMM_WORLD,
- &request );
- if( gd->my_rank < gd->comm_size-1 )
- MPI_Send( &gd->key_array[last_local_key],
- 1,
- MP_KEY_TYPE,
- gd->my_rank+1,
- 1000,
- MPI_COMM_WORLD );
- if( gd->my_rank > 0 )
- MPI_Wait( &request, &status );
-
-/* Confirm that neighbor's greatest key value
- is not greater than my least key value */
- j = 0;
- if( gd->my_rank > 0 && gd->total_local_keys > 0 )
- if( k > gd->key_array[0] )
- j++;
-
-
-/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
- for( i=1; i<gd->total_local_keys; i++ )
- if( gd->key_array[i-1] > gd->key_array[i] )
- j++;
-
-
- if( j != 0 )
- {
- printf( "Processor %d: Full_verify: number of keys out of sort: %d\n",
- gd->my_rank, j );
- }
- else
- gd->passed_verification++;
-
-
-}
-
-
-
-
-/*****************************************************************/
-/************* R A N K ****************/
-/*****************************************************************/
-
-
-void rank( global_data* gd, int iteration )
-{
-
- INT_TYPE i, k;
-
- INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
- INT_TYPE key;
- INT_TYPE2 bucket_sum_accumulator, j, m;
- INT_TYPE local_bucket_sum_accumulator;
- INT_TYPE min_key_val, max_key_val;
- INT_TYPE *key_buff_ptr;
-
-
-
-
-/* Iteration alteration of keys */
- if(gd->my_rank == 0 )
- {
- gd->key_array[iteration] = iteration;
- gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
- }
-
-
-/* Initialize */
- for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
- {
- gd->bucket_size[i] = 0;
- gd->bucket_size_totals[i] = 0;
- gd->process_bucket_distrib_ptr1[i] = 0;
- gd->process_bucket_distrib_ptr2[i] = 0;
- }
-
-
-/* Determine where the partial verify test keys are, load into */
-/* top of array bucket_size */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
- gd->bucket_size[NUM_BUCKETS+i] =
- gd->key_array[gd->test_index_array[i] % NUM_KEYS];
-
-
-/* Determine the number of keys in each bucket */
- for( i=0; i<NUM_KEYS; i++ )
- gd->bucket_size[gd->key_array[i] >> shift]++;
-
-
-/* Accumulative bucket sizes are the bucket pointers */
- gd->bucket_ptrs[0] = 0;
- for( i=1; i< NUM_BUCKETS; i++ )
- gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
-
-
-/* Sort into appropriate bucket */
- for( i=0; i<NUM_KEYS; i++ )
- {
- key = gd->key_array[i];
- gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
- }
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_start(gd, 3 );
-#endif
-
-/* Get the bucket size totals for the entire problem. These
- will be used to determine the redistribution of keys */
- MPI_Allreduce( gd->bucket_size,
- gd->bucket_size_totals,
- NUM_BUCKETS+TEST_ARRAY_SIZE,
- MP_KEY_TYPE,
- MPI_SUM,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 3 );
- timer_start(gd, 2 );
-#endif
-
-/* Determine Redistibution of keys: accumulate the bucket size totals
- till this number surpasses NUM_KEYS (which the average number of keys
- per processor). Then all keys in these buckets go to processor 0.
- Continue accumulating again until supassing 2*NUM_KEYS. All keys
- in these buckets go to processor 1, etc. This algorithm guarantees
- that all processors have work ranking; no processors are left idle.
- The optimum number of buckets, however, does not result in as high
- a degree of load balancing (as even a distribution of keys as is
- possible) as is obtained from increasing the number of buckets, but
- more buckets results in more computation per processor so that the
- optimum number of buckets turns out to be 1024 for machines tested.
- Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
- number of first and last bucket which each processor will have after
- the redistribution is done. */
-
- bucket_sum_accumulator = 0;
- local_bucket_sum_accumulator = 0;
- gd->send_displ[0] = 0;
- gd->process_bucket_distrib_ptr1[0] = 0;
- for( i=0, j=0; i<NUM_BUCKETS; i++ )
- {
- bucket_sum_accumulator += gd->bucket_size_totals[i];
- local_bucket_sum_accumulator += gd->bucket_size[i];
- if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
- {
- gd->send_count[j] = local_bucket_sum_accumulator;
- if( j != 0 )
- {
- gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
- gd->process_bucket_distrib_ptr1[j] =
- gd->process_bucket_distrib_ptr2[j-1]+1;
- }
- gd->process_bucket_distrib_ptr2[j++] = i;
- local_bucket_sum_accumulator = 0;
- }
- }
-
-/* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
- that the last few processors don't get any buckets. So, we
- need to set counts properly in this case to avoid any fallouts. */
- while( j < gd->comm_size )
- {
- gd->send_count[j] = 0;
- gd->process_bucket_distrib_ptr1[j] = 1;
- j++;
- }
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_start(gd, 3 );
-#endif
-
-/* This is the redistribution section: first find out how many keys
- each processor will send to every other processor: */
- MPI_Alltoall( gd->send_count,
- 1,
- MPI_INT,
- gd->recv_count,
- 1,
- MPI_INT,
- MPI_COMM_WORLD );
-
-/* Determine the receive array displacements for the buckets */
- gd->recv_displ[0] = 0;
- for( i=1; i<gd->comm_size; i++ )
- gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
-
-
-/* Now send the keys to respective processors */
- MPI_Alltoallv( gd->key_buff1,
- gd->send_count,
- gd->send_displ,
- MP_KEY_TYPE,
- gd->key_buff2,
- gd->recv_count,
- gd->recv_displ,
- MP_KEY_TYPE,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 3 );
- timer_start(gd, 2 );
-#endif
-
-/* The starting and ending bucket numbers on each processor are
- multiplied by the interval size of the buckets to obtain the
- smallest possible min and greatest possible max value of any
- key on each processor */
- min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
- max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
-
-/* Clear the work array */
- for( i=0; i<max_key_val-min_key_val+1; i++ )
- gd->key_buff1[i] = 0;
-
-/* Determine the total number of keys on all other
- processors holding keys of lesser value */
- m = 0;
- for( k=0; k<gd->my_rank; k++ )
- for( i= gd->process_bucket_distrib_ptr1[k];
- i<=gd->process_bucket_distrib_ptr2[k];
- i++ )
- m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
-
-/* Determine total number of keys on this processor */
- j = 0;
- for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
- i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
- i++ )
- j += gd->bucket_size_totals[i]; /* j has total # of local keys */
-
-
-/* Ranking of all keys occurs in this section: */
-/* shift it backwards so no subtractions are necessary in loop */
- key_buff_ptr = gd->key_buff1 - min_key_val;
-
-/* In this section, the keys themselves are used as their
- own indexes to determine how many of each there are: their
- individual population */
- for( i=0; i<j; i++ )
- key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key */
- /* population */
-
-/* To obtain ranks of each key, successively add the individual key
- population, not forgetting the total of lesser keys, m.
- NOTE: Since the total of lesser keys would be subtracted later
- in verification, it is no longer added to the first key population
- here, but still needed during the partial verify test. This is to
- ensure that 32-bit key_buff can still be used for class D. */
-/* key_buff_ptr[min_key_val] += m; */
- for( i=min_key_val; i<max_key_val; i++ )
- key_buff_ptr[i+1] += key_buff_ptr[i];
-
-
-/* This is the partial verify test section */
-/* Observe that test_rank_array vals are */
-/* shifted differently for different cases */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- {
- k = gd->bucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */
- if( min_key_val <= k && k <= max_key_val )
- {
- /* Add the total of lesser keys, m, here */
- INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
- int failed = 0;
-
- switch( CLASS )
- {
- case 'S':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'W':
- if( i < 2 )
- {
- if( key_rank != gd->test_rank_array[i]+(iteration-2) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'A':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+(iteration-1) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-(iteration-1) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'B':
- if( i == 1 || i == 2 || i == 4 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'C':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'D':
- if( i < 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- }
- if( failed == 1 )
- printf( "Failed partial verification: "
- "iteration %d, processor %d, test key %d\n",
- iteration, gd->my_rank, (int)i );
- }
- }
-
-
-
-
-/* Make copies of rank info for use by full_verify: these variables
- in rank are local; making them global slows down the code, probably
- since they cannot be made register by compiler */
-
- if( iteration == MAX_ITERATIONS )
- {
- gd->key_buff_ptr_global = key_buff_ptr;
- gd->total_local_keys = j;
- gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
- }
-
-}
-
-
-/*****************************************************************/
-/************* M A I N ****************/
-/*****************************************************************/
-
-int main( int argc, char **argv )
-{
-
- int i, iteration, itemp;
-
- double timecounter, maxtime;
-
- global_data* gd = malloc(sizeof(global_data));
-/* Initialize MPI */
- MPI_Init( &argc, &argv );
- MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
- MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
-
-/* Initialize the verification arrays if a valid class */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- switch( CLASS )
- {
- case 'S':
- gd->test_index_array[i] = S_test_index_array[i];
- gd->test_rank_array[i] = S_test_rank_array[i];
- break;
- case 'A':
- gd->test_index_array[i] = A_test_index_array[i];
- gd->test_rank_array[i] = A_test_rank_array[i];
- break;
- case 'W':
- gd->test_index_array[i] = W_test_index_array[i];
- gd->test_rank_array[i] = W_test_rank_array[i];
- break;
- case 'B':
- gd->test_index_array[i] = B_test_index_array[i];
- gd->test_rank_array[i] = B_test_rank_array[i];
- break;
- case 'C':
- gd->test_index_array[i] = C_test_index_array[i];
- gd->test_rank_array[i] = C_test_rank_array[i];
- break;
- case 'D':
- gd->test_index_array[i] = D_test_index_array[i];
- gd->test_rank_array[i] = D_test_rank_array[i];
- break;
- };
-
-
-
-/* Printout initial NPB info */
- if( gd->my_rank == 0 )
- {
- printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
- printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
- printf( " Iterations: %d\n", MAX_ITERATIONS );
- printf( " Number of processes: %d\n",gd->comm_size );
- }
-
-/* Check that actual and compiled number of processors agree */
- if( gd->comm_size != NUM_PROCS )
- {
- if( gd->my_rank == 0 )
- printf( "\n ERROR: compiled for %d processes\n"
- " Number of active processes: %d\n"
- " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
- MPI_Finalize();
- exit( 1 );
- }
-
-/* Check to see whether total number of processes is within bounds.
- This could in principle be checked in setparams.c, but it is more
- convenient to do it here */
- if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
- {
- if( gd->my_rank == 0 )
- printf( "\n ERROR: number of processes %d not within range %d-%d"
- "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
- MPI_Finalize();
- exit( 1 );
- }
-
-
-/* Generate random number sequence and subsequent keys on all procs */
- create_seq(gd, find_my_seed( gd->my_rank,
- gd->comm_size,
- 4*(long)TOTAL_KEYS*MIN_PROCS,
- 314159265.00, /* Random number gen seed */
- 1220703125.00 ), /* Random number gen mult */
- 1220703125.00 ); /* Random number gen mult */
-
-/* Do one interation for free (i.e., untimed) to guarantee initialization of
- all data and code pages and respective tables */
- rank(gd, 1 );
-
-/* Start verification counter */
- gd->passed_verification = 0;
-
- if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" );
-
-/* Initialize timer */
- timer_clear(gd, 0 );
-
-/* Initialize separate communication, computation timing */
-#ifdef TIMING_ENABLED
- for( i=1; i<=3; i++ ) timer_clear(gd, i );
-#endif
-
-/* Start timer */
- timer_start(gd, 0 );
-
-#ifdef TIMING_ENABLED
- timer_start(gd, 1 );
- timer_start(gd, 2 );
-#endif
-
- char smpi_category[100];
- snprintf (smpi_category, 100, "%d", gd->my_rank);
- TRACE_smpi_set_category (smpi_category);
-
-/* This is the main iteration */
- for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
- {
- if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
- rank(gd, iteration );
- }
- TRACE_smpi_set_category (NULL);
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_stop(gd, 1 );
-#endif
-
-/* Stop timer, obtain time for processors */
- timer_stop(gd, 0 );
-
- timecounter = timer_read(gd, 0 );
-
-/* End of timing, obtain maximum time of all processors */
- MPI_Reduce( &timecounter,
- &maxtime,
- 1,
- MPI_DOUBLE,
- MPI_MAX,
- 0,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- {
- double tmin, tsum, tmax;
-
- if( my_rank == 0 )
- {
- printf( "\ntimer 1/2/3 = total/computation/communication time\n");
- printf( " min avg max\n" );
- }
- for( i=1; i<=3; i++ )
- {
- timecounter = timer_read(gd, i );
- MPI_Reduce( &timecounter,
- &tmin,
- 1,
- MPI_DOUBLE,
- MPI_MIN,
- 0,
- MPI_COMM_WORLD );
- MPI_Reduce( &timecounter,
- &tsum,
- 1,
- MPI_DOUBLE,
- MPI_SUM,
- 0,
- MPI_COMM_WORLD );
- MPI_Reduce( &timecounter,
- &tmax,
- 1,
- MPI_DOUBLE,
- MPI_MAX,
- 0,
- MPI_COMM_WORLD );
- if( my_rank == 0 )
- printf( "timer %d: %f %f %f\n",
- i, tmin, tsum/((double) comm_size), tmax );
- }
- if( my_rank == 0 )
- printf( "\n" );
- }
-#endif
-
-/* This tests that keys are in sequence: sorting of last ranked key seq
- occurs here, but is an untimed operation */
- full_verify(gd);
-
-
-/* Obtain verification counter sum */
- itemp =gd->passed_verification;
- MPI_Reduce( &itemp,
- &gd->passed_verification,
- 1,
- MPI_INT,
- MPI_SUM,
- 0,
- MPI_COMM_WORLD );
-
-
-
-/* The final printout */
- if( gd->my_rank == 0 )
- {
- if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
- gd->passed_verification = 0;
- c_print_results( "IS",
- CLASS,
- (int)(TOTAL_KEYS),
- MIN_PROCS,
- 0,
- MAX_ITERATIONS,
- NUM_PROCS,
- gd->comm_size,
- maxtime,
- ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
- /maxtime/1000000.,
- "keys ranked",
- gd->passed_verification,
- NPBVERSION,
- COMPILETIME,
- MPICC,
- CLINK,
- CMPI_LIB,
- CMPI_INC,
- CFLAGS,
- CLINKFLAGS );
- }
-
- MPI_Finalize();
- free(gd);
-
- return 0;
- /**************************/
-} /* E N D P R O G R A M */
- /**************************/
#include <stdlib.h>
#include <stdio.h>
+#include "simgrid/instr.h" //TRACE_
+
/******************/
/* default values */
/******************/
timer_start(gd, 2 );
#endif
+ char smpi_category[100];
+ snprintf (smpi_category, 100, "%d", gd->my_rank);
+ TRACE_smpi_set_category (smpi_category);
+
/* This is the main iteration */
for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
{
if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
rank(gd, iteration );
}
-
+ TRACE_smpi_set_category (NULL);
#ifdef TIMING_ENABLED
timer_stop(gd, 2 );
+++ /dev/null
-This directory contains examples of make.def files that were used
-by the NPB team in testing the benchmarks on different platforms.
-They can be used as starting points for make.def files for your
-own platform, but you may need to taylor them for best performance
-on your installation. A clean template can be found in directory
-`config'.
-Some examples of suite.def files are also provided.
\ No newline at end of file
+++ /dev/null
-#This is for a DEC Alpha 8400. The code will execute on a
-#single processor
-#Warning: parallel make does not work properly in general
-MPIF77 = f77
-FLINK = f77
-#Optimization -O5 breaks SP; works fine for all other codes
-FFLAGS = -O4
-
-MPICC = cc
-CLINK = cc
-CFLAGS = -O5
-
-include ../config/make.dummy
-
-CC = cc -g
-BINDIR = ../bin
-
-RAND = randi8
+++ /dev/null
-#This is for a generic single-processor SGI workstation
-MPIF77 = f77
-FLINK = f77
-FFLAGS = -O3
-
-MPICC = cc
-CLINK = cc
-CFLAGS = -O3
-
-include ../config/make.dummy
-
-CC = cc -g
-BINDIR = ../bin
-
-RAND = randi8
-
+++ /dev/null
-# This is for a an SGI Origin 2000 or 3000 with vendor MPI. The Fortran
-# record length is specified, so it can be used for the I/O benchmark.
-# as well
-MPIF77 = f77
-FMPI_LIB = -lmpi
-FLINK = f77 -64
-FFLAGS = -O3 -64
-
-MPICC = cc
-CMPI_LIB = -lmpi
-CLINK = cc
-CFLAGS = -O3
-
-CC = cc -g
-BINDIR = ../bin
-
-RAND = randi8
-
-CONVERTFLAG = -DFORTRAN_REC_SIZE=4
-
+++ /dev/null
-# This is for the SGI PowerChallenge Array at NASA Ames. mrf77 and
-# mrcc are local scripts that invoke the proper MPI library.
-MPIF77 = mrf77
-FLINK = mrf77
-FFLAGS = -O3 -OPT:fold_arith_limit=1204
-
-MPICC = mrcc
-CLINK = mrcc
-CFLAGS = -O3 -OPT:fold_arith_limit=1204
-
-CC = cc -g
-BINDIR = ../bin
-
-RAND = randi8
-
-
+++ /dev/null
-#This is for the IBM SP2 at Ames; mrf77 and mrcc are local scripts
-MPIF77 = mrf77
-FLINK = mrf77
-FFLAGS = -O3
-FLINKFLAGS = -bmaxdata:0x60000000
-
-MPICC = mrcc
-CLINK = mrcc
-CFLAGS = -O3
-CLINKFLAGS = -bmaxdata:0x60000000
-
-CC = cc -g
-
-BINDIR = ../bin
-
-RAND = randi8
-
+++ /dev/null
-# This is for a Sun SparcCenter or UltraEnterprise machine
-MPIF77 = f77
-FLINK = f77
-FMPI_LIB = -L<your mpich installation tree>/lib/solaris/ch_lfshmem -lmpi
-FMPI_INC = -I<your mpich installation tree>/include
-# sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m)
-# and f77 -V returns 4.0 or greater
-# FFLAGS = -fast -xtarget=super -xO4 -depend
-# Ultra1,2, UltraEnterprise servers (uname -m returns sun4u)
-FFLAGS = -fast -xtarget=ultra -xarch=v8plus -xO4 -depend
-FLINKFLAGS = -lmopt -lcopt -lsunmath
-
-MPICC = cc
-CLINK = cc
-CMPI_LIB = -L<your mpich installation tree>/lib/solaris/ch_lfshmem -lmpi
-CMPI_INC = -I<your mpich installation tree>/include
-# sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m)
-# and cc -V returns 4.0 or greater
-#CFLAGS = -fast -xtarget=super -xO4 -xdepend
-# Ultra1,2, UltraEnterprise servers (uname -m returns sun4u)
-CFLAGS = -fast -xtarget=ultra -xarch=v8plus -xO4 -xdepend
-CLINKFLAGS = -fast
-
-CC = cc -g
-
-BINDIR = ../bin
-
-# Cannot use randi8 or randi8-safe on a 32-but machine. Use double precision
-RAND = randdp
-
+++ /dev/null
-#This is for the Cray T3D at the Jet Propulsion Laboratory
-MPIF77 = cf77
-FLINK = cf77
-FMPI_LIB = -L/usr/local/mpp/lib -lmpi
-FMPI_INC = -I/usr/local/mpp/lib/include/mpp
-FFLAGS = -dp -Wf-onoieeedivide -C cray-t3d
-#The following flags provide more effective optimization, but may
-#cause the random number generator randi8(_safe) to break in EP
-#FFLAGS = -dp -Wf-oaggress -Wf-onoieeedivide -C cray-t3d
-FLINKFLAGS = -Wl-Drdahead=on -C cray-t3d
-
-MPICC = cc
-CLINK = cc
-CMPI_LIB = -L/usr/local/mpp/lib -lmpi
-CMPI_INC = -I/usr/local/mpp/lib/include/mpp
-CFLAGS = -O3 -Tcray-t3d
-CLINKFLAGS = -Tcray-t3d
-
-CC = cc -g -Tcray-ymp
-BINDIR = ../bin
-
-CONVERTFLAG= -DCONVERTDOUBLE
-
-RAND = randi8
-
+++ /dev/null
-#---------------------------------------------------------------------------
-#
-# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS.
-#
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# Items in this file will need to be changed for each platform.
-# (Note these definitions are inconsistent with NPB2.1.)
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# Parallel Fortran:
-#
-# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must
-# be defined:
-#
-# MPIF77 - Fortran compiler
-# FFLAGS - Fortran compilation arguments
-# FMPI_INC - any -I arguments required for compiling MPI/Fortran
-# FLINK - Fortran linker
-# FLINKFLAGS - Fortran linker arguments
-# FMPI_LIB - any -L and -l arguments required for linking MPI/Fortran
-#
-# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or
-# $(MPIF77) $(FFLAGS)
-# linking is done with $(FLINK) $(FMPI_LIB) $(FLINKFLAGS)
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# This is the fortran compiler used for MPI programs
-#---------------------------------------------------------------------------
-MPIF77 = mpif77
-# This links MPI fortran programs; usually the same as ${MPIF77}
-FLINK = $(MPIF77)
-
-#---------------------------------------------------------------------------
-# These macros are passed to the linker to help link with MPI correctly
-#---------------------------------------------------------------------------
-FMPI_LIB =
-
-#---------------------------------------------------------------------------
-# These macros are passed to the compiler to help find 'mpif.h'
-#---------------------------------------------------------------------------
-FMPI_INC =
-
-#---------------------------------------------------------------------------
-# Global *compile time* flags for Fortran programs
-#---------------------------------------------------------------------------
-FFLAGS = -fast
-# FFLAGS = -g
-
-#---------------------------------------------------------------------------
-# Global *link time* flags. Flags for increasing maximum executable
-# size usually go here.
-#---------------------------------------------------------------------------
-FLINKFLAGS = -fast
-
-
-#---------------------------------------------------------------------------
-# Parallel C:
-#
-# For IS, which is in C, the following must be defined:
-#
-# MPICC - C compiler
-# CFLAGS - C compilation arguments
-# CMPI_INC - any -I arguments required for compiling MPI/C
-# CLINK - C linker
-# CLINKFLAGS - C linker flags
-# CMPI_LIB - any -L and -l arguments required for linking MPI/C
-#
-# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
-# $(MPICC) $(CFLAGS)
-# linking is done with $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# This is the C compiler used for MPI programs
-#---------------------------------------------------------------------------
-MPICC = mpicc
-# This links MPI C programs; usually the same as ${MPICC}
-CLINK = $(MPICC)
-
-#---------------------------------------------------------------------------
-# These macros are passed to the linker to help link with MPI correctly
-#---------------------------------------------------------------------------
-CMPI_LIB =
-
-#---------------------------------------------------------------------------
-# These macros are passed to the compiler to help find 'mpi.h'
-#---------------------------------------------------------------------------
-CMPI_INC =
-
-#---------------------------------------------------------------------------
-# Global *compile time* flags for C programs
-#---------------------------------------------------------------------------
-CFLAGS = -fast
-# CFLAGS = -g
-
-#---------------------------------------------------------------------------
-# Global *link time* flags. Flags for increasing maximum executable
-# size usually go here.
-#---------------------------------------------------------------------------
-CLINKFLAGS = -fast
-
-
-#---------------------------------------------------------------------------
-# MPI dummy library:
-#
-# Uncomment if you want to use the MPI dummy library supplied by NAS instead
-# of the true message-passing library. The include file redefines several of
-# the above macros. It also invokes make in subdirectory MPI_dummy. Make
-# sure that no spaces or tabs precede include.
-#---------------------------------------------------------------------------
-# include ../config/make.dummy
-
-
-#---------------------------------------------------------------------------
-# Utilities C:
-#
-# This is the C compiler used to compile C utilities. Flags required by
-# this compiler go here also; typically there are few flags required; hence
-# there are no separate macros provided for such flags.
-#---------------------------------------------------------------------------
-CC = cc -g
-
-
-#---------------------------------------------------------------------------
-# Destination of executables, relative to subdirs of the main directory. .
-#---------------------------------------------------------------------------
-BINDIR = ../bin
-
-
-#---------------------------------------------------------------------------
-# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which
-# is twice the precision required for the NPB suite. A compiler flag
-# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to
-# 64 bits, but the MPI library may continue to send 128 bits. Short of
-# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit
-# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment
-# the following line to enable this substitution.
-#
-# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO
-# SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC
-# VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE
-# THE CORRECT VALUE FOR "length".
-# IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED,
-# UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR
-# "length"
-#---------------------------------------------------------------------------
-# CONVERTFLAG = -DCONVERTDOUBLE
-CONVERTFLAG = -DFORTRAN_REC_SIZE=1
-# CONVERTFLAG = -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length
-
-
-#---------------------------------------------------------------------------
-# The variable RAND controls which random number generator
-# is used. It is described in detail in Doc/README.install.
-# Use "randi8" unless there is a reason to use another one.
-# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
-#---------------------------------------------------------------------------
-RAND = randi8
-# The following is highly reliable but may be slow:
-# RAND = randdp
-
+++ /dev/null
-bt S 1
-bt S 4
-bt S 9
-bt S 16
-bt A 1
-bt A 4
-bt A 9
-bt A 16
-bt A 25
-bt A 36
-bt A 49
-bt A 64
-bt A 81
-bt A 100
-bt A 121
-bt B 1
-bt B 4
-bt B 9
-bt B 16
-bt B 25
-bt B 36
-bt B 49
-bt B 64
-bt B 81
-bt B 100
-bt B 121
-bt C 1
-bt C 4
-bt C 9
-bt C 16
-bt C 25
-bt C 36
-bt C 49
-bt C 64
-bt C 81
-bt C 100
-bt C 121
+++ /dev/null
-cg S 1
-cg S 2
-cg S 4
-cg S 8
-cg S 16
-cg A 1
-cg A 2
-cg A 4
-cg A 8
-cg A 16
-cg A 32
-cg A 64
-cg A 128
-cg B 1
-cg B 2
-cg B 4
-cg B 8
-cg B 16
-cg B 32
-cg B 64
-cg B 128
-cg C 1
-cg C 2
-cg C 4
-cg C 8
-cg C 16
-cg C 32
-cg C 64
-cg C 128
+++ /dev/null
-ft S 1
-ft S 2
-ft S 4
-ft S 8
-ft S 16
-ft A 1
-ft A 2
-ft A 4
-ft A 8
-ft A 16
-ft A 32
-ft A 64
-ft A 128
-ft B 1
-ft B 2
-ft B 4
-ft B 8
-ft B 16
-ft B 32
-ft B 64
-ft B 128
-ft C 1
-ft C 2
-ft C 4
-ft C 8
-ft C 16
-ft C 32
-ft C 64
-ft C 128
+++ /dev/null
-lu S 1
-lu S 2
-lu S 4
-lu S 8
-lu S 16
-lu A 1
-lu A 2
-lu A 4
-lu A 8
-lu A 16
-lu A 32
-lu A 64
-lu A 128
-lu B 1
-lu B 2
-lu B 4
-lu B 8
-lu B 16
-lu B 32
-lu B 64
-lu B 128
-lu C 1
-lu C 2
-lu C 4
-lu C 8
-lu C 16
-lu C 32
-lu C 64
-lu C 128
+++ /dev/null
-mg S 1
-mg S 2
-mg S 4
-mg S 8
-mg S 16
-mg A 1
-mg A 2
-mg A 4
-mg A 8
-mg A 16
-mg A 32
-mg A 64
-mg A 128
-mg B 1
-mg B 2
-mg B 4
-mg B 8
-mg B 16
-mg B 32
-mg B 64
-mg B 128
-mg C 1
-mg C 2
-mg C 4
-mg C 8
-mg C 16
-mg C 32
-mg C 64
-mg C 128
+++ /dev/null
-bt S 1
-cg S 1
-ep S 1
-ft S 1
-is S 1
-lu S 1
-mg S 1
-sp S 1
+++ /dev/null
-sp S 1
-sp S 4
-sp S 9
-sp S 16
-sp A 1
-sp A 4
-sp A 9
-sp A 16
-sp A 25
-sp A 36
-sp A 49
-sp A 64
-sp A 81
-sp A 100
-sp A 121
-sp B 1
-sp B 4
-sp B 9
-sp B 16
-sp B 25
-sp B 36
-sp B 49
-sp B 64
-sp B 81
-sp B 100
-sp B 121
-sp C 1
-sp C 4
-sp C 9
-sp C 16
-sp C 25
-sp C 36
-sp C 49
-sp C 64
-sp C 81
-sp C 100
-sp C 121
# Typing "make suite" in the main directory will build all the benchmarks
# specified in this file.
# Each line of this file contains a benchmark name, class, and number
-# of nodes. The name is one of "cg", "is", "ep", mg", "ft", "sp", "bt",
-# "lu", and "dt".
+# of nodes. The name is one of "is", "ep", and "dt"
# The class is one of "S", "W", "A", "B", "C", "D", and "E"
# (except that no classes C, D and E for DT, and no class E for IS).
# The number of nodes must be a legal number for a particular
# Comments start with "#" as the first character on a line.
# No blank lines.
# The following example builds 1 processor sample sizes of all benchmarks.
-ft S 1
-mg S 1
-sp S 1
-lu S 1
-bt S 1
is S 1
ep S 1
-cg S 1
dt S 1