examples/smpi/mc/mutual_exclusion
examples/smpi/mc/non_deterministic
examples/smpi/mc/send_deterministic
+examples/smpi/NAS/dt
+examples/smpi/NAS/ep
+examples/smpi/NAS/is
examples/smpi/mvmul
examples/smpi/replay_multiple/replay_multiple
examples/smpi/replay/one_trace
--- /dev/null
+if(enable_smpi)
+ if(WIN32)
+ set(CMAKE_C_FLAGS "-include ${CMAKE_HOME_DIRECTORY}/include/smpi/smpi_main.h")
+ else()
+ set(CMAKE_C_COMPILER "${CMAKE_BINARY_DIR}/smpi_script/bin/smpicc")
+ endif()
+
+ include_directories(BEFORE "${CMAKE_HOME_DIRECTORY}/include/smpi")
+ add_executable (is is.c nas_common.c)
+ target_link_libraries(is simgrid m)
+ add_executable (ep ep.c nas_common.c)
+ target_link_libraries(ep simgrid m)
+ add_executable (dt dt.c nas_common.c DGraph.c)
+ target_link_libraries(dt simgrid m)
+endif()
+
+set(examples_src ${examples_src} ${CMAKE_CURRENT_SOURCE_DIR}/nas_common.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/nas_common.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/is.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/dt.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/ep.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/DGraph.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/DGraph.h
+ PARENT_SCOPE)
+set(txt_files ${txt_files} ${CMAKE_CURRENT_SOURCE_DIR}/README.install PARENT_SCOPE)
\ No newline at end of file
return nd;
}
void nodeShow(DGNode* nd){
- fprintf( stderr,"%3d.%s: (%d,%d)\n",
- nd->id,nd->name,nd->inDegree,nd->outDegree);
+ fprintf( stderr,"%3d.%s: (%d,%d)\n", nd->id,nd->name,nd->inDegree,nd->outDegree);
/*
if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
else if(nd->verified==0) fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
dg->name=strdup(nm);
return dg;
}
+
int AttachNode(DGraph* dg, DGNode* nd) {
int i=0,j,len=0;
DGNode **nds =NULL, *tmpnd=NULL;
if (dg->numNodes == dg->maxNodes-1 ) {
dg->maxNodes += BLOCK_SIZE;
- nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
+ nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
free(dg->node);
dg->node=nds;
}
- len = strlen( nd->name);
+ len = strlen( nd->name);
for (i = 0; i < dg->numNodes; i++) {
tmpnd =dg->node[ i];
ar=NULL;
if ( strncmp( nd->name, tmpnd->name, len) ) continue;
if ( nd->inDegree > 0 ) {
tmpnd->maxInDegree += nd->maxInDegree;
- ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
+ ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
free(tmpnd->inArc);
tmpnd->inArc=ar;
}
if ( nd->outDegree > 0 ) {
tmpnd->maxOutDegree += nd->maxOutDegree;
- ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
+ ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
free(tmpnd->outArc);
tmpnd->outArc=ar;
for (j = 0; j < nd->outDegree; j++ ) {
nd->outArc[ j]->tail = tmpnd;
- }
+ }
memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
tmpnd->outDegree += nd->outDegree;
- }
+ }
free(nd);
return i;
}
nd->id = dg->numNodes;
dg->node[dg->numNodes] = nd;
dg->numNodes++;
-return nd->id;
+ return nd->id;
}
+
int AttachArc(DGraph *dg,DGArc* nar){
-int arcId = -1;
-int i=0,newNumber=0;
-DGNode *head = nar->head,
- *tail = nar->tail;
-DGArc **ars=NULL,*probe=NULL;
-/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
+ int arcId = -1;
+ int i=0,newNumber=0;
+ DGNode *head = nar->head,
+ *tail = nar->tail;
+ DGArc **ars=NULL,*probe=NULL;
+ /*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
if ( !tail || !head ) return arcId;
if ( dg->numArcs == dg->maxArcs-1 ) {
dg->maxArcs += BLOCK_SIZE;
- ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
+ ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
free(dg->arc);
dg->arc=ars;
}
for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
probe = tail->outArc[ i];
- if(probe->head == head
- &&
- probe->length == nar->length
- ){
- free(nar);
- return probe->id;
+ if(probe->head == head && probe->length == nar->length){
+ free(nar);
+ return probe->id;
}
}
-
+
nar->id = dg->numArcs;
arcId=dg->numArcs;
dg->arc[dg->numArcs] = nar;
dg->numArcs++;
-
+
head->inArc[ head->inDegree] = nar;
head->inDegree++;
if ( head->inDegree >= head->maxInDegree ) {
newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
- ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+ ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
free(head->inArc);
head->inArc=ars;
tail->outDegree++;
if(tail->outDegree >= tail->maxOutDegree ) {
newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
- ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+ ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
free(tail->outArc);
tail->outArc=ars;
tail->maxOutDegree = newNumber;
}
/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
-return arcId;
+ return arcId;
}
+
void graphShow(DGraph *dg,int DetailsLevel){
int i=0,j=0;
fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
DGNode *focusNode = dg->node[ i];
if(DetailsLevel >= 2) {
for (j = 0; j < focusNode->inDegree; j++ ) {
- fprintf(stderr,"\t ");
- nodeShow(focusNode->inArc[ j]->tail);
+ fprintf(stderr,"\t ");
+ nodeShow(focusNode->inArc[ j]->tail);
}
}
nodeShow(focusNode);
for (j = 0; j < focusNode->outDegree; j++ ) {
fprintf(stderr, "\t ");
nodeShow(focusNode->outArc[ j]->head);
- }
+ }
fprintf(stderr, "---\n");
}
fprintf(stderr,"----------------------------------------\n");
if ( DetailsLevel < 3) return;
}
-
-
-
+++ /dev/null
-BENCHMARK=dt
-include ../config/make.def
-include ../sys/make.common
-
-OBJS = DGraph.o ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/randdp.o
-
-${PROGRAM}: config dt.o dt-folding.o ${OBJS}
- ${CLINK} ${CLINKFLAGS} -o $(BINDIR)/dt.${CLASS} dt.o ${OBJS} ${CMPI_LIB}
- ${CLINK} ${CLINKFLAGS} -o ${BINDIR}/dt-folding.${CLASS} dt-folding.o ${OBJS} ${CMPI_LIB}
-
-.c.o:
- ${CCOMPILE} $<
-
-dt.o: dt.c npbparams.h
-dt-folding.o: dt-folding.c npbparams.h
-DGraph.o: DGraph.c DGraph.h
-
-clean:
- - rm -f *.o *~ npbparams.h
+++ /dev/null
-Data Traffic benchmark DT is new in the NPB suite
-(released as part of NPB3.x-MPI package).
-----------------------------------------------------
-
-DT is written in C and same executable can run on any number of processors,
-provided this number is not less than the number of nodes in the communication
-graph. DT benchmark takes one argument: BH, WH, or SH. This argument
-specifies the communication graph Black Hole, White Hole, or SHuffle
-respectively. The current release contains verification numbers for
-CLASSES S, W, A, and B only. Classes C and D are defined, but verification
-numbers are not provided in this release.
-
-The following table summarizes the number of nodes in the communication
-graph based on CLASS and graph TYPE.
-
-CLASS N_Source N_Nodes(BH,WH) N_Nodes(SH)
- S 4 5 12
- W 8 11 32
- A 16 21 80
- B 32 43 192
- C 64 85 448
- D 128 171 1024
+++ /dev/null
-/*************************************************************************
- * *
- * N A S P A R A L L E L B E N C H M A R K S 3.3 *
- * *
- * D T *
- * *
- *************************************************************************
- * *
- * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
- * *
- * Permission to use, copy, distribute and modify this software *
- * for any purpose with or without fee is hereby granted. We *
- * request, however, that all derived work reference the NAS *
- * Parallel Benchmarks 3.3. This software is provided "as is" *
- * without express or implied warranty. *
- * *
- * Information on NPB 3.3, including the technical report, the *
- * original specifications, source code, results and information *
- * on how to submit new results, is available at: *
- * *
- * http: www.nas.nasa.gov/Software/NPB *
- * *
- * Send comments or suggestions to npb@nas.nasa.gov *
- * Send bug reports to npb-bugs@nas.nasa.gov *
- * *
- * NAS Parallel Benchmarks Group *
- * NASA Ames Research Center *
- * Mail Stop: T27A-1 *
- * Moffett Field, CA 94035-1000 *
- * *
- * E-mail: npb@nas.nasa.gov *
- * Fax: (650) 604-3957 *
- * *
- *************************************************************************
- * *
- * Author: M. Frumkin * *
- * *
- *************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-
-//int passed_verification;
-extern double randlc( double *X, double *A );
-extern
-void c_print_results( char *name,
- char class,
- int n1,
- int n2,
- int n3,
- int niter,
- int nprocs_compiled,
- int nprocs_total,
- double t,
- double mops,
- char *optype,
- int passed_verification,
- char *npbversion,
- char *compiletime,
- char *mpicc,
- char *clink,
- char *cmpi_lib,
- char *cmpi_inc,
- char *cflags,
- char *clinkflags );
-
-void timer_clear( int n );
-void timer_start( int n );
-void timer_stop( int n );
-double timer_read( int n );
-int timer_on=0,timers_tot=64;
-
-int verify(char *bmname,double rnm2){
- double verify_value=0.0;
- double epsilon=1.0E-8;
- char cls=CLASS;
- int verified=-1;
- if (cls != 'U') {
- if(cls=='S') {
- if(strstr(bmname,"BH")){
- verify_value=30892725.0;
- }else if(strstr(bmname,"WH")){
- verify_value=67349758.0;
- }else if(strstr(bmname,"SH")){
- verify_value=58875767.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- }
- verified = 0;
- }else if(cls=='W') {
- if(strstr(bmname,"BH")){
- verify_value = 4102461.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 204280762.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 186944764.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- }
- verified = 0;
- }else if(cls=='A') {
- if(strstr(bmname,"BH")){
- verify_value = 17809491.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 1289925229.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 610856482.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- }
- verified = 0;
- }else if(cls=='B') {
- if(strstr(bmname,"BH")){
- verify_value = 4317114.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 7877279917.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 1836863082.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- verified = 0;
- }
- }else if(cls=='C') {
- if(strstr(bmname,"BH")){
- verify_value = 0.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 0.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 0.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- verified = -1;
- }
- }else if(cls=='D') {
- if(strstr(bmname,"BH")){
- verify_value = 0.0;
- }else if(strstr(bmname,"WH")){
- verify_value = 0.0;
- }else if(strstr(bmname,"SH")){
- verify_value = 0.0;
- }else{
- fprintf(stderr,"No such benchmark as %s.\n",bmname);
- }
- verified = -1;
- }else{
- fprintf(stderr,"No such class as %c.\n",cls);
- }
- fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
- if(verified==-1){
- fprintf(stderr," No verification was performed.\n");
- }else if( rnm2 - verify_value < epsilon &&
- rnm2 - verify_value > -epsilon) { /* abs here does not work on ALTIX */
- verified = 1;
- fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
- }else{
- verified = 0;
- fprintf(stderr," The correct verification value = %f\n",verify_value);
- fprintf(stderr," Got value = %f\n",rnm2);
- }
- }else{
- verified = -1;
- }
- return verified;
- }
-
-int ipowMod(int a,long long int n,int md){
- int seed=1,q=a,r=1;
- if(n<0){
- fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
- n=-n; /* temp fix */
-/* return 1; */
- }
- if(md<=0){
- fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
- return 1;
- }
- if(n==0) return 1;
- while(n>1){
- int n2 = n/2;
- if (n2*2==n){
- seed = (q*q)%md;
- q=seed;
- n = n2;
- }else{
- seed = (r*q)%md;
- r=seed;
- n = n-1;
- }
- }
- seed = (r*q)%md;
- return seed;
-}
-
-#include "DGraph.h"
-DGraph *buildSH(char cls){
-/*
- Nodes of the graph must be topologically sorted
- to avoid MPI deadlock.
-*/
- DGraph *dg;
- int numSources=NUM_SOURCES; /* must be power of 2 */
- int numOfLayers=0,tmpS=numSources>>1;
- int firstLayerNode=0;
- DGArc *ar=NULL;
- DGNode *nd=NULL;
- int mask=0x0,ndid=0,ndoff=0;
- int i=0,j=0;
- char nm[BLOCK_SIZE];
-
- sprintf(nm,"DT_SH.%c",cls);
- dg=newDGraph(nm);
-
- while(tmpS>1){
- numOfLayers++;
- tmpS>>=1;
- }
- for(i=0;i<numSources;i++){
- sprintf(nm,"Source.%d",i);
- nd=newNode(nm);
- AttachNode(dg,nd);
- }
- for(j=0;j<numOfLayers;j++){
- mask=0x00000001<<j;
- for(i=0;i<numSources;i++){
- sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
- nd=newNode(nm);
- AttachNode(dg,nd);
- ndoff=i&(~mask);
- ndid=firstLayerNode+ndoff;
- ar=newArc(dg->node[ndid],nd);
- AttachArc(dg,ar);
- ndoff+=mask;
- ndid=firstLayerNode+ndoff;
- ar=newArc(dg->node[ndid],nd);
- AttachArc(dg,ar);
- }
- firstLayerNode+=numSources;
- }
- mask=0x00000001<<numOfLayers;
- for(i=0;i<numSources;i++){
- sprintf(nm,"Sink.%d",i);
- nd=newNode(nm);
- AttachNode(dg,nd);
- ndoff=i&(~mask);
- ndid=firstLayerNode+ndoff;
- ar=newArc(dg->node[ndid],nd);
- AttachArc(dg,ar);
- ndoff+=mask;
- ndid=firstLayerNode+ndoff;
- ar=newArc(dg->node[ndid],nd);
- AttachArc(dg,ar);
- }
-return dg;
-}
-DGraph *buildWH(char cls){
-/*
- Nodes of the graph must be topologically sorted
- to avoid MPI deadlock.
-*/
- int i=0,j=0;
- int numSources=NUM_SOURCES,maxInDeg=4;
- int numLayerNodes=numSources,firstLayerNode=0;
- int totComparators=0;
- int numPrevLayerNodes=numLayerNodes;
- int id=0,sid=0;
- DGraph *dg;
- DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
- DGArc *ar=NULL;
- char nm[BLOCK_SIZE];
-
- sprintf(nm,"DT_WH.%c",cls);
- dg=newDGraph(nm);
-
- for(i=0;i<numSources;i++){
- sprintf(nm,"Sink.%d",i);
- nd=newNode(nm);
- AttachNode(dg,nd);
- }
- totComparators=0;
- numPrevLayerNodes=numLayerNodes;
- while(numLayerNodes>maxInDeg){
- numLayerNodes=numLayerNodes/maxInDeg;
- if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
- for(i=0;i<numLayerNodes;i++){
- sprintf(nm,"Comparator.%d",totComparators);
- totComparators++;
- nd=newNode(nm);
- id=AttachNode(dg,nd);
- for(j=0;j<maxInDeg;j++){
- sid=i*maxInDeg+j;
- if(sid>=numPrevLayerNodes) break;
- snd=dg->node[firstLayerNode+sid];
- ar=newArc(dg->node[id],snd);
- AttachArc(dg,ar);
- }
- }
- firstLayerNode+=numPrevLayerNodes;
- numPrevLayerNodes=numLayerNodes;
- }
- source=newNode("Source");
- AttachNode(dg,source);
- for(i=0;i<numPrevLayerNodes;i++){
- nd=dg->node[firstLayerNode+i];
- ar=newArc(source,nd);
- AttachArc(dg,ar);
- }
-
- for(i=0;i<dg->numNodes/2;i++){ /* Topological sorting */
- tmp=dg->node[i];
- dg->node[i]=dg->node[dg->numNodes-1-i];
- dg->node[i]->id=i;
- dg->node[dg->numNodes-1-i]=tmp;
- dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
- }
-return dg;
-}
-DGraph *buildBH(char cls){
-/*
- Nodes of the graph must be topologically sorted
- to avoid MPI deadlock.
-*/
- int i=0,j=0;
- int numSources=NUM_SOURCES,maxInDeg=4;
- int numLayerNodes=numSources,firstLayerNode=0;
- DGraph *dg;
- DGNode *nd=NULL, *snd=NULL, *sink=NULL;
- DGArc *ar=NULL;
- int totComparators=0;
- int numPrevLayerNodes=numLayerNodes;
- int id=0, sid=0;
- char nm[BLOCK_SIZE];
-
- sprintf(nm,"DT_BH.%c",cls);
- dg=newDGraph(nm);
-
- for(i=0;i<numSources;i++){
- sprintf(nm,"Source.%d",i);
- nd=newNode(nm);
- AttachNode(dg,nd);
- }
- while(numLayerNodes>maxInDeg){
- numLayerNodes=numLayerNodes/maxInDeg;
- if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
- for(i=0;i<numLayerNodes;i++){
- sprintf(nm,"Comparator.%d",totComparators);
- totComparators++;
- nd=newNode(nm);
- id=AttachNode(dg,nd);
- for(j=0;j<maxInDeg;j++){
- sid=i*maxInDeg+j;
- if(sid>=numPrevLayerNodes) break;
- snd=dg->node[firstLayerNode+sid];
- ar=newArc(snd,dg->node[id]);
- AttachArc(dg,ar);
- }
- }
- firstLayerNode+=numPrevLayerNodes;
- numPrevLayerNodes=numLayerNodes;
- }
- sink=newNode("Sink");
- AttachNode(dg,sink);
- for(i=0;i<numPrevLayerNodes;i++){
- nd=dg->node[firstLayerNode+i];
- ar=newArc(nd,sink);
- AttachArc(dg,ar);
- }
-return dg;
-}
-
-typedef struct{
- int len;
- double* val;
-} Arr;
-Arr *newArr(int len){
- Arr *arr=(Arr *)SMPI_SHARED_MALLOC(sizeof(Arr));
- arr->len=len;
- arr->val=(double *)SMPI_SHARED_MALLOC(len*sizeof(double));
- return arr;
-}
-void arrShow(Arr* a){
- if(!a) fprintf(stderr,"-- NULL array\n");
- else{
- fprintf(stderr,"-- length=%d\n",a->len);
- }
-}
-double CheckVal(Arr *feat){
- double csum=0.0;
- int i=0;
- for(i=0;i<feat->len;i++){
- csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since
- result will be 0 for large len */
- }
- return csum;
-}
-int GetFNumDPar(int* mean, int* stdev){
- *mean=NUM_SAMPLES;
- *stdev=STD_DEVIATION;
- return 0;
-}
-int GetFeatureNum(char *mbname,int id){
- double tran=314159265.0;
- double A=2*id+1;
- double denom=randlc(&tran,&A);
- char cval='S';
- int mean=NUM_SAMPLES,stdev=128;
- int rtfs=0,len=0;
- GetFNumDPar(&mean,&stdev);
- rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
- if(rtfs<0) rtfs=-rtfs;
- len=mean-stdev+rtfs;
- return len;
-}
-Arr* RandomFeatures(char *bmname,int fdim,int id){
- int len=GetFeatureNum(bmname,id)*fdim;
- Arr* feat=newArr(len);
- int nxg=2,nyg=2,nzg=2,nfg=5;
- int nx=421,ny=419,nz=1427,nf=3527;
- long long int expon=(len*(id+1))%3141592;
- int seedx=ipowMod(nxg,expon,nx),
- seedy=ipowMod(nyg,expon,ny),
- seedz=ipowMod(nzg,expon,nz),
- seedf=ipowMod(nfg,expon,nf);
- int i=0;
- if(timer_on){
- timer_clear(id+1);
- timer_start(id+1);
- }
- for(i=0;i<len;i+=fdim){
- seedx=(seedx*nxg)%nx;
- seedy=(seedy*nyg)%ny;
- seedz=(seedz*nzg)%nz;
- seedf=(seedf*nfg)%nf;
- feat->val[i]=seedx;
- feat->val[i+1]=seedy;
- feat->val[i+2]=seedz;
- feat->val[i+3]=seedf;
- }
- if(timer_on){
- timer_stop(id+1);
- fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
- }
- return feat;
-}
-void Resample(Arr *a,int blen){
- long long int i=0,j=0,jlo=0,jhi=0;
- double avval=0.0;
- double *nval=(double *)SMPI_SHARED_MALLOC(blen*sizeof(double));
- Arr *tmp=newArr(10);
- for(i=0;i<blen;i++) nval[i]=0.0;
- for(i=1;i<a->len-1;i++){
- jlo=(int)(0.5*(2*i-1)*(blen/a->len));
- jhi=(int)(0.5*(2*i+1)*(blen/a->len));
-
- avval=a->val[i]/(jhi-jlo+1);
- for(j=jlo;j<=jhi;j++){
- nval[j]+=avval;
- }
- }
- nval[0]=a->val[0];
- nval[blen-1]=a->val[a->len-1];
- SMPI_SHARED_FREE(a->val);
- a->val=nval;
- a->len=blen;
-}
-#define fielddim 4
-Arr* WindowFilter(Arr *a, Arr* b,int w){
- int i=0,j=0,k=0;
- double rms0=0.0,rms1=0.0,rmsm1=0.0;
- double weight=((double) (w+1))/(w+2);
-
- w+=1;
- if(timer_on){
- timer_clear(w);
- timer_start(w);
- }
- if(a->len<b->len) Resample(a,b->len);
- if(a->len>b->len) Resample(b,a->len);
- for(i=fielddim;i<a->len-fielddim;i+=fielddim){
- rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
- +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
- +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
- +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
- j=i+fielddim;
- rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
- +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
- +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
- +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
- j=i-fielddim;
- rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
- +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
- +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
- +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
- k=0;
- if(rms1<rms0){
- k=1;
- rms0=rms1;
- }
- if(rmsm1<rms0) k=-1;
- if(k==0){
- j=i+fielddim;
- a->val[i]=weight*b->val[i];
- a->val[i+1]=weight*b->val[i+1];
- a->val[i+2]=weight*b->val[i+2];
- a->val[i+3]=weight*b->val[i+3];
- }else if(k==1){
- j=i+fielddim;
- a->val[i]=weight*b->val[j];
- a->val[i+1]=weight*b->val[j+1];
- a->val[i+2]=weight*b->val[j+2];
- a->val[i+3]=weight*b->val[j+3];
- }else { /*if(k==-1)*/
- j=i-fielddim;
- a->val[i]=weight*b->val[j];
- a->val[i+1]=weight*b->val[j+1];
- a->val[i+2]=weight*b->val[j+2];
- a->val[i+3]=weight*b->val[j+3];
- }
- }
- if(timer_on){
- timer_stop(w);
- fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
- }
- return a;
-}
-
-int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
- int i=0,tag=0;
- DGArc *ar=NULL;
- DGNode *head=NULL;
- if(!feat) return 0;
- for(i=0;i<nd->outDegree;i++){
- ar=nd->outArc[i];
- if(ar->tail!=nd) continue;
- head=ar->head;
- tag=ar->id;
- if(head->address!=nd->address){
- MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
- MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
- }
- }
- return 1;
-}
-Arr* CombineStreams(DGraph *dg,DGNode *nd){
- Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
- int i=0,len=0,tag=0;
- DGArc *ar=NULL;
- DGNode *tail=NULL;
- MPI_Status status;
- Arr *feat=NULL,*featp=NULL;
-
- if(nd->inDegree==0) return NULL;
- for(i=0;i<nd->inDegree;i++){
- ar=nd->inArc[i];
- if(ar->head!=nd) continue;
- tail=ar->tail;
- if(tail->address!=nd->address){
- len=0;
- tag=ar->id;
- MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
- feat=newArr(len);
- MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
- resfeat=WindowFilter(resfeat,feat,nd->id);
- SMPI_SHARED_FREE(feat);
- }else{
- featp=(Arr *)tail->feat;
- feat=newArr(featp->len);
- memcpy(feat->val,featp->val,featp->len*sizeof(double));
- resfeat=WindowFilter(resfeat,feat,nd->id);
- SMPI_SHARED_FREE(feat);
- }
- }
- for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
- nd->feat=resfeat;
- return nd->feat;
-}
-double Reduce(Arr *a,int w){
- double retv=0.0;
- if(timer_on){
- timer_clear(w);
- timer_start(w);
- }
- retv=(int)(w*CheckVal(a));/* The casting needed for node
- and array dependent verifcation */
- if(timer_on){
- timer_stop(w);
- fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
- }
- return retv;
-}
-
-double ReduceStreams(DGraph *dg,DGNode *nd){
- double csum=0.0;
- int i=0,len=0,tag=0;
- DGArc *ar=NULL;
- DGNode *tail=NULL;
- Arr *feat=NULL;
- double retv=0.0;
-
- for(i=0;i<nd->inDegree;i++){
- ar=nd->inArc[i];
- if(ar->head!=nd) continue;
- tail=ar->tail;
- if(tail->address!=nd->address){
- MPI_Status status;
- len=0;
- tag=ar->id;
- MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
- feat=newArr(len);
- MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
- csum+=Reduce(feat,(nd->id+1));
- SMPI_SHARED_FREE(feat);
- }else{
- csum+=Reduce(tail->feat,(nd->id+1));
- }
- }
- if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
- retv=(nd->id+1)*csum;
- return retv;
-}
-
-int ProcessNodes(DGraph *dg,int me){
- double chksum=0.0;
- Arr *feat=NULL;
- int i=0,verified=0,tag;
- DGNode *nd=NULL;
- double rchksum=0.0;
- MPI_Status status;
-
- for(i=0;i<dg->numNodes;i++){
- nd=dg->node[i];
- if(nd->address!=me) continue;
- if(strstr(nd->name,"Source")){
- nd->feat=RandomFeatures(dg->name,fielddim,nd->id);
- SendResults(dg,nd,nd->feat);
- }else if(strstr(nd->name,"Sink")){
- chksum=ReduceStreams(dg,nd);
- tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
- MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
- }else{
- feat=CombineStreams(dg,nd);
- SendResults(dg,nd,feat);
- }
- }
- if(me==0){ /* Report node */
- rchksum=0.0;
- chksum=0.0;
- for(i=0;i<dg->numNodes;i++){
- nd=dg->node[i];
- if(!strstr(nd->name,"Sink")) continue;
- tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
- MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
- chksum+=rchksum;
- }
- verified=verify(dg->name,chksum);
- }
-return verified;
-}
-
-int main(int argc,char **argv ){
- int my_rank,comm_size;
- int i;
- DGraph *dg=NULL;
- int verified=0, featnum=0;
- double bytes_sent=2.0,tot_time=0.0;
-
- MPI_Init( &argc, &argv );
- MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
- MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
-
- if(argc!=2||
- ( strncmp(argv[1],"BH",2)!=0
- &&strncmp(argv[1],"WH",2)!=0
- &&strncmp(argv[1],"SH",2)!=0
- )
- ){
- if(my_rank==0){
- fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
- fprintf(stderr,"** Where \n - N is integer number of MPI processes\n");
- fprintf(stderr," - S is the class S, W, or A \n");
- fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n");
- fprintf(stderr," - the number of MPI processes N should not be be less than \n");
- fprintf(stderr," the number of nodes in the graph\n");
- }
- MPI_Finalize();
- exit(0);
- }
- if(strncmp(argv[1],"BH",2)==0){
- dg=buildBH(CLASS);
- }else if(strncmp(argv[1],"WH",2)==0){
- dg=buildWH(CLASS);
- }else if(strncmp(argv[1],"SH",2)==0){
- dg=buildSH(CLASS);
- }
-
- if(timer_on&&dg->numNodes+1>timers_tot){
- timer_on=0;
- if(my_rank==0)
- fprintf(stderr,"Not enough timers. Node timeing is off. \n");
- }
- if(dg->numNodes>comm_size){
- if(my_rank==0){
- fprintf(stderr,"** The number of MPI processes should not be less than \n");
- fprintf(stderr,"** the number of nodes in the graph\n");
- fprintf(stderr,"** Number of MPI processes = %d\n",comm_size);
- fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes);
- }
- MPI_Finalize();
- exit(0);
- }
- for(i=0;i<dg->numNodes;i++){
- dg->node[i]->address=i;
- }
- if( my_rank == 0 ){
- printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
- graphShow(dg,0);
- timer_clear(0);
- timer_start(0);
- }
- verified=ProcessNodes(dg,my_rank);
-
- featnum=NUM_SAMPLES*fielddim;
- bytes_sent=featnum*dg->numArcs;
- bytes_sent/=1048576;
- if(my_rank==0){
- timer_stop(0);
- tot_time=timer_read(0);
- c_print_results( dg->name,
- CLASS,
- featnum,
- 0,
- 0,
- dg->numNodes,
- 0,
- comm_size,
- tot_time,
- bytes_sent/tot_time,
- "bytes transmitted",
- verified,
- NPBVERSION,
- COMPILETIME,
- MPICC,
- CLINK,
- CMPI_LIB,
- CMPI_INC,
- CFLAGS,
- CLINKFLAGS );
- }
- MPI_Finalize();
- return 1;
-}
+++ /dev/null
-BENCHMARK=ep
-include ../config/make.def
-include ../sys/make.common
-
-${PROGRAM}: config ep.o ep-sampling.o ../common/randdp.o
- ${CLINK} ${CLINKFLAGS} -o ${BINDIR}/ep.${CLASS}.${NPROCS} ep.o ../common/randdp.o ${CMPI_LIB} -lm
- ${CLINK} ${CLINKFLAGS} -o ${BINDIR}/ep-sampling.${CLASS}.${NPROCS} ep-sampling.o ../common/randdp.o ${CMPI_LIB} -lm
-
-ep.o: ep.c npbparams.h
- ${CCOMPILE} ep.c
-ep-sampling.o: ep-sampling.c npbparams.h
- ${CCOMPILE} ep-sampling.c
-
-clean:
- - rm -f *.o *~ npbparams.h
+++ /dev/null
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-#define true 1
-#define false 0
-
-
-//---NOTE : all the timers function have been modified to
-// avoid global timers (privatize these).
- // ----------------------- timers ---------------------
- void timer_clear(double *onetimer) {
- //elapsed[n] = 0.0;
- *onetimer = 0.0;
- }
-
- void timer_start(double *onetimer) {
- *onetimer = MPI_Wtime();
- }
-
- void timer_stop(int n,double *elapsed,double *start) {
- double t, now;
-
- now = MPI_Wtime();
- t = now - start[n];
- elapsed[n] += t;
- }
-
- double timer_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */
- return(elapsed[n]);
- }
- /********************************************************************
- ***************** V R A N L C ******************
- ***************** *****************/
- double vranlc(int n, double x, double a, double *y)
- {
- int i;
- long i246m1=0x00003FFFFFFFFFFF;
- long LLx, Lx, La;
- double d2m46;
-
-// This doesn't work, because the compiler does the calculation in 32
-// bits and overflows. No standard way (without f90 stuff) to specify
-// that the rhs should be done in 64 bit arithmetic.
-// parameter(i246m1=2**46-1)
-
- d2m46=pow(0.5,46);
-
-// c Note that the v6 compiler on an R8000 does something stupid with
-// c the above. Using the following instead (or various other things)
-// c makes the calculation run almost 10 times as fast.
-//
-// c save d2m46
-// c data d2m46/0.0d0/
-// c if (d2m46 .eq. 0.0d0) then
-// c d2m46 = 0.5d0**46
-// c endif
-
- Lx = (long)x;
- La = (long)a;
- //fprintf(stdout,("================== Vranlc ================");
- //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
- LLx = Lx;
- for (i=0; i< n; i++) {
- Lx = Lx*La & i246m1 ;
- LLx = Lx;
- y[i] = d2m46 * (double)LLx;
- /*
- if(i == 0) {
- fprintf(stdout,("After loop 0:");
- fprintf(stdout,("Lx = " + Lx + ", La = " + La);
- fprintf(stdout,("d2m46 = " + d2m46);
- fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
- fprintf(stdout,("Y[0]" + y[0]);
- }
- */
- }
-
- x = (double)LLx;
- /*
- fprintf(stdout,("Change: Lx = " + Lx);
- fprintf(stdout,("=============End Vranlc ================");
- */
- return x;
- }
-
-
-
-//-------------- the core (unique function) -----------
- void doTest(int argc, char **argv) {
- double dum[3] = {1.,1.,1.};
- double x1, x2, sx, sy, tm, an, tt, gc;
- double Mops;
- double epsilon=1.0E-8, a = 1220703125., s=271828183.;
- double t1, t2, t3, t4;
- double sx_verify_value, sy_verify_value, sx_err, sy_err;
-
-#include "npbparams.h"
- int mk=16,
- // --> set by make : in npbparams.h
- //m=28, // for CLASS=A
- //m=30, // for CLASS=B
- //npm=2, // NPROCS
- mm = m-mk,
- nn = (int)(pow(2,mm)),
- nk = (int)(pow(2,mk)),
- nq=10,
- np,
- node,
- no_nodes,
- i,
- ik,
- kk,
- l,
- k, nit, no_large_nodes,
- np_add, k_offset, j;
- int me, nprocs, root=0, dp_type;
- int verified,
- timers_enabled=true;
- char size[500]; // mind the size of the string to represent a big number
-
- //Use in randlc..
- int KS = 0;
- double R23, R46, T23, T46;
-
- double *qq = (double *) malloc (10000*sizeof(double));
- double *start = (double *) malloc (64*sizeof(double));
- double *elapsed = (double *) malloc (64*sizeof(double));
-
- double *x = (double *) malloc (2*nk*sizeof(double));
- double *q = (double *) malloc (nq*sizeof(double));
-
- MPI_Init( &argc, &argv );
- MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
- MPI_Comm_rank( MPI_COMM_WORLD, &node);
-
-#ifdef USE_MPE
- MPE_Init_log();
-#endif
- root = 0;
- if (node == root ) {
-
- /* Because the size of the problem is too large to store in a 32-bit
- * integer for some classes, we put it into a string (for printing).
- * Have to strip off the decimal point put in there by the floating
- * point print statement (internal file)
- */
- fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
- sprintf(size,"%d",(int) pow(2,m+1));
- //size = size.replace('.', ' ');
- fprintf(stdout," Number of random numbers generated: %s\n",size);
- fprintf(stdout," Number of active processes: %d\n",no_nodes);
-
- }
- verified = false;
-
- /* c Compute the number of "batches" of random number pairs generated
- c per processor. Adjust if the number of processors does not evenly
- c divide the total number
-*/
-
- np = nn / no_nodes;
- no_large_nodes = nn % no_nodes;
- if (node < no_large_nodes) np_add = 1;
- else np_add = 0;
- np = np + np_add;
-
- if (np == 0) {
- fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
- MPI_Abort(MPI_COMM_WORLD,1);
- exit(0);
- }
-
-/* c Call the random number generator functions and initialize
- c the x-array to reduce the effects of paging on the timings.
- c Also, call all mathematical functions that are used. Make
- c sure these initializations cannot be eliminated as dead code.
-*/
-
- //call vranlc(0, dum[1], dum[2], dum[3]);
- // Array indexes start at 1 in Fortran, 0 in Java
- vranlc(0, dum[0], dum[1], &(dum[2]));
-
- dum[0] = randlc(&(dum[1]),&(dum[2]));
- /////////////////////////////////
- for (i=0;i<2*nk;i++) {
- x[i] = -1e99;
- }
- Mops = log(sqrt(abs(1)));
-
- /*
- c---------------------------------------------------------------------
- c Synchronize before placing time stamp
- c---------------------------------------------------------------------
- */
- MPI_Barrier( MPI_COMM_WORLD );
-
- timer_clear(&(elapsed[1]));
- timer_clear(&(elapsed[2]));
- timer_clear(&(elapsed[3]));
- timer_start(&(start[1]));
-
- t1 = a;
- //fprintf(stdout,("(ep.f:160) t1 = " + t1);
- t1 = vranlc(0, t1, a, x);
- //fprintf(stdout,("(ep.f:161) t1 = " + t1);
-
-
-/* c Compute AN = A ^ (2 * NK) (mod 2^46). */
-
- t1 = a;
- //fprintf(stdout,("(ep.f:165) t1 = " + t1);
- for (i=1; i <= mk+1; i++) {
- t2 = randlc(&t1, &t1);
- //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
- }
- an = t1;
- //fprintf(stdout,("(ep.f:172) s = " + s);
- tt = s;
- gc = 0.;
- sx = 0.;
- sy = 0.;
- for (i=0; i < nq ; i++) {
- q[i] = 0.;
- }
-
-/*
- Each instance of this loop may be performed independently. We compute
- the k offsets separately to take into account the fact that some nodes
- have more numbers to generate than others
-*/
-
- if (np_add == 1)
- k_offset = node * np -1;
- else
- k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
-
- int stop = false;
- for(k = 1; k <= np; k++) SMPI_SAMPLE_LOCAL(0.25 * np, 0.03) {
- stop = false;
- kk = k_offset + k ;
- t1 = s;
- //fprintf(stdout,("(ep.f:193) t1 = " + t1);
- t2 = an;
-
-// Find starting seed t1 for this kk.
-
- for (i=1;i<=100 && !stop;i++) {
- ik = kk / 2;
- //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
- if (2 * ik != kk) {
- t3 = randlc(&t1, &t2);
- //fprintf(stdout,("(ep.f:200) t1= " +t1 );
- }
- if (ik==0)
- stop = true;
- else {
- t3 = randlc(&t2, &t2);
- kk = ik;
- }
- }
-// Compute uniform pseudorandom numbers.
-
- //if (timers_enabled) timer_start(3);
- timer_start(&(start[3]));
- //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified
-
- //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
- //fprintf(stdout,"2*nk = " + (2*nk));
- //fprintf(stdout,"t1 = " + t1);
- //fprintf(stdout,"a = " + a);
- //fprintf(stdout,"x[0] = " + x[0]);
- //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-
- t1 = vranlc(2 * nk, t1, a, x);
-
- //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<");
- //fprintf(stdout,("2*nk = " + (2*nk));
- //fprintf(stdout,("t1 = " + t1);
- //fprintf(stdout,("a = " + a);
- //fprintf(stdout,("x[0] = " + x[0]);
- //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-
- //if (timers_enabled) timer_stop(3);
- timer_stop(3,elapsed,start);
-
-/* Compute Gaussian deviates by acceptance-rejection method and
- * tally counts in concentric square annuli. This loop is not
- * vectorizable.
- */
- //if (timers_enabled) timer_start(2);
- timer_start(&(start[2]));
- for(i=1; i<=nk;i++) {
- x1 = 2. * x[2*i-2] -1.0;
- x2 = 2. * x[2*i-1] - 1.0;
- t1 = x1*x1 + x2*x2;
- if (t1 <= 1.) {
- t2 = sqrt(-2. * log(t1) / t1);
- t3 = (x1 * t2);
- t4 = (x2 * t2);
- l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
- q[l] = q[l] + 1.;
- sx = sx + t3;
- sy = sy + t4;
- }
- /*
- if(i == 1) {
- fprintf(stdout,"x1 = " + x1);
- fprintf(stdout,"x2 = " + x2);
- fprintf(stdout,"t1 = " + t1);
- fprintf(stdout,"t2 = " + t2);
- fprintf(stdout,"t3 = " + t3);
- fprintf(stdout,"t4 = " + t4);
- fprintf(stdout,"l = " + l);
- fprintf(stdout,"q[l] = " + q[l]);
- fprintf(stdout,"sx = " + sx);
- fprintf(stdout,"sy = " + sy);
- }
- */
- }
- //if (timers_enabled) timer_stop(2);
- timer_stop(2,elapsed,start);
- }
-
- //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
- MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
- sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
- MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
- sy = x[0];
- MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
- for(i = 0; i < nq; i++) {
- q[i] = x[i];
- }
- for(i = 0; i < nq; i++) {
- gc += q[i];
- }
-
- timer_stop(1,elapsed,start);
- tm = timer_read(1,elapsed);
- MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
- tm = x[0];
-
- if(node == root) {
- nit = 0;
- verified = true;
-
- if(m == 24) {
- sx_verify_value = -3.247834652034740E3;
- sy_verify_value = -6.958407078382297E3;
- } else if(m == 25) {
- sx_verify_value = -2.863319731645753E3;
- sy_verify_value = -6.320053679109499E3;
- } else if(m == 28) {
- sx_verify_value = -4.295875165629892E3;
- sy_verify_value = -1.580732573678431E4;
- } else if(m == 30) {
- sx_verify_value = 4.033815542441498E4;
- sy_verify_value = -2.660669192809235E4;
- } else if(m == 32) {
- sx_verify_value = 4.764367927995374E4;
- sy_verify_value = -8.084072988043731E4;
- } else if(m == 36) {
- sx_verify_value = 1.982481200946593E5;
- sy_verify_value = -1.020596636361769E5;
- } else {
- verified = false;
- }
-
- /*
- fprintf(stdout,("sx = " + sx);
- fprintf(stdout,("sx_verify = " + sx_verify_value);
- fprintf(stdout,("sy = " + sy);
- fprintf(stdout,("sy_verify = " + sy_verify_value);
- */
- if(verified) {
- sx_err = abs((sx - sx_verify_value)/sx_verify_value);
- sy_err = abs((sy - sy_verify_value)/sy_verify_value);
- /*
- fprintf(stdout,("sx_err = " + sx_err);
- fprintf(stdout,("sy_err = " + sx_err);
- fprintf(stdout,("epsilon= " + epsilon);
- */
- verified = ((sx_err < epsilon) && (sy_err < epsilon));
- }
-
- Mops = (pow(2.0, m+1))/tm/1000;
-
- fprintf(stdout,"EP Benchmark Results:\n");
- fprintf(stdout,"CPU Time=%d\n",(int) tm);
- fprintf(stdout,"N = 2^%d\n",m);
- fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
- fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
- fprintf(stdout,"Count:");
- for(i = 0; i < nq; i++) {
- fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
- }
-
- /*
- print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
- "Random numbers generated", verified, npbversion,
- compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
- fprintf(stdout,"\nEP Benchmark Completed\n");
- fprintf(stdout,"Class = %s\n", _class);
- fprintf(stdout,"Size = %s\n", size);
- fprintf(stdout,"Iteration = %d\n", nit);
- fprintf(stdout,"Time in seconds = %f\n",(tm/1000));
- fprintf(stdout,"Total processes = %d\n",no_nodes);
- fprintf(stdout,"Mops/s total = %f\n",Mops);
- fprintf(stdout,"Mops/s/process = %f\n", Mops/no_nodes);
- fprintf(stdout,"Operation type = Random number generated\n");
- if(verified) {
- fprintf(stdout,"Verification = SUCCESSFUL\n");
- } else {
- fprintf(stdout,"Verification = UNSUCCESSFUL\n");
- }
- fprintf(stdout,"Total time: %f\n",(timer_read(1,elapsed)/1000));
- fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2,elapsed)/1000));
- fprintf(stdout,"Random numbers: %f\n",(timer_read(3,elapsed)/1000));
- }
-#ifdef USE_MPE
- MPE_Finish_log(argv[0]);
-#endif
-
- MPI_Finalize();
- }
-
- int main(int argc, char **argv) {
- doTest(argc,argv);
- }
+++ /dev/null
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#include "mpi.h"
-#include "npbparams.h"
-
-#include "simgrid/instr.h" //TRACE_
-
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-#define true 1
-#define false 0
-
-//---NOTE : all the timers function have been modified to
-// avoid global timers (privatize these).
- // ----------------------- timers ---------------------
- void timer_clear(double *onetimer) {
- //elapsed[n] = 0.0;
- *onetimer = 0.0;
- }
-
- void timer_start(double *onetimer) {
- *onetimer = MPI_Wtime();
- }
-
- void timer_stop(int n,double *elapsed,double *start) {
- double t, now;
-
- now = MPI_Wtime();
- t = now - start[n];
- elapsed[n] += t;
- }
-
- double timer_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */
- return(elapsed[n]);
- }
- /********************************************************************
- ***************** V R A N L C ******************
- ***************** *****************/
- double vranlc(int n, double x, double a, double *y)
- {
- int i;
- long i246m1=0x00003FFFFFFFFFFF;
- long LLx, Lx, La;
- double d2m46;
-
-// This doesn't work, because the compiler does the calculation in 32
-// bits and overflows. No standard way (without f90 stuff) to specify
-// that the rhs should be done in 64 bit arithmetic.
-// parameter(i246m1=2**46-1)
-
- d2m46=pow(0.5,46);
-
-// c Note that the v6 compiler on an R8000 does something stupid with
-// c the above. Using the following instead (or various other things)
-// c makes the calculation run almost 10 times as fast.
-//
-// c save d2m46
-// c data d2m46/0.0d0/
-// c if (d2m46 .eq. 0.0d0) then
-// c d2m46 = 0.5d0**46
-// c endif
-
- Lx = (long)x;
- La = (long)a;
- //fprintf(stdout,("================== Vranlc ================");
- //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
- LLx = Lx;
- for (i=0; i< n; i++) {
- Lx = Lx*La & i246m1 ;
- LLx = Lx;
- y[i] = d2m46 * (double)LLx;
- /*
- if(i == 0) {
- fprintf(stdout,("After loop 0:");
- fprintf(stdout,("Lx = " + Lx + ", La = " + La);
- fprintf(stdout,("d2m46 = " + d2m46);
- fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
- fprintf(stdout,("Y[0]" + y[0]);
- }
- */
- }
-
- x = (double)LLx;
- /*
- fprintf(stdout,("Change: Lx = " + Lx);
- fprintf(stdout,("=============End Vranlc ================");
- */
- return x;
- }
-
-
-
-//-------------- the core (unique function) -----------
- void doTest(int argc, char **argv) {
- double dum[3] = {1.,1.,1.};
- double x1, x2, sx, sy, tm, an, tt, gc;
- double Mops;
- double epsilon=1.0E-8, a = 1220703125., s=271828183.;
- double t1, t2, t3, t4;
- double sx_verify_value, sy_verify_value, sx_err, sy_err;
-
-#include "npbparams.h"
- int mk=16,
- // --> set by make : in npbparams.h
- //m=28, // for CLASS=A
- //m=30, // for CLASS=B
- //npm=2, // NPROCS
- mm = m-mk,
- nn = (int)(pow(2,mm)),
- nk = (int)(pow(2,mk)),
- nq=10,
- np,
- node,
- no_nodes,
- i,
- ik,
- kk,
- l,
- k, nit, no_large_nodes,
- np_add, k_offset, j;
- int me, nprocs, root=0, dp_type;
- int verified,
- timers_enabled=true;
- char size[500]; // mind the size of the string to represent a big number
-
- //Use in randlc..
- int KS = 0;
- double R23, R46, T23, T46;
-
- double *qq = (double *) malloc (10000*sizeof(double));
- double *start = (double *) malloc (64*sizeof(double));
- double *elapsed = (double *) malloc (64*sizeof(double));
-
- double *x = (double *) malloc (2*nk*sizeof(double));
- double *q = (double *) malloc (nq*sizeof(double));
-
- TRACE_smpi_set_category ("start");
-
- MPI_Init( &argc, &argv );
- MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
- MPI_Comm_rank( MPI_COMM_WORLD, &node);
-
-#ifdef USE_MPE
- MPE_Init_log();
-#endif
- root = 0;
- if (node == root ) {
-
- /* Because the size of the problem is too large to store in a 32-bit
- * integer for some classes, we put it into a string (for printing).
- * Have to strip off the decimal point put in there by the floating
- * point print statement (internal file)
- */
- fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
- sprintf(size,"%d",(int)pow(2,m+1));
- //size = size.replace('.', ' ');
- fprintf(stdout," Number of random numbers generated: %s\n",size);
- fprintf(stdout," Number of active processes: %d\n",no_nodes);
-
- }
- verified = false;
-
- /* c Compute the number of "batches" of random number pairs generated
- c per processor. Adjust if the number of processors does not evenly
- c divide the total number
-*/
-
- np = nn / no_nodes;
- no_large_nodes = nn % no_nodes;
- if (node < no_large_nodes) np_add = 1;
- else np_add = 0;
- np = np + np_add;
-
- if (np == 0) {
- fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
- MPI_Abort(MPI_COMM_WORLD,1);
- exit(0);
- }
-
-/* c Call the random number generator functions and initialize
- c the x-array to reduce the effects of paging on the timings.
- c Also, call all mathematical functions that are used. Make
- c sure these initializations cannot be eliminated as dead code.
-*/
-
- //call vranlc(0, dum[1], dum[2], dum[3]);
- // Array indexes start at 1 in Fortran, 0 in Java
- vranlc(0, dum[0], dum[1], &(dum[2]));
-
- dum[0] = randlc(&(dum[1]),&(dum[2]));
- /////////////////////////////////
- for (i=0;i<2*nk;i++) {
- x[i] = -1e99;
- }
- Mops = log(sqrt(abs(1)));
-
- /*
- c---------------------------------------------------------------------
- c Synchronize before placing time stamp
- c---------------------------------------------------------------------
- */
- MPI_Barrier( MPI_COMM_WORLD );
-
- TRACE_smpi_set_category ("ep");
-
- timer_clear(&(elapsed[1]));
- timer_clear(&(elapsed[2]));
- timer_clear(&(elapsed[3]));
- timer_start(&(start[1]));
-
- t1 = a;
- //fprintf(stdout,("(ep.f:160) t1 = " + t1);
- t1 = vranlc(0, t1, a, x);
- //fprintf(stdout,("(ep.f:161) t1 = " + t1);
-
-
-/* c Compute AN = A ^ (2 * NK) (mod 2^46). */
-
- t1 = a;
- //fprintf(stdout,("(ep.f:165) t1 = " + t1);
- for (i=1; i <= mk+1; i++) {
- t2 = randlc(&t1, &t1);
- //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
- }
- an = t1;
- //fprintf(stdout,("(ep.f:172) s = " + s);
- tt = s;
- gc = 0.;
- sx = 0.;
- sy = 0.;
- for (i=0; i < nq ; i++) {
- q[i] = 0.;
- }
-
-/*
- Each instance of this loop may be performed independently. We compute
- the k offsets separately to take into account the fact that some nodes
- have more numbers to generate than others
-*/
-
- if (np_add == 1)
- k_offset = node * np -1;
- else
- k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
-
- int stop = false;
- for(k = 1; k <= np; k++) {
- stop = false;
- kk = k_offset + k ;
- t1 = s;
- //fprintf(stdout,("(ep.f:193) t1 = " + t1);
- t2 = an;
-
-// Find starting seed t1 for this kk.
-
- for (i=1;i<=100 && !stop;i++) {
- ik = kk / 2;
- //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
- if (2 * ik != kk) {
- t3 = randlc(&t1, &t2);
- //fprintf(stdout,("(ep.f:200) t1= " +t1 );
- }
- if (ik==0)
- stop = true;
- else {
- t3 = randlc(&t2, &t2);
- kk = ik;
- }
- }
-// Compute uniform pseudorandom numbers.
-
- //if (timers_enabled) timer_start(3);
- timer_start(&(start[3]));
- //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified
-
- //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
- //fprintf(stdout,"2*nk = " + (2*nk));
- //fprintf(stdout,"t1 = " + t1);
- //fprintf(stdout,"a = " + a);
- //fprintf(stdout,"x[0] = " + x[0]);
- //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-
- t1 = vranlc(2 * nk, t1, a, x);
-
- //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<");
- //fprintf(stdout,("2*nk = " + (2*nk));
- //fprintf(stdout,("t1 = " + t1);
- //fprintf(stdout,("a = " + a);
- //fprintf(stdout,("x[0] = " + x[0]);
- //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
-
- //if (timers_enabled) timer_stop(3);
- timer_stop(3,elapsed,start);
-
-/* Compute Gaussian deviates by acceptance-rejection method and
- * tally counts in concentric square annuli. This loop is not
- * vectorizable.
- */
- //if (timers_enabled) timer_start(2);
- timer_start(&(start[2]));
- for(i=1; i<=nk;i++) {
- x1 = 2. * x[2*i-2] -1.0;
- x2 = 2. * x[2*i-1] - 1.0;
- t1 = x1*x1 + x2*x2;
- if (t1 <= 1.) {
- t2 = sqrt(-2. * log(t1) / t1);
- t3 = (x1 * t2);
- t4 = (x2 * t2);
- l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
- q[l] = q[l] + 1.;
- sx = sx + t3;
- sy = sy + t4;
- }
- /*
- if(i == 1) {
- fprintf(stdout,"x1 = " + x1);
- fprintf(stdout,"x2 = " + x2);
- fprintf(stdout,"t1 = " + t1);
- fprintf(stdout,"t2 = " + t2);
- fprintf(stdout,"t3 = " + t3);
- fprintf(stdout,"t4 = " + t4);
- fprintf(stdout,"l = " + l);
- fprintf(stdout,"q[l] = " + q[l]);
- fprintf(stdout,"sx = " + sx);
- fprintf(stdout,"sy = " + sy);
- }
- */
- }
- //if (timers_enabled) timer_stop(2);
- timer_stop(2,elapsed,start);
- }
-
- TRACE_smpi_set_category ("finalize");
-
- //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
- MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
- sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
- MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
- sy = x[0];
- MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
- for(i = 0; i < nq; i++) {
- q[i] = x[i];
- }
- for(i = 0; i < nq; i++) {
- gc += q[i];
- }
-
- timer_stop(1,elapsed,start);
- tm = timer_read(1,elapsed);
- MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
- tm = x[0];
-
- if(node == root) {
- nit = 0;
- verified = true;
-
- if(m == 24) {
- sx_verify_value = -3.247834652034740E3;
- sy_verify_value = -6.958407078382297E3;
- } else if(m == 25) {
- sx_verify_value = -2.863319731645753E3;
- sy_verify_value = -6.320053679109499E3;
- } else if(m == 28) {
- sx_verify_value = -4.295875165629892E3;
- sy_verify_value = -1.580732573678431E4;
- } else if(m == 30) {
- sx_verify_value = 4.033815542441498E4;
- sy_verify_value = -2.660669192809235E4;
- } else if(m == 32) {
- sx_verify_value = 4.764367927995374E4;
- sy_verify_value = -8.084072988043731E4;
- } else if(m == 36) {
- sx_verify_value = 1.982481200946593E5;
- sy_verify_value = -1.020596636361769E5;
- } else {
- verified = false;
- }
-
- /*
- fprintf(stdout,("sx = " + sx);
- fprintf(stdout,("sx_verify = " + sx_verify_value);
- fprintf(stdout,("sy = " + sy);
- fprintf(stdout,("sy_verify = " + sy_verify_value);
- */
- if(verified) {
- sx_err = abs((sx - sx_verify_value)/sx_verify_value);
- sy_err = abs((sy - sy_verify_value)/sy_verify_value);
- /*
- fprintf(stdout,("sx_err = " + sx_err);
- fprintf(stdout,("sy_err = " + sx_err);
- fprintf(stdout,("epsilon= " + epsilon);
- */
- verified = ((sx_err < epsilon) && (sy_err < epsilon));
- }
-
- Mops = (pow(2.0, m+1))/tm/1000;
-
- fprintf(stdout,"EP Benchmark Results:\n");
- fprintf(stdout,"CPU Time=%d\n",(int) tm);
- fprintf(stdout,"N = 2^%d\n",m);
- fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
- fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
- fprintf(stdout,"Count:");
- for(i = 0; i < nq; i++) {
- fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
- }
-
- /*
- print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
- "Random numbers generated", verified, npbversion,
- compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
- fprintf(stdout,"\nEP Benchmark Completed\n");
- fprintf(stdout,"Class = %s\n", _class);
- fprintf(stdout,"Size = %s\n", size);
- fprintf(stdout,"Iteration = %d\n", nit);
- fprintf(stdout,"Time in seconds = %f\n",(tm/1000));
- fprintf(stdout,"Total processes = %d\n",no_nodes);
- fprintf(stdout,"Mops/s total = %f\n",Mops);
- fprintf(stdout,"Mops/s/process = %f\n", Mops/no_nodes);
- fprintf(stdout,"Operation type = Random number generated\n");
- if(verified) {
- fprintf(stdout,"Verification = SUCCESSFUL\n");
- } else {
- fprintf(stdout,"Verification = UNSUCCESSFUL\n");
- }
- fprintf(stdout,"Total time: %f\n",(timer_read(1,elapsed)/1000));
- fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2,elapsed)/1000));
- fprintf(stdout,"Random numbers: %f\n",(timer_read(3,elapsed)/1000));
- }
-#ifdef USE_MPE
- MPE_Finish_log(argv[0]);
-#endif
-
- MPI_Finalize();
- }
-
- int main(int argc, char **argv) {
- doTest(argc,argv);
- }
+++ /dev/null
-BENCHMARK=is
-
-include ../config/make.def
-include ../sys/make.common
-
-OBJS = is.o ${COMMON}/c_print_results.o
-
-${PROGRAM}: config ${OBJS}
- ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
-
-.c.o:
- ${CCOMPILE} $<
-
-is.o: is.c npbparams.h
-
-clean:
- - rm -f *.o *~ is npbparams.h
+++ /dev/null
-/*************************************************************************
- * *
- * N A S P A R A L L E L B E N C H M A R K S 3.3 *
- * *
- * I S *
- * *
- *************************************************************************
- * *
- * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
- * It is described in NAS Technical Report 95-020. *
- * *
- * Permission to use, copy, distribute and modify this software *
- * for any purpose with or without fee is hereby granted. We *
- * request, however, that all derived work reference the NAS *
- * Parallel Benchmarks 3.3. This software is provided "as is" *
- * without express or implied warranty. *
- * *
- * Information on NPB 3.3, including the technical report, the *
- * original specifications, source code, results and information *
- * on how to submit new results, is available at: *
- * *
- * http://www.nas.nasa.gov/Software/NPB *
- * *
- * Send comments or suggestions to npb@nas.nasa.gov *
- * Send bug reports to npb-bugs@nas.nasa.gov *
- * *
- * NAS Parallel Benchmarks Group *
- * NASA Ames Research Center *
- * Mail Stop: T27A-1 *
- * Moffett Field, CA 94035-1000 *
- * *
- * E-mail: npb@nas.nasa.gov *
- * Fax: (650) 604-3957 *
- * *
- *************************************************************************
- * *
- * Author: M. Yarrow *
- * H. Jin *
- * *
- *************************************************************************/
-
-#include "mpi.h"
-#include "npbparams.h"
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "simgrid/instr.h" //TRACE_
-
-/******************/
-/* default values */
-/******************/
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-#define MIN_PROCS 1
-
-
-/*************/
-/* CLASS S */
-/*************/
-#if CLASS == 'S'
-#define TOTAL_KEYS_LOG_2 16
-#define MAX_KEY_LOG_2 11
-#define NUM_BUCKETS_LOG_2 9
-#endif
-
-
-/*************/
-/* CLASS W */
-/*************/
-#if CLASS == 'W'
-#define TOTAL_KEYS_LOG_2 20
-#define MAX_KEY_LOG_2 16
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-/*************/
-/* CLASS A */
-/*************/
-#if CLASS == 'A'
-#define TOTAL_KEYS_LOG_2 23
-#define MAX_KEY_LOG_2 19
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS B */
-/*************/
-#if CLASS == 'B'
-#define TOTAL_KEYS_LOG_2 25
-#define MAX_KEY_LOG_2 21
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS C */
-/*************/
-#if CLASS == 'C'
-#define TOTAL_KEYS_LOG_2 27
-#define MAX_KEY_LOG_2 23
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS D */
-/*************/
-#if CLASS == 'D'
-#define TOTAL_KEYS_LOG_2 29
-#define MAX_KEY_LOG_2 27
-#define NUM_BUCKETS_LOG_2 10
-#undef MIN_PROCS
-#define MIN_PROCS 4
-#endif
-
-
-#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2)
-#define MAX_KEY (1 << MAX_KEY_LOG_2)
-#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2)
-#define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
-
-/*****************************************************************/
-/* On larger number of processors, since the keys are (roughly) */
-/* gaussian distributed, the first and last processor sort keys */
-/* in a large interval, requiring array sizes to be larger. Note */
-/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
-/* The required array size also depends on the bucket size used. */
-/* The following values are validated for the 1024-bucket setup. */
-/*****************************************************************/
-#if NUM_PROCS < 256
-#define SIZE_OF_BUFFERS 3*NUM_KEYS/2
-#elif NUM_PROCS < 512
-#define SIZE_OF_BUFFERS 5*NUM_KEYS/2
-#elif NUM_PROCS < 1024
-#define SIZE_OF_BUFFERS 4*NUM_KEYS
-#else
-#define SIZE_OF_BUFFERS 13*NUM_KEYS/2
-#endif
-
-/*****************************************************************/
-/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
-/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */
-/* MAX_PROCS AT YOUR PERIL */
-/*****************************************************************/
-#if CLASS == 'S'
-#define MAX_PROCS 128
-#else
-#define MAX_PROCS 1024
-#endif
-
-#define MAX_ITERATIONS 10
-#define TEST_ARRAY_SIZE 5
-
-
-/***********************************/
-/* Enable separate communication, */
-/* computation timing and printout */
-/***********************************/
-/* #define TIMING_ENABLED */
-
-
-/*************************************/
-/* Typedef: if necessary, change the */
-/* size of int here by changing the */
-/* int type to, say, long */
-/*************************************/
-typedef int INT_TYPE;
-typedef long INT_TYPE2;
-#define MP_KEY_TYPE MPI_INT
-
-
-typedef struct {
-
-/********************/
-/* MPI properties: */
-/********************/
-int my_rank,
- comm_size;
-
-
-/********************/
-/* Some global info */
-/********************/
-INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
- total_local_keys, /* copies of rank info */
- total_lesser_keys;
-
-
-int passed_verification;
-
-
-
-/************************************/
-/* These are the three main arrays. */
-/* See SIZE_OF_BUFFERS def above */
-/************************************/
-INT_TYPE key_array[SIZE_OF_BUFFERS],
- key_buff1[SIZE_OF_BUFFERS],
- key_buff2[SIZE_OF_BUFFERS],
- bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */
- bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
- bucket_ptrs[NUM_BUCKETS],
- process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
- process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
-int send_count[MAX_PROCS], recv_count[MAX_PROCS],
- send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
-
-
-/**********************/
-/* Partial verif info */
-/**********************/
-INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
- test_rank_array[TEST_ARRAY_SIZE];
-
-/**********/
-/* Timers */
-/**********/
-double start[64], elapsed[64];
-
-} global_data;
-
-
-const INT_TYPE2
- S_test_index_array[TEST_ARRAY_SIZE] =
- {48427,17148,23627,62548,4431},
- S_test_rank_array[TEST_ARRAY_SIZE] =
- {0,18,346,64917,65463},
-
- W_test_index_array[TEST_ARRAY_SIZE] =
- {357773,934767,875723,898999,404505},
- W_test_rank_array[TEST_ARRAY_SIZE] =
- {1249,11698,1039987,1043896,1048018},
-
- A_test_index_array[TEST_ARRAY_SIZE] =
- {2112377,662041,5336171,3642833,4250760},
- A_test_rank_array[TEST_ARRAY_SIZE] =
- {104,17523,123928,8288932,8388264},
-
- B_test_index_array[TEST_ARRAY_SIZE] =
- {41869,812306,5102857,18232239,26860214},
- B_test_rank_array[TEST_ARRAY_SIZE] =
- {33422937,10244,59149,33135281,99},
-
- C_test_index_array[TEST_ARRAY_SIZE] =
- {44172927,72999161,74326391,129606274,21736814},
- C_test_rank_array[TEST_ARRAY_SIZE] =
- {61147,882988,266290,133997595,133525895},
-
- D_test_index_array[TEST_ARRAY_SIZE] =
- {1317351170,995930646,1157283250,1503301535,1453734525},
- D_test_rank_array[TEST_ARRAY_SIZE] =
- {1,36538729,1978098519,2145192618,2147425337};
-
-
-
-/***********************/
-/* function prototypes */
-/***********************/
-double randlc( double *X, double *A );
-
-void full_verify( global_data* gd );
-
-void c_print_results( char *name,
- char class,
- int n1,
- int n2,
- int n3,
- int niter,
- int nprocs_compiled,
- int nprocs_total,
- double t,
- double mops,
- char *optype,
- int passed_verification,
- char *npbversion,
- char *compiletime,
- char *mpicc,
- char *clink,
- char *cmpi_lib,
- char *cmpi_inc,
- char *cflags,
- char *clinkflags );
-
-void timer_clear(global_data* gd, int n );
-void timer_start(global_data* gd, int n );
-void timer_stop(global_data* gd, int n );
-double timer_read(global_data* gd, int n );
-
-void timer_clear(global_data* gd, int n ) {
- gd->elapsed[n] = 0.0;
-}
-
-void timer_start(global_data* gd, int n ) {
- gd->start[n] = MPI_Wtime();
-}
-
-void timer_stop(global_data* gd, int n ) {
- gd->elapsed[n] += MPI_Wtime() - gd->start[n];
-}
-
-double timer_read(global_data* gd, int n ) {
- return gd->elapsed[n];
-}
-
-
-/*
- * FUNCTION RANDLC (X, A)
- *
- * This routine returns a uniform pseudorandom double precision number in the
- * range (0, 1) by using the linear congruential generator
- *
- * x_{k+1} = a x_k (mod 2^46)
- *
- * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
- * before repeating. The argument A is the same as 'a' in the above formula,
- * and X is the same as x_0. A and X must be odd double precision integers
- * in the range (1, 2^46). The returned value RANDLC is normalized to be
- * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
- * the new seed x_1, so that subsequent calls to RANDLC using the same
- * arguments will generate a continuous sequence.
- *
- * This routine should produce the same results on any computer with at least
- * 48 mantissa bits in double precision floating point data. On Cray systems,
- * double precision should be disabled.
- *
- * David H. Bailey October 26, 1990
- *
- * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- * SAVE KS, R23, R46, T23, T46
- * DATA KS/0/
- *
- * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
- * by merely using the ** operator, in order to insure that the results are
- * exact on all systems. This code assumes that 0.5D0 is represented exactly.
- */
-
-
-/*****************************************************************/
-/************* R A N D L C ************/
-/************* ************/
-/************* portable random number generator ************/
-/*****************************************************************/
-
-double randlc( double *X, double *A )
-{
- static int KS=0;
- static double R23, R46, T23, T46;
- double T1, T2, T3, T4;
- double A1;
- double A2;
- double X1;
- double X2;
- double Z;
- int i, j;
-
- if (KS == 0)
- {
- R23 = 1.0;
- R46 = 1.0;
- T23 = 1.0;
- T46 = 1.0;
-
- for (i=1; i<=23; i++)
- {
- R23 = 0.50 * R23;
- T23 = 2.0 * T23;
- }
- for (i=1; i<=46; i++)
- {
- R46 = 0.50 * R46;
- T46 = 2.0 * T46;
- }
- KS = 1;
- }
-
-/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
-
- T1 = R23 * *A;
- j = T1;
- A1 = j;
- A2 = *A - T23 * A1;
-
-/* Break X into two parts such that X = 2^23 * X1 + X2, compute
- Z = A1 * X2 + A2 * X1 (mod 2^23), and then
- X = 2^23 * Z + A2 * X2 (mod 2^46). */
-
- T1 = R23 * *X;
- j = T1;
- X1 = j;
- X2 = *X - T23 * X1;
- T1 = A1 * X2 + A2 * X1;
-
- j = R23 * T1;
- T2 = j;
- Z = T1 - T23 * T2;
- T3 = T23 * Z + A2 * X2;
- j = R46 * T3;
- T4 = j;
- *X = T3 - T46 * T4;
- return(R46 * *X);
-}
-
-
-
-/*****************************************************************/
-/************ F I N D _ M Y _ S E E D ************/
-/************ ************/
-/************ returns parallel random number seq seed ************/
-/*****************************************************************/
-
-/*
- * Create a random number sequence of total length nn residing
- * on np number of processors. Each processor will therefore have a
- * subsequence of length nn/np. This routine returns that random
- * number which is the first random number for the subsequence belonging
- * to processor rank kn, and which is used as seed for proc kn ran # gen.
- */
-
-double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
- int np, /* np = num procs */
- long nn, /* total num of ran numbers, all procs */
- double s, /* Ran num seed, for ex.: 314159265.00 */
- double a ) /* Ran num gen mult, try 1220703125.00 */
-{
-
- long i;
-
- double t1,t2,t3,an;
- long mq,nq,kk,ik;
-
-
-
- nq = nn / np;
-
- for( mq=0; nq>1; mq++,nq/=2 )
- ;
-
- t1 = a;
-
- for( i=1; i<=mq; i++ )
- t2 = randlc( &t1, &t1 );
-
- an = t1;
-
- kk = kn;
- t1 = s;
- t2 = an;
-
- for( i=1; i<=100; i++ )
- {
- ik = kk / 2;
- if( 2 * ik != kk )
- t3 = randlc( &t1, &t2 );
- if( ik == 0 )
- break;
- t3 = randlc( &t2, &t2 );
- kk = ik;
- }
-
- return( t1 );
-
-}
-
-
-
-
-/*****************************************************************/
-/************* C R E A T E _ S E Q ************/
-/*****************************************************************/
-
-void create_seq( global_data* gd, double seed, double a )
-{
- double x;
- int i, k;
-
- k = MAX_KEY/4;
-
- for (i=0; i<NUM_KEYS; i++)
- {
- x = randlc(&seed, &a);
- x += randlc(&seed, &a);
- x += randlc(&seed, &a);
- x += randlc(&seed, &a);
-
- gd->key_array[i] = k*x;
- }
-}
-
-
-
-
-/*****************************************************************/
-/************* F U L L _ V E R I F Y ************/
-/*****************************************************************/
-
-
-void full_verify( global_data* gd )
-{
- MPI_Status status;
- MPI_Request request;
-
- INT_TYPE i, j;
- INT_TYPE k, last_local_key;
-
-
-/* Now, finally, sort the keys: */
- for( i=0; i<gd->total_local_keys; i++ )
- gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
- gd->total_lesser_keys] = gd->key_buff2[i];
- last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
-
-/* Send largest key value to next processor */
- if( gd->my_rank > 0 )
- MPI_Irecv( &k,
- 1,
- MP_KEY_TYPE,
- gd->my_rank-1,
- 1000,
- MPI_COMM_WORLD,
- &request );
- if( gd->my_rank < gd->comm_size-1 )
- MPI_Send( &gd->key_array[last_local_key],
- 1,
- MP_KEY_TYPE,
- gd->my_rank+1,
- 1000,
- MPI_COMM_WORLD );
- if( gd->my_rank > 0 )
- MPI_Wait( &request, &status );
-
-/* Confirm that neighbor's greatest key value
- is not greater than my least key value */
- j = 0;
- if( gd->my_rank > 0 && gd->total_local_keys > 0 )
- if( k > gd->key_array[0] )
- j++;
-
-
-/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
- for( i=1; i<gd->total_local_keys; i++ )
- if( gd->key_array[i-1] > gd->key_array[i] )
- j++;
-
-
- if( j != 0 )
- {
- printf( "Processor %d: Full_verify: number of keys out of sort: %d\n",
- gd->my_rank, j );
- }
- else
- gd->passed_verification++;
-
-
-}
-
-
-
-
-/*****************************************************************/
-/************* R A N K ****************/
-/*****************************************************************/
-
-
-void rank( global_data* gd, int iteration )
-{
-
- INT_TYPE i, k;
-
- INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
- INT_TYPE key;
- INT_TYPE2 bucket_sum_accumulator, j, m;
- INT_TYPE local_bucket_sum_accumulator;
- INT_TYPE min_key_val, max_key_val;
- INT_TYPE *key_buff_ptr;
-
-
-
-
-/* Iteration alteration of keys */
- if(gd->my_rank == 0 )
- {
- gd->key_array[iteration] = iteration;
- gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
- }
-
-
-/* Initialize */
- for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
- {
- gd->bucket_size[i] = 0;
- gd->bucket_size_totals[i] = 0;
- gd->process_bucket_distrib_ptr1[i] = 0;
- gd->process_bucket_distrib_ptr2[i] = 0;
- }
-
-
-/* Determine where the partial verify test keys are, load into */
-/* top of array bucket_size */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
- gd->bucket_size[NUM_BUCKETS+i] =
- gd->key_array[gd->test_index_array[i] % NUM_KEYS];
-
-
-/* Determine the number of keys in each bucket */
- for( i=0; i<NUM_KEYS; i++ )
- gd->bucket_size[gd->key_array[i] >> shift]++;
-
-
-/* Accumulative bucket sizes are the bucket pointers */
- gd->bucket_ptrs[0] = 0;
- for( i=1; i< NUM_BUCKETS; i++ )
- gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
-
-
-/* Sort into appropriate bucket */
- for( i=0; i<NUM_KEYS; i++ )
- {
- key = gd->key_array[i];
- gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
- }
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_start(gd, 3 );
-#endif
-
-/* Get the bucket size totals for the entire problem. These
- will be used to determine the redistribution of keys */
- MPI_Allreduce( gd->bucket_size,
- gd->bucket_size_totals,
- NUM_BUCKETS+TEST_ARRAY_SIZE,
- MP_KEY_TYPE,
- MPI_SUM,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 3 );
- timer_start(gd, 2 );
-#endif
-
-/* Determine Redistibution of keys: accumulate the bucket size totals
- till this number surpasses NUM_KEYS (which the average number of keys
- per processor). Then all keys in these buckets go to processor 0.
- Continue accumulating again until supassing 2*NUM_KEYS. All keys
- in these buckets go to processor 1, etc. This algorithm guarantees
- that all processors have work ranking; no processors are left idle.
- The optimum number of buckets, however, does not result in as high
- a degree of load balancing (as even a distribution of keys as is
- possible) as is obtained from increasing the number of buckets, but
- more buckets results in more computation per processor so that the
- optimum number of buckets turns out to be 1024 for machines tested.
- Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
- number of first and last bucket which each processor will have after
- the redistribution is done. */
-
- bucket_sum_accumulator = 0;
- local_bucket_sum_accumulator = 0;
- gd->send_displ[0] = 0;
- gd->process_bucket_distrib_ptr1[0] = 0;
- for( i=0, j=0; i<NUM_BUCKETS; i++ )
- {
- bucket_sum_accumulator += gd->bucket_size_totals[i];
- local_bucket_sum_accumulator += gd->bucket_size[i];
- if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
- {
- gd->send_count[j] = local_bucket_sum_accumulator;
- if( j != 0 )
- {
- gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
- gd->process_bucket_distrib_ptr1[j] =
- gd->process_bucket_distrib_ptr2[j-1]+1;
- }
- gd->process_bucket_distrib_ptr2[j++] = i;
- local_bucket_sum_accumulator = 0;
- }
- }
-
-/* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
- that the last few processors don't get any buckets. So, we
- need to set counts properly in this case to avoid any fallouts. */
- while( j < gd->comm_size )
- {
- gd->send_count[j] = 0;
- gd->process_bucket_distrib_ptr1[j] = 1;
- j++;
- }
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_start(gd, 3 );
-#endif
-
-/* This is the redistribution section: first find out how many keys
- each processor will send to every other processor: */
- MPI_Alltoall( gd->send_count,
- 1,
- MPI_INT,
- gd->recv_count,
- 1,
- MPI_INT,
- MPI_COMM_WORLD );
-
-/* Determine the receive array displacements for the buckets */
- gd->recv_displ[0] = 0;
- for( i=1; i<gd->comm_size; i++ )
- gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
-
-
-/* Now send the keys to respective processors */
- MPI_Alltoallv( gd->key_buff1,
- gd->send_count,
- gd->send_displ,
- MP_KEY_TYPE,
- gd->key_buff2,
- gd->recv_count,
- gd->recv_displ,
- MP_KEY_TYPE,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 3 );
- timer_start(gd, 2 );
-#endif
-
-/* The starting and ending bucket numbers on each processor are
- multiplied by the interval size of the buckets to obtain the
- smallest possible min and greatest possible max value of any
- key on each processor */
- min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
- max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
-
-/* Clear the work array */
- for( i=0; i<max_key_val-min_key_val+1; i++ )
- gd->key_buff1[i] = 0;
-
-/* Determine the total number of keys on all other
- processors holding keys of lesser value */
- m = 0;
- for( k=0; k<gd->my_rank; k++ )
- for( i= gd->process_bucket_distrib_ptr1[k];
- i<=gd->process_bucket_distrib_ptr2[k];
- i++ )
- m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
-
-/* Determine total number of keys on this processor */
- j = 0;
- for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
- i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
- i++ )
- j += gd->bucket_size_totals[i]; /* j has total # of local keys */
-
-
-/* Ranking of all keys occurs in this section: */
-/* shift it backwards so no subtractions are necessary in loop */
- key_buff_ptr = gd->key_buff1 - min_key_val;
-
-/* In this section, the keys themselves are used as their
- own indexes to determine how many of each there are: their
- individual population */
- for( i=0; i<j; i++ )
- key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key */
- /* population */
-
-/* To obtain ranks of each key, successively add the individual key
- population, not forgetting the total of lesser keys, m.
- NOTE: Since the total of lesser keys would be subtracted later
- in verification, it is no longer added to the first key population
- here, but still needed during the partial verify test. This is to
- ensure that 32-bit key_buff can still be used for class D. */
-/* key_buff_ptr[min_key_val] += m; */
- for( i=min_key_val; i<max_key_val; i++ )
- key_buff_ptr[i+1] += key_buff_ptr[i];
-
-
-/* This is the partial verify test section */
-/* Observe that test_rank_array vals are */
-/* shifted differently for different cases */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- {
- k = gd->bucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */
- if( min_key_val <= k && k <= max_key_val )
- {
- /* Add the total of lesser keys, m, here */
- INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
- int failed = 0;
-
- switch( CLASS )
- {
- case 'S':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'W':
- if( i < 2 )
- {
- if( key_rank != gd->test_rank_array[i]+(iteration-2) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'A':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+(iteration-1) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-(iteration-1) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'B':
- if( i == 1 || i == 2 || i == 4 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'C':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'D':
- if( i < 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- }
- if( failed == 1 )
- printf( "Failed partial verification: "
- "iteration %d, processor %d, test key %d\n",
- iteration, gd->my_rank, (int)i );
- }
- }
-
-
-
-
-/* Make copies of rank info for use by full_verify: these variables
- in rank are local; making them global slows down the code, probably
- since they cannot be made register by compiler */
-
- if( iteration == MAX_ITERATIONS )
- {
- gd->key_buff_ptr_global = key_buff_ptr;
- gd->total_local_keys = j;
- gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
- }
-
-}
-
-
-/*****************************************************************/
-/************* M A I N ****************/
-/*****************************************************************/
-
-int main( int argc, char **argv )
-{
-
- int i, iteration, itemp;
-
- double timecounter, maxtime;
-
- global_data* gd = malloc(sizeof(global_data));
-/* Initialize MPI */
- MPI_Init( &argc, &argv );
- MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
- MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
-
-/* Initialize the verification arrays if a valid class */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- switch( CLASS )
- {
- case 'S':
- gd->test_index_array[i] = S_test_index_array[i];
- gd->test_rank_array[i] = S_test_rank_array[i];
- break;
- case 'A':
- gd->test_index_array[i] = A_test_index_array[i];
- gd->test_rank_array[i] = A_test_rank_array[i];
- break;
- case 'W':
- gd->test_index_array[i] = W_test_index_array[i];
- gd->test_rank_array[i] = W_test_rank_array[i];
- break;
- case 'B':
- gd->test_index_array[i] = B_test_index_array[i];
- gd->test_rank_array[i] = B_test_rank_array[i];
- break;
- case 'C':
- gd->test_index_array[i] = C_test_index_array[i];
- gd->test_rank_array[i] = C_test_rank_array[i];
- break;
- case 'D':
- gd->test_index_array[i] = D_test_index_array[i];
- gd->test_rank_array[i] = D_test_rank_array[i];
- break;
- };
-
-
-
-/* Printout initial NPB info */
- if( gd->my_rank == 0 )
- {
- printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
- printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
- printf( " Iterations: %d\n", MAX_ITERATIONS );
- printf( " Number of processes: %d\n",gd->comm_size );
- }
-
-/* Check that actual and compiled number of processors agree */
- if( gd->comm_size != NUM_PROCS )
- {
- if( gd->my_rank == 0 )
- printf( "\n ERROR: compiled for %d processes\n"
- " Number of active processes: %d\n"
- " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
- MPI_Finalize();
- exit( 1 );
- }
-
-/* Check to see whether total number of processes is within bounds.
- This could in principle be checked in setparams.c, but it is more
- convenient to do it here */
- if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
- {
- if( gd->my_rank == 0 )
- printf( "\n ERROR: number of processes %d not within range %d-%d"
- "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
- MPI_Finalize();
- exit( 1 );
- }
-
-
-/* Generate random number sequence and subsequent keys on all procs */
- create_seq(gd, find_my_seed( gd->my_rank,
- gd->comm_size,
- 4*(long)TOTAL_KEYS*MIN_PROCS,
- 314159265.00, /* Random number gen seed */
- 1220703125.00 ), /* Random number gen mult */
- 1220703125.00 ); /* Random number gen mult */
-
-/* Do one interation for free (i.e., untimed) to guarantee initialization of
- all data and code pages and respective tables */
- rank(gd, 1 );
-
-/* Start verification counter */
- gd->passed_verification = 0;
-
- if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" );
-
-/* Initialize timer */
- timer_clear(gd, 0 );
-
-/* Initialize separate communication, computation timing */
-#ifdef TIMING_ENABLED
- for( i=1; i<=3; i++ ) timer_clear(gd, i );
-#endif
-
-/* Start timer */
- timer_start(gd, 0 );
-
-#ifdef TIMING_ENABLED
- timer_start(gd, 1 );
- timer_start(gd, 2 );
-#endif
-
- char smpi_category[100];
- snprintf (smpi_category, 100, "%d", gd->my_rank);
- TRACE_smpi_set_category (smpi_category);
-
-/* This is the main iteration */
- for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
- {
- if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
- rank(gd, iteration );
- }
- TRACE_smpi_set_category (NULL);
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_stop(gd, 1 );
-#endif
-
-/* Stop timer, obtain time for processors */
- timer_stop(gd, 0 );
-
- timecounter = timer_read(gd, 0 );
-
-/* End of timing, obtain maximum time of all processors */
- MPI_Reduce( &timecounter,
- &maxtime,
- 1,
- MPI_DOUBLE,
- MPI_MAX,
- 0,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- {
- double tmin, tsum, tmax;
-
- if( my_rank == 0 )
- {
- printf( "\ntimer 1/2/3 = total/computation/communication time\n");
- printf( " min avg max\n" );
- }
- for( i=1; i<=3; i++ )
- {
- timecounter = timer_read(gd, i );
- MPI_Reduce( &timecounter,
- &tmin,
- 1,
- MPI_DOUBLE,
- MPI_MIN,
- 0,
- MPI_COMM_WORLD );
- MPI_Reduce( &timecounter,
- &tsum,
- 1,
- MPI_DOUBLE,
- MPI_SUM,
- 0,
- MPI_COMM_WORLD );
- MPI_Reduce( &timecounter,
- &tmax,
- 1,
- MPI_DOUBLE,
- MPI_MAX,
- 0,
- MPI_COMM_WORLD );
- if( my_rank == 0 )
- printf( "timer %d: %f %f %f\n",
- i, tmin, tsum/((double) comm_size), tmax );
- }
- if( my_rank == 0 )
- printf( "\n" );
- }
-#endif
-
-/* This tests that keys are in sequence: sorting of last ranked key seq
- occurs here, but is an untimed operation */
- full_verify(gd);
-
-
-/* Obtain verification counter sum */
- itemp =gd->passed_verification;
- MPI_Reduce( &itemp,
- &gd->passed_verification,
- 1,
- MPI_INT,
- MPI_SUM,
- 0,
- MPI_COMM_WORLD );
-
-
-
-/* The final printout */
- if( gd->my_rank == 0 )
- {
- if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
- gd->passed_verification = 0;
- c_print_results( "IS",
- CLASS,
- (int)(TOTAL_KEYS),
- MIN_PROCS,
- 0,
- MAX_ITERATIONS,
- NUM_PROCS,
- gd->comm_size,
- maxtime,
- ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
- /maxtime/1000000.,
- "keys ranked",
- gd->passed_verification,
- NPBVERSION,
- COMPILETIME,
- MPICC,
- CLINK,
- CMPI_LIB,
- CMPI_INC,
- CFLAGS,
- CLINKFLAGS );
- }
-
- MPI_Finalize();
- free(gd);
-
- return 0;
- /**************************/
-} /* E N D P R O G R A M */
- /**************************/
+++ /dev/null
-SHELL=/bin/sh
-CLASS=S
-NPROCS=1
-default:
- @ sys/print_instructions
-
-IS: is
-is:
- cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
-EP: ep
-ep:
- cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
-DT: dt
-dt:
- cd DT; $(MAKE) CLASS=$(CLASS)
-
-clean:
- - rm -f *~ */*~ */*.o */npbparams.h
- - rm -f sys/setparams sys/setparams.h
-
-veryclean: clean
- - rm -f bin/*
\ No newline at end of file
+++ /dev/null
-#include <stdlib.h>
-#include <stdio.h>
-
-void c_print_results(char *name, char class, int n1, int n2, int n3, int niter, int nprocs_compiled, int nprocs_total,
- double t, double mops, char *optype, int passed_verification, char *npbversion, char *compiletime,
- char *mpicc, char *clink, char *cmpi_lib, char *cmpi_inc, char *cflags, char *clinkflags)
-{
- printf( "\n\n %s Benchmark Completed\n", name );
- printf( " Class = %c\n", class );
-
- if( n3 == 0 ) {
- long nn = n1;
- if ( n2 != 0 ) nn *= n2;
- printf( " Size = %12ld\n", nn ); /* as in IS */
- }
- else
- printf( " Size = %3dx %3dx %3d\n", n1,n2,n3 );
-
- printf( " Iterations = %12d\n", niter );
- printf( " Time in seconds = %12.2f\n", t );
- printf( " Total processes = %12d\n", nprocs_total );
-
- if ( nprocs_compiled != 0 )
- printf( " Compiled procs = %12d\n", nprocs_compiled );
-
- printf( " Mop/s total = %12.2f\n", mops );
- printf( " Mop/s/process = %12.2f\n", mops/((float) nprocs_total) );
- printf( " Operation type = %24s\n", optype);
-
- if( passed_verification )
- printf( " Verification = SUCCESSFUL\n" );
- else
- printf( " Verification = UNSUCCESSFUL\n" );
-
- printf( " Version = %12s\n", npbversion );
- printf( " Compile date = %12s\n", compiletime );
- printf( "\n Compile options:\n" );
- printf( " MPICC = %s\n", mpicc );
- printf( " CLINK = %s\n", clink );
- printf( " CMPI_LIB = %s\n", cmpi_lib );
- printf( " CMPI_INC = %s\n", cmpi_inc );
- printf( " CFLAGS = %s\n", cflags );
- printf( " CLINKFLAGS = %s\n", clinkflags );
- printf( "\n\n" );
- printf( " Please send the results of this run to:\n\n" );
- printf( " NPB Development Team\n" );
- printf( " Internet: npb@nas.nasa.gov\n \n" );
- printf( " If email is not available, send this to:\n\n" );
- printf( " MS T27A-1\n" );
- printf( " NASA Ames Research Center\n" );
- printf( " Moffett Field, CA 94035-1000\n\n" );
- printf( " Fax: 650-604-3957\n\n" );
-}
+++ /dev/null
-#include "mpi.h"
-
-double start[64], elapsed[64];
-
-void timer_clear( int n )
-{
- elapsed[n] = 0.0;
-}
-
-void timer_start( int n )
-{
- start[n] = MPI_Wtime();
-}
-
-void timer_stop( int n )
-{
- double t, now;
- now = MPI_Wtime();
- t = now - start[n];
- elapsed[n] += t;
-}
-
-double timer_read( int n )
-{
- return( elapsed[n] );
-}
-
+++ /dev/null
-/*
- * FUNCTION RANDLC (X, A)
- *
- * This routine returns a uniform pseudorandom double precision number in the
- * range (0, 1) by using the linear congruential generator
- *
- * x_{k+1} = a x_k (mod 2^46)
- *
- * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
- * before repeating. The argument A is the same as 'a' in the above formula,
- * and X is the same as x_0. A and X must be odd double precision integers
- * in the range (1, 2^46). The returned value RANDLC is normalized to be
- * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
- * the new seed x_1, so that subsequent calls to RANDLC using the same
- * arguments will generate a continuous sequence.
- *
- * This routine should produce the same results on any computer with at least
- * 48 mantissa bits in double precision floating point data. On Cray systems,
- * double precision should be disabled.
- *
- * David H. Bailey October 26, 1990
- *
- * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- * SAVE KS, R23, R46, T23, T46
- * DATA KS/0/
- *
- * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
- * by merely using the ** operator, in order to insure that the results are
- * exact on all systems. This code assumes that 0.5D0 is represented exactly.
- */
-double randlc(double *X, double*A)
-{
- static int KS=0;
- static double R23, R46, T23, T46;
- double T1, T2, T3, T4;
- double A1, A2;
- double X1, X2;
- double Z;
- int i, j;
-
- if (KS == 0) {
- R23 = 1.0;
- R46 = 1.0;
- T23 = 1.0;
- T46 = 1.0;
-
- for (i=1; i<=23; i++) {
- R23 = 0.50 * R23;
- T23 = 2.0 * T23;
- }
- for (i=1; i<=46; i++) {
- R46 = 0.50 * R46;
- T46 = 2.0 * T46;
- }
- KS = 1;
- }
-
-/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
- T1 = R23 * *A;
- j = T1;
- A1 = j;
- A2 = *A - T23 * A1;
-
-/* Break X into two parts such that X = 2^23 * X1 + X2, compute
- Z = A1 * X2 + A2 * X1 (mod 2^23), and then X = 2^23 * Z + A2 * X2 (mod 2^46). */
- T1 = R23 * *X;
- j = T1;
- X1 = j;
- X2 = *X - T23 * X1;
- T1 = A1 * X2 + A2 * X1;
-
- j = R23 * T1;
- T2 = j;
- Z = T1 - T23 * T2;
- T3 = T23 * Z + A2 * X2;
- j = R46 * T3;
- T4 = j;
- *X = T3 - T46 * T4;
- return(R46 * *X);
-}
+++ /dev/null
-#---------------------------------------------------------------------------
-#
-# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS.
-#
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# Items in this file will need to be changed for each platform.
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# Parallel C:
-#
-# MPICC - C compiler
-# CFLAGS - C compilation arguments
-# CMPI_INC - any -I arguments required for compiling MPI/C
-# CLINK - C linker
-# CLINKFLAGS - C linker flags
-# CMPI_LIB - any -L and -l arguments required for linking MPI/C
-#
-# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
-# $(MPICC) $(CFLAGS)
-# linking is done with $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
-#---------------------------------------------------------------------------
-
-#---------------------------------------------------------------------------
-# This is the C compiler used for MPI programs
-#---------------------------------------------------------------------------
-MPICC = smpicc
-# This links MPI C programs; usually the same as ${MPICC}
-CLINK = $(MPICC)
-
-#---------------------------------------------------------------------------
-# These macros are passed to the linker to help link with MPI correctly
-#---------------------------------------------------------------------------
-CMPI_LIB =
-
-#---------------------------------------------------------------------------
-# These macros are passed to the compiler to help find 'mpi.h'
-#---------------------------------------------------------------------------
-CMPI_INC =
-
-#---------------------------------------------------------------------------
-# Global *compile time* flags for C programs
-#---------------------------------------------------------------------------
-CFLAGS = -O2
-
-#---------------------------------------------------------------------------
-# Global *link time* flags. Flags for increasing maximum executable
-# size usually go here.
-#---------------------------------------------------------------------------
-CLINKFLAGS = -O2
-
-#---------------------------------------------------------------------------
-# Utilities C:
-#
-# This is the C compiler used to compile C utilities. Flags required by
-# this compiler go here also; typically there are few flags required; hence
-# there are no separate macros provided for such flags.
-#---------------------------------------------------------------------------
-CC = gcc -g
-
-#---------------------------------------------------------------------------
-# Destination of executables, relative to subdirs of the main directory. .
-#---------------------------------------------------------------------------
-BINDIR = ../bin
-
-#---------------------------------------------------------------------------
-# The variable RAND controls which random number generator
-# is used. It is described in detail in README.install.
-# Use "randi8" unless there is a reason to use another one.
-#---------------------------------------------------------------------------
-RAND = randi8
#include <stdio.h>
#include <string.h>
-#include "mpi.h"
-#include "npbparams.h"
-
+#include "smpi/mpi.h"
+#include "nas_common.h"
#include "simgrid/instr.h" //TRACE_
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-
-//int passed_verification;
-extern double randlc( double *X, double *A );
-extern
-void c_print_results( char *name,
- char class,
- int n1,
- int n2,
- int n3,
- int niter,
- int nprocs_compiled,
- int nprocs_total,
- double t,
- double mops,
- char *optype,
- int passed_verification,
- char *npbversion,
- char *compiletime,
- char *mpicc,
- char *clink,
- char *cmpi_lib,
- char *cmpi_inc,
- char *cflags,
- char *clinkflags );
-
-void timer_clear( int n );
-void timer_start( int n );
-void timer_stop( int n );
-double timer_read( int n );
+
int timer_on=0,timers_tot=64;
+double start[64], elapsed[64];
-int verify(char *bmname,double rnm2){
+char class;
+int nprocs;
+int num_samples;
+int deviation;
+int num_sources;
+
+static int verify(char *bmname,double rnm2){
double verify_value=0.0;
double epsilon=1.0E-8;
- char cls=CLASS;
int verified=-1;
- if (cls != 'U') {
- if(cls=='S') {
+ if (class != 'U') {
+ if(class=='S') {
if(strstr(bmname,"BH")){
verify_value=30892725.0;
}else if(strstr(bmname,"WH")){
fprintf(stderr,"No such benchmark as %s.\n",bmname);
}
verified = 0;
- }else if(cls=='W') {
+ }else if(class=='W') {
if(strstr(bmname,"BH")){
verify_value = 4102461.0;
}else if(strstr(bmname,"WH")){
- verify_value = 204280762.0;
+ verify_value = 204280762.0;
}else if(strstr(bmname,"SH")){
verify_value = 186944764.0;
}else{
fprintf(stderr,"No such benchmark as %s.\n",bmname);
}
verified = 0;
- }else if(cls=='A') {
+ }else if(class=='A') {
if(strstr(bmname,"BH")){
verify_value = 17809491.0;
}else if(strstr(bmname,"WH")){
fprintf(stderr,"No such benchmark as %s.\n",bmname);
}
verified = 0;
- }else if(cls=='B') {
+ }else if(class=='B') {
if(strstr(bmname,"BH")){
verify_value = 4317114.0;
}else if(strstr(bmname,"WH")){
fprintf(stderr,"No such benchmark as %s.\n",bmname);
verified = 0;
}
- }else if(cls=='C') {
+ }else if(class=='C') {
if(strstr(bmname,"BH")){
verify_value = 0.0;
}else if(strstr(bmname,"WH")){
fprintf(stderr,"No such benchmark as %s.\n",bmname);
verified = -1;
}
- }else if(cls=='D') {
+ }else if(class=='D') {
if(strstr(bmname,"BH")){
verify_value = 0.0;
}else if(strstr(bmname,"WH")){
}
verified = -1;
}else{
- fprintf(stderr,"No such class as %c.\n",cls);
+ fprintf(stderr,"No such class as %c.\n",class);
}
fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
if(verified==-1){
return verified;
}
-int ipowMod(int a,long long int n,int md){
+static int ipowMod(int a,long long int n,int md){
int seed=1,q=a,r=1;
if(n<0){
fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
}
#include "DGraph.h"
-DGraph *buildSH(char cls){
+static DGraph *buildSH(const char cls){
/*
Nodes of the graph must be topologically sorted
to avoid MPI deadlock.
*/
DGraph *dg;
- int numSources=NUM_SOURCES; /* must be power of 2 */
+ int numSources=num_sources; /* must be power of 2 */
int numOfLayers=0,tmpS=numSources>>1;
int firstLayerNode=0;
DGArc *ar=NULL;
}
return dg;
}
-DGraph *buildWH(char cls){
-/*
- Nodes of the graph must be topologically sorted
- to avoid MPI deadlock.
-*/
+static DGraph *buildWH(const char cls){
+/* Nodes of the graph must be topologically sorted to avoid MPI deadlock. */
int i=0,j=0;
- int numSources=NUM_SOURCES,maxInDeg=4;
+ int numSources=num_sources,maxInDeg=4;
int numLayerNodes=numSources,firstLayerNode=0;
int totComparators=0;
int numPrevLayerNodes=numLayerNodes;
firstLayerNode+=numPrevLayerNodes;
numPrevLayerNodes=numLayerNodes;
}
- source=newNode("Source");
+ source=newNode((char*)"Source");
AttachNode(dg,source);
for(i=0;i<numPrevLayerNodes;i++){
nd=dg->node[firstLayerNode+i];
}
return dg;
}
-DGraph *buildBH(char cls){
-/*
- Nodes of the graph must be topologically sorted
- to avoid MPI deadlock.
-*/
+static DGraph *buildBH(const char cls){
+/* Nodes of the graph must be topologically sorted to avoid MPI deadlock.*/
int i=0,j=0;
- int numSources=NUM_SOURCES,maxInDeg=4;
+ int numSources=num_sources,maxInDeg=4;
int numLayerNodes=numSources,firstLayerNode=0;
DGraph *dg;
DGNode *nd=NULL, *snd=NULL, *sink=NULL;
firstLayerNode+=numPrevLayerNodes;
numPrevLayerNodes=numLayerNodes;
}
- sink=newNode("Sink");
+ sink=newNode((char*)"Sink");
AttachNode(dg,sink);
for(i=0;i<numPrevLayerNodes;i++){
nd=dg->node[firstLayerNode+i];
int len;
double* val;
} Arr;
-Arr *newArr(int len){
- Arr *arr=(Arr *)malloc(sizeof(Arr));
+
+static Arr *newArr(int len){
+ Arr *arr=(Arr *)malloc(sizeof(Arr)); //Arr *arr=(Arr *)SMPI_SHARED_MALLOC(sizeof(Arr));
arr->len=len;
- arr->val=(double *)malloc(len*sizeof(double));
+ arr->val=(double *)malloc(len*sizeof(double)); //arr->val=(double *)SMPI_SHARED_MALLOC(len*sizeof(double));
return arr;
}
-void arrShow(Arr* a){
+
+static void arrShow(Arr* a){
if(!a) fprintf(stderr,"-- NULL array\n");
else{
fprintf(stderr,"-- length=%d\n",a->len);
}
}
-double CheckVal(Arr *feat){
+
+static double CheckVal(Arr *feat){
double csum=0.0;
int i=0;
for(i=0;i<feat->len;i++){
- csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since
- result will be 0 for large len */
+ csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since result will be 0 for large len */
}
- return csum;
+ return csum;
}
-int GetFNumDPar(int* mean, int* stdev){
- *mean=NUM_SAMPLES;
- *stdev=STD_DEVIATION;
+
+static int GetFNumDPar(int* mean, int* stdev){
+ *mean=num_samples;
+ *stdev=deviation;
return 0;
}
-int GetFeatureNum(char *mbname,int id){
+
+static int GetFeatureNum(char *mbname,int id){
double tran=314159265.0;
double A=2*id+1;
double denom=randlc(&tran,&A);
char cval='S';
- int mean=NUM_SAMPLES,stdev=128;
+ int mean=num_samples,stdev=128;
int rtfs=0,len=0;
GetFNumDPar(&mean,&stdev);
rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
len=mean-stdev+rtfs;
return len;
}
-Arr* RandomFeatures(char *bmname,int fdim,int id){
+
+static Arr* RandomFeatures(char *bmname,int fdim,int id){
int len=GetFeatureNum(bmname,id)*fdim;
Arr* feat=newArr(len);
int nxg=2,nyg=2,nzg=2,nfg=5;
timer_stop(id+1);
fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
}
- return feat;
+ return feat;
}
-void Resample(Arr *a,int blen){
+
+static void Resample(Arr *a,int blen){
long long int i=0,j=0,jlo=0,jhi=0;
double avval=0.0;
double *nval=(double *)malloc(blen*sizeof(double));
- Arr *tmp=newArr(10);
+ //double *nval=(double *)SMPI_SHARED_MALLOC(blen*sizeof(double));
for(i=0;i<blen;i++) nval[i]=0.0;
for(i=1;i<a->len-1;i++){
jlo=(int)(0.5*(2*i-1)*(blen/a->len));
jhi=(int)(0.5*(2*i+1)*(blen/a->len));
- avval=a->val[i]/(jhi-jlo+1);
+ avval=a->val[i]/(jhi-jlo+1);
for(j=jlo;j<=jhi;j++){
nval[j]+=avval;
}
}
nval[0]=a->val[0];
nval[blen-1]=a->val[a->len-1];
- free(a->val);
+ free(a->val); //SMPI_SHARED_FREE(a->val);
a->val=nval;
a->len=blen;
}
+
#define fielddim 4
-Arr* WindowFilter(Arr *a, Arr* b,int w){
+static Arr* WindowFilter(Arr *a, Arr* b,int w){
int i=0,j=0,k=0;
double rms0=0.0,rms1=0.0,rmsm1=0.0;
double weight=((double) (w+1))/(w+2);
return a;
}
-int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
+static int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
int i=0,tag=0;
DGArc *ar=NULL;
DGNode *head=NULL;
TRACE_smpi_set_category (NULL);
return 1;
}
-Arr* CombineStreams(DGraph *dg,DGNode *nd){
- Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
+static Arr* CombineStreams(DGraph *dg,DGNode *nd){
+ Arr *resfeat=newArr(num_samples*fielddim);
int i=0,len=0,tag=0;
DGArc *ar=NULL;
DGNode *tail=NULL;
feat=newArr(len);
MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
resfeat=WindowFilter(resfeat,feat,nd->id);
- free(feat);
+ free(feat);//SMPI_SHARED_FREE(feat);
}else{
featp=(Arr *)tail->feat;
feat=newArr(featp->len);
memcpy(feat->val,featp->val,featp->len*sizeof(double));
resfeat=WindowFilter(resfeat,feat,nd->id);
- free(feat);
+ free(feat);//SMPI_SHARED_FREE(feat);
}
}
for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
nd->feat=resfeat;
return nd->feat;
}
-double Reduce(Arr *a,int w){
+
+static double Reduce(Arr *a,int w){
double retv=0.0;
if(timer_on){
timer_clear(w);
timer_start(w);
}
- retv=(int)(w*CheckVal(a));/* The casting needed for node
- and array dependent verifcation */
+ retv=(int)(w*CheckVal(a));/* The casting needed for node and array dependent verifcation */
if(timer_on){
timer_stop(w);
fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
return retv;
}
-double ReduceStreams(DGraph *dg,DGNode *nd){
+static double ReduceStreams(DGraph *dg,DGNode *nd){
double csum=0.0;
int i=0,len=0,tag=0;
DGArc *ar=NULL;
feat=newArr(len);
MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
csum+=Reduce(feat,(nd->id+1));
- free(feat);
+ free(feat);//SMPI_SHARED_FREE(feat);
}else{
csum+=Reduce(tail->feat,(nd->id+1));
}
return retv;
}
-int ProcessNodes(DGraph *dg,int me){
+static int ProcessNodes(DGraph *dg,int me){
double chksum=0.0;
Arr *feat=NULL;
int i=0,verified=0,tag;
int verified=0, featnum=0;
double bytes_sent=2.0,tot_time=0.0;
- MPI_Init( &argc, &argv );
- MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
- MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
- TRACE_smpi_set_category ("begin");
-
- if(argc!=2||
- ( strncmp(argv[1],"BH",2)!=0
- &&strncmp(argv[1],"WH",2)!=0
- &&strncmp(argv[1],"SH",2)!=0
- )
- ){
+ MPI_Init( &argc, &argv );
+ MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+ MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+ TRACE_smpi_set_category ("begin");
+ get_info(argc, argv, &nprocs, &class);
+ check_info(DT, nprocs, class);
+
+ if (class == 'S') { num_samples=1728; deviation=128; num_sources=4; }
+ else if (class == 'W') { num_samples=1728*8; deviation=128*2; num_sources=4*2; }
+ else if (class == 'A') { num_samples=1728*64; deviation=128*4; num_sources=4*4; }
+ else if (class == 'B') { num_samples=1728*512; deviation=128*8; num_sources=4*8; }
+ else if (class == 'C') { num_samples=1728*4096; deviation=128*16; num_sources=4*16; }
+ else if (class == 'D') { num_samples=1728*4096*8; deviation=128*32; num_sources=4*32; }
+ else {
+ printf("setparams: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+
+
+ if(argc!=2|| ( strncmp(argv[1],"BH",2)!=0 && strncmp(argv[1],"WH",2)!=0 &&strncmp(argv[1],"SH",2)!=0)){
if(my_rank==0){
fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
fprintf(stderr,"** Where \n - N is integer number of MPI processes\n");
exit(0);
}
if(strncmp(argv[1],"BH",2)==0){
- dg=buildBH(CLASS);
+ dg=buildBH(class);
}else if(strncmp(argv[1],"WH",2)==0){
- dg=buildWH(CLASS);
+ dg=buildWH(class);
}else if(strncmp(argv[1],"SH",2)==0){
- dg=buildSH(CLASS);
+ dg=buildSH(class);
}
if(timer_on&&dg->numNodes+1>timers_tot){
verified=ProcessNodes(dg,my_rank);
TRACE_smpi_set_category ("end");
- featnum=NUM_SAMPLES*fielddim;
+ featnum=num_samples*fielddim;
bytes_sent=featnum*dg->numArcs;
bytes_sent/=1048576;
if(my_rank==0){
timer_stop(0);
tot_time=timer_read(0);
- c_print_results( dg->name,
- CLASS,
- featnum,
- 0,
- 0,
- dg->numNodes,
- 0,
- comm_size,
- tot_time,
- bytes_sent/tot_time,
- "bytes transmitted",
- verified,
- NPBVERSION,
- COMPILETIME,
- MPICC,
- CLINK,
- CMPI_LIB,
- CMPI_INC,
- CFLAGS,
- CLINKFLAGS );
+ c_print_results( dg->name, class, featnum, 0, 0, dg->numNodes, 0, comm_size, tot_time, bytes_sent/tot_time,
+ "bytes transmitted", verified);
}
MPI_Finalize();
return 1;
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "smpi/mpi.h"
+#include "nas_common.h"
+#include "simgrid/instr.h" //TRACE_
+
+char class;
+int nprocs;
+
+#define true 1
+#define false 0
+
+int main(int argc, char **argv) {
+ double dum[3] = {1.,1.,1.};
+ double x1, x2, sx, sy, tm, an, tt, gc;
+ double Mops;
+ double epsilon=1.0E-8, a = 1220703125., s=271828183.;
+ double t1, t2, t3, t4;
+ double sx_verify_value, sy_verify_value, sx_err, sy_err;
+
+ int m, mk=16,
+ mm, nn,
+ nk = (int)(pow(2,mk)),
+ nq=10,
+ np,
+ node,
+ no_nodes,
+ i,
+ ik,
+ kk,
+ l,
+ k, nit, no_large_nodes,
+ np_add, k_offset;
+ int root=0;
+ int verified;
+ char size[500]; // mind the size of the string to represent a big number
+
+ double *start = (double *) malloc (64*sizeof(double));
+ double *elapsed = (double *) malloc (64*sizeof(double));
+
+ double *x = (double *) malloc (2*nk*sizeof(double));
+ double *q = (double *) malloc (nq*sizeof(double));
+
+ MPI_Init( &argc, &argv );
+ MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
+ MPI_Comm_rank( MPI_COMM_WORLD, &node);
+
+ TRACE_smpi_set_category ("start");
+
+ get_info(argc, argv, &nprocs, &class);
+ check_info(EP, nprocs, class);
+
+ if (class == 'S') { m = 24; }
+ else if (class == 'W') { m = 25; }
+ else if (class == 'A') { m = 28; }
+ else if (class == 'B') { m = 30; }
+ else if (class == 'C') { m = 32; }
+ else if (class == 'D') { m = 36; }
+ else if (class == 'E') { m = 40; }
+ else {
+ printf("EP: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+ mm = m -mk;
+ nn = (int)(pow(2,mm)),
+
+ root = 0;
+ if (node == root ) {
+ /* Because the size of the problem is too large to store in a 32-bit integer for some classes, we put it into a
+ * string (for printing). Have to strip off the decimal point put in there by the floating point print statement
+ * (internal file)
+ */
+ fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
+ sprintf(size,"%zu",(unsigned long)pow(2,m+1));
+ //size = size.replace('.', ' ');
+ fprintf(stdout," Number of random numbers generated: %s\n",size);
+ fprintf(stdout," Number of active processes: %d\n",no_nodes);
+ }
+ verified = false;
+
+ /* Compute the number of "batches" of random number pairs generated per processor. Adjust if the number of processors
+ * does not evenly divide the total number
+ */
+
+ np = nn / no_nodes;
+ no_large_nodes = nn % no_nodes;
+ if (node < no_large_nodes) np_add = 1;
+ else np_add = 0;
+ np = np + np_add;
+
+ if (np == 0) {
+ fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
+ MPI_Abort(MPI_COMM_WORLD,1);
+ exit(0);
+ }
+
+ /* Call the random number generator functions and initialize the x-array to reduce the effects of paging the timings.
+ Also, call all mathematical functions that are used. Make sure initializations cannot be eliminated as dead code. */
+
+ //call vranlc(0, dum[1], dum[2], dum[3]);
+ // Array indexes start at 1 in Fortran, 0 in Java
+ vranlc(0, dum[0], dum[1], &(dum[2]));
+
+ dum[0] = randlc(&(dum[1]),&(dum[2]));
+ for (i=0;i<2*nk;i++) {
+ x[i] = -1e99;
+ }
+ Mops = log(sqrt(abs(1)));
+
+ /* Synchronize before placing time stamp */
+ MPI_Barrier( MPI_COMM_WORLD );
+
+ TRACE_smpi_set_category ("ep");
+
+ time_clear(&(elapsed[1]));
+ time_clear(&(elapsed[2]));
+ time_clear(&(elapsed[3]));
+ time_start(&(start[1]));
+
+ t1 = a;
+ //fprintf(stdout,("(ep.f:160) t1 = " + t1);
+ t1 = vranlc(0, t1, a, x);
+ //fprintf(stdout,("(ep.f:161) t1 = " + t1);
+
+ /* Compute AN = A ^ (2 * NK) (mod 2^46). */
+ t1 = a;
+ //fprintf(stdout,("(ep.f:165) t1 = " + t1);
+ for (i=1; i <= mk+1; i++) {
+ t2 = randlc(&t1, &t1);
+ //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
+ }
+ an = t1;
+ //fprintf(stdout,("(ep.f:172) s = " + s);
+ tt = s;
+ gc = tt = 0.;
+ sx = 0.;
+ sy = 0.;
+ for (i=0; i < nq ; i++) {
+ q[i] = 0.;
+ }
+
+/* Each instance of this loop may be performed independently. We compute the k offsets separately to take into account
+ * the fact that some nodes have more numbers to generate than others */
+
+ if (np_add == 1)
+ k_offset = node * np -1;
+ else
+ k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
+
+ int stop = false;
+ for(k = 1; k <= np; k++) {// SMPI_SAMPLE_LOCAL(0.25 * np, 0.03) {
+ stop = false;
+ kk = k_offset + k ;
+ t1 = s;
+ //fprintf(stdout,("(ep.f:193) t1 = " + t1);
+ t2 = an;
+
+ // Find starting seed t1 for this kk.
+ for (i=1;i<=100 && !stop;i++) {
+ ik = kk / 2;
+ //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
+ if (2 * ik != kk) {
+ t3 = randlc(&t1, &t2);
+ //fprintf(stdout,("(ep.f:200) t1= " +t1 );
+ }
+ if (ik==0)
+ stop = true;
+ else {
+ t3 = randlc(&t2, &t2);
+ kk = ik;
+ }
+ }
+ // Compute uniform pseudorandom numbers.
+
+ //if (timers_enabled) timer_start(3);
+ time_start(&(start[3]));
+ //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified
+
+ //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
+ //fprintf(stdout,"2*nk = " + (2*nk));
+ //fprintf(stdout,"t1 = " + t1);
+ //fprintf(stdout,"a = " + a);
+ //fprintf(stdout,"x[0] = " + x[0]);
+ //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+ t1 = vranlc(2 * nk, t1, a, x);
+
+ //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<");
+ //fprintf(stdout,("2*nk = " + (2*nk));
+ //fprintf(stdout,("t1 = " + t1);
+ //fprintf(stdout,("a = " + a);
+ //fprintf(stdout,("x[0] = " + x[0]);
+ //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+
+ //if (timers_enabled) timer_stop(3);
+ time_stop(3,elapsed,start);
+
+ /* Compute Gaussian deviates by acceptance-rejection method and tally counts in concentric square annuli.
+ * This loop is not vectorizable. */
+ //if (timers_enabled) timer_start(2);
+ time_start(&(start[2]));
+ for(i=1; i<=nk;i++) {
+ x1 = 2. * x[2*i-2] -1.0;
+ x2 = 2. * x[2*i-1] - 1.0;
+ t1 = x1*x1 + x2*x2;
+ if (t1 <= 1.) {
+ t2 = sqrt(-2. * log(t1) / t1);
+ t3 = (x1 * t2);
+ t4 = (x2 * t2);
+ l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
+ q[l] = q[l] + 1.;
+ sx = sx + t3;
+ sy = sy + t4;
+ }
+ /*
+ if(i == 1) {
+ fprintf(stdout,"x1 = " + x1);
+ fprintf(stdout,"x2 = " + x2);
+ fprintf(stdout,"t1 = " + t1);
+ fprintf(stdout,"t2 = " + t2);
+ fprintf(stdout,"t3 = " + t3);
+ fprintf(stdout,"t4 = " + t4);
+ fprintf(stdout,"l = " + l);
+ fprintf(stdout,"q[l] = " + q[l]);
+ fprintf(stdout,"sx = " + sx);
+ fprintf(stdout,"sy = " + sy);
+ }
+ */
+ }
+ //if (timers_enabled) timer_stop(2);
+ time_stop(2,elapsed,start);
+ }
+
+ TRACE_smpi_set_category ("finalize");
+
+ MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+ sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
+ MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+ sy = x[0];
+ MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+ for(i = 0; i < nq; i++) {
+ q[i] = x[i];
+ }
+ for(i = 0; i < nq; i++) {
+ gc += q[i];
+ }
+
+ time_stop(1,elapsed,start);
+ tm = time_read(1,elapsed);
+ MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+ tm = x[0];
+
+ if(node == root) {
+ nit = 0;
+ verified = true;
+
+ if(m == 24) {
+ sx_verify_value = -3.247834652034740E3;
+ sy_verify_value = -6.958407078382297E3;
+ } else if(m == 25) {
+ sx_verify_value = -2.863319731645753E3;
+ sy_verify_value = -6.320053679109499E3;
+ } else if(m == 28) {
+ sx_verify_value = -4.295875165629892E3;
+ sy_verify_value = -1.580732573678431E4;
+ } else if(m == 30) {
+ sx_verify_value = 4.033815542441498E4;
+ sy_verify_value = -2.660669192809235E4;
+ } else if(m == 32) {
+ sx_verify_value = 4.764367927995374E4;
+ sy_verify_value = -8.084072988043731E4;
+ } else if(m == 36) {
+ sx_verify_value = 1.982481200946593E5;
+ sy_verify_value = -1.020596636361769E5;
+ } else {
+ verified = false;
+ }
+
+ /*
+ fprintf(stdout,("sx = " + sx);
+ fprintf(stdout,("sx_verify = " + sx_verify_value);
+ fprintf(stdout,("sy = " + sy);
+ fprintf(stdout,("sy_verify = " + sy_verify_value);
+ */
+ if(verified) {
+ sx_err = abs((sx - sx_verify_value)/sx_verify_value);
+ sy_err = abs((sy - sy_verify_value)/sy_verify_value);
+ /*
+ fprintf(stdout,("sx_err = " + sx_err);
+ fprintf(stdout,("sy_err = " + sx_err);
+ fprintf(stdout,("epsilon= " + epsilon);
+ */
+ verified = ((sx_err < epsilon) && (sy_err < epsilon));
+ }
+
+ Mops = (pow(2.0, m+1))/tm/1000;
+
+ fprintf(stdout,"EP Benchmark Results:\n");
+ fprintf(stdout,"CPU Time=%d\n",(int) tm);
+ fprintf(stdout,"N = 2^%d\n",m);
+ fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
+ fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
+ fprintf(stdout,"Count:");
+ for(i = 0; i < nq; i++) {
+ fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
+ }
+ c_print_results("EP", class, m+1, 0, 0, nit, nprocs, no_nodes, tm, Mops, "Random number generated",verified);
+
+ fprintf(stdout,"Total time: %f\n",(time_read(1,elapsed)/1000));
+ fprintf(stdout,"Gaussian pairs: %f\n",(time_read(2,elapsed)/1000));
+ fprintf(stdout,"Random numbers: %f\n",(time_read(3,elapsed)/1000));
+ }
+
+ MPI_Finalize();
+ return 0;
+}
--- /dev/null
+/*************************************************************************
+ * *
+ * N A S P A R A L L E L B E N C H M A R K S 3.3 *
+ * *
+ * I S *
+ * *
+ *************************************************************************
+ * *
+ * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
+ * It is described in NAS Technical Report 95-020. *
+ * *
+ * Permission to use, copy, distribute and modify this software *
+ * for any purpose with or without fee is hereby granted. We *
+ * request, however, that all derived work reference the NAS *
+ * Parallel Benchmarks 3.3. This software is provided "as is" *
+ * without express or implied warranty. *
+ * *
+ * Information on NPB 3.3, including the technical report, the *
+ * original specifications, source code, results and information *
+ * on how to submit new results, is available at: *
+ * *
+ * http://www.nas.nasa.gov/Software/NPB *
+ * *
+ * Send comments or suggestions to npb@nas.nasa.gov *
+ * Send bug reports to npb-bugs@nas.nasa.gov *
+ * *
+ * NAS Parallel Benchmarks Group *
+ * NASA Ames Research Center *
+ * Mail Stop: T27A-1 *
+ * Moffett Field, CA 94035-1000 *
+ * *
+ * E-mail: npb@nas.nasa.gov *
+ * Fax: (650) 604-3957 *
+ * *
+ *************************************************************************
+ * *
+ * Author: M. Yarrow *
+ * H. Jin *
+ * *
+ *************************************************************************/
+
+#include "smpi/mpi.h"
+#include "nas_common.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "simgrid/instr.h" //TRACE_
+
+char class;
+int nprocs;
+int total_keys_log2;
+int max_key_log_2;
+int num_bucket_log_2;
+int min_procs=1;
+/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024.
+ * INCREASE max_procs AT YOUR PERIL
+ */
+int max_procs=1024;
+
+int total_keys;
+int max_key;
+int num_buckets;
+int num_keys;
+long size_of_buffers;
+
+#define MAX_ITERATIONS 10
+#define TEST_ARRAY_SIZE 5
+
+/* Typedef: if necessary, change the size of int here by changing the int type to, say, long */
+typedef int INT_TYPE;
+typedef long INT_TYPE2;
+#define MP_KEY_TYPE MPI_INT
+
+typedef struct {
+/* MPI properties: */
+int my_rank, comm_size;
+/* Some global info */
+INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
+ total_local_keys, /* copies of rank info */
+ total_lesser_keys;
+
+int passed_verification;
+/* These are the three main arrays. See SIZE_OF_BUFFERS def above */
+INT_TYPE *key_array, *key_buff1, *key_buff2,
+ *bucket_size, /* Top 5 elements for */
+ *bucket_size_totals, /* part. ver. vals */
+ *bucket_ptrs, *process_bucket_distrib_ptr1, *process_bucket_distrib_ptr2;
+int send_count[1024], recv_count[1024], send_displ[1024], recv_displ[1024];
+
+/* Partial verif info */
+INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
+ test_rank_array[TEST_ARRAY_SIZE];
+} global_data;
+
+const INT_TYPE2
+ S_test_index_array[TEST_ARRAY_SIZE] = {48427,17148,23627,62548,4431},
+ S_test_rank_array[TEST_ARRAY_SIZE] = {0,18,346,64917,65463},
+ W_test_index_array[TEST_ARRAY_SIZE] = {357773,934767,875723,898999,404505},
+ W_test_rank_array[TEST_ARRAY_SIZE] = {1249,11698,1039987,1043896,1048018},
+
+ A_test_index_array[TEST_ARRAY_SIZE] = {2112377,662041,5336171,3642833,4250760},
+ A_test_rank_array[TEST_ARRAY_SIZE] = {104,17523,123928,8288932,8388264},
+
+ B_test_index_array[TEST_ARRAY_SIZE] = {41869,812306,5102857,18232239,26860214},
+ B_test_rank_array[TEST_ARRAY_SIZE] = {33422937,10244,59149,33135281,99},
+
+ C_test_index_array[TEST_ARRAY_SIZE] = {44172927,72999161,74326391,129606274,21736814},
+ C_test_rank_array[TEST_ARRAY_SIZE] = {61147,882988,266290,133997595,133525895},
+
+ D_test_index_array[TEST_ARRAY_SIZE] = {1317351170,995930646,1157283250,1503301535,1453734525},
+ D_test_rank_array[TEST_ARRAY_SIZE] = {1,36538729,1978098519,2145192618,2147425337};
+
+void full_verify( global_data* gd );
+
+/************ returns parallel random number seq seed ************/
+/*
+ * Create a random number sequence of total length nn residing on np number of processors. Each processor will
+ * therefore have a subsequence of length nn/np. This routine returns that random number which is the first random
+ * number for the subsequence belonging to processor rank kn, and which is used as seed for proc kn ran # gen.
+ */
+static double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
+ int np, /* np = num procs */
+ long nn, /* total num of ran numbers, all procs */
+ double s, /* Ran num seed, for ex.: 314159265.00 */
+ double a ) /* Ran num gen mult, try 1220703125.00 */
+{
+ long i;
+ double t1,t2,t3,an;
+ long mq,nq,kk,ik;
+
+ nq = nn / np;
+
+ for( mq=0; nq>1; mq++,nq/=2);
+
+ t1 = a;
+
+ for( i=1; i<=mq; i++ )
+ t2 = randlc( &t1, &t1 );
+
+ an = t1;
+
+ kk = kn;
+ t1 = s;
+ t2 = an;
+
+ for( i=1; i<=100; i++ ){
+ ik = kk / 2;
+ if( 2 * ik != kk )
+ t3 = randlc( &t1, &t2 );
+ if( ik == 0 )
+ break;
+ t3 = randlc( &t2, &t2 );
+ kk = ik;
+ }
+ an=t3;//added to silence paranoid compilers
+
+ return t1;
+}
+
+static void create_seq( global_data* gd, double seed, double a )
+{
+ double x;
+ int i, k;
+
+ k = max_key/4;
+
+ for (i=0; i<num_keys; i++){
+ x = randlc(&seed, &a);
+ x += randlc(&seed, &a);
+ x += randlc(&seed, &a);
+ x += randlc(&seed, &a);
+
+ gd->key_array[i] = k*x;
+ }
+}
+
+void full_verify( global_data* gd )
+{
+ MPI_Status status;
+ MPI_Request request;
+
+ INT_TYPE i, j;
+ INT_TYPE k, last_local_key;
+
+/* Now, finally, sort the keys: */
+ for( i=0; i<gd->total_local_keys; i++ )
+ gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- gd->total_lesser_keys] = gd->key_buff2[i];
+ last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
+
+/* Send largest key value to next processor */
+ if( gd->my_rank > 0 )
+ MPI_Irecv( &k, 1, MP_KEY_TYPE, gd->my_rank-1, 1000, MPI_COMM_WORLD, &request );
+ if( gd->my_rank < gd->comm_size-1 )
+ MPI_Send( &gd->key_array[last_local_key], 1, MP_KEY_TYPE, gd->my_rank+1, 1000, MPI_COMM_WORLD );
+ if( gd->my_rank > 0 )
+ MPI_Wait( &request, &status );
+
+/* Confirm that neighbor's greatest key value is not greater than my least key value */
+ j = 0;
+ if( gd->my_rank > 0 && gd->total_local_keys > 0 )
+ if( k > gd->key_array[0] )
+ j++;
+
+/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
+ for( i=1; i<gd->total_local_keys; i++ )
+ if( gd->key_array[i-1] > gd->key_array[i] )
+ j++;
+
+ if( j != 0 ) {
+ printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", gd->my_rank, j );
+ } else
+ gd->passed_verification++;
+}
+
+static void rank( global_data* gd, int iteration )
+{
+ INT_TYPE i, k;
+ INT_TYPE shift = max_key_log_2 - num_bucket_log_2;
+ INT_TYPE key;
+ INT_TYPE2 bucket_sum_accumulator, j, m;
+ INT_TYPE local_bucket_sum_accumulator;
+ INT_TYPE min_key_val, max_key_val;
+ INT_TYPE *key_buff_ptr;
+
+/* Iteration alteration of keys */
+ if(gd->my_rank == 0){
+ gd->key_array[iteration] = iteration;
+ gd->key_array[iteration+MAX_ITERATIONS] = max_key - iteration;
+ }
+
+/* Initialize */
+ for( i=0; i<num_buckets+TEST_ARRAY_SIZE; i++ ){
+ gd->bucket_size[i] = 0;
+ gd->bucket_size_totals[i] = 0;
+ gd->process_bucket_distrib_ptr1[i] = 0;
+ gd->process_bucket_distrib_ptr2[i] = 0;
+ }
+
+/* Determine where the partial verify test keys are, load into top of array bucket_size */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ )
+ if( (gd->test_index_array[i]/num_keys) == gd->my_rank )
+ gd->bucket_size[num_buckets+i] = gd->key_array[gd->test_index_array[i] % num_keys];
+
+/* Determine the number of keys in each bucket */
+ for( i=0; i<num_keys; i++ )
+ gd->bucket_size[gd->key_array[i] >> shift]++;
+
+/* Accumulative bucket sizes are the bucket pointers */
+ gd->bucket_ptrs[0] = 0;
+ for( i=1; i< num_buckets; i++ )
+ gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
+
+/* Sort into appropriate bucket */
+ for( i=0; i<num_keys; i++ ) {
+ key = gd->key_array[i];
+ gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
+ }
+
+/* Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */
+ MPI_Allreduce(gd->bucket_size, gd->bucket_size_totals, num_buckets+TEST_ARRAY_SIZE, MP_KEY_TYPE, MPI_SUM,
+ MPI_COMM_WORLD);
+
+/* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses num_keys (which the
+ * average number of keys per processor). Then all keys in these buckets go to processor 0.
+ Continue accumulating again until supassing 2*num_keys. All keys in these buckets go to processor 1, etc. This
+ algorithm guarantees that all processors have work ranking; no processors are left idle.
+ The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution
+ of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more
+ computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested.
+ Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each
+ processor will have after the redistribution is done.
+*/
+
+ bucket_sum_accumulator = 0;
+ local_bucket_sum_accumulator = 0;
+ gd->send_displ[0] = 0;
+ gd->process_bucket_distrib_ptr1[0] = 0;
+ for( i=0, j=0; i<num_buckets; i++ ) {
+ bucket_sum_accumulator += gd->bucket_size_totals[i];
+ local_bucket_sum_accumulator += gd->bucket_size[i];
+ if( bucket_sum_accumulator >= (j+1)*num_keys ) {
+ gd->send_count[j] = local_bucket_sum_accumulator;
+ if( j != 0 ){
+ gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
+ gd->process_bucket_distrib_ptr1[j] = gd->process_bucket_distrib_ptr2[j-1]+1;
+ }
+ gd->process_bucket_distrib_ptr2[j++] = i;
+ local_bucket_sum_accumulator = 0;
+ }
+ }
+
+/* When nprocs approaching num_buckets, it is highly possible that the last few processors don't get any buckets.
+ * So, we need to set counts properly in this case to avoid any fallouts. */
+ while( j < gd->comm_size ) {
+ gd->send_count[j] = 0;
+ gd->process_bucket_distrib_ptr1[j] = 1;
+ j++;
+ }
+
+/* This is the redistribution section: first find out how many keys
+ each processor will send to every other processor: */
+ MPI_Alltoall( gd->send_count, 1, MPI_INT, gd->recv_count, 1, MPI_INT, MPI_COMM_WORLD );
+
+/* Determine the receive array displacements for the buckets */
+ gd->recv_displ[0] = 0;
+ for( i=1; i<gd->comm_size; i++ )
+ gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
+
+ /* Now send the keys to respective processors */
+ MPI_Alltoallv(gd->key_buff1, gd->send_count, gd->send_displ, MP_KEY_TYPE, gd->key_buff2, gd->recv_count,
+ gd->recv_displ, MP_KEY_TYPE, MPI_COMM_WORLD );
+
+/* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to
+ * obtain the smallest possible min and greatest possible max value of any key on each processor
+ */
+ min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
+ max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
+
+/* Clear the work array */
+ for( i=0; i<max_key_val-min_key_val+1; i++ )
+ gd->key_buff1[i] = 0;
+
+/* Determine the total number of keys on all other processors holding keys of lesser value */
+ m = 0;
+ for( k=0; k<gd->my_rank; k++ )
+ for( i= gd->process_bucket_distrib_ptr1[k]; i<=gd->process_bucket_distrib_ptr2[k]; i++ )
+ m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
+
+/* Determine total number of keys on this processor */
+ j = 0;
+ for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; i++ )
+ j += gd->bucket_size_totals[i]; /* j has total # of local keys */
+
+/* Ranking of all keys occurs in this section: */
+/* shift it backwards so no subtractions are necessary in loop */
+ key_buff_ptr = gd->key_buff1 - min_key_val;
+
+/* In this section, the keys themselves are used as their own indexes to determine how many of each there are: their
+ individual population */
+ for( i=0; i<j; i++ )
+ key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key population */
+
+/* To obtain ranks of each key, successively add the individual key population, not forgetting the total of lesser
+ * keys, m.
+ NOTE: Since the total of lesser keys would be subtracted later in verification, it is no longer added to the first
+ key population here, but still needed during the partial verify test. This is to ensure that 32-bit key_buff can
+ still be used for class D. */
+/* key_buff_ptr[min_key_val] += m; */
+ for( i=min_key_val; i<max_key_val; i++ )
+ key_buff_ptr[i+1] += key_buff_ptr[i];
+
+/* This is the partial verify test section */
+/* Observe that test_rank_array vals are shifted differently for different cases */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ ){
+ k = gd->bucket_size_totals[i+num_buckets]; /* Keys were hidden here */
+ if( min_key_val <= k && k <= max_key_val ){
+ /* Add the total of lesser keys, m, here */
+ INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
+ int failed = 0;
+
+ switch( class ){
+ case 'S':
+ if( i <= 2 ) {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ } else {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'W':
+ if( i < 2 ){
+ if( key_rank != gd->test_rank_array[i]+(iteration-2) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ } else {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'A':
+ if( i <= 2 ){
+ if( key_rank != gd->test_rank_array[i]+(iteration-1) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ } else {
+ if( key_rank != gd->test_rank_array[i]-(iteration-1) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'B':
+ if( i == 1 || i == 2 || i == 4 ) {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ } else {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'C':
+ if( i <= 2 ){
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ } else {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'D':
+ if( i < 2 ) {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ } else {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ }
+ if( failed == 1 )
+ printf( "Failed partial verification: iteration %d, processor %d, test key %d\n",
+ iteration, gd->my_rank, (int)i );
+ }
+ }
+
+/* Make copies of rank info for use by full_verify: these variables in rank are local; making them global slows down
+ * the code, probably since they cannot be made register by compiler */
+
+ if( iteration == MAX_ITERATIONS ) {
+ gd->key_buff_ptr_global = key_buff_ptr;
+ gd->total_local_keys = j;
+ gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
+ }
+}
+
+int main( int argc, char **argv )
+{
+ int i, iteration, itemp;
+ double timecounter, maxtime;
+
+ global_data* gd = malloc(sizeof(global_data));
+/* Initialize MPI */
+ MPI_Init( &argc, &argv );
+ MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
+ MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
+
+ get_info(argc, argv, &nprocs, &class);
+ check_info(IS, nprocs, class);
+/* Initialize the verification arrays if a valid class */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ )
+
+ switch( class ) {
+ case 'S':
+ total_keys_log2 = 16;
+ max_key_log_2 = 11;
+ num_bucket_log_2 = 9;
+ max_procs = 128;
+ gd->test_index_array[i] = S_test_index_array[i];
+ gd->test_rank_array[i] = S_test_rank_array[i];
+ break;
+ case 'A':
+ total_keys_log2 = 23;
+ max_key_log_2 = 19;
+ num_bucket_log_2 = 10;
+ gd->test_index_array[i] = A_test_index_array[i];
+ gd->test_rank_array[i] = A_test_rank_array[i];
+ break;
+ case 'W':
+ total_keys_log2 = 20;
+ max_key_log_2 = 16;
+ num_bucket_log_2 = 10;
+ gd->test_index_array[i] = W_test_index_array[i];
+ gd->test_rank_array[i] = W_test_rank_array[i];
+ break;
+ case 'B':
+ total_keys_log2 = 25;
+ max_key_log_2 = 21;
+ num_bucket_log_2 = 10;
+ gd->test_index_array[i] = B_test_index_array[i];
+ gd->test_rank_array[i] = B_test_rank_array[i];
+ break;
+ case 'C':
+ total_keys_log2 = 27;
+ max_key_log_2 = 23;
+ num_bucket_log_2 = 10;
+ gd->test_index_array[i] = C_test_index_array[i];
+ gd->test_rank_array[i] = C_test_rank_array[i];
+ break;
+ case 'D':
+ total_keys_log2 = 29;
+ max_key_log_2 = 27;
+ num_bucket_log_2 = 10;
+ min_procs = 4;
+ gd->test_index_array[i] = D_test_index_array[i];
+ gd->test_rank_array[i] = D_test_rank_array[i];
+ break;
+ };
+
+ total_keys = (1 << total_keys_log2);
+ max_key = (1 << max_key_log_2);
+ num_buckets = (1 << num_bucket_log_2);
+ num_keys = (total_keys/nprocs*min_procs);
+
+ /* On larger number of processors, since the keys are (roughly) gaussian distributed, the first and last processor
+ * sort keys in a large interval, requiring array sizes to be larger. Note that for large NUM_PROCS, num_keys is,
+ * however, a small number The required array size also depends on the bucket size used. The following values are
+ * validated for the 1024-bucket setup. */
+ if (nprocs < 256)
+ size_of_buffers = 3*num_keys/2;
+ else if (nprocs < 512)
+ size_of_buffers = 5*num_keys/2;
+ else if (nprocs < 1024)
+ size_of_buffers = 4*num_keys/2;
+ else
+ size_of_buffers = 13*num_keys/2;
+
+ gd->key_array = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
+ gd->key_buff1 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
+ gd->key_buff2 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
+ gd->bucket_size = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* Top 5 elements for */
+ gd->bucket_size_totals = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* part. ver. vals */
+ gd->bucket_ptrs = (INT_TYPE*)malloc(num_buckets*sizeof(INT_TYPE));
+ gd->process_bucket_distrib_ptr1 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
+ gd->process_bucket_distrib_ptr2 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
+// int send_count[max_procs], recv_count[max_procs],
+// send_displ[max_procs], recv_displ[max_procs];
+
+/* Printout initial NPB info */
+ if( gd->my_rank == 0 ){
+ printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
+ printf( " Size: %ld (class %c)\n", (long)total_keys*min_procs, class);
+ printf( " Iterations: %d\n", MAX_ITERATIONS );
+ printf( " Number of processes: %d\n",gd->comm_size );
+ }
+
+/* Check that actual and compiled number of processors agree */
+ if( gd->comm_size != nprocs) {
+ if( gd->my_rank == 0 )
+ printf( "\n ERROR: compiled for %d processes\n"
+ " Number of active processes: %d\n"
+ " Exiting program!\n\n", nprocs, gd->comm_size );
+ MPI_Finalize();
+ exit( 1 );
+ }
+
+/* Check to see whether total number of processes is within bounds.
+ This could in principle be checked in setparams.c, but it is more convenient to do it here */
+ if( gd->comm_size < min_procs || gd->comm_size > max_procs){
+ if( gd->my_rank == 0 )
+ printf( "\n ERROR: number of processes %d not within range %d-%d"
+ "\n Exiting program!\n\n", gd->comm_size, min_procs, max_procs);
+ MPI_Finalize();
+ exit( 1 );
+ }
+
+/* Generate random number sequence and subsequent keys on all procs */
+ create_seq(gd, find_my_seed( gd->my_rank, gd->comm_size, 4*(long)total_keys*min_procs,
+ 314159265.00, /* Random number gen seed */
+ 1220703125.00 ), /* Random number gen mult */
+ 1220703125.00 ); /* Random number gen mult */
+
+/* Do one interation for free (i.e., untimed) to guarantee initialization of
+ all data and code pages and respective tables */
+ rank(gd, 1 );
+
+/* Start verification counter */
+ gd->passed_verification = 0;
+
+ if( gd->my_rank == 0 && class != 'S' ) printf( "\n iteration\n" );
+
+/* Initialize timer */
+ timer_clear(0);
+
+/* Start timer */
+ timer_start(0);
+
+ char smpi_category[100];
+ snprintf (smpi_category, 100, "%d", gd->my_rank);
+ TRACE_smpi_set_category (smpi_category);
+
+/* This is the main iteration */
+ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) {
+ if( gd->my_rank == 0 && class != 'S' ) printf( " %d\n", iteration );
+ rank(gd, iteration );
+ }
+ TRACE_smpi_set_category (NULL);
+
+/* Stop timer, obtain time for processors */
+ timer_stop(0);
+
+ timecounter = timer_read(0);
+
+/* End of timing, obtain maximum time of all processors */
+ MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
+
+/* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */
+ full_verify(gd);
+
+/* Obtain verification counter sum */
+ itemp =gd->passed_verification;
+ MPI_Reduce( &itemp, &gd->passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );
+
+/* The final printout */
+ if( gd->my_rank == 0 ) {
+ if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
+ gd->passed_verification = 0;
+ c_print_results("IS", class, (int)(total_keys), min_procs, 0, MAX_ITERATIONS, nprocs, gd->comm_size, maxtime,
+ ((double) (MAX_ITERATIONS)*total_keys*min_procs)/maxtime/1000000., "keys ranked",
+ gd->passed_verification);
+ }
+
+ MPI_Finalize();
+ free(gd);
+
+ return 0;
+}
--- /dev/null
+/* Copyright (c) 2016. The SimGrid Team.
+ * All rights reserved. */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+#include "nas_common.h"
+
+static double start[64], elapsed[64];
+
+/* integer log base two. Return error is argument isn't a power of two or is less than or equal to zero */
+int ilog2(int i)
+{
+ int log2;
+ int exp2 = 1;
+ if (i <= 0) return(-1);
+
+ for (log2 = 0; log2 < 20; log2++) {
+ if (exp2 == i) return(log2);
+ exp2 *= 2;
+ }
+ return(-1);
+}
+
+/* get_info(): Get parameters from command line */
+void get_info(int argc, char *argv[], int *nprocsp, char *classp)
+{
+ if (argc < 3) {
+ printf("Usage: %s (%d) nprocs class\n", argv[0], argc);
+ exit(1);
+ }
+
+ *nprocsp = atoi(argv[1]);
+ *classp = *argv[2];
+}
+
+/* check_info(): Make sure command line data is ok for this benchmark */
+void check_info(int type, int nprocs, char class)
+{
+ int logprocs;
+
+ /* check number of processors */
+ if (nprocs <= 0) {
+ printf("setparams: Number of processors must be greater than zero\n");
+ exit(1);
+ }
+ switch(type) {
+ case IS:
+ logprocs = ilog2(nprocs);
+ if (logprocs < 0) {
+ printf("setparams: Number of processors must be a power of two (1,2,4,...) for this benchmark\n");
+ exit(1);
+ }
+ break;
+ case EP:
+ case DT:
+ break;
+ default:
+ /* never should have gotten this far with a bad name */
+ printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type);
+ exit(1);
+ }
+
+ /* check class */
+ if (class != 'S' && class != 'W' && class != 'A' && class != 'B' && class != 'C' && class != 'D' && class != 'E') {
+ printf("setparams: Unknown benchmark class %c\n", class);
+ printf("setparams: Allowed classes are \"S\", \"W\", and \"A\" through \"E\"\n");
+ exit(1);
+ }
+
+ if (class == 'E' && (type == IS || type == DT)) {
+ printf("setparams: Benchmark class %c not defined for IS or DT\n", class);
+ exit(1);
+ }
+
+ if (class == 'D' && type == IS && nprocs < 4) {
+ printf("setparams: IS class D size cannot be run on less than 4 processors\n");
+ exit(1);
+ }
+}
+
+void time_clear(double *onetimer) {
+ *onetimer = 0.0;
+}
+
+void time_start(double *onetimer) {
+ *onetimer = MPI_Wtime();
+}
+
+void time_stop(int n,double *elapsed,double *start) {
+ elapsed[n] = MPI_Wtime()- start[n];
+}
+
+double time_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */
+ return(elapsed[n]);
+}
+
+void timer_clear(int n)
+{
+ elapsed[n] = 0.0;
+}
+
+void timer_start(int n)
+{
+ start[n] = MPI_Wtime();
+}
+
+void timer_stop(int n)
+{
+ elapsed[n] += MPI_Wtime() - start[n];
+}
+
+double timer_read(int n)
+{
+ return elapsed[n];
+}
+
+double vranlc(int n, double x, double a, double *y)
+{
+ int i;
+ long i246m1=0x00003FFFFFFFFFFF;
+ long LLx, Lx, La;
+ double d2m46;
+
+// This doesn't work, because the compiler does the calculation in 32 bits and overflows. No standard way (without
+// f90 stuff) to specifythat the rhs should be done in 64 bit arithmetic.
+// parameter(i246m1=2**46-1)
+
+ d2m46=pow(0.5,46);
+
+ Lx = (long)x;
+ La = (long)a;
+ //fprintf(stdout,("================== Vranlc ================");
+ //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
+ LLx = Lx;
+ for (i=0; i< n; i++) {
+ Lx = Lx*La & i246m1 ;
+ LLx = Lx;
+ y[i] = d2m46 * (double)LLx;
+ /*
+ if(i == 0) {
+ fprintf(stdout,("After loop 0:");
+ fprintf(stdout,("Lx = " + Lx + ", La = " + La);
+ fprintf(stdout,("d2m46 = " + d2m46);
+ fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
+ fprintf(stdout,("Y[0]" + y[0]);
+ }
+ */
+ }
+
+ x = (double)LLx;
+ /*
+ fprintf(stdout,("Change: Lx = " + Lx);
+ fprintf(stdout,("=============End Vranlc ================");
+ */
+ return x;
+}
+
+/*
+ * FUNCTION RANDLC (X, A)
+ *
+ * This routine returns a uniform pseudorandom double precision number in the
+ * range (0, 1) by using the linear congruential generator
+ *
+ * x_{k+1} = a x_k (mod 2^46)
+ *
+ * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+ * before repeating. The argument A is the same as 'a' in the above formula,
+ * and X is the same as x_0. A and X must be odd double precision integers
+ * in the range (1, 2^46). The returned value RANDLC is normalized to be
+ * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+ * the new seed x_1, so that subsequent calls to RANDLC using the same
+ * arguments will generate a continuous sequence.
+ *
+ * This routine should produce the same results on any computer with at least
+ * 48 mantissa bits in double precision floating point data. On Cray systems,
+ * double precision should be disabled.
+ *
+ * David H. Bailey October 26, 1990
+ *
+ * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ * SAVE KS, R23, R46, T23, T46
+ * DATA KS/0/
+ *
+ * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
+ * by merely using the ** operator, in order to insure that the results are
+ * exact on all systems. This code assumes that 0.5D0 is represented exactly.
+ */
+double randlc(double *X, double*A)
+{
+ static int KS=0;
+ static double R23, R46, T23, T46;
+ double T1, T2, T3, T4;
+ double A1, A2;
+ double X1, X2;
+ double Z;
+ int i, j;
+
+ if (KS == 0) {
+ R23 = 1.0;
+ R46 = 1.0;
+ T23 = 1.0;
+ T46 = 1.0;
+
+ for (i=1; i<=23; i++) {
+ R23 = 0.50 * R23;
+ T23 = 2.0 * T23;
+ }
+ for (i=1; i<=46; i++) {
+ R46 = 0.50 * R46;
+ T46 = 2.0 * T46;
+ }
+ KS = 1;
+ }
+
+/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
+ T1 = R23 * *A;
+ j = T1;
+ A1 = j;
+ A2 = *A - T23 * A1;
+
+/* Break X into two parts such that X = 2^23 * X1 + X2, compute
+ Z = A1 * X2 + A2 * X1 (mod 2^23), and then X = 2^23 * Z + A2 * X2 (mod 2^46). */
+ T1 = R23 * *X;
+ j = T1;
+ X1 = j;
+ X2 = *X - T23 * X1;
+ T1 = A1 * X2 + A2 * X1;
+
+ j = R23 * T1;
+ T2 = j;
+ Z = T1 - T23 * T2;
+ T3 = T23 * Z + A2 * X2;
+ j = R46 * T3;
+ T4 = j;
+ *X = T3 - T46 * T4;
+ return(R46 * *X);
+}
+
+void c_print_results(const char *name, char class, int n1, int n2, int n3, int niter, int nprocs_compiled,
+ int nprocs_total, double t, double mops, const char *optype, int passed_verification)
+{
+ printf( "\n\n %s Benchmark Completed\n", name );
+ printf( " Class = %c\n", class );
+
+ if( n3 == 0 ) {
+ long nn = n1;
+ if ( n2 != 0 ) nn *= n2;
+ printf( " Size = %12ld\n", nn ); /* as in IS */
+ } else
+ printf( " Size = %3dx %3dx %3d\n", n1,n2,n3 );
+
+ printf( " Iterations = %12d\n", niter );
+ printf( " Time in seconds = %12.2f\n", t );
+ printf( " Total processes = %12d\n", nprocs_total );
+
+ if ( nprocs_compiled != 0 )
+ printf( " Compiled procs = %12d\n", nprocs_compiled );
+
+ printf( " Mop/s total = %12.2f\n", mops );
+ printf( " Mop/s/process = %12.2f\n", mops/((float) nprocs_total) );
+ printf( " Operation type = %24s\n", optype);
+
+ if( passed_verification )
+ printf( " Verification = SUCCESSFUL\n" );
+ else
+ printf( " Verification = UNSUCCESSFUL\n" );
+}
--- /dev/null
+/* Copyright (c) 2016. The SimGrid Team.
+ * All rights reserved. */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+#ifndef NAS_COMMON_H
+#define NAS_COMMON_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "mpi.h"
+
+enum benchmark_types {IS, DT, EP};
+
+int ilog2(int i);
+void timer_clear(int n);
+void timer_start(int n);
+void timer_stop(int n);
+double timer_read(int n);
+
+void time_clear(double *onetimer);
+void time_start(double *onetimer);
+void time_stop(int n,double *elapsed,double *start);
+double time_read(int n, double *elapsed);
+
+double vranlc(int n, double x, double a, double *y);
+double randlc(double *X, double*A);
+
+void c_print_results(const char *name, char class, int n1, int n2, int n3, int niter, int nprocs_compiled,
+ int nprocs_total, double t, double mops, const char *optype, int passed_verification);
+
+void get_info(int argc, char *argv[], int *nprocsp, char *classp);
+void check_info(int type, int nprocs, char class);
+
+#endif
+++ /dev/null
-include ../config/make.def
-all: setparams
-
-# setparams creates an npbparam.h file for each benchmark configuration.
-# npbparams.h also contains info about how a benchmark was compiled and linked
-
-setparams: setparams.c ../config/make.def
- $(CC) ${CONVERTFLAG} -o setparams setparams.c
-
-clean:
- -rm -f setparams setparams.h npbparams.h *~ *.o
-
+++ /dev/null
-This directory contains utilities and files used by the
-build process. You should not need to change anything
-in this directory.
-
-Original Files
---------------
-setparams.c:
- Source for the setparams program. This program is used internally
- in the build process to create the file "npbparams.h" for each
- benchmark. npbparams.h contains Fortran or C parameters to build a
- benchmark for a specific class and number of nodes. The setparams
- program is never run directly by a user. Its invocation syntax is
- "setparams benchmark-name nprocs class".
- It examines the file "npbparams.h" in the current directory. If
- the specified parameters are the same as those in the npbparams.h
- file, nothing it changed. If the file does not exist or corresponds
- to a different class/number of nodes, it is (re)built.
- One of the more complicated things in npbparams.h is that it
- contains, in a Fortran string, the compiler flags used to build a
- benchmark, so that a benchmark can print out how it was compiled.
-
-make.common
- A makefile segment that is included in each individual benchmark
- program makefile. It sets up some standard macros (COMPILE, etc)
- and makes sure everything is configured correctly (npbparams.h)
-
-Makefile
- Builds setparams
-
-README
- This file.
-
-Created files
--------------
-
-setparams
- See descriptions above
-
+++ /dev/null
-PROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS).$(NPROCS)
-CCOMPILE = $(MPICC) -c $(CMPI_INC) $(CFLAGS)
-
-# Class "U" is used internally by the setparams program to mean
-# "unknown". This means that if you don't specify CLASS=
-# on the command line, you'll get an error. It would be nice
-# to be able to avoid this, but we'd have to get information
-# from the setparams back to the make program, which isn't easy.
-CLASS=U
-NPROCS=1
-
-default:: ${PROGRAM}
-
-# This makes sure the configuration utility setparams
-# is up to date.
-# Note that this must be run every time, which is why the
-# target does not exist and is not created.
-# If you create a file called "config" you will break things.
-config:
- @cd ../sys; ${MAKE} all
- ../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE}
-
-COMMON=../common
-$${COMMON}/c_randdp.o: ${COMMON}/randdp.c
- cd ${COMMON}; ${CCOMPILE} -o c_randdp.o randdp.c
-
-${COMMON}/c_print_results.o: ${COMMON}/c_print_results.c
- cd ${COMMON}; ${CCOMPILE} c_print_results.c
-
-${COMMON}/c_timers.o: ${COMMON}/c_timers.c
- cd ${COMMON}; ${CCOMPILE} c_timers.c
-
-# Normally setparams updates npbparams.h only if the settings (CLASS/NPROCS)
-# have changed. However, we also want to update if the compile options
-# may have changed (set in ../config/make.def).
-npbparams.h: ../config/make.def
- @ echo make.def modified. Rebuilding npbparams.h just in case
- rm -f npbparams.h
- ../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE}
-
-# So that "make benchmark-name" works
-${BENCHMARK}: default
-${BENCHMARKU}: default
-
-
+++ /dev/null
-echo ''
-echo ' To make a NAS benchmark type '
-echo ''
-echo ' make <benchmark-name> NPROCS=<number> CLASS=<class>'
-echo ''
-echo ' where <benchmark-name> is "ep", "dt", or "is"
-echo ' <number> is the number of processors'
-echo ' <class> is "S", "W", "A", "B", "C", or "D"'
-echo ''
-
+++ /dev/null
-/*
- * This utility configures a NPB to be built for a specific number
- * of nodes and a specific class. It creates a file "npbparams.h"
- * in the source directory. This file keeps state information about
- * which size of benchmark is currently being built (so that nothing
- * if unnecessarily rebuilt) and defines (through PARAMETER statements)
- * the number of nodes and class for which a benchmark is being built.
-
- * The utility takes 3 arguments:
- * setparams benchmark-name nprocs class
- * benchmark-name is "ep", "dt", or "is"
- * nprocs is the number of processors to run on
- * class is the size of the benchmark
- * These parameters are checked for the current benchmark. If they
- * are invalid, this program prints a message and aborts.
- * If the parameters are ok, the current npbsize.h (actually just
- * the first line) is read in. If the new parameters are the same as
- * the old, nothing is done, but an exit code is returned to force the
- * user to specify (otherwise the make procedure succeeds but builds a
- * binary of the wrong name). Otherwise the file is rewritten.
- * Errors write a message (to stdout) and abort.
- *
- * This program makes use of two extra benchmark "classes"
- * class "X" means an invalid specification. It is returned if
- * there is an error parsing the config file.
- * class "U" is an external specification meaning "unknown class"
- *
- * Unfortunately everything has to be case sensitive. This is
- * because we can always convert lower to upper or v.v. but
- * can't feed this information back to the makefile, so typing
- * make CLASS=a and make CLASS=A will produce different binaries.
- *
- *
- */
-
-#include <sys/types.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <string.h>
-#include <time.h>
-
-/*
- * This is the master version number for this set of NPB benchmarks. It is in an obscure place so people
- * won't accidentally change it.
- */
-
-#define VERSION "3.3"
-
-/* controls verbose output from setparams */
-/* #define VERBOSE */
-
-#define FILENAME "npbparams.h"
-#define DESC_LINE "c NPROCS = %d CLASS = %c\n"
-#define DEF_CLASS_LINE "#define CLASS '%c'\n"
-#define DEF_NUM_PROCS_LINE "#define NUM_PROCS %d\n"
-#define FINDENT " "
-#define CONTINUE " > "
-
-void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp, int* subtypep);
-void check_info(int type, int nprocs, char class);
-void read_info(int type, int *nprocsp, char *classp, int *subtypep);
-void write_info(int type, int nprocs, char class, int subtype);
-void write_ep_info_C(FILE *fp, int nprocs, char class); /* after C translation */
-void write_is_info(FILE *fp, int nprocs, char class);
-void write_dt_info(FILE *fp, int nprocs, char class);
-void write_compiler_info(int type, FILE *fp);
-void check_line(char *line, char *label, char *val);
-int check_include_line(char *line, char *filename);
-void put_string(FILE *fp, char *name, char *val);
-void put_def_string(FILE *fp, char *name, char *val);
-void put_def_variable(FILE *fp, char *name, char *val);
-int isqrt(int i);
-int ilog2(int i);
-int ipow2(int i);
-
-enum benchmark_types {IS, DT, EP};
-
-int main(int argc, char *argv[])
-{
- int nprocs, nprocs_old, type;
- char class, class_old;
- int subtype = -1, old_subtype = -1;
-
- /* Get command line arguments. Make sure they're ok. */
- get_info(argc, argv, &type, &nprocs, &class, &subtype);
- if (class != 'U') {
-#ifdef VERBOSE
- printf("setparams: For benchmark %s: number of processors = %d class = %c\n",
- argv[1], nprocs, class);
-#endif
- check_info(type, nprocs, class);
- }
-
- /* Get old information. */
- read_info(type, &nprocs_old, &class_old, &old_subtype);
- if (class != 'U') {
- if (class_old != 'X') {
-#ifdef VERBOSE
- printf("setparams: old settings: number of processors = %d class = %c\n",
- nprocs_old, class_old);
-#endif
- }
- } else {
- printf("setparams:\n\
- *********************************************************************\n\
- * You must specify NPROCS and CLASS to build this benchmark *\n\
- * For example, to build a class A benchmark for 4 processors, type *\n\
- * make {benchmark-name} NPROCS=4 CLASS=A *\n\
- *********************************************************************\n\n");
-
- if (class_old != 'X') {
-#ifdef VERBOSE
- printf("setparams: Previous settings were CLASS=%c NPROCS=%d\n",
- class_old, nprocs_old);
-#endif
- }
- exit(1); /* exit on class==U */
- }
-
- /* Write out new information if it's different. */
- if (nprocs != nprocs_old || class != class_old || subtype != old_subtype) {
-#ifdef VERBOSE
- printf("setparams: Writing %s\n", FILENAME);
-#endif
- write_info(type, nprocs, class, subtype);
- } else {
-#ifdef VERBOSE
- printf("setparams: Settings unchanged. %s unmodified\n", FILENAME);
-#endif
- }
-
- return 0;
-}
-
-/* get_info(): Get parameters from command line */
-void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp, int *subtypep)
-{
- if (argc < 4) {
- printf("Usage: %s (%d) benchmark-name nprocs class\n", argv[0], argc);
- exit(1);
- }
-
- *nprocsp = atoi(argv[2]);
- *classp = *argv[3];
-
- if (!strcmp(argv[1], "is") || !strcmp(argv[1], "IS")) *typep = IS;
- else if (!strcmp(argv[1], "dt") || !strcmp(argv[1], "DT")) *typep = DT;
- else if (!strcmp(argv[1], "ep") || !strcmp(argv[1], "EP")) *typep = EP;
- else {
- printf("setparams: Error: unknown benchmark type %s\n", argv[1]);
- exit(1);
- }
-}
-
-/*
- * check_info(): Make sure command line data is ok for this benchmark
- */
-
-void check_info(int type, int nprocs, char class)
-{
- int rootprocs, logprocs;
-
- /* check number of processors */
- if (nprocs <= 0) {
- printf("setparams: Number of processors must be greater than zero\n");
- exit(1);
- }
- switch(type) {
- case IS:
- logprocs = ilog2(nprocs);
- if (logprocs < 0) {
- printf("setparams: Number of processors must be a power of two (1,2,4,...) for this benchmark\n");
- exit(1);
- }
- break;
-
- case EP:
- case DT:
- break;
-
- default:
- /* never should have gotten this far with a bad name */
- printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type);
- exit(1);
- }
-
- /* check class */
- if (class != 'S' &&
- class != 'W' &&
- class != 'A' &&
- class != 'B' &&
- class != 'C' &&
- class != 'D' &&
- class != 'E') {
- printf("setparams: Unknown benchmark class %c\n", class);
- printf("setparams: Allowed classes are \"S\", \"W\", and \"A\" through \"E\"\n");
- exit(1);
- }
-
- if (class == 'E' && (type == IS || type == DT)) {
- printf("setparams: Benchmark class %c not defined for IS or DT\n", class);
- exit(1);
- }
-
- if (class == 'D' && type == IS && nprocs < 4) {
- printf("setparams: IS class D size cannot be run on less than 4 processors\n");
- exit(1);
- }
-}
-
-/*
- * read_info(): Read previous information from file.
- * Not an error if file doesn't exist, because this may be the first time we're running.
- * Assumes the first two lines of the file is in a special format that we understand (since we wrote it).
- */
-
-void read_info(int type, int *nprocsp, char *classp, int *subtypep)
-{
- int nread = 0;
- FILE *fp;
- fp = fopen(FILENAME, "r");
- if (fp == NULL) {
-#ifdef VERBOSE
- printf("setparams: INFO: configuration file %s does not exist (yet)\n", FILENAME);
-#endif
- goto abort;
- }
-
- /* first two lines of file contains info */
- nread = fscanf(fp, DEF_CLASS_LINE, classp);
- nread += fscanf(fp, DEF_NUM_PROCS_LINE, nprocsp);
- if (nread != 2) {
- printf("setparams: Error line %d parsing config file %s. Ignoring previous settings\n", __LINE__,FILENAME);
- goto abort;
- }
-
- fclose(fp);
- return;
-
- abort:
- *nprocsp = -1;
- *classp = 'X';
- *subtypep = -1;
- return;
-}
-
-/*
- * write_info(): Write new information to config file.
- * First line is in a special format so we can read
- * it in again. Then comes a warning. The rest is all
- * specific to a particular benchmark.
- */
-
-void write_info(int type, int nprocs, char class, int subtype)
-{
- FILE *fp;
- char *BT_TYPES[] = {"NONE", "FULL", "SIMPLE", "EPIO", "FORTRAN"};
-
- fp = fopen(FILENAME, "w");
- if (fp == NULL) {
- printf("setparams: Can't open file %s for writing\n", FILENAME);
- exit(1);
- }
-
- fprintf(fp, DEF_CLASS_LINE, class);
- fprintf(fp, DEF_NUM_PROCS_LINE, nprocs);
- fprintf(fp, "\
-/*\n\
- This file is generated automatically by the setparams utility.\n\
- It sets the number of processors and the class of the NPB\n\
- in this directory. Do not modify it by hand. */\n\
- \n");
-
- /* Now do benchmark-specific stuff */
- switch(type) {
- case IS:
- write_is_info(fp, nprocs, class);
- break;
- case DT:
- write_dt_info(fp, nprocs, class);
- break;
- case EP:
- write_ep_info_C(fp, nprocs, class);
- break;
- default:
- printf("setparams: (Internal error): Unknown benchmark type %d\n", type);
- exit(1);
- }
- write_compiler_info(type, fp);
- fclose(fp);
- return;
-}
-
-/* write_dt_info(): Write DT specific info to config file */
-
-void write_dt_info(FILE *fp, int nprocs, char class)
-{
- int num_samples,deviation,num_sources;
- if (class == 'S') { num_samples=1728; deviation=128; num_sources=4; }
- else if (class == 'W') { num_samples=1728*8; deviation=128*2; num_sources=4*2; }
- else if (class == 'A') { num_samples=1728*64; deviation=128*4; num_sources=4*4; }
- else if (class == 'B') { num_samples=1728*512; deviation=128*8; num_sources=4*8; }
- else if (class == 'C') { num_samples=1728*4096; deviation=128*16; num_sources=4*16; }
- else if (class == 'D') { num_samples=1728*4096*8; deviation=128*32; num_sources=4*32; }
- else {
- printf("setparams: Internal error: invalid class type %c\n", class);
- exit(1);
- }
- fprintf(fp, "#define NUM_SAMPLES %d\n", num_samples);
- fprintf(fp, "#define STD_DEVIATION %d\n", deviation);
- fprintf(fp, "#define NUM_SOURCES %d\n", num_sources);
-}
-
-/* write_is_info(): Write IS specific info to config file */
-void write_is_info(FILE *fp, int nprocs, char class)
-{
- if( class != 'S' && class != 'W' && class != 'A' && class != 'B' && class != 'C' && class != 'D' )
- {
- printf("setparams: Internal error: invalid class type %c\n", class);
- exit(1);
- }
-}
-
-/* write_ep_info_C(): Write EP specific info to config file */
-void write_ep_info_C(FILE *fp, int nprocs, char class)
-{
- /* easiest way (given the way the benchmark is written) is to specify log of number of grid points in each
- * direction m1, m2, m3. nt is the number of iterations
- */
- int m;
- if (class == 'S') { m = 24; }
- else if (class == 'W') { m = 25; }
- else if (class == 'A') { m = 28; }
- else if (class == 'B') { m = 30; }
- else if (class == 'C') { m = 32; }
- else if (class == 'D') { m = 36; }
- else if (class == 'E') { m = 40; }
- else {
- printf("setparams: Internal error: invalid class type %c\n", class);
- exit(1);
- }
-
- /* number of processors given by "npm" */
- fprintf(fp, "%schar *_class=\"%c\";\n",FINDENT,class);
- fprintf(fp, "%sint m=%d;\n", FINDENT,m);
- fprintf(fp, "%sint npm=%d;\n", FINDENT,nprocs);
-}
-
-/*
- * This is a gross hack to allow the benchmarks to print out how they were compiled. Various other ways
- * of doing this have been tried and they all fail on some machine - due to a broken "make" program, or
- * F77 limitations, of whatever. Hopefully this will always work because it uses very portable C. Unfortunately
- * it relies on parsing the make.def file - YUK.
- * If your machine doesn't have <string.h> or <ctype.h>, happy hacking!
- */
-
-#define VERBOSE
-#define LL 400
-#include <stdio.h>
-#define DEFFILE "../config/make.def"
-#define DEFAULT_MESSAGE "(none)"
-FILE *deffile;
-void write_compiler_info(int type, FILE *fp)
-{
- char line[LL];
- char compiletime[LL], randfile[LL];
- char mpicc[LL], cflags[LL], clink[LL], clinkflags[LL],
- cmpi_lib[LL], cmpi_inc[LL];
- struct tm *tmp;
- time_t t;
- deffile = fopen(DEFFILE, "r");
- if (deffile == NULL) {
- printf("\n\
-setparams: File %s doesn't exist. To build the NAS benchmarks\n\
- you need to create is according to the instructions\n\
- in the README in the main directory and comments in \n\
- the file config/make.def.template\n", DEFFILE);
- exit(1);
- }
- strcpy(randfile, DEFAULT_MESSAGE);
- strcpy(mpicc, DEFAULT_MESSAGE);
- strcpy(cflags, DEFAULT_MESSAGE);
- strcpy(clink, DEFAULT_MESSAGE);
- strcpy(clinkflags, DEFAULT_MESSAGE);
- strcpy(cmpi_lib, DEFAULT_MESSAGE);
- strcpy(cmpi_inc, DEFAULT_MESSAGE);
-
- while (fgets(line, LL, deffile) != NULL) {
- if (*line == '#') continue;
- /* yes, this is inefficient. but it's simple! */
- check_line(line, "RAND", randfile);
- check_line(line, "MPICC", mpicc);
- check_line(line, "CFLAGS", cflags);
- check_line(line, "CLINK", clink);
- check_line(line, "CLINKFLAGS", clinkflags);
- check_line(line, "CMPI_LIB", cmpi_lib);
- check_line(line, "CMPI_INC", cmpi_inc);
- }
-
- (void) time(&t);
- tmp = localtime(&t);
- (void) strftime(compiletime, (size_t)LL, "%d %b %Y", tmp);
-
- put_def_string(fp, "COMPILETIME", compiletime);
- put_def_string(fp, "NPBVERSION", VERSION);
- put_def_string(fp, "MPICC", mpicc);
- put_def_string(fp, "CFLAGS", cflags);
- put_def_string(fp, "CLINK", clink);
- put_def_string(fp, "CLINKFLAGS", clinkflags);
- put_def_string(fp, "CMPI_LIB", cmpi_lib);
- put_def_string(fp, "CMPI_INC", cmpi_inc);
-}
-
-void check_line(char *line, char *label, char *val)
-{
- char *original_line;
- int n;
- original_line = line;
- /* compare beginning of line and label */
- while (*label != '\0' && *line == *label) {
- line++; label++;
- }
- /* if *label is not EOS, we must have had a mismatch */
- if (*label != '\0') return;
- /* if *line is not a space, actual label is longer than test label */
- if (!isspace(*line) && *line != '=') return ;
- /* skip over white space */
- while (isspace(*line)) line++;
- /* next char should be '=' */
- if (*line != '=') return;
- /* skip over white space */
- while (isspace(*++line));
- /* if EOS, nothing was specified */
- if (*line == '\0') return;
- /* finally we've come to the value */
- strcpy(val, line);
- /* chop off the newline at the end */
- n = strlen(val)-1;
- if (n >= 0 && val[n] == '\n')
- val[n--] = '\0';
- if (n >= 0 && val[n] == '\r')
- val[n--] = '\0';
- /* treat continuation */
- while (val[n] == '\\' && fgets(original_line, LL, deffile)) {
- line = original_line;
- while (isspace(*line)) line++;
- if (isspace(*original_line)) val[n++] = ' ';
- while (*line && *line != '\n' && *line != '\r' && n < LL-1)
- val[n++] = *line++;
- val[n] = '\0';
- n--;
- }
-/* if (val[strlen(val) - 1] == '\\') {
- printf("\n\
-setparams: Error in file make.def. Because of the way in which\n\
- command line arguments are incorporated into the\n\
- executable benchmark, you can't have any continued\n\
- lines in the file make.def, that is, lines ending\n\
- with the character \"\\\". Although it may be ugly, \n\
- you should be able to reformat without continuation\n\
- lines. The offending line is\n\
- %s\n", original_line);
- exit(1);
- } */
-}
-
-int check_include_line(char *line, char *filename)
-{
- char *include_string = "include";
- /* compare beginning of line and "include" */
- while (*include_string != '\0' && *line == *include_string) {
- line++; include_string++;
- }
- /* if *include_string is not EOS, we must have had a mismatch */
- if (*include_string != '\0') return(0);
- /* if *line is not a space, first word is not "include" */
- if (!isspace(*line)) return(0);
- /* skip over white space */
- while (isspace(*++line));
- /* if EOS, nothing was specified */
- if (*line == '\0') return(0);
- /* next keyword should be name of include file in *filename */
- while (*filename != '\0' && *line == *filename) {
- line++; filename++;
- }
- if (*filename != '\0' ||
- (*line != ' ' && *line != '\0' && *line !='\n')) return(0);
- else return(1);
-}
-
-#define MAXL 46
-void put_string(FILE *fp, char *name, char *val)
-{
- int len;
- len = strlen(val);
- if (len > MAXL) {
- val[MAXL] = '\0';
- val[MAXL-1] = '.';
- val[MAXL-2] = '.';
- val[MAXL-3] = '.';
- len = MAXL;
- }
- fprintf(fp, "%scharacter*%d %s\n", FINDENT, len, name);
- fprintf(fp, "%sparameter (%s=\'%s\')\n", FINDENT, name, val);
-}
-
-/* NOTE: is the ... stuff necessary in C? */
-void put_def_string(FILE *fp, char *name, char *val)
-{
- int len;
- len = strlen(val);
- if (len > MAXL) {
- val[MAXL] = '\0';
- val[MAXL-1] = '.';
- val[MAXL-2] = '.';
- val[MAXL-3] = '.';
- len = MAXL;
- }
- fprintf(fp, "#define %s \"%s\"\n", name, val);
-}
-
-void put_def_variable(FILE *fp, char *name, char *val)
-{
- int len;
- len = strlen(val);
- if (len > MAXL) {
- val[MAXL] = '\0';
- val[MAXL-1] = '.';
- val[MAXL-2] = '.';
- val[MAXL-3] = '.';
- len = MAXL;
- }
- fprintf(fp, "#define %s %s\n", name, val);
-}
-
-#if 0
-/* this version allows arbitrarily long lines but some compilers don't like that and they're rarely useful */
-
-#define LINELEN 65
-void put_string(FILE *fp, char *name, char *val)
-{
- int len, nlines, pos, i;
- char line[100];
- len = strlen(val);
- nlines = len/LINELEN;
- if (nlines*LINELEN < len) nlines++;
- fprintf(fp, "%scharacter*%d %s\n", FINDENT, nlines*LINELEN, name);
- fprintf(fp, "%sparameter (%s = \n", FINDENT, name);
- for (i = 0; i < nlines; i++) {
- pos = i*LINELEN;
- if (i == 0) fprintf(fp, "%s\'", CONTINUE);
- else fprintf(fp, "%s", CONTINUE);
- /* number should be same as LINELEN */
- fprintf(fp, "%.65s", val+pos);
- if (i == nlines-1) fprintf(fp, "\')\n");
- else fprintf(fp, "\n");
- }
-}
-#endif
-
-
-/* integer square root. Return error if argument isn't a perfect square or is less than or equal to zero */
-int isqrt(int i)
-{
- int root, square;
- if (i <= 0) return(-1);
- square = 0;
- for (root = 1; square <= i; root++) {
- square = root*root;
- if (square == i) return(root);
- }
- return(-1);
-}
-
-/* integer log base two. Return error is argument isn't a power of two or is less than or equal to zero */
-int ilog2(int i)
-{
- int log2;
- int exp2 = 1;
- if (i <= 0) return(-1);
-
- for (log2 = 0; log2 < 20; log2++) {
- if (exp2 == i) return(log2);
- exp2 *= 2;
- }
- return(-1);
-}
-
-int ipow2(int i)
-{
- int pow2 = 1;
- if (i < 0) return(-1);
- if (i == 0) return(1);
- while(i--) pow2 *= 2;
- return(pow2);
-}
#include <string>
#include <unordered_map>
+#include <memory>
#include <vector>
#include <xbt/base.h>
+#include "src/xbt/memory_map.hpp"
#include "src/mc/mc_forward.h"
#include "src/mc/Type.hpp"
#include "src/mc/Frame.hpp"
const char* name, const char* scope);
};
-
+XBT_PRIVATE std::shared_ptr<ObjectInformation> createObjectInformation(
+ std::vector<simgrid::xbt::VmMap> const& maps, const char* name);
+XBT_PRIVATE void postProcessObjectInformation(
+ simgrid::mc::Process* process, simgrid::mc::ObjectInformation* info);
}
}
#include <xbt/base.h>
#include <xbt/mmalloc.h>
-#include "src/mc/mc_object_info.h"
#include "src/mc/mc_unw.h"
#include "src/mc/mc_snapshot.h"
#include "src/mc/mc_ignore.h"
static char* get_lib_name(const char* pathname, struct s_mc_memory_map_re* res)
{
- const char* map_basename = xbt_basename((char*) pathname);
+ char* map_basename = xbt_basename(pathname);
regmatch_t match;
- if(regexec(&res->so_re, map_basename, 1, &match, 0))
+ if(regexec(&res->so_re, map_basename, 1, &match, 0)) {
+ free(map_basename);
return nullptr;
+ }
char* libname = strndup(map_basename, match.rm_so);
+ free(map_basename);
+ map_basename = nullptr;
// Strip the version suffix:
if(libname && !regexec(&res->version_re, libname, 1, &match, 0)) {
const char* current_name = nullptr;
- this->object_infos.resize(0);
+ this->object_infos.clear();
for (size_t i=0; i < maps.size(); i++) {
simgrid::xbt::VmMap const& reg = maps[i];
}
std::shared_ptr<simgrid::mc::ObjectInformation> info =
- MC_find_object_info(this->memory_map_, pathname);
+ simgrid::mc::createObjectInformation(this->memory_map_, pathname);
this->object_infos.push_back(info);
if (is_executable)
this->binary_info = info;
// Resolve time (including accross differents objects):
for (auto const& object_info : this->object_infos)
- MC_post_process_object_info(this, object_info.get());
+ postProcessObjectInformation(this, object_info.get());
xbt_assert(this->maestro_stack_start_, "Did not find maestro_stack_start");
xbt_assert(this->maestro_stack_end_, "Did not find maestro_stack_end");
#include "src/mc/mc_private.h"
#include "xbt/module.h"
#include <xbt/mmalloc.h>
+#include <xbt/memory.hpp>
#include "src/smpi/private.h"
#include "src/xbt/mmalloc/mmprivate.h"
std::vector<simgrid::xbt::VmMap> const& maps,
simgrid::mc::ObjectInformation* result)
{
- char* file_name = xbt_strdup(result->file_name.c_str());
- const char *name = xbt_basename(file_name);
+ char* name = xbt_basename(result->file_name.c_str());
+
for (size_t i = 0; i < maps.size(); ++i) {
simgrid::xbt::VmMap const& reg = maps[i];
- if (maps[i].pathname.empty()
- || strcmp(xbt_basename(maps[i].pathname.c_str()), name)) {
- // Nothing to do
- } else if ((reg.prot & PROT_WRITE)) {
+ if (maps[i].pathname.empty())
+ continue;
+ char* map_basename = xbt_basename(maps[i].pathname.c_str());
+ if (strcmp(name, map_basename) != 0) {
+ free(map_basename);
+ continue;
+ }
+ free(map_basename);
+ if ((reg.prot & PROT_WRITE)) {
xbt_assert(!result->start_rw,
"Multiple read-write segments for %s, not supported",
maps[i].pathname.c_str());
xbt_assert(result->start_rw);
xbt_assert(result->start_exec);
- free(file_name);
+
+ free(name);
}
/************************************* Take Snapshot ************************************/
continue;
// If dot_output enabled, do not handle the corresponding file
- if (dot_output != nullptr && strcmp(xbt_basename(link), _sg_mc_dot_output_file) == 0)
- continue;
+ if (dot_output != nullptr) {
+ char* link_basename = xbt_basename(link);
+ if (strcmp(link_basename, _sg_mc_dot_output_file) == 0) {
+ free(link_basename);
+ continue;
+ }
+ free(link_basename);
+ }
// This is probably a shared memory used by lttng-ust:
if(strncmp("/dev/shm/ust-shm-tmp-", link, 21)==0)
#include "src/mc/mc_private.h"
#include "src/mc/mc_dwarf.hpp"
-#include "src/mc/mc_object_info.h"
#include "src/mc/Process.hpp"
#include "src/mc/ObjectInformation.hpp"
#include "src/mc/Variable.hpp"
}
/** Base directories for external debug files */
+static
const char* debug_paths[] = {
"/usr/lib/debug/",
"/usr/local/lib/debug/",
}
}
+namespace simgrid {
+namespace mc {
+
/** \brief Finds informations about a given shared object/executable */
-std::shared_ptr<simgrid::mc::ObjectInformation> MC_find_object_info(
+std::shared_ptr<simgrid::mc::ObjectInformation> createObjectInformation(
std::vector<simgrid::xbt::VmMap> const& maps, const char *name)
{
std::shared_ptr<simgrid::mc::ObjectInformation> result =
/*************************************************************************/
-void MC_post_process_object_info(simgrid::mc::Process* process, simgrid::mc::ObjectInformation* info)
+void postProcessObjectInformation(simgrid::mc::Process* process, simgrid::mc::ObjectInformation* info)
{
for (auto& i : info->types) {
}
}
+}
+}
+
namespace simgrid {
namespace dwarf {
+++ /dev/null
-/* Copyright (c) 2007-2015. The SimGrid Team.
- * All rights reserved. */
-
-/* This program is free software; you can redistribute it and/or modify it
- * under the terms of the license (GNU LGPL) which comes with this package. */
-
-#ifndef SIMGRID_MC_OBJECT_INFO_H
-#define SIMGRID_MC_OBJECT_INFO_H
-
-#include <vector>
-#include <memory>
-
-#include <xbt/base.h>
-
-#include "src/mc/mc_forward.hpp"
-#include "src/xbt/memory_map.hpp"
-
-XBT_PRIVATE std::shared_ptr<simgrid::mc::ObjectInformation> MC_find_object_info(
- std::vector<simgrid::xbt::VmMap> const& maps, const char* name);
-XBT_PRIVATE void MC_post_process_object_info(simgrid::mc::Process* process, simgrid::mc::ObjectInformation* info);
-
-#endif
}
if (!acyclic_graph_detail(result)){
- XBT_ERROR("The DAX described in %s is not a DAG. It contains a cycle.", xbt_basename(filename));
+ char* base = xbt_basename(filename);
+ XBT_ERROR("The DAX described in %s is not a DAG. It contains a cycle.", base);
+ free(base);
xbt_dynar_foreach(result, cpt, file)
SD_task_destroy(file);
xbt_dynar_free_container(&result);
}
if (result && !acyclic_graph_detail(result)) {
- XBT_ERROR("The DOT described in %s is not a DAG. It contains a cycle.", xbt_basename((char*)filename));
+ char* base = xbt_basename(filename);
+ XBT_ERROR("The DOT described in %s is not a DAG. It contains a cycle.", base);
+ free(base);
xbt_dynar_free(&result);
result = NULL;
}
#include <mc/mc.h>
#include "mc/datatypes.h"
-#include "src/mc/mc_object_info.h"
#include "src/mc/mc_private.h"
#include "src/mc/Process.hpp"
#include <stdlib.h>
#include "src/mc/mc_private.h"
-#include "src/mc/mc_object_info.h"
#include "src/mc/Process.hpp"
#include "src/mc/Type.hpp"
src/mc/mc_unw.h
src/mc/mc_unw.cpp
src/mc/mc_unw_vmread.cpp
- src/mc/mc_object_info.h
src/mc/mc_checkpoint.cpp
src/mc/mc_snapshot.h
src/mc/mc_snapshot.cpp
examples/simdag/scheduling/CMakeLists.txt
examples/smpi/CMakeLists.txt
+ examples/smpi/NAS/CMakeLists.txt
examples/smpi/smpi_msg_masterslave/CMakeLists.txt
examples/smpi/replay_multiple/CMakeLists.txt
examples/smpi/energy/CMakeLists.txt
endif()
if(VALGRIND_EXE)
- execute_process(COMMAND "${VALGRIND_EXE} --version " OUTPUT_VARIABLE "VALGRIND_VERSION")
+ execute_process(COMMAND ${VALGRIND_EXE} --version OUTPUT_VARIABLE "VALGRIND_VERSION")
string(REGEX MATCH "[0-9]+.[0-9]+.[0-9]+" NEW_VALGRIND_VERSION "${VALGRIND_VERSION}")
if(NEW_VALGRIND_VERSION)
message(STATUS "Valgrind version: ${NEW_VALGRIND_VERSION}")
+ contrib/.*
-+ examples/smpi/NAS/.*
-
+ src/simix/README_attempt_without_stack
+ src/simix/simix_network\.tla