+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "DGraph.h"
-
-DGArc *newArc(DGNode *tl,DGNode *hd){
- DGArc *ar=(DGArc *)malloc(sizeof(DGArc));
- ar->tail=tl;
- ar->head=hd;
- return ar;
-}
-void arcShow(DGArc *ar){
- DGNode *tl=(DGNode *)ar->tail,
- *hd=(DGNode *)ar->head;
- fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name);
-}
-
-DGNode *newNode(char *nm){
- DGNode *nd=(DGNode *)malloc(sizeof(DGNode));
- nd->attribute=0;
- nd->color=0;
- nd->inDegree=0;
- nd->outDegree=0;
- nd->maxInDegree=SMALL_BLOCK_SIZE;
- nd->maxOutDegree=SMALL_BLOCK_SIZE;
- nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*));
- nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*));
- nd->name=strdup(nm);
- nd->feat=NULL;
- return nd;
-}
-void nodeShow(DGNode* nd){
- fprintf( stderr,"%3d.%s: (%d,%d)\n",
- nd->id,nd->name,nd->inDegree,nd->outDegree);
-/*
- if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
- else if(nd->verified==0) fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
- else fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name);
-*/
-}
-
-DGraph* newDGraph(char* nm){
- DGraph *dg=(DGraph *)malloc(sizeof(DGraph));
- dg->numNodes=0;
- dg->numArcs=0;
- dg->maxNodes=BLOCK_SIZE;
- dg->maxArcs=BLOCK_SIZE;
- dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*));
- dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*));
- dg->name=strdup(nm);
- return dg;
-}
-int AttachNode(DGraph* dg, DGNode* nd) {
- int i=0,j,len=0;
- DGNode **nds =NULL, *tmpnd=NULL;
- DGArc **ar=NULL;
-
- if (dg->numNodes == dg->maxNodes-1 ) {
- dg->maxNodes += BLOCK_SIZE;
- nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
- memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
- free(dg->node);
- dg->node=nds;
- }
-
- len = strlen( nd->name);
- for (i = 0; i < dg->numNodes; i++) {
- tmpnd =dg->node[ i];
- ar=NULL;
- if ( strlen( tmpnd->name) != len ) continue;
- if ( strncmp( nd->name, tmpnd->name, len) ) continue;
- if ( nd->inDegree > 0 ) {
- tmpnd->maxInDegree += nd->maxInDegree;
- ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
- memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
- free(tmpnd->inArc);
- tmpnd->inArc=ar;
- for (j = 0; j < nd->inDegree; j++ ) {
- nd->inArc[ j]->head = tmpnd;
- }
- memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *));
- tmpnd->inDegree += nd->inDegree;
- }
- if ( nd->outDegree > 0 ) {
- tmpnd->maxOutDegree += nd->maxOutDegree;
- ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
- memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
- free(tmpnd->outArc);
- tmpnd->outArc=ar;
- for (j = 0; j < nd->outDegree; j++ ) {
- nd->outArc[ j]->tail = tmpnd;
- }
- memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
- tmpnd->outDegree += nd->outDegree;
- }
- free(nd);
- return i;
- }
- nd->id = dg->numNodes;
- dg->node[dg->numNodes] = nd;
- dg->numNodes++;
-return nd->id;
-}
-int AttachArc(DGraph *dg,DGArc* nar){
-int arcId = -1;
-int i=0,newNumber=0;
-DGNode *head = nar->head,
- *tail = nar->tail;
-DGArc **ars=NULL,*probe=NULL;
-/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
- if ( !tail || !head ) return arcId;
- if ( dg->numArcs == dg->maxArcs-1 ) {
- dg->maxArcs += BLOCK_SIZE;
- ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
- memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
- free(dg->arc);
- dg->arc=ars;
- }
- for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
- probe = tail->outArc[ i];
- if(probe->head == head
- &&
- probe->length == nar->length
- ){
- free(nar);
- return probe->id;
- }
- }
-
- nar->id = dg->numArcs;
- arcId=dg->numArcs;
- dg->arc[dg->numArcs] = nar;
- dg->numArcs++;
-
- head->inArc[ head->inDegree] = nar;
- head->inDegree++;
- if ( head->inDegree >= head->maxInDegree ) {
- newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
- ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
- memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
- free(head->inArc);
- head->inArc=ars;
- head->maxInDegree = newNumber;
- }
- tail->outArc[ tail->outDegree] = nar;
- tail->outDegree++;
- if(tail->outDegree >= tail->maxOutDegree ) {
- newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
- ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
- memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
- free(tail->outArc);
- tail->outArc=ars;
- tail->maxOutDegree = newNumber;
- }
-/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
-return arcId;
-}
-void graphShow(DGraph *dg,int DetailsLevel){
- int i=0,j=0;
- fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
- if ( DetailsLevel < 1) return;
- for (i = 0; i < dg->numNodes; i++ ) {
- DGNode *focusNode = dg->node[ i];
- if(DetailsLevel >= 2) {
- for (j = 0; j < focusNode->inDegree; j++ ) {
- fprintf(stderr,"\t ");
- nodeShow(focusNode->inArc[ j]->tail);
- }
- }
- nodeShow(focusNode);
- if ( DetailsLevel < 2) continue;
- for (j = 0; j < focusNode->outDegree; j++ ) {
- fprintf(stderr, "\t ");
- nodeShow(focusNode->outArc[ j]->head);
- }
- fprintf(stderr, "---\n");
- }
- fprintf(stderr,"----------------------------------------\n");
- if ( DetailsLevel < 3) return;
-}
-
-
-
+++ /dev/null
-#ifndef _DGRAPH
-#define _DGRAPH
-
-#define BLOCK_SIZE 128
-#define SMALL_BLOCK_SIZE 32
-
-typedef struct{
- int id;
- void *tail,*head;
- int length,width,attribute,maxWidth;
-}DGArc;
-
-typedef struct{
- int maxInDegree,maxOutDegree;
- int inDegree,outDegree;
- int id;
- char *name;
- DGArc **inArc,**outArc;
- int depth,height,width;
- int color,attribute,address,verified;
- void *feat;
-}DGNode;
-
-typedef struct{
- int maxNodes,maxArcs;
- int id;
- char *name;
- int numNodes,numArcs;
- DGNode **node;
- DGArc **arc;
-} DGraph;
-
-DGArc *newArc(DGNode *tl,DGNode *hd);
-void arcShow(DGArc *ar);
-DGNode *newNode(char *nm);
-void nodeShow(DGNode* nd);
-
-DGraph* newDGraph(char *nm);
-int AttachNode(DGraph *dg,DGNode *nd);
-int AttachArc(DGraph *dg,DGArc* nar);
-void graphShow(DGraph *dg,int DetailsLevel);
-
-#endif
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=dt
-BENCHMARKU=DT
-
-include ../config/make.def
-
-include ../sys/make.common
-#Override PROGRAM
-DTPROGRAM = $(BINDIR)/$(BENCHMARK)-folding.$(CLASS)
-
-OBJS = dt.o DGraph.o \
- ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
-
-
-${PROGRAM}: config ${OBJS}
- ${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
-
-.c.o:
- ${CCOMPILE} $<
-
-dt.o: dt.c npbparams.h
-DGraph.o: DGraph.c DGraph.h
-
-clean:
- - rm -f *.o *~ mputil*
- - rm -f dt npbparams.h core
+++ /dev/null
-Data Traffic benchmark DT is new in the NPB suite
-(released as part of NPB3.x-MPI package).
-----------------------------------------------------
-
-DT is written in C and same executable can run on any number of processors,
-provided this number is not less than the number of nodes in the communication
-graph. DT benchmark takes one argument: BH, WH, or SH. This argument
-specifies the communication graph Black Hole, White Hole, or SHuffle
-respectively. The current release contains verification numbers for
-CLASSES S, W, A, and B only. Classes C and D are defined, but verification
-numbers are not provided in this release.
-
-The following table summarizes the number of nodes in the communication
-graph based on CLASS and graph TYPE.
-
-CLASS N_Source N_Nodes(BH,WH) N_Nodes(SH)
- S 4 5 12
- W 8 11 32
- A 16 21 80
- B 32 43 192
- C 64 85 448
- D 128 171 1024
SHELL=/bin/sh
BENCHMARK=dt
-BENCHMARKU=DT
include ../config/make.def
#Override PROGRAM
DTPROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS)
-OBJS = dt.o DGraph.o \
+OBJS = dt.o DGraph.o \
+ ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
+
+OBJS-F = dt-folding.o DGraph.o \
${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
-${PROGRAM}: config ${OBJS}
+${PROGRAM}: config ${OBJS} ${OBJS-F}
${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
+ ${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM}-folding ${OBJS-F} ${CMPI_LIB}
.c.o:
${CCOMPILE} $<
dt.o: dt.c npbparams.h
+dt-folding.o: dt-folding.c npbparams.h
DGraph.o: DGraph.c DGraph.h
clean:
- rm -f *.o *~ mputil*
- - rm -f dt npbparams.h core
+ - rm -f dt dt-folding npbparams.h
include ../config/make.def
-#OBJS = ep.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o
OBJS = ep.o randlc.o
include ../sys/make.common
${PROGRAM}: config ${OBJS}
-# ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
-
-#ep.o: ep.f mpinpb.h npbparams.h
-# ${FCOMPILE} ep.f
-
ep.o: ep.c randlc.c mpinpb.h npbparams.h
${CCOMPILE} ep.c
+++ /dev/null
-!-------------------------------------------------------------------------!
-! !
-! N A S P A R A L L E L B E N C H M A R K S 3.3 !
-! !
-! E P !
-! !
-!-------------------------------------------------------------------------!
-! !
-! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
-! It is described in NAS Technical Reports 95-020 and 02-007 !
-! !
-! Permission to use, copy, distribute and modify this software !
-! for any purpose with or without fee is hereby granted. We !
-! request, however, that all derived work reference the NAS !
-! Parallel Benchmarks 3.3. This software is provided "as is" !
-! without express or implied warranty. !
-! !
-! Information on NPB 3.3, including the technical report, the !
-! original specifications, source code, results and information !
-! on how to submit new results, is available at: !
-! !
-! http://www.nas.nasa.gov/Software/NPB/ !
-! !
-! Send comments or suggestions to npb@nas.nasa.gov !
-! !
-! NAS Parallel Benchmarks Group !
-! NASA Ames Research Center !
-! Mail Stop: T27A-1 !
-! Moffett Field, CA 94035-1000 !
-! !
-! E-mail: npb@nas.nasa.gov !
-! Fax: (650) 604-3957 !
-! !
-!-------------------------------------------------------------------------!
-
-
-c---------------------------------------------------------------------
-c
-c Authors: P. O. Frederickson
-c D. H. Bailey
-c A. C. Woo
-c R. F. Van der Wijngaart
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
- program EMBAR
-c---------------------------------------------------------------------
-C
-c This is the MPI version of the APP Benchmark 1,
-c the "embarassingly parallel" benchmark.
-c
-c
-c M is the Log_2 of the number of complex pairs of uniform (0, 1) random
-c numbers. MK is the Log_2 of the size of each batch of uniform random
-c numbers. MK can be set for convenience on a given system, since it does
-c not affect the results.
-
- implicit none
-
- include 'npbparams.h'
- include 'mpinpb.h'
-
- double precision Mops, epsilon, a, s, t1, t2, t3, t4, x, x1,
- > x2, q, sx, sy, tm, an, tt, gc, dum(3),
- > timer_read
- double precision sx_verify_value, sy_verify_value, sx_err, sy_err
- integer mk, mm, nn, nk, nq, np, ierr, node, no_nodes,
- > i, ik, kk, l, k, nit, ierrcode, no_large_nodes,
- > np_add, k_offset, j
- logical verified, timers_enabled
- parameter (timers_enabled = .false.)
- external randlc, timer_read
- double precision randlc, qq
- character*15 size
-
- parameter (mk = 16, mm = m - mk, nn = 2 ** mm,
- > nk = 2 ** mk, nq = 10, epsilon=1.d-8,
- > a = 1220703125.d0, s = 271828183.d0)
-
- common/storage/ x(2*nk), q(0:nq-1), qq(10000)
- data dum /1.d0, 1.d0, 1.d0/
-
- call mpi_init(ierr)
- call mpi_comm_rank(MPI_COMM_WORLD,node,ierr)
- call mpi_comm_size(MPI_COMM_WORLD,no_nodes,ierr)
-
- root = 0
-
- if (.not. convertdouble) then
- dp_type = MPI_DOUBLE_PRECISION
- else
- dp_type = MPI_REAL
- endif
-
- if (node.eq.root) then
-
-c Because the size of the problem is too large to store in a 32-bit
-c integer for some classes, we put it into a string (for printing).
-c Have to strip off the decimal point put in there by the floating
-c point print statement (internal file)
-
- write(*, 1000)
- write(size, '(f15.0)' ) 2.d0**(m+1)
- j = 15
- if (size(j:j) .eq. '.') j = j - 1
- write (*,1001) size(1:j)
- write(*, 1003) no_nodes
-
- 1000 format(/,' NAS Parallel Benchmarks 3.3 -- EP Benchmark',/)
- 1001 format(' Number of random numbers generated: ', a15)
- 1003 format(' Number of active processes: ', 2x, i13, /)
-
- endif
-
- verified = .false.
-
-c Compute the number of "batches" of random number pairs generated
-c per processor. Adjust if the number of processors does not evenly
-c divide the total number
-
- np = nn / no_nodes
- no_large_nodes = mod(nn, no_nodes)
- if (node .lt. no_large_nodes) then
- np_add = 1
- else
- np_add = 0
- endif
- np = np + np_add
-
- if (np .eq. 0) then
- write (6, 1) no_nodes, nn
- 1 format ('Too many nodes:',2i6)
- call mpi_abort(MPI_COMM_WORLD,ierrcode,ierr)
- stop
- endif
-
-c Call the random number generator functions and initialize
-c the x-array to reduce the effects of paging on the timings.
-c Also, call all mathematical functions that are used. Make
-c sure these initializations cannot be eliminated as dead code.
-
- call vranlc(0, dum(1), dum(2), dum(3))
- dum(1) = randlc(dum(2), dum(3))
- do 5 i = 1, 2*nk
- x(i) = -1.d99
- 5 continue
- Mops = log(sqrt(abs(max(1.d0,1.d0))))
-
-c---------------------------------------------------------------------
-c Synchronize before placing time stamp
-c---------------------------------------------------------------------
- call mpi_barrier(MPI_COMM_WORLD, ierr)
-
- call timer_clear(1)
- call timer_clear(2)
- call timer_clear(3)
- call timer_start(1)
-
- t1 = a
- call vranlc(0, t1, a, x)
-
-c Compute AN = A ^ (2 * NK) (mod 2^46).
-
- t1 = a
-
- do 100 i = 1, mk + 1
- t2 = randlc(t1, t1)
- 100 continue
-
- an = t1
- tt = s
- gc = 0.d0
- sx = 0.d0
- sy = 0.d0
-
- do 110 i = 0, nq - 1
- q(i) = 0.d0
- 110 continue
-
-c Each instance of this loop may be performed independently. We compute
-c the k offsets separately to take into account the fact that some nodes
-c have more numbers to generate than others
-
- if (np_add .eq. 1) then
- k_offset = node * np -1
- else
- k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1
- endif
-
- do 150 k = 1, np
- kk = k_offset + k
- t1 = s
- t2 = an
-
-c Find starting seed t1 for this kk.
-
- do 120 i = 1, 100
- ik = kk / 2
- if (2 * ik .ne. kk) t3 = randlc(t1, t2)
- if (ik .eq. 0) goto 130
- t3 = randlc(t2, t2)
- kk = ik
- 120 continue
-
-c Compute uniform pseudorandom numbers.
- 130 continue
-
- if (timers_enabled) call timer_start(3)
- call vranlc(2 * nk, t1, a, x)
- if (timers_enabled) call timer_stop(3)
-
-c Compute Gaussian deviates by acceptance-rejection method and
-c tally counts in concentric square annuli. This loop is not
-c vectorizable.
-
- if (timers_enabled) call timer_start(2)
-
- do 140 i = 1, nk
- x1 = 2.d0 * x(2*i-1) - 1.d0
- x2 = 2.d0 * x(2*i) - 1.d0
- t1 = x1 ** 2 + x2 ** 2
- if (t1 .le. 1.d0) then
- t2 = sqrt(-2.d0 * log(t1) / t1)
- t3 = (x1 * t2)
- t4 = (x2 * t2)
- l = max(abs(t3), abs(t4))
- q(l) = q(l) + 1.d0
- sx = sx + t3
- sy = sy + t4
- endif
- 140 continue
-
- if (timers_enabled) call timer_stop(2)
-
- 150 continue
-
- call mpi_allreduce(sx, x, 1, dp_type,
- > MPI_SUM, MPI_COMM_WORLD, ierr)
- sx = x(1)
- call mpi_allreduce(sy, x, 1, dp_type,
- > MPI_SUM, MPI_COMM_WORLD, ierr)
- sy = x(1)
- call mpi_allreduce(q, x, nq, dp_type,
- > MPI_SUM, MPI_COMM_WORLD, ierr)
-
- do i = 1, nq
- q(i-1) = x(i)
- enddo
-
- do 160 i = 0, nq - 1
- gc = gc + q(i)
- 160 continue
-
- call timer_stop(1)
- tm = timer_read(1)
-
- call mpi_allreduce(tm, x, 1, dp_type,
- > MPI_MAX, MPI_COMM_WORLD, ierr)
- tm = x(1)
-
- if (node.eq.root) then
- nit=0
- verified = .true.
- if (m.eq.24) then
- sx_verify_value = -3.247834652034740D+3
- sy_verify_value = -6.958407078382297D+3
- elseif (m.eq.25) then
- sx_verify_value = -2.863319731645753D+3
- sy_verify_value = -6.320053679109499D+3
- elseif (m.eq.28) then
- sx_verify_value = -4.295875165629892D+3
- sy_verify_value = -1.580732573678431D+4
- elseif (m.eq.30) then
- sx_verify_value = 4.033815542441498D+4
- sy_verify_value = -2.660669192809235D+4
- elseif (m.eq.32) then
- sx_verify_value = 4.764367927995374D+4
- sy_verify_value = -8.084072988043731D+4
- elseif (m.eq.36) then
- sx_verify_value = 1.982481200946593D+5
- sy_verify_value = -1.020596636361769D+5
- elseif (m.eq.40) then
- sx_verify_value = -5.319717441530D+05
- sy_verify_value = -3.688834557731D+05
- else
- verified = .false.
- endif
- if (verified) then
- sx_err = abs((sx - sx_verify_value)/sx_verify_value)
- sy_err = abs((sy - sy_verify_value)/sy_verify_value)
- verified = ((sx_err.le.epsilon) .and. (sy_err.le.epsilon))
- endif
- Mops = 2.d0**(m+1)/tm/1000000.d0
-
- write (6,11) tm, m, gc, sx, sy, (i, q(i), i = 0, nq - 1)
- 11 format ('EP Benchmark Results:'//'CPU Time =',f10.4/'N = 2^',
- > i5/'No. Gaussian Pairs =',f15.0/'Sums = ',1p,2d25.15/
- > 'Counts:'/(i3,0p,f15.0))
-
- call print_results('EP', class, m+1, 0, 0, nit, npm,
- > no_nodes, tm, Mops,
- > 'Random numbers generated',
- > verified, npbversion, compiletime, cs1,
- > cs2, cs3, cs4, cs5, cs6, cs7)
-
- endif
-
- if (timers_enabled .and. (node .eq. root)) then
- print *, 'Total time: ', timer_read(1)
- print *, 'Gaussian pairs: ', timer_read(2)
- print *, 'Random numbers: ', timer_read(3)
- endif
-
- call mpi_finalize(ierr)
-
- end
+++ /dev/null
-# Makefile for MPI dummy library.
-# Must be edited for a specific machine. Does NOT read in
-# the make.def file of NPB 2.3
-F77 = f77
-CC = cc
-AR = ar
-
-# Enable if either Cray or IBM: (no such flag for most machines: see wtime.h)
-# MACHINE = -DCRAY
-# MACHINE = -DIBM
-
-libmpi.a: mpi_dummy.o mpi_dummy_c.o wtime.o
- $(AR) r libmpi.a mpi_dummy.o mpi_dummy_c.o wtime.o
-
-mpi_dummy.o: mpi_dummy.f mpif.h
- $(F77) -c mpi_dummy.f
-# For a Cray C90, try:
-# cf77 -dp -c mpi_dummy.f
-# For an IBM 590, try:
-# xlf -c mpi_dummy.f
-
-mpi_dummy_c.o: mpi_dummy.c mpi.h
- $(CC) -c ${MACHINE} -o mpi_dummy_c.o mpi_dummy.c
-
-wtime.o: wtime.c
-# For most machines or CRAY or IBM
- $(CC) -c ${MACHINE} wtime.c
-# For a precise timer on an SGI Power Challenge, try:
-# $(CC) -o wtime.o -c wtime_sgi64.c
-
-test: test.f
- $(F77) -o test -I. test.f -L. -lmpi
-
-
-
-clean:
- - rm -f *~ *.o
- - rm -f test libmpi.a
+++ /dev/null
-###########################################
-# NAS Parallel Benchmarks 2&3 #
-# MPI/F77/C #
-# Revision 3.3 #
-# NASA Ames Research Center #
-# npb@nas.nasa.gov #
-# http://www.nas.nasa.gov/Software/NPB/ #
-###########################################
-
-MPI Dummy Library
-
-
-The MPI dummy library is supplied as a convenience for people who do
-not have an MPI library but would like to try running on one processor
-anyway. The NPB 2.x/3.x benchmarks are designed so that they do not
-actually try to do any message passing when run on one node. The MPI
-dummy library is just that - a set of dummy MPI routines which don't
-do anything, but allow you to link the benchmarks. Actually they do a
-few things, but nothing important. Note that the dummy library is
-sufficient only for the NPB 2.x/3.x benchmarks. It probably won't be
-useful for anything else because it implements only a handful of
-functions.
-
-Because the dummy library is just an extra goody, and since we don't
-have an infinite amount of time, it may be a bit trickier to configure
-than the rest of the benchmarks. You need to:
-
-1. Find out how C and Fortran interact on your machine. On most machines,
-the fortran functon foo(x) is declared in C as foo_(xp) where xp is
-a pointer, not a value. On IBMs, it's just foo(xp). On Cray C90s, its
-FOO(xp). You can define CRAY or IBM to get these, or you need to
-edit wtime.c if you've got something else.
-
-2. Edit the Makefile to compile mpi_dummy.f and wtime.c correctly
-for your machine (including -DCRAY or -DIBM if necessary).
-
-3. The substitute MPI timer gives wall clock time, not CPU time.
-If you're running on a timeshared machine, you may want to
-use a CPU timer. Edit the function mpi_wtime() in mpi_dummy.f
-to change this timer. (NOTE: for official benchmark results,
-ONLY wall clock times are valid. Using a CPU timer is ok
-if you want to get things running, but don't report any results
-measured with a CPU timer. )
-
-TROUBLESHOOTING
-
-o Compiling or linking of the benchmark aborts because the dummy MPI
- header file or the dummy MPI library cannot be found.
- - the file make.dummy in subdirectory config relies on the use
- of the -I"path" and -L"path" -l"library" constructs to pass
- information to the compilers and linkers. Edit this file to conform
- to your system.
+++ /dev/null
-#define MPI_DOUBLE 1
-#define MPI_INT 2
-#define MPI_BYTE 3
-#define MPI_FLOAT 4
-#define MPI_LONG 5
-
-#define MPI_COMM_WORLD 0
-
-#define MPI_MAX 1
-#define MPI_SUM 2
-#define MPI_MIN 3
-
-#define MPI_SUCCESS 0
-#define MPI_ANY_SOURCE -1
-#define MPI_ERR_OTHER -1
-#define MPI_STATUS_SIZE 3
-
-
-/*
- Status object. It is the only user-visible MPI data-structure
- The "count" field is PRIVATE; use MPI_Get_count to access it.
- */
-typedef struct {
- int count;
- int MPI_SOURCE;
- int MPI_TAG;
- int MPI_ERROR;
-} MPI_Status;
-
-
-/* MPI request objects */
-typedef int MPI_Request;
-
-/* MPI datatype */
-typedef int MPI_Datatype;
-
-/* MPI comm */
-typedef int MPI_Comm;
-
-/* MPI operation */
-typedef int MPI_Op;
-
-
-
-/* Prototypes: */
-void mpi_error( void );
-
-int MPI_Irecv( void *buf,
- int count,
- MPI_Datatype datatype,
- int source,
- int tag,
- MPI_Comm comm,
- MPI_Request *request );
-
-int MPI_Send( void *buf,
- int count,
- MPI_Datatype datatype,
- int dest,
- int tag,
- MPI_Comm comm );
-
-int MPI_Wait( MPI_Request *request,
- MPI_Status *status );
-
-int MPI_Init( int *argc,
- char ***argv );
-
-int MPI_Comm_rank( MPI_Comm comm,
- int *rank );
-
-int MPI_Comm_size( MPI_Comm comm,
- int *size );
-
-double MPI_Wtime( void );
-
-int MPI_Barrier( MPI_Comm comm );
-
-int MPI_Finalize( void );
-
-int MPI_Allreduce( void *sendbuf,
- void *recvbuf,
- int nitems,
- MPI_Datatype type,
- MPI_Op op,
- MPI_Comm comm );
-
-int MPI_Reduce( void *sendbuf,
- void *recvbuf,
- int nitems,
- MPI_Datatype type,
- MPI_Op op,
- int root,
- MPI_Comm comm );
-
-int MPI_Alltoall( void *sendbuf,
- int sendcount,
- MPI_Datatype sendtype,
- void *recvbuf,
- int recvcount,
- MPI_Datatype recvtype,
- MPI_Comm comm );
-
-int MPI_Alltoallv( void *sendbuf,
- int *sendcounts,
- int *senddispl,
- MPI_Datatype sendtype,
- void *recvbuf,
- int *recvcounts,
- int *recvdispl,
- MPI_Datatype recvtype,
- MPI_Comm comm );
+++ /dev/null
-#include <stdlib.h>
-#include "mpi.h"
-#include "wtime.h"
-
-void mpi_error( void )
-{
- printf( "mpi_error called\n" );
- abort();
-}
-
-
-
-
-int MPI_Irecv( void *buf,
- int count,
- MPI_Datatype datatype,
- int source,
- int tag,
- MPI_Comm comm,
- MPI_Request *request )
-{
- mpi_error();
- return( MPI_ERR_OTHER );
-}
-
-
-
-
-int MPI_Recv( void *buf,
- int count,
- MPI_Datatype datatype,
- int source,
- int tag,
- MPI_Comm comm,
- MPI_Status *status )
-{
- mpi_error();
- return( MPI_ERR_OTHER );
-}
-
-
-
-
-int MPI_Send( void *buf,
- int count,
- MPI_Datatype datatype,
- int dest,
- int tag,
- MPI_Comm comm )
-{
- mpi_error();
- return( MPI_ERR_OTHER );
-}
-
-
-
-
-int MPI_Wait( MPI_Request *request,
- MPI_Status *status )
-{
- mpi_error();
- return( MPI_ERR_OTHER );
-}
-
-
-
-
-int MPI_Init( int *argc,
- char ***argv )
-{
- return( MPI_SUCCESS );
-}
-
-
-
-
-int MPI_Comm_rank( MPI_Comm comm,
- int *rank )
-{
- *rank = 0;
- return( MPI_SUCCESS );
-}
-
-
-
-
-int MPI_Comm_size( MPI_Comm comm,
- int *size )
-{
- *size = 1;
- return( MPI_SUCCESS );
-}
-
-
-
-
-double MPI_Wtime( void )
-{
- void wtime();
-
- double t;
- wtime( &t );
- return( t );
-}
-
-
-
-
-int MPI_Barrier( MPI_Comm comm )
-{
- return( MPI_SUCCESS );
-}
-
-
-
-
-int MPI_Finalize( void )
-{
- return( MPI_SUCCESS );
-}
-
-
-
-
-int MPI_Allreduce( void *sendbuf,
- void *recvbuf,
- int nitems,
- MPI_Datatype type,
- MPI_Op op,
- MPI_Comm comm )
-{
- int i;
- if( type == MPI_INT )
- {
- int *pd_sendbuf, *pd_recvbuf;
- pd_sendbuf = (int *) sendbuf;
- pd_recvbuf = (int *) recvbuf;
- for( i=0; i<nitems; i++ )
- *(pd_recvbuf+i) = *(pd_sendbuf+i);
- }
- if( type == MPI_LONG )
- {
- long *pd_sendbuf, *pd_recvbuf;
- pd_sendbuf = (long *) sendbuf;
- pd_recvbuf = (long *) recvbuf;
- for( i=0; i<nitems; i++ )
- *(pd_recvbuf+i) = *(pd_sendbuf+i);
- }
- if( type == MPI_DOUBLE )
- {
- double *pd_sendbuf, *pd_recvbuf;
- pd_sendbuf = (double *) sendbuf;
- pd_recvbuf = (double *) recvbuf;
- for( i=0; i<nitems; i++ )
- *(pd_recvbuf+i) = *(pd_sendbuf+i);
- }
- return( MPI_SUCCESS );
-}
-
-
-
-
-int MPI_Reduce( void *sendbuf,
- void *recvbuf,
- int nitems,
- MPI_Datatype type,
- MPI_Op op,
- int root,
- MPI_Comm comm )
-{
- int i;
- if( type == MPI_INT )
- {
- int *pi_sendbuf, *pi_recvbuf;
- pi_sendbuf = (int *) sendbuf;
- pi_recvbuf = (int *) recvbuf;
- for( i=0; i<nitems; i++ )
- *(pi_recvbuf+i) = *(pi_sendbuf+i);
- }
- if( type == MPI_LONG )
- {
- long *pi_sendbuf, *pi_recvbuf;
- pi_sendbuf = (long *) sendbuf;
- pi_recvbuf = (long *) recvbuf;
- for( i=0; i<nitems; i++ )
- *(pi_recvbuf+i) = *(pi_sendbuf+i);
- }
- if( type == MPI_DOUBLE )
- {
- double *pd_sendbuf, *pd_recvbuf;
- pd_sendbuf = (double *) sendbuf;
- pd_recvbuf = (double *) recvbuf;
- for( i=0; i<nitems; i++ )
- *(pd_recvbuf+i) = *(pd_sendbuf+i);
- }
- return( MPI_SUCCESS );
-}
-
-
-
-
-int MPI_Alltoall( void *sendbuf,
- int sendcount,
- MPI_Datatype sendtype,
- void *recvbuf,
- int recvcount,
- MPI_Datatype recvtype,
- MPI_Comm comm )
-{
- int i;
- if( recvtype == MPI_INT )
- {
- int *pd_sendbuf, *pd_recvbuf;
- pd_sendbuf = (int *) sendbuf;
- pd_recvbuf = (int *) recvbuf;
- for( i=0; i<sendcount; i++ )
- *(pd_recvbuf+i) = *(pd_sendbuf+i);
- }
- if( recvtype == MPI_LONG )
- {
- long *pd_sendbuf, *pd_recvbuf;
- pd_sendbuf = (long *) sendbuf;
- pd_recvbuf = (long *) recvbuf;
- for( i=0; i<sendcount; i++ )
- *(pd_recvbuf+i) = *(pd_sendbuf+i);
- }
- return( MPI_SUCCESS );
-}
-
-
-
-
-int MPI_Alltoallv( void *sendbuf,
- int *sendcounts,
- int *senddispl,
- MPI_Datatype sendtype,
- void *recvbuf,
- int *recvcounts,
- int *recvdispl,
- MPI_Datatype recvtype,
- MPI_Comm comm )
-{
- int i;
- if( recvtype == MPI_INT )
- {
- int *pd_sendbuf, *pd_recvbuf;
- pd_sendbuf = (int *) sendbuf;
- pd_recvbuf = (int *) recvbuf;
- for( i=0; i<sendcounts[0]; i++ )
- *(pd_recvbuf+i+recvdispl[0]) = *(pd_sendbuf+i+senddispl[0]);
- }
- if( recvtype == MPI_LONG )
- {
- long *pd_sendbuf, *pd_recvbuf;
- pd_sendbuf = (long *) sendbuf;
- pd_recvbuf = (long *) recvbuf;
- for( i=0; i<sendcounts[0]; i++ )
- *(pd_recvbuf+i+recvdispl[0]) = *(pd_sendbuf+i+senddispl[0]);
- }
- return( MPI_SUCCESS );
-}
-
-
-
-
+++ /dev/null
- subroutine mpi_isend(buf,count,datatype,source,
- & tag,comm,request,ierror)
- integer buf(*), count,datatype,source,tag,comm,
- & request,ierror
- call mpi_error()
- return
- end
-
- subroutine mpi_irecv(buf,count,datatype,source,
- & tag,comm,request,ierror)
- integer buf(*), count,datatype,source,tag,comm,
- & request,ierror
- call mpi_error()
- return
- end
-
- subroutine mpi_send(buf,count,datatype,dest,tag,comm,ierror)
- integer buf(*), count,datatype,dest,tag,comm,ierror
- call mpi_error()
- return
- end
-
- subroutine mpi_recv(buf,count,datatype,source,
- & tag,comm,status,ierror)
- integer buf(*), count,datatype,source,tag,comm,
- & status(*),ierror
- call mpi_error()
- return
- end
-
- subroutine mpi_comm_split(comm,color,key,newcomm,ierror)
- integer comm,color,key,newcomm,ierror
- return
- end
-
- subroutine mpi_comm_rank(comm, rank,ierr)
- implicit none
- integer comm, rank,ierr
- rank = 0
- return
- end
-
- subroutine mpi_comm_size(comm, size, ierr)
- implicit none
- integer comm, size, ierr
- size = 1
- return
- end
-
- double precision function mpi_wtime()
- implicit none
- double precision t
-c This function must measure wall clock time, not CPU time.
-c Since there is no portable timer in Fortran (77)
-c we call a routine compiled in C (though the C source may have
-c to be tweaked).
- call wtime(t)
-c The following is not ok for "official" results because it reports
-c CPU time not wall clock time. It may be useful for developing/testing
-c on timeshared Crays, though.
-c call second(t)
-
- mpi_wtime = t
-
- return
- end
-
-
-c may be valid to call this in single processor case
- subroutine mpi_barrier(comm,ierror)
- return
- end
-
-c may be valid to call this in single processor case
- subroutine mpi_bcast(buf, nitems, type, root, comm, ierr)
- implicit none
- integer buf(*), nitems, type, root, comm, ierr
- return
- end
-
- subroutine mpi_comm_dup(oldcomm, newcomm,ierror)
- integer oldcomm, newcomm,ierror
- newcomm= oldcomm
- return
- end
-
- subroutine mpi_error()
- print *, 'mpi_error called'
- stop
- end
-
- subroutine mpi_abort(comm, errcode, ierr)
- implicit none
- integer comm, errcode, ierr
- print *, 'mpi_abort called'
- stop
- end
-
- subroutine mpi_finalize(ierr)
- return
- end
-
- subroutine mpi_init(ierr)
- return
- end
-
-
-c assume double precision, which is all SP uses
- subroutine mpi_reduce(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- implicit none
- include 'mpif.h'
- integer nitems, type, op, root, comm, ierr
- double precision inbuf(*), outbuf(*)
-
- if (type .eq. mpi_double_precision) then
- call mpi_reduce_dp(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- else if (type .eq. mpi_double_complex) then
- call mpi_reduce_dc(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- else if (type .eq. mpi_complex) then
- call mpi_reduce_complex(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- else if (type .eq. mpi_real) then
- call mpi_reduce_real(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- else if (type .eq. mpi_integer) then
- call mpi_reduce_int(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- else
- print *, 'mpi_reduce: unknown type ', type
- end if
- return
- end
-
-
- subroutine mpi_reduce_real(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- implicit none
- integer nitems, type, op, root, comm, ierr, i
- real inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
- subroutine mpi_reduce_dp(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- implicit none
- integer nitems, type, op, root, comm, ierr, i
- double precision inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
- subroutine mpi_reduce_dc(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- implicit none
- integer nitems, type, op, root, comm, ierr, i
- double complex inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
-
- subroutine mpi_reduce_complex(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- implicit none
- integer nitems, type, op, root, comm, ierr, i
- complex inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
- subroutine mpi_reduce_int(inbuf, outbuf, nitems,
- $ type, op, root, comm, ierr)
- implicit none
- integer nitems, type, op, root, comm, ierr, i
- integer inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
- subroutine mpi_allreduce(inbuf, outbuf, nitems,
- $ type, op, comm, ierr)
- implicit none
- integer nitems, type, op, comm, ierr
- double precision inbuf(*), outbuf(*)
-
- call mpi_reduce(inbuf, outbuf, nitems,
- $ type, op, 0, comm, ierr)
- return
- end
-
- subroutine mpi_alltoall(inbuf, nitems, type, outbuf, nitems_dum,
- $ type_dum, comm, ierr)
- implicit none
- include 'mpif.h'
- integer nitems, type, comm, ierr, nitems_dum, type_dum
- double precision inbuf(*), outbuf(*)
- if (type .eq. mpi_double_precision) then
- call mpi_alltoall_dp(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- else if (type .eq. mpi_double_complex) then
- call mpi_alltoall_dc(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- else if (type .eq. mpi_complex) then
- call mpi_alltoall_complex(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- else if (type .eq. mpi_real) then
- call mpi_alltoall_real(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- else if (type .eq. mpi_integer) then
- call mpi_alltoall_int(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- else
- print *, 'mpi_alltoall: unknown type ', type
- end if
- return
- end
-
- subroutine mpi_alltoall_dc(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- implicit none
- integer nitems, type, comm, ierr, i
- double complex inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
-
- subroutine mpi_alltoall_complex(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- implicit none
- integer nitems, type, comm, ierr, i
- double complex inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
- subroutine mpi_alltoall_dp(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- implicit none
- integer nitems, type, comm, ierr, i
- double precision inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
- subroutine mpi_alltoall_real(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- implicit none
- integer nitems, type, comm, ierr, i
- real inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
- subroutine mpi_alltoall_int(inbuf, outbuf, nitems,
- $ type, comm, ierr)
- implicit none
- integer nitems, type, comm, ierr, i
- integer inbuf(*), outbuf(*)
- do i = 1, nitems
- outbuf(i) = inbuf(i)
- end do
-
- return
- end
-
- subroutine mpi_wait(request,status,ierror)
- integer request,status,ierror
- call mpi_error()
- return
- end
-
- subroutine mpi_waitall(count,requests,status,ierror)
- integer count,requests(*),status(*),ierror
- call mpi_error()
- return
- end
-
+++ /dev/null
- integer mpi_comm_world
- parameter (mpi_comm_world = 0)
-
- integer mpi_max, mpi_min, mpi_sum
- parameter (mpi_max = 1, mpi_sum = 2, mpi_min = 3)
-
- integer mpi_byte, mpi_integer, mpi_real,
- > mpi_double_precision, mpi_complex,
- > mpi_double_complex
- parameter (mpi_double_precision = 1,
- $ mpi_integer = 2,
- $ mpi_byte = 3,
- $ mpi_real= 4,
- $ mpi_complex = 5,
- $ mpi_double_complex = 6)
-
- integer mpi_any_source
- parameter (mpi_any_source = -1)
-
- integer mpi_err_other
- parameter (mpi_err_other = -1)
-
- double precision mpi_wtime
- external mpi_wtime
-
- integer mpi_status_size
- parameter (mpi_status_size=3)
+++ /dev/null
- program
- implicit none
- double precision t, mpi_wtime
- external mpi_wtime
- t = 0.0
- t = mpi_wtime()
- print *, t
- t = mpi_wtime()
- print *, t
- end
+++ /dev/null
-#include "wtime.h"
-#include <sys/time.h>
-
-void wtime(double *t)
-{
- static int sec = -1;
- struct timeval tv;
- gettimeofday(&tv, (void *)0);
- if (sec < 0) sec = tv.tv_sec;
- *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
-}
-
-
+++ /dev/null
- subroutine wtime(tim)
- real*8 tim
- dimension tarray(2)
- call etime(tarray)
- tim = tarray(1)
- return
- end
-
-
-
-
-
+++ /dev/null
-/* C/Fortran interface is different on different machines.
- * You may need to tweak this.
- */
-
-
-#if defined(IBM)
-#define wtime wtime
-#elif defined(CRAY)
-#define wtime WTIME
-#else
-#define wtime wtime_
-#endif
+++ /dev/null
-#include <sys/types.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/syssgi.h>
-#include <sys/immu.h>
-#include <errno.h>
-#include <stdio.h>
-
-/* The following works on SGI Power Challenge systems */
-
-typedef unsigned long iotimer_t;
-
-unsigned int cycleval;
-volatile iotimer_t *iotimer_addr, base_counter;
-double resolution;
-
-/* address_t is an integer type big enough to hold an address */
-typedef unsigned long address_t;
-
-
-
-void timer_init()
-{
-
- int fd;
- char *virt_addr;
- address_t phys_addr, page_offset, pagemask, pagebase_addr;
-
- pagemask = getpagesize() - 1;
- errno = 0;
- phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
- if (errno != 0) {
- perror("SGI_QUERY_CYCLECNTR");
- exit(1);
- }
- /* rel_addr = page offset of physical address */
- page_offset = phys_addr & pagemask;
- pagebase_addr = phys_addr - page_offset;
- fd = open("/dev/mmem", O_RDONLY);
-
- virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
- virt_addr = virt_addr + page_offset;
- iotimer_addr = (iotimer_t *)virt_addr;
- /* cycleval in picoseconds to this gives resolution in seconds */
- resolution = 1.0e-12*cycleval;
- base_counter = *iotimer_addr;
-}
-
-void wtime_(double *time)
-{
- static int initialized = 0;
- volatile iotimer_t counter_value;
- if (!initialized) {
- timer_init();
- initialized = 1;
- }
- counter_value = *iotimer_addr - base_counter;
- *time = (double)counter_value * resolution;
-}
-
-
-void wtime(double *time)
-{
- static int initialized = 0;
- volatile iotimer_t counter_value;
- if (!initialized) {
- timer_init();
- initialized = 1;
- }
- counter_value = *iotimer_addr - base_counter;
- *time = (double)counter_value * resolution;
-}
-
-
SHELL=/bin/sh
-CLASS=U
+CLASS=S
NPROCS=1
SUBTYPE=
VERSION=
is: header
cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-IS-trace: is-trace
-is-trace: header
- cd IS-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
EP: ep
ep: header
cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-EP-trace: ep-trace
-ep-trace: header
- cd EP-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
EP-sampling: ep-sampling
ep-sampling: header
cd EP-sampling; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
dt: header
cd DT; $(MAKE) CLASS=$(CLASS)
-DT-trace: dt-trace
-dt-trace: header
- cd DT-trace; $(MAKE) CLASS=$(CLASS)
-
-DT-folding: dt-folding
-dt-folding: header
- cd DT-folding; $(MAKE) CLASS=$(CLASS)
-
# Awk script courtesy cmg@cray.com, modified by Haoqiang Jin
suite:
@ awk -f sys/suite.awk SMAKE=$(MAKE) $(SFILE) | $(SHELL)
# are defined) but on a really clean system this will won't work
# because those makefiles need config/make.def
clean:
- - rm -f core
- - rm -f *~ */core */*~ */*.o */npbparams.h */*.obj */*.exe
- - rm -f MPI_dummy/test MPI_dummy/libmpi.a
+ - rm -f *~ */*~ */*.o */npbparams.h
- rm -f sys/setparams sys/makesuite sys/setparams.h
- - rm -f btio.*.out*
veryclean: clean
- rm -f config/make.def config/suite.def
- - rm -f bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.*
- - rm -f bin/ep.* bin/cg.* bin/dt.*
+ - rm -f bin/is.* bin/ep.* bin/dt.*
header:
@ sys/print_header
+++ /dev/null
-
- subroutine print_results(name, class, n1, n2, n3, niter,
- > nprocs_compiled, nprocs_total,
- > t, mops, optype, verified, npbversion,
- > compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
-
- implicit none
- character*2 name
- character*1 class
- integer n1, n2, n3, niter, nprocs_compiled, nprocs_total, j
- double precision t, mops
- character optype*24, size*15
- logical verified
- character*(*) npbversion, compiletime,
- > cs1, cs2, cs3, cs4, cs5, cs6, cs7
-
- write (*, 2) name
- 2 format(//, ' ', A2, ' Benchmark Completed.')
-
- write (*, 3) Class
- 3 format(' Class = ', 12x, a12)
-
-c If this is not a grid-based problem (EP, FT, CG), then
-c we only print n1, which contains some measure of the
-c problem size. In that case, n2 and n3 are both zero.
-c Otherwise, we print the grid size n1xn2xn3
-
- if ((n2 .eq. 0) .and. (n3 .eq. 0)) then
- if (name(1:2) .eq. 'EP') then
- write(size, '(f15.0)' ) 2.d0**n1
- j = 15
- if (size(j:j) .eq. '.') j = j - 1
- write (*,42) size(1:j)
- 42 format(' Size = ',9x, a15)
- else
- write (*,44) n1
- 44 format(' Size = ',12x, i12)
- endif
- else
- write (*, 4) n1,n2,n3
- 4 format(' Size = ',9x, i4,'x',i4,'x',i4)
- endif
-
- write (*, 5) niter
- 5 format(' Iterations = ', 12x, i12)
-
- write (*, 6) t
- 6 format(' Time in seconds = ',12x, f12.2)
-
- write (*,7) nprocs_total
- 7 format(' Total processes = ', 12x, i12)
-
- write (*,8) nprocs_compiled
- 8 format(' Compiled procs = ', 12x, i12)
-
- write (*,9) mops
- 9 format(' Mop/s total = ',12x, f12.2)
-
- write (*,10) mops/float( nprocs_total )
- 10 format(' Mop/s/process = ', 12x, f12.2)
-
- write(*, 11) optype
- 11 format(' Operation type = ', a24)
-
- if (verified) then
- write(*,12) ' SUCCESSFUL'
- else
- write(*,12) 'UNSUCCESSFUL'
- endif
- 12 format(' Verification = ', 12x, a)
-
- write(*,13) npbversion
- 13 format(' Version = ', 12x, a12)
-
- write(*,14) compiletime
- 14 format(' Compile date = ', 12x, a12)
-
-
- write (*,121) cs1
- 121 format(/, ' Compile options:', /,
- > ' MPIF77 = ', A)
-
- write (*,122) cs2
- 122 format(' FLINK = ', A)
-
- write (*,123) cs3
- 123 format(' FMPI_LIB = ', A)
-
- write (*,124) cs4
- 124 format(' FMPI_INC = ', A)
-
- write (*,125) cs5
- 125 format(' FFLAGS = ', A)
-
- write (*,126) cs6
- 126 format(' FLINKFLAGS = ', A)
-
- write(*, 127) cs7
- 127 format(' RAND = ', A)
-
- write (*,130)
- 130 format(//' Please send the results of this run to:'//
- > ' NPB Development Team '/
- > ' Internet: npb@nas.nasa.gov'/
- > ' '/
- > ' If email is not available, send this to:'//
- > ' MS T27A-1'/
- > ' NASA Ames Research Center'/
- > ' Moffett Field, CA 94035-1000'//
- > ' Fax: 650-604-3957'//)
-
-
- return
- end
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- double precision function randlc (x, a)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c This routine returns a uniform pseudorandom double precision number in the
-c range (0, 1) by using the linear congruential generator
-c
-c x_{k+1} = a x_k (mod 2^46)
-c
-c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
-c before repeating. The argument A is the same as 'a' in the above formula,
-c and X is the same as x_0. A and X must be odd double precision integers
-c in the range (1, 2^46). The returned value RANDLC is normalized to be
-c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
-c the new seed x_1, so that subsequent calls to RANDLC using the same
-c arguments will generate a continuous sequence.
-c
-c This routine should produce the same results on any computer with at least
-c 48 mantissa bits in double precision floating point data. On 64 bit
-c systems, double precision should be disabled.
-c
-c David H. Bailey October 26, 1990
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
- parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
- > t46 = t23 ** 2)
-
-c---------------------------------------------------------------------
-c Break A into two parts such that A = 2^23 * A1 + A2.
-c---------------------------------------------------------------------
- t1 = r23 * a
- a1 = int (t1)
- a2 = a - t23 * a1
-
-c---------------------------------------------------------------------
-c Break X into two parts such that X = 2^23 * X1 + X2, compute
-c Z = A1 * X2 + A2 * X1 (mod 2^23), and then
-c X = 2^23 * Z + A2 * X2 (mod 2^46).
-c---------------------------------------------------------------------
- t1 = r23 * x
- x1 = int (t1)
- x2 = x - t23 * x1
- t1 = a1 * x2 + a2 * x1
- t2 = int (r23 * t1)
- z = t1 - t23 * t2
- t3 = t23 * z + a2 * x2
- t4 = int (r46 * t3)
- x = t3 - t46 * t4
- randlc = r46 * x
-
- return
- end
-
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine vranlc (n, x, a, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c This routine generates N uniform pseudorandom double precision numbers in
-c the range (0, 1) by using the linear congruential generator
-c
-c x_{k+1} = a x_k (mod 2^46)
-c
-c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
-c before repeating. The argument A is the same as 'a' in the above formula,
-c and X is the same as x_0. A and X must be odd double precision integers
-c in the range (1, 2^46). The N results are placed in Y and are normalized
-c to be between 0 and 1. X is updated to contain the new seed, so that
-c subsequent calls to VRANLC using the same arguments will generate a
-c continuous sequence. If N is zero, only initialization is performed, and
-c the variables X, A and Y are ignored.
-c
-c This routine is the standard version designed for scalar or RISC systems.
-c However, it should produce the same results on any single processor
-c computer with at least 48 mantissa bits in double precision floating point
-c data. On 64 bit systems, double precision should be disabled.
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- integer i,n
- double precision y,r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
- dimension y(*)
- parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
- > t46 = t23 ** 2)
-
-
-c---------------------------------------------------------------------
-c Break A into two parts such that A = 2^23 * A1 + A2.
-c---------------------------------------------------------------------
- t1 = r23 * a
- a1 = int (t1)
- a2 = a - t23 * a1
-
-c---------------------------------------------------------------------
-c Generate N results. This loop is not vectorizable.
-c---------------------------------------------------------------------
- do i = 1, n
-
-c---------------------------------------------------------------------
-c Break X into two parts such that X = 2^23 * X1 + X2, compute
-c Z = A1 * X2 + A2 * X1 (mod 2^23), and then
-c X = 2^23 * Z + A2 * X2 (mod 2^46).
-c---------------------------------------------------------------------
- t1 = r23 * x
- x1 = int (t1)
- x2 = x - t23 * x1
- t1 = a1 * x2 + a2 * x1
- t2 = int (r23 * t1)
- z = t1 - t23 * t2
- t3 = t23 * z + a2 * x2
- t4 = int (r46 * t3)
- x = t3 - t46 * t4
- y(i) = r46 * x
- enddo
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
- double precision function randlc (x, a)
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c This routine returns a uniform pseudorandom double precision number in the
-c range (0, 1) by using the linear congruential generator
-c
-c x_{k+1} = a x_k (mod 2^46)
-c
-c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
-c before repeating. The argument A is the same as 'a' in the above formula,
-c and X is the same as x_0. A and X must be odd double precision integers
-c in the range (1, 2^46). The returned value RANDLC is normalized to be
-c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
-c the new seed x_1, so that subsequent calls to RANDLC using the same
-c arguments will generate a continuous sequence.
-c
-c This routine should produce the same results on any computer with at least
-c 48 mantissa bits in double precision floating point data. On 64 bit
-c systems, double precision should be disabled.
-c
-c David H. Bailey October 26, 1990
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
- parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
- > t46 = t23 ** 2)
-
-c---------------------------------------------------------------------
-c Break A into two parts such that A = 2^23 * A1 + A2.
-c---------------------------------------------------------------------
- t1 = r23 * a
- a1 = int (t1)
- a2 = a - t23 * a1
-
-c---------------------------------------------------------------------
-c Break X into two parts such that X = 2^23 * X1 + X2, compute
-c Z = A1 * X2 + A2 * X1 (mod 2^23), and then
-c X = 2^23 * Z + A2 * X2 (mod 2^46).
-c---------------------------------------------------------------------
- t1 = r23 * x
- x1 = int (t1)
- x2 = x - t23 * x1
-
-
- t1 = a1 * x2 + a2 * x1
- t2 = int (r23 * t1)
- z = t1 - t23 * t2
- t3 = t23 * z + a2 * x2
- t4 = int (r46 * t3)
- x = t3 - t46 * t4
- randlc = r46 * x
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine vranlc (n, x, a, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This routine generates N uniform pseudorandom double precision numbers in
-c the range (0, 1) by using the linear congruential generator
-c
-c x_{k+1} = a x_k (mod 2^46)
-c
-c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
-c before repeating. The argument A is the same as 'a' in the above formula,
-c and X is the same as x_0. A and X must be odd double precision integers
-c in the range (1, 2^46). The N results are placed in Y and are normalized
-c to be between 0 and 1. X is updated to contain the new seed, so that
-c subsequent calls to RANDLC using the same arguments will generate a
-c continuous sequence.
-c
-c This routine generates the output sequence in batches of length NV, for
-c convenience on vector computers. This routine should produce the same
-c results on any computer with at least 48 mantissa bits in double precision
-c floating point data. On Cray systems, double precision should be disabled.
-c
-c David H. Bailey August 30, 1990
-c---------------------------------------------------------------------
-
- integer n
- double precision x, a, y(*)
-
- double precision r23, r46, t23, t46
- integer nv
- parameter (r23 = 2.d0 ** (-23), r46 = r23 * r23, t23 = 2.d0 ** 23,
- > t46 = t23 * t23, nv = 64)
- double precision xv(nv), t1, t2, t3, t4, an, a1, a2, x1, x2, yy
- integer n1, i, j
- external randlc
- double precision randlc
-
-c---------------------------------------------------------------------
-c Compute the first NV elements of the sequence using RANDLC.
-c---------------------------------------------------------------------
- t1 = x
- n1 = min (n, nv)
-
- do i = 1, n1
- xv(i) = t46 * randlc (t1, a)
- enddo
-
-c---------------------------------------------------------------------
-c It is not necessary to compute AN, A1 or A2 unless N is greater than NV.
-c---------------------------------------------------------------------
- if (n .gt. nv) then
-
-c---------------------------------------------------------------------
-c Compute AN = AA ^ NV (mod 2^46) using successive calls to RANDLC.
-c---------------------------------------------------------------------
- t1 = a
- t2 = r46 * a
-
- do i = 1, nv - 1
- t2 = randlc (t1, a)
- enddo
-
- an = t46 * t2
-
-c---------------------------------------------------------------------
-c Break AN into two parts such that AN = 2^23 * A1 + A2.
-c---------------------------------------------------------------------
- t1 = r23 * an
- a1 = aint (t1)
- a2 = an - t23 * a1
- endif
-
-c---------------------------------------------------------------------
-c Compute N pseudorandom results in batches of size NV.
-c---------------------------------------------------------------------
- do j = 0, n - 1, nv
- n1 = min (nv, n - j)
-
-c---------------------------------------------------------------------
-c Compute up to NV results based on the current seed vector XV.
-c---------------------------------------------------------------------
- do i = 1, n1
- y(i+j) = r46 * xv(i)
- enddo
-
-c---------------------------------------------------------------------
-c If this is the last pass through the 140 loop, it is not necessary to
-c update the XV vector.
-c---------------------------------------------------------------------
- if (j + n1 .eq. n) goto 150
-
-c---------------------------------------------------------------------
-c Update the XV vector by multiplying each element by AN (mod 2^46).
-c---------------------------------------------------------------------
- do i = 1, nv
- t1 = r23 * xv(i)
- x1 = aint (t1)
- x2 = xv(i) - t23 * x1
- t1 = a1 * x2 + a2 * x1
- t2 = aint (r23 * t1)
- yy = t1 - t23 * t2
- t3 = t23 * yy + a2 * x2
- t4 = aint (r46 * t3)
- xv(i) = t3 - t46 * t4
- enddo
-
- enddo
-
-c---------------------------------------------------------------------
-c Save the last seed in X so that subsequent calls to VRANLC will generate
-c a continuous sequence.
-c---------------------------------------------------------------------
- 150 x = xv(n1)
-
- return
- end
-
-c----- end of program ------------------------------------------------
-
+++ /dev/null
- double precision function randlc(x, a)
-
-c---------------------------------------------------------------------
-c
-c This routine returns a uniform pseudorandom double precision number in the
-c range (0, 1) by using the linear congruential generator
-c
-c x_{k+1} = a x_k (mod 2^46)
-c
-c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
-c before repeating. The argument A is the same as 'a' in the above formula,
-c and X is the same as x_0. A and X must be odd double precision integers
-c in the range (1, 2^46). The returned value RANDLC is normalized to be
-c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
-c the new seed x_1, so that subsequent calls to RANDLC using the same
-c arguments will generate a continuous sequence.
-
- implicit none
- double precision x, a
- integer*8 i246m1, Lx, La
- double precision d2m46
-
- parameter(d2m46=0.5d0**46)
-
- save i246m1
- data i246m1/X'00003FFFFFFFFFFF'/
-
- Lx = X
- La = A
-
- Lx = iand(Lx*La,i246m1)
- randlc = d2m46*dble(Lx)
- x = dble(Lx)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
- SUBROUTINE VRANLC (N, X, A, Y)
- implicit none
- integer n, i
- double precision x, a, y(*)
- integer*8 i246m1, Lx, La
- double precision d2m46
-
-c This doesn't work, because the compiler does the calculation in 32
-c bits and overflows. No standard way (without f90 stuff) to specify
-c that the rhs should be done in 64 bit arithmetic.
-c parameter(i246m1=2**46-1)
-
- parameter(d2m46=0.5d0**46)
-
- save i246m1
- data i246m1/X'00003FFFFFFFFFFF'/
-
-c Note that the v6 compiler on an R8000 does something stupid with
-c the above. Using the following instead (or various other things)
-c makes the calculation run almost 10 times as fast.
-c
-c save d2m46
-c data d2m46/0.0d0/
-c if (d2m46 .eq. 0.0d0) then
-c d2m46 = 0.5d0**46
-c endif
-
- Lx = X
- La = A
- do i = 1, N
- Lx = iand(Lx*La,i246m1)
- y(i) = d2m46*dble(Lx)
- end do
- x = dble(Lx)
-
- return
- end
-
+++ /dev/null
- double precision function randlc(x, a)
-
-c---------------------------------------------------------------------
-c
-c This routine returns a uniform pseudorandom double precision number in the
-c range (0, 1) by using the linear congruential generator
-c
-c x_{k+1} = a x_k (mod 2^46)
-c
-c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
-c before repeating. The argument A is the same as 'a' in the above formula,
-c and X is the same as x_0. A and X must be odd double precision integers
-c in the range (1, 2^46). The returned value RANDLC is normalized to be
-c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
-c the new seed x_1, so that subsequent calls to RANDLC using the same
-c arguments will generate a continuous sequence.
-
- implicit none
- double precision x, a
- integer*8 Lx, La, a1, a2, x1, x2, xa
- double precision d2m46
- parameter(d2m46=0.5d0**46)
-
- Lx = x
- La = A
- a1 = ibits(La, 23, 23)
- a2 = ibits(La, 0, 23)
- x1 = ibits(Lx, 23, 23)
- x2 = ibits(Lx, 0, 23)
- xa = ishft(ibits(a1*x2+a2*x1, 0, 23), 23) + a2*x2
- Lx = ibits(xa,0, 46)
- x = dble(Lx)
- randlc = d2m46*x
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
- SUBROUTINE VRANLC (N, X, A, Y)
- implicit none
- integer n, i
- double precision x, a, y(*)
- integer*8 Lx, La, a1, a2, x1, x2, xa
- double precision d2m46
- parameter(d2m46=0.5d0**46)
-
- Lx = X
- La = A
- a1 = ibits(La, 23, 23)
- a2 = ibits(La, 0, 23)
- do i = 1, N
- x1 = ibits(Lx, 23, 23)
- x2 = ibits(Lx, 0, 23)
- xa = ishft(ibits(a1*x2+a2*x1, 0, 23), 23) + a2*x2
- Lx = ibits(xa,0, 46)
- y(i) = d2m46*dble(Lx)
- end do
- x = dble(Lx)
- return
- end
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine timer_clear(n)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- integer n
-
- double precision start(64), elapsed(64)
- common /tt/ start, elapsed
-
- elapsed(n) = 0.0
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine timer_start(n)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- integer n
- include 'mpif.h'
- double precision start(64), elapsed(64)
- common /tt/ start, elapsed
-
- start(n) = MPI_Wtime()
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine timer_stop(n)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- integer n
- include 'mpif.h'
- double precision start(64), elapsed(64)
- common /tt/ start, elapsed
- double precision t, now
- now = MPI_Wtime()
- t = now - start(n)
- elapsed(n) = elapsed(n) + t
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- double precision function timer_read(n)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- integer n
- double precision start(64), elapsed(64)
- common /tt/ start, elapsed
-
- timer_read = elapsed(n)
- return
- end
-
+++ /dev/null
-FMPI_LIB = -L../MPI_dummy -lmpi
-FMPI_INC = -I../MPI_dummy
-CMPI_LIB = -L../MPI_dummy -lmpi
-CMPI_INC = -I../MPI_dummy
-default:: ${PROGRAM} libmpi.a
-libmpi.a:
- cd ../MPI_dummy; $(MAKE) F77=$(MPIF77) CC=$(MPICC)