9 #include "simgrid/instr.h" //TRACE_
18 //---NOTE : all the timers function have been modified to
19 // avoid global timers (privatize these).
20 // ----------------------- timers ---------------------
21 void timer_clear(double *onetimer) {
26 void timer_start(double *onetimer) {
27 *onetimer = MPI_Wtime();
30 void timer_stop(int n,double *elapsed,double *start) {
38 double timer_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */
41 /********************************************************************
42 ***************** V R A N L C ******************
43 ***************** *****************/
44 double vranlc(int n, double x, double a, double *y)
47 long i246m1=0x00003FFFFFFFFFFF;
51 // This doesn't work, because the compiler does the calculation in 32
52 // bits and overflows. No standard way (without f90 stuff) to specify
53 // that the rhs should be done in 64 bit arithmetic.
54 // parameter(i246m1=2**46-1)
58 // c Note that the v6 compiler on an R8000 does something stupid with
59 // c the above. Using the following instead (or various other things)
60 // c makes the calculation run almost 10 times as fast.
63 // c data d2m46/0.0d0/
64 // c if (d2m46 .eq. 0.0d0) then
65 // c d2m46 = 0.5d0**46
70 //fprintf(stdout,("================== Vranlc ================");
71 //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
73 for (i=0; i< n; i++) {
76 y[i] = d2m46 * (double)LLx;
79 fprintf(stdout,("After loop 0:");
80 fprintf(stdout,("Lx = " + Lx + ", La = " + La);
81 fprintf(stdout,("d2m46 = " + d2m46);
82 fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
83 fprintf(stdout,("Y[0]" + y[0]);
90 fprintf(stdout,("Change: Lx = " + Lx);
91 fprintf(stdout,("=============End Vranlc ================");
98 //-------------- the core (unique function) -----------
99 void doTest(int argc, char **argv) {
100 double dum[3] = {1.,1.,1.};
101 double x1, x2, sx, sy, tm, an, tt, gc;
103 double epsilon=1.0E-8, a = 1220703125., s=271828183.;
104 double t1, t2, t3, t4;
105 double sx_verify_value, sy_verify_value, sx_err, sy_err;
107 #include "npbparams.h"
109 // --> set by make : in npbparams.h
110 //m=28, // for CLASS=A
111 //m=30, // for CLASS=B
114 nn = (int)(pow(2,mm)),
115 nk = (int)(pow(2,mk)),
124 k, nit, no_large_nodes,
126 int me, nprocs, root=0, dp_type;
129 char size[500]; // mind the size of the string to represent a big number
133 double R23, R46, T23, T46;
135 double *qq = (double *) malloc (10000*sizeof(double));
136 double *start = (double *) malloc (64*sizeof(double));
137 double *elapsed = (double *) malloc (64*sizeof(double));
139 double *x = (double *) malloc (2*nk*sizeof(double));
140 double *q = (double *) malloc (nq*sizeof(double));
142 TRACE_smpi_set_category ("start");
144 MPI_Init( &argc, &argv );
145 MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
146 MPI_Comm_rank( MPI_COMM_WORLD, &node);
154 /* Because the size of the problem is too large to store in a 32-bit
155 * integer for some classes, we put it into a string (for printing).
156 * Have to strip off the decimal point put in there by the floating
157 * point print statement (internal file)
159 fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
160 sprintf(size,"%d",(int)pow(2,m+1));
161 //size = size.replace('.', ' ');
162 fprintf(stdout," Number of random numbers generated: %s\n",size);
163 fprintf(stdout," Number of active processes: %d\n",no_nodes);
168 /* c Compute the number of "batches" of random number pairs generated
169 c per processor. Adjust if the number of processors does not evenly
170 c divide the total number
174 no_large_nodes = nn % no_nodes;
175 if (node < no_large_nodes) np_add = 1;
180 fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
181 MPI_Abort(MPI_COMM_WORLD,1);
185 /* c Call the random number generator functions and initialize
186 c the x-array to reduce the effects of paging on the timings.
187 c Also, call all mathematical functions that are used. Make
188 c sure these initializations cannot be eliminated as dead code.
191 //call vranlc(0, dum[1], dum[2], dum[3]);
192 // Array indexes start at 1 in Fortran, 0 in Java
193 vranlc(0, dum[0], dum[1], &(dum[2]));
195 dum[0] = randlc(&(dum[1]),&(dum[2]));
196 /////////////////////////////////
197 for (i=0;i<2*nk;i++) {
200 Mops = log(sqrt(abs(1)));
203 c---------------------------------------------------------------------
204 c Synchronize before placing time stamp
205 c---------------------------------------------------------------------
207 MPI_Barrier( MPI_COMM_WORLD );
209 TRACE_smpi_set_category ("ep");
211 timer_clear(&(elapsed[1]));
212 timer_clear(&(elapsed[2]));
213 timer_clear(&(elapsed[3]));
214 timer_start(&(start[1]));
217 //fprintf(stdout,("(ep.f:160) t1 = " + t1);
218 t1 = vranlc(0, t1, a, x);
219 //fprintf(stdout,("(ep.f:161) t1 = " + t1);
222 /* c Compute AN = A ^ (2 * NK) (mod 2^46). */
225 //fprintf(stdout,("(ep.f:165) t1 = " + t1);
226 for (i=1; i <= mk+1; i++) {
227 t2 = randlc(&t1, &t1);
228 //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
231 //fprintf(stdout,("(ep.f:172) s = " + s);
236 for (i=0; i < nq ; i++) {
241 Each instance of this loop may be performed independently. We compute
242 the k offsets separately to take into account the fact that some nodes
243 have more numbers to generate than others
247 k_offset = node * np -1;
249 k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
252 for(k = 1; k <= np; k++) {
256 //fprintf(stdout,("(ep.f:193) t1 = " + t1);
259 // Find starting seed t1 for this kk.
261 for (i=1;i<=100 && !stop;i++) {
263 //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
265 t3 = randlc(&t1, &t2);
266 //fprintf(stdout,("(ep.f:200) t1= " +t1 );
271 t3 = randlc(&t2, &t2);
275 // Compute uniform pseudorandom numbers.
277 //if (timers_enabled) timer_start(3);
278 timer_start(&(start[3]));
279 //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified
281 //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
282 //fprintf(stdout,"2*nk = " + (2*nk));
283 //fprintf(stdout,"t1 = " + t1);
284 //fprintf(stdout,"a = " + a);
285 //fprintf(stdout,"x[0] = " + x[0]);
286 //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
288 t1 = vranlc(2 * nk, t1, a, x);
290 //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<");
291 //fprintf(stdout,("2*nk = " + (2*nk));
292 //fprintf(stdout,("t1 = " + t1);
293 //fprintf(stdout,("a = " + a);
294 //fprintf(stdout,("x[0] = " + x[0]);
295 //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
297 //if (timers_enabled) timer_stop(3);
298 timer_stop(3,elapsed,start);
300 /* Compute Gaussian deviates by acceptance-rejection method and
301 * tally counts in concentric square annuli. This loop is not
304 //if (timers_enabled) timer_start(2);
305 timer_start(&(start[2]));
306 for(i=1; i<=nk;i++) {
307 x1 = 2. * x[2*i-2] -1.0;
308 x2 = 2. * x[2*i-1] - 1.0;
311 t2 = sqrt(-2. * log(t1) / t1);
314 l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
321 fprintf(stdout,"x1 = " + x1);
322 fprintf(stdout,"x2 = " + x2);
323 fprintf(stdout,"t1 = " + t1);
324 fprintf(stdout,"t2 = " + t2);
325 fprintf(stdout,"t3 = " + t3);
326 fprintf(stdout,"t4 = " + t4);
327 fprintf(stdout,"l = " + l);
328 fprintf(stdout,"q[l] = " + q[l]);
329 fprintf(stdout,"sx = " + sx);
330 fprintf(stdout,"sy = " + sy);
334 //if (timers_enabled) timer_stop(2);
335 timer_stop(2,elapsed,start);
338 TRACE_smpi_set_category ("finalize");
340 //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
341 MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
342 sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
343 MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
345 MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
347 for(i = 0; i < nq; i++) {
350 for(i = 0; i < nq; i++) {
354 timer_stop(1,elapsed,start);
355 tm = timer_read(1,elapsed);
356 MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
364 sx_verify_value = -3.247834652034740E3;
365 sy_verify_value = -6.958407078382297E3;
367 sx_verify_value = -2.863319731645753E3;
368 sy_verify_value = -6.320053679109499E3;
370 sx_verify_value = -4.295875165629892E3;
371 sy_verify_value = -1.580732573678431E4;
373 sx_verify_value = 4.033815542441498E4;
374 sy_verify_value = -2.660669192809235E4;
376 sx_verify_value = 4.764367927995374E4;
377 sy_verify_value = -8.084072988043731E4;
379 sx_verify_value = 1.982481200946593E5;
380 sy_verify_value = -1.020596636361769E5;
386 fprintf(stdout,("sx = " + sx);
387 fprintf(stdout,("sx_verify = " + sx_verify_value);
388 fprintf(stdout,("sy = " + sy);
389 fprintf(stdout,("sy_verify = " + sy_verify_value);
392 sx_err = abs((sx - sx_verify_value)/sx_verify_value);
393 sy_err = abs((sy - sy_verify_value)/sy_verify_value);
395 fprintf(stdout,("sx_err = " + sx_err);
396 fprintf(stdout,("sy_err = " + sx_err);
397 fprintf(stdout,("epsilon= " + epsilon);
399 verified = ((sx_err < epsilon) && (sy_err < epsilon));
402 Mops = (pow(2.0, m+1))/tm/1000;
404 fprintf(stdout,"EP Benchmark Results:\n");
405 fprintf(stdout,"CPU Time=%d\n",(int) tm);
406 fprintf(stdout,"N = 2^%d\n",m);
407 fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
408 fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
409 fprintf(stdout,"Count:");
410 for(i = 0; i < nq; i++) {
411 fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
415 print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
416 "Random numbers generated", verified, npbversion,
417 compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
418 fprintf(stdout,"\nEP Benchmark Completed\n");
419 fprintf(stdout,"Class = %s\n", _class);
420 fprintf(stdout,"Size = %s\n", size);
421 fprintf(stdout,"Iteration = %d\n", nit);
422 fprintf(stdout,"Time in seconds = %f\n",(tm/1000));
423 fprintf(stdout,"Total processes = %d\n",no_nodes);
424 fprintf(stdout,"Mops/s total = %f\n",Mops);
425 fprintf(stdout,"Mops/s/process = %f\n", Mops/no_nodes);
426 fprintf(stdout,"Operation type = Random number generated\n");
428 fprintf(stdout,"Verification = SUCCESSFUL\n");
430 fprintf(stdout,"Verification = UNSUCCESSFUL\n");
432 fprintf(stdout,"Total time: %f\n",(timer_read(1,elapsed)/1000));
433 fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2,elapsed)/1000));
434 fprintf(stdout,"Random numbers: %f\n",(timer_read(3,elapsed)/1000));
437 MPE_Finish_log(argv[0]);
443 int main(int argc, char **argv) {