Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Added an example with iteration sampling.
[simgrid.git] / examples / smpi / NAS / EP-folding / ep.c
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <math.h>
5
6 #include "mpi.h"
7 #include "npbparams.h"
8
9 #include "randlc.h"
10
11 #ifndef CLASS
12 #define CLASS 'S'
13 #define NUM_PROCS            1                 
14 #endif
15 #define true 1
16 #define false 0
17
18
19 //---NOTE : all the timers function have been modified to
20 //          avoid global timers (privatize these). 
21       // ----------------------- timers ---------------------
22       void timer_clear(double *onetimer) {
23             //elapsed[n] = 0.0;
24             *onetimer = 0.0;
25       }
26
27       void timer_start(double *onetimer) {
28             *onetimer = MPI_Wtime();
29       }
30
31       void timer_stop(int n,double *elapsed,double *start) {
32             double t, now;
33
34             now = MPI_Wtime();
35             t = now - start[n];
36             elapsed[n] += t;
37       }
38
39       double timer_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
40             return(elapsed[n]);
41       }
42       /********************************************************************
43        *****************            V R A N L C          ******************
44        *****************                                 *****************/           
45       double vranlc(int n, double x, double a, double *y)
46       {
47         int i;
48         long  i246m1=0x00003FFFFFFFFFFF;
49           long  LLx, Lx, La;
50         double d2m46;
51
52 // This doesn't work, because the compiler does the calculation in 32
53 // bits and overflows. No standard way (without f90 stuff) to specify
54 // that the rhs should be done in 64 bit arithmetic.
55 //     parameter(i246m1=2**46-1)
56
57       d2m46=pow(0.5,46);
58
59 // c Note that the v6 compiler on an R8000 does something stupid with
60 // c the above. Using the following instead (or various other things)
61 // c makes the calculation run almost 10 times as fast.
62 //
63 // c     save d2m46
64 // c      data d2m46/0.0d0/
65 // c      if (d2m46 .eq. 0.0d0) then
66 // c         d2m46 = 0.5d0**46
67 // c      endif
68
69       Lx = (long)x;
70       La = (long)a;
71       //fprintf(stdout,("================== Vranlc ================");
72       //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
73         LLx = Lx;
74         for (i=0; i< n; i++) {
75                   Lx   = Lx*La & i246m1 ;
76                   LLx = Lx;
77                   y[i] = d2m46 * (double)LLx;
78                   /*
79                      if(i == 0) {
80                      fprintf(stdout,("After loop 0:");
81                      fprintf(stdout,("Lx = " + Lx + ", La = " + La);
82                      fprintf(stdout,("d2m46 = " + d2m46);
83                      fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
84                      fprintf(stdout,("Y[0]" + y[0]);
85                      }
86                    */
87         }
88
89       x = (double)LLx;
90       /*
91       fprintf(stdout,("Change: Lx = " + Lx);
92       fprintf(stdout,("=============End   Vranlc ================");
93       */
94       return x;
95     }
96
97
98
99 //-------------- the core (unique function) -----------
100       void doTest(int argc, char **argv) {
101                   double dum[3] = {1.,1.,1.};
102                   double x1, x2, sx, sy, tm, an, tt, gc;
103                   double Mops;
104                   double epsilon=1.0E-8, a = 1220703125., s=271828183.;
105                   double t1, t2, t3, t4; 
106                   double sx_verify_value, sy_verify_value, sx_err, sy_err;
107
108 #include "npbparams.h"
109                   int    mk=16, 
110                            // --> set by make : in npbparams.h
111                            //m=28, // for CLASS=A
112                            //m=30, // for CLASS=B
113                            //npm=2, // NPROCS
114                            mm = m-mk, 
115                            nn = (int)(pow(2,mm)), 
116                            nk = (int)(pow(2,mk)), 
117                            nq=10, 
118                            np, 
119                            node, 
120                            no_nodes, 
121                            i, 
122                            ik, 
123                            kk, 
124                            l, 
125                            k, nit, no_large_nodes,
126                            np_add, k_offset, j;
127                   int    me, nprocs, root=0, dp_type;
128                   int verified, 
129                             timers_enabled=true;
130                   char  size[500]; // mind the size of the string to represent a big number
131
132                   //Use in randlc..
133                   int KS = 0;
134                   double R23, R46, T23, T46;
135
136                   double *qq = (double *) malloc (10000*sizeof(double));
137                   double *start = (double *) malloc (64*sizeof(double));
138                   double *elapsed = (double *) malloc (64*sizeof(double));
139
140                   double *x = (double *) malloc (2*nk*sizeof(double));
141                   double *q = (double *) malloc (nq*sizeof(double));
142
143                   MPI_Init( &argc, &argv );
144                   MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
145                   MPI_Comm_rank( MPI_COMM_WORLD, &node);
146
147 #ifdef USE_MPE
148     MPE_Init_log();
149 #endif
150                   root = 0;
151                   if (node == root ) {
152
153                             /*   Because the size of the problem is too large to store in a 32-bit
154                              *   integer for some classes, we put it into a string (for printing).
155                              *   Have to strip off the decimal point put in there by the floating
156                              *   point print statement (internal file)
157                              */
158                             fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
159                             sprintf(size,"%d",pow(2,m+1));
160                             //size = size.replace('.', ' ');
161                             fprintf(stdout," Number of random numbers generated: %s\n",size);
162                             fprintf(stdout," Number of active processes: %d\n",no_nodes);
163
164                   }
165                   verified = false;
166
167                   /* c   Compute the number of "batches" of random number pairs generated 
168                      c   per processor. Adjust if the number of processors does not evenly 
169                      c   divide the total number
170 */
171
172        np = nn / no_nodes;
173        no_large_nodes = nn % no_nodes;
174        if (node < no_large_nodes) np_add = 1;
175        else np_add = 0;
176        np = np + np_add;
177
178        if (np == 0) {
179              fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
180              MPI_Abort(MPI_COMM_WORLD,1);
181              exit(0); 
182        } 
183
184 /* c   Call the random number generator functions and initialize
185    c   the x-array to reduce the effects of paging on the timings.
186    c   Also, call all mathematical functions that are used. Make
187    c   sure these initializations cannot be eliminated as dead code.
188 */
189
190          //call vranlc(0, dum[1], dum[2], dum[3]);
191          // Array indexes start at 1 in Fortran, 0 in Java
192          vranlc(0, dum[0], dum[1], &(dum[2])); 
193
194          dum[0] = randlc(&(dum[1]),&(dum[2]));
195          /////////////////////////////////
196          for (i=0;i<2*nk;i++) {
197                    x[i] = -1e99;
198          }
199          Mops = log(sqrt(abs(1))); 
200
201          /*
202             c---------------------------------------------------------------------
203             c    Synchronize before placing time stamp
204             c---------------------------------------------------------------------
205           */
206         MPI_Barrier( MPI_COMM_WORLD );
207
208         timer_clear(&(elapsed[1]));
209         timer_clear(&(elapsed[2]));
210         timer_clear(&(elapsed[3]));
211         timer_start(&(start[1]));
212         
213         t1 = a;
214         //fprintf(stdout,("(ep.f:160) t1 = " + t1);
215         t1 = vranlc(0, t1, a, x);
216         //fprintf(stdout,("(ep.f:161) t1 = " + t1);
217         
218         
219 /* c   Compute AN = A ^ (2 * NK) (mod 2^46). */
220         
221         t1 = a;
222         //fprintf(stdout,("(ep.f:165) t1 = " + t1);
223         for (i=1; i <= mk+1; i++) {
224                t2 = randlc(&t1, &t1);
225                //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
226         } 
227         an = t1;
228         //fprintf(stdout,("(ep.f:172) s = " + s);
229         tt = s;
230         gc = 0.;
231         sx = 0.;
232         sy = 0.;
233         for (i=0; i < nq ; i++) {
234                q[i] = 0.;
235         }
236
237 /*
238     Each instance of this loop may be performed independently. We compute
239     the k offsets separately to take into account the fact that some nodes
240     have more numbers to generate than others
241 */
242
243       if (np_add == 1)
244          k_offset = node * np -1;
245       else
246          k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
247      
248       int stop = false;
249       for(k = 1; k <= np; k++) SMPI_SAMPLE_LOCAL(0.25 * np) {
250          stop = false;
251          kk = k_offset + k ;
252          t1 = s;
253          //fprintf(stdout,("(ep.f:193) t1 = " + t1);
254          t2 = an;
255
256 //       Find starting seed t1 for this kk.
257
258          for (i=1;i<=100 && !stop;i++) {
259             ik = kk / 2;
260             //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
261             if (2 * ik != kk)  {
262                 t3 = randlc(&t1, &t2);
263                 //fprintf(stdout,("(ep.f:200) t1= " +t1 );
264             }
265             if (ik==0)
266                 stop = true;
267             else {
268                t3 = randlc(&t2, &t2);
269                kk = ik;
270            }
271          }
272 //       Compute uniform pseudorandom numbers.
273
274          //if (timers_enabled)  timer_start(3);
275          timer_start(&(start[3]));
276          //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
277
278         //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
279         //fprintf(stdout,"2*nk = " + (2*nk));
280         //fprintf(stdout,"t1 = " + t1);
281         //fprintf(stdout,"a  = " + a);
282         //fprintf(stdout,"x[0] = " + x[0]);
283         //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
284         
285         t1 = vranlc(2 * nk, t1, a, x);
286
287         //fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
288         //fprintf(stdout,("2*nk = " + (2*nk));
289         //fprintf(stdout,("t1 = " + t1);
290         //fprintf(stdout,("a  = " + a);
291         //fprintf(stdout,("x[0] = " + x[0]);
292         //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
293         
294          //if (timers_enabled)  timer_stop(3);
295          timer_stop(3,elapsed,start);
296
297 /*       Compute Gaussian deviates by acceptance-rejection method and 
298  *       tally counts in concentric square annuli.  This loop is not 
299  *       vectorizable. 
300  */
301          //if (timers_enabled) timer_start(2);
302          timer_start(&(start[2]));
303          for(i=1; i<=nk;i++) {
304             x1 = 2. * x[2*i-2] -1.0;
305             x2 = 2. * x[2*i-1] - 1.0;
306             t1 = x1*x1 + x2*x2;
307             if (t1 <= 1.) {
308                t2   = sqrt(-2. * log(t1) / t1);
309                t3   = (x1 * t2);
310                t4   = (x2 * t2);
311                l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
312                q[l] = q[l] + 1.;
313                sx   = sx + t3;
314                sy   = sy + t4;
315              }
316                 /*
317              if(i == 1) {
318                 fprintf(stdout,"x1 = " + x1);
319                 fprintf(stdout,"x2 = " + x2);
320                 fprintf(stdout,"t1 = " + t1);
321                 fprintf(stdout,"t2 = " + t2);
322                 fprintf(stdout,"t3 = " + t3);
323                 fprintf(stdout,"t4 = " + t4);
324                 fprintf(stdout,"l = " + l);
325                 fprintf(stdout,"q[l] = " + q[l]);
326                 fprintf(stdout,"sx = " + sx);
327                 fprintf(stdout,"sy = " + sy);
328              }
329                 */
330            }
331          //if (timers_enabled)  timer_stop(2);
332          timer_stop(2,elapsed,start);
333       }
334
335       //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)   
336         MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
337         sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
338       MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
339       sy = x[0];
340       MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
341
342       for(i = 0; i < nq; i++) {
343                 q[i] = x[i];
344         }
345         for(i = 0; i < nq; i++) {
346                 gc += q[i];
347         }
348
349         timer_stop(1,elapsed,start);
350       tm = timer_read(1,elapsed);
351         MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
352         tm = x[0];
353
354         if(node == root) {
355                 nit = 0;
356                 verified = true;
357
358                 if(m == 24) {
359                         sx_verify_value = -3.247834652034740E3;
360                         sy_verify_value = -6.958407078382297E3;
361                 } else if(m == 25) {
362                         sx_verify_value = -2.863319731645753E3;
363                         sy_verify_value = -6.320053679109499E3;
364                 } else if(m == 28) {
365                         sx_verify_value = -4.295875165629892E3;
366                         sy_verify_value = -1.580732573678431E4;
367                 } else if(m == 30) {
368                         sx_verify_value =  4.033815542441498E4;
369                         sy_verify_value = -2.660669192809235E4;
370                 } else if(m == 32) {
371                         sx_verify_value =  4.764367927995374E4;
372                         sy_verify_value = -8.084072988043731E4;
373                 } else if(m == 36) {
374                         sx_verify_value =  1.982481200946593E5;
375                         sy_verify_value = -1.020596636361769E5;
376                 } else {
377                         verified = false;
378                 }
379
380                 /*
381                 fprintf(stdout,("sx        = " + sx);
382                 fprintf(stdout,("sx_verify = " + sx_verify_value);
383                 fprintf(stdout,("sy        = " + sy);
384                 fprintf(stdout,("sy_verify = " + sy_verify_value);
385                 */
386                 if(verified) {
387                         sx_err = abs((sx - sx_verify_value)/sx_verify_value);
388                         sy_err = abs((sy - sy_verify_value)/sy_verify_value);
389                         /*
390                         fprintf(stdout,("sx_err = " + sx_err);
391                         fprintf(stdout,("sy_err = " + sx_err);
392                         fprintf(stdout,("epsilon= " + epsilon);
393                         */
394                         verified = ((sx_err < epsilon) && (sy_err < epsilon));
395                 }
396
397                 Mops = (pow(2.0, m+1))/tm/1000;
398
399                 fprintf(stdout,"EP Benchmark Results:\n");
400                 fprintf(stdout,"CPU Time=%d\n",tm);
401                 fprintf(stdout,"N = 2^%d\n",m);
402                 fprintf(stdout,"No. Gaussain Pairs =%d\n",gc);
403                 fprintf(stdout,"Sum = %lf %ld\n",sx,sy);
404                 fprintf(stdout,"Count:");
405                 for(i = 0; i < nq; i++) {
406                         fprintf(stdout,"%d\t %ld\n",i,q[i]);
407                 }
408
409                 /*
410                 print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
411                                 "Random numbers generated", verified, npbversion,
412                                 compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
413                 fprintf(stdout,"\nEP Benchmark Completed\n");
414             fprintf(stdout,"Class           = %s\n", _class);
415                 fprintf(stdout,"Size            = %s\n", size);
416                 fprintf(stdout,"Iteration       = %d\n", nit);
417                 fprintf(stdout,"Time in seconds = %lf\n",(tm/1000));
418                 fprintf(stdout,"Total processes = %d\n",no_nodes);
419                 fprintf(stdout,"Mops/s total    = %lf\n",Mops);
420                 fprintf(stdout,"Mops/s/process  = %lf\n", Mops/no_nodes);
421                 fprintf(stdout,"Operation type  = Random number generated\n");
422                 if(verified) {
423                         fprintf(stdout,"Verification    = SUCCESSFUL\n");
424                 } else {
425                         fprintf(stdout,"Verification    = UNSUCCESSFUL\n");
426                 }
427                 fprintf(stdout,"Total time:     %lf\n",(timer_read(1,elapsed)/1000));
428                 fprintf(stdout,"Gaussian pairs: %lf\n",(timer_read(2,elapsed)/1000));
429                 fprintf(stdout,"Random numbers: %lf\n",(timer_read(3,elapsed)/1000));
430         }
431 #ifdef USE_MPE
432     MPE_Finish_log(argv[0]);
433 #endif
434  
435        MPI_Finalize();
436       }
437
438     int main(int argc, char **argv) {
439        doTest(argc,argv);
440     }