examples/smpi/NAS/EP/ep-sampling.c

   1 #include <stdlib.h>
   2 #include <stdio.h>
   3 #include <string.h>
   4 #include <math.h>
   5
   6 #include "mpi.h"
   7 #include "npbparams.h"
   8
   9 #ifndef CLASS
  10 #define CLASS 'S'
  11 #define NUM_PROCS            1
  12 #endif
  13 #define true 1
  14 #define false 0
  15
  16
  17 //---NOTE : all the timers function have been modified to
  18 //          avoid global timers (privatize these).
  19       // ----------------------- timers ---------------------
  20       void timer_clear(double *onetimer) {
  21             //elapsed[n] = 0.0;
  22             *onetimer = 0.0;
  23       }
  24
  25       void timer_start(double *onetimer) {
  26             *onetimer = MPI_Wtime();
  27       }
  28
  29       void timer_stop(int n,double *elapsed,double *start) {
  30             double t, now;
  31
  32             now = MPI_Wtime();
  33             t = now - start[n];
  34             elapsed[n] += t;
  35       }
  36
  37       double timer_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
  38             return(elapsed[n]);
  39       }
  40       /********************************************************************
  41        *****************            V R A N L C          ******************
  42        *****************                                 *****************/
  43       double vranlc(int n, double x, double a, double *y)
  44       {
  45         int i;
  46         long  i246m1=0x00003FFFFFFFFFFF;
  47     long  LLx, Lx, La;
  48         double d2m46;
  49
  50 // This doesn't work, because the compiler does the calculation in 32
  51 // bits and overflows. No standard way (without f90 stuff) to specify
  52 // that the rhs should be done in 64 bit arithmetic.
  53 //     parameter(i246m1=2**46-1)
  54
  55       d2m46=pow(0.5,46);
  56
  57 // c Note that the v6 compiler on an R8000 does something stupid with
  58 // c the above. Using the following instead (or various other things)
  59 // c makes the calculation run almost 10 times as fast.
  60 //
  61 // c     save d2m46
  62 // c      data d2m46/0.0d0/
  63 // c      if (d2m46 .eq. 0.0d0) then
  64 // c         d2m46 = 0.5d0**46
  65 // c      endif
  66
  67       Lx = (long)x;
  68       La = (long)a;
  69       //fprintf(stdout,("================== Vranlc ================");
  70       //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
  71   LLx = Lx;
  72   for (i=0; i< n; i++) {
  73       Lx   = Lx*La & i246m1 ;
  74       LLx = Lx;
  75       y[i] = d2m46 * (double)LLx;
  76       /*
  77          if(i == 0) {
  78          fprintf(stdout,("After loop 0:");
  79          fprintf(stdout,("Lx = " + Lx + ", La = " + La);
  80          fprintf(stdout,("d2m46 = " + d2m46);
  81          fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
  82          fprintf(stdout,("Y[0]" + y[0]);
  83          }
  84        */
  85   }
  86
  87       x = (double)LLx;
  88       /*
  89       fprintf(stdout,("Change: Lx = " + Lx);
  90       fprintf(stdout,("=============End   Vranlc ================");
  91       */
  92       return x;
  93     }
  94
  95
  96
  97 //-------------- the core (unique function) -----------
  98       void doTest(int argc, char **argv) {
  99       double dum[3] = {1.,1.,1.};
 100       double x1, x2, sx, sy, tm, an, tt, gc;
 101       double Mops;
 102       double epsilon=1.0E-8, a = 1220703125., s=271828183.;
 103       double t1, t2, t3, t4;
 104       double sx_verify_value, sy_verify_value, sx_err, sy_err;
 105
 106 #include "npbparams.h"
 107       int    mk=16,
 108          // --> set by make : in npbparams.h
 109          //m=28, // for CLASS=A
 110          //m=30, // for CLASS=B
 111          //npm=2, // NPROCS
 112          mm = m-mk,
 113          nn = (int)(pow(2,mm)),
 114          nk = (int)(pow(2,mk)),
 115          nq=10,
 116          np,
 117          node,
 118          no_nodes,
 119          i,
 120          ik,
 121          kk,
 122          l,
 123          k, nit, no_large_nodes,
 124          np_add, k_offset, j;
 125       int    me, nprocs, root=0, dp_type;
 126       int verified,
 127           timers_enabled=true;
 128       char  size[500]; // mind the size of the string to represent a big number
 129
 130       //Use in randlc..
 131       int KS = 0;
 132       double R23, R46, T23, T46;
 133
 134       double *qq = (double *) malloc (10000*sizeof(double));
 135       double *start = (double *) malloc (64*sizeof(double));
 136       double *elapsed = (double *) malloc (64*sizeof(double));
 137
 138       double *x = (double *) malloc (2*nk*sizeof(double));
 139       double *q = (double *) malloc (nq*sizeof(double));
 140
 141       MPI_Init( &argc, &argv );
 142       MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
 143       MPI_Comm_rank( MPI_COMM_WORLD, &node);
 144
 145 #ifdef USE_MPE
 146     MPE_Init_log();
 147 #endif
 148       root = 0;
 149       if (node == root ) {
 150
 151           /*   Because the size of the problem is too large to store in a 32-bit
 152            *   integer for some classes, we put it into a string (for printing).
 153            *   Have to strip off the decimal point put in there by the floating
 154            *   point print statement (internal file)
 155            */
 156           fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
 157           sprintf(size,"%d",(int) pow(2,m+1));
 158           //size = size.replace('.', ' ');
 159           fprintf(stdout," Number of random numbers generated: %s\n",size);
 160           fprintf(stdout," Number of active processes: %d\n",no_nodes);
 161
 162       }
 163       verified = false;
 164
 165       /* c   Compute the number of "batches" of random number pairs generated
 166          c   per processor. Adjust if the number of processors does not evenly
 167          c   divide the total number
 168 */
 169
 170        np = nn / no_nodes;
 171        no_large_nodes = nn % no_nodes;
 172        if (node < no_large_nodes) np_add = 1;
 173        else np_add = 0;
 174        np = np + np_add;
 175
 176        if (np == 0) {
 177              fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
 178              MPI_Abort(MPI_COMM_WORLD,1);
 179              exit(0);
 180        }
 181
 182 /* c   Call the random number generator functions and initialize
 183    c   the x-array to reduce the effects of paging on the timings.
 184    c   Also, call all mathematical functions that are used. Make
 185    c   sure these initializations cannot be eliminated as dead code.
 186 */
 187
 188    //call vranlc(0, dum[1], dum[2], dum[3]);
 189    // Array indexes start at 1 in Fortran, 0 in Java
 190    vranlc(0, dum[0], dum[1], &(dum[2]));
 191
 192    dum[0] = randlc(&(dum[1]),&(dum[2]));
 193    /////////////////////////////////
 194    for (i=0;i<2*nk;i++) {
 195        x[i] = -1e99;
 196    }
 197    Mops = log(sqrt(abs(1)));
 198
 199    /*
 200       c---------------------------------------------------------------------
 201       c    Synchronize before placing time stamp
 202       c---------------------------------------------------------------------
 203     */
 204         MPI_Barrier( MPI_COMM_WORLD );
 205
 206         timer_clear(&(elapsed[1]));
 207         timer_clear(&(elapsed[2]));
 208         timer_clear(&(elapsed[3]));
 209         timer_start(&(start[1]));
 210
 211         t1 = a;
 212   //fprintf(stdout,("(ep.f:160) t1 = " + t1);
 213         t1 = vranlc(0, t1, a, x);
 214   //fprintf(stdout,("(ep.f:161) t1 = " + t1);
 215
 216
 217 /* c   Compute AN = A ^ (2 * NK) (mod 2^46). */
 218
 219         t1 = a;
 220   //fprintf(stdout,("(ep.f:165) t1 = " + t1);
 221         for (i=1; i <= mk+1; i++) {
 222                t2 = randlc(&t1, &t1);
 223          //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
 224         }
 225         an = t1;
 226   //fprintf(stdout,("(ep.f:172) s = " + s);
 227         tt = s;
 228         gc = 0.;
 229         sx = 0.;
 230         sy = 0.;
 231         for (i=0; i < nq ; i++) {
 232                q[i] = 0.;
 233         }
 234
 235 /*
 236     Each instance of this loop may be performed independently. We compute
 237     the k offsets separately to take into account the fact that some nodes
 238     have more numbers to generate than others
 239 */
 240
 241       if (np_add == 1)
 242          k_offset = node * np -1;
 243       else
 244          k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
 245
 246       int stop = false;
 247       for(k = 1; k <= np; k++) SMPI_SAMPLE_LOCAL(0.25 * np, 0.03) {
 248          stop = false;
 249          kk = k_offset + k ;
 250          t1 = s;
 251          //fprintf(stdout,("(ep.f:193) t1 = " + t1);
 252          t2 = an;
 253
 254 //       Find starting seed t1 for this kk.
 255
 256          for (i=1;i<=100 && !stop;i++) {
 257             ik = kk / 2;
 258       //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
 259             if (2 * ik != kk)  {
 260                 t3 = randlc(&t1, &t2);
 261                 //fprintf(stdout,("(ep.f:200) t1= " +t1 );
 262             }
 263             if (ik==0)
 264                 stop = true;
 265             else {
 266                t3 = randlc(&t2, &t2);
 267                kk = ik;
 268            }
 269          }
 270 //       Compute uniform pseudorandom numbers.
 271
 272          //if (timers_enabled)  timer_start(3);
 273    timer_start(&(start[3]));
 274          //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
 275
 276   //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
 277   //fprintf(stdout,"2*nk = " + (2*nk));
 278   //fprintf(stdout,"t1 = " + t1);
 279   //fprintf(stdout,"a  = " + a);
 280   //fprintf(stdout,"x[0] = " + x[0]);
 281   //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
 282
 283   t1 = vranlc(2 * nk, t1, a, x);
 284
 285   //fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
 286   //fprintf(stdout,("2*nk = " + (2*nk));
 287   //fprintf(stdout,("t1 = " + t1);
 288   //fprintf(stdout,("a  = " + a);
 289   //fprintf(stdout,("x[0] = " + x[0]);
 290   //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
 291
 292          //if (timers_enabled)  timer_stop(3);
 293    timer_stop(3,elapsed,start);
 294
 295 /*       Compute Gaussian deviates by acceptance-rejection method and
 296  *       tally counts in concentric square annuli.  This loop is not
 297  *       vectorizable.
 298  */
 299          //if (timers_enabled) timer_start(2);
 300     timer_start(&(start[2]));
 301          for(i=1; i<=nk;i++) {
 302             x1 = 2. * x[2*i-2] -1.0;
 303             x2 = 2. * x[2*i-1] - 1.0;
 304             t1 = x1*x1 + x2*x2;
 305             if (t1 <= 1.) {
 306                t2   = sqrt(-2. * log(t1) / t1);
 307                t3   = (x1 * t2);
 308                t4   = (x2 * t2);
 309                l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
 310                q[l] = q[l] + 1.;
 311                sx   = sx + t3;
 312                sy   = sy + t4;
 313              }
 314     /*
 315        if(i == 1) {
 316                 fprintf(stdout,"x1 = " + x1);
 317                 fprintf(stdout,"x2 = " + x2);
 318                 fprintf(stdout,"t1 = " + t1);
 319                 fprintf(stdout,"t2 = " + t2);
 320                 fprintf(stdout,"t3 = " + t3);
 321                 fprintf(stdout,"t4 = " + t4);
 322                 fprintf(stdout,"l = " + l);
 323                 fprintf(stdout,"q[l] = " + q[l]);
 324                 fprintf(stdout,"sx = " + sx);
 325                 fprintf(stdout,"sy = " + sy);
 326        }
 327     */
 328            }
 329          //if (timers_enabled)  timer_stop(2);
 330     timer_stop(2,elapsed,start);
 331       }
 332
 333       //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
 334   MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 335   sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
 336       MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 337       sy = x[0];
 338       MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 339
 340       for(i = 0; i < nq; i++) {
 341     q[i] = x[i];
 342   }
 343   for(i = 0; i < nq; i++) {
 344     gc += q[i];
 345   }
 346
 347   timer_stop(1,elapsed,start);
 348       tm = timer_read(1,elapsed);
 349   MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
 350   tm = x[0];
 351
 352   if(node == root) {
 353     nit = 0;
 354     verified = true;
 355
 356     if(m == 24) {
 357              sx_verify_value = -3.247834652034740E3;
 358                   sy_verify_value = -6.958407078382297E3;
 359     } else if(m == 25) {
 360                 sx_verify_value = -2.863319731645753E3;
 361       sy_verify_value = -6.320053679109499E3;
 362     } else if(m == 28) {
 363             sx_verify_value = -4.295875165629892E3;
 364       sy_verify_value = -1.580732573678431E4;
 365     } else if(m == 30) {
 366             sx_verify_value =  4.033815542441498E4;
 367                   sy_verify_value = -2.660669192809235E4;
 368     } else if(m == 32) {
 369                   sx_verify_value =  4.764367927995374E4;
 370                      sy_verify_value = -8.084072988043731E4;
 371     } else if(m == 36) {
 372             sx_verify_value =  1.982481200946593E5;
 373             sy_verify_value = -1.020596636361769E5;
 374     } else {
 375       verified = false;
 376     }
 377
 378     /*
 379     fprintf(stdout,("sx        = " + sx);
 380     fprintf(stdout,("sx_verify = " + sx_verify_value);
 381     fprintf(stdout,("sy        = " + sy);
 382     fprintf(stdout,("sy_verify = " + sy_verify_value);
 383     */
 384     if(verified) {
 385       sx_err = abs((sx - sx_verify_value)/sx_verify_value);
 386       sy_err = abs((sy - sy_verify_value)/sy_verify_value);
 387       /*
 388       fprintf(stdout,("sx_err = " + sx_err);
 389       fprintf(stdout,("sy_err = " + sx_err);
 390       fprintf(stdout,("epsilon= " + epsilon);
 391       */
 392       verified = ((sx_err < epsilon) && (sy_err < epsilon));
 393     }
 394
 395     Mops = (pow(2.0, m+1))/tm/1000;
 396
 397     fprintf(stdout,"EP Benchmark Results:\n");
 398     fprintf(stdout,"CPU Time=%d\n",(int) tm);
 399     fprintf(stdout,"N = 2^%d\n",m);
 400     fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
 401     fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
 402     fprintf(stdout,"Count:");
 403     for(i = 0; i < nq; i++) {
 404       fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
 405     }
 406
 407     /*
 408     print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
 409         "Random numbers generated", verified, npbversion,
 410         compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
 411     fprintf(stdout,"\nEP Benchmark Completed\n");
 412             fprintf(stdout,"Class           = %s\n", _class);
 413     fprintf(stdout,"Size            = %s\n", size);
 414     fprintf(stdout,"Iteration       = %d\n", nit);
 415     fprintf(stdout,"Time in seconds = %f\n",(tm/1000));
 416     fprintf(stdout,"Total processes = %d\n",no_nodes);
 417     fprintf(stdout,"Mops/s total    = %f\n",Mops);
 418     fprintf(stdout,"Mops/s/process  = %f\n", Mops/no_nodes);
 419     fprintf(stdout,"Operation type  = Random number generated\n");
 420     if(verified) {
 421       fprintf(stdout,"Verification    = SUCCESSFUL\n");
 422     } else {
 423       fprintf(stdout,"Verification    = UNSUCCESSFUL\n");
 424     }
 425              fprintf(stdout,"Total time:     %f\n",(timer_read(1,elapsed)/1000));
 426              fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2,elapsed)/1000));
 427              fprintf(stdout,"Random numbers: %f\n",(timer_read(3,elapsed)/1000));
 428          }
 429 #ifdef USE_MPE
 430     MPE_Finish_log(argv[0]);
 431 #endif
 432
 433        MPI_Finalize();
 434       }
 435
 436     int main(int argc, char **argv) {
 437        doTest(argc,argv);
 438     }