From 77ce8d32ddc82aa2c106a8e2401537d4c16d57c0 Mon Sep 17 00:00:00 2001
From: pini <pini@48e7efb5-ca39-0410-a469-dd3cf9ba447f>
Date: Tue, 30 Nov 2010 15:44:18 +0000
Subject: [PATCH] Added an example with iteration sampling.

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/simgrid/simgrid/trunk@8761 48e7efb5-ca39-0410-a469-dd3cf9ba447f
---
 examples/smpi/NAS/EP-folding/README   |   6 +
 examples/smpi/NAS/EP-folding/ep.c     | 440 ++++++++++++++++++++++++++
 examples/smpi/NAS/EP-folding/randlc.c | 107 +++++++
 examples/smpi/NAS/EP-folding/randlc.h |   3 +
 examples/smpi/NAS/Makefile            |   4 +
 5 files changed, 560 insertions(+)
 create mode 100644 examples/smpi/NAS/EP-folding/README
 create mode 100644 examples/smpi/NAS/EP-folding/ep.c
 create mode 100644 examples/smpi/NAS/EP-folding/randlc.c
 create mode 100644 examples/smpi/NAS/EP-folding/randlc.h

diff --git a/examples/smpi/NAS/EP-folding/README b/examples/smpi/NAS/EP-folding/README
new file mode 100644
index 0000000000..6eb36571af
--- /dev/null
+++ b/examples/smpi/NAS/EP-folding/README
@@ -0,0 +1,6 @@
+This code implements the random-number generator described in the
+NAS Parallel Benchmark document RNR Technical Report RNR-94-007.
+The code is "embarrassingly" parallel in that no communication is
+required for the generation of the random numbers itself. There is
+no special requirement on the number of processors used for running
+the benchmark.
diff --git a/examples/smpi/NAS/EP-folding/ep.c b/examples/smpi/NAS/EP-folding/ep.c
new file mode 100644
index 0000000000..41de955d7b
--- /dev/null
+++ b/examples/smpi/NAS/EP-folding/ep.c
@@ -0,0 +1,440 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#include "randlc.h"
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS            1                 
+#endif
+#define true 1
+#define false 0
+
+
+//---NOTE : all the timers function have been modified to
+//          avoid global timers (privatize these). 
+      // ----------------------- timers ---------------------
+      void timer_clear(double *onetimer) {
+            //elapsed[n] = 0.0;
+            *onetimer = 0.0;
+      }
+
+      void timer_start(double *onetimer) {
+            *onetimer = MPI_Wtime();
+      }
+
+      void timer_stop(int n,double *elapsed,double *start) {
+            double t, now;
+
+            now = MPI_Wtime();
+            t = now - start[n];
+            elapsed[n] += t;
+      }
+
+      double timer_read(int n, double *elapsed) {  /* ok, useless, but jsut to keep function call */
+            return(elapsed[n]);
+      }
+      /********************************************************************
+       *****************            V R A N L C          ******************
+       *****************                                 *****************/           
+      double vranlc(int n, double x, double a, double *y)
+      {
+        int i;
+        long  i246m1=0x00003FFFFFFFFFFF;
+	  long  LLx, Lx, La;
+        double d2m46;
+
+// This doesn't work, because the compiler does the calculation in 32
+// bits and overflows. No standard way (without f90 stuff) to specify
+// that the rhs should be done in 64 bit arithmetic.
+//     parameter(i246m1=2**46-1)
+
+      d2m46=pow(0.5,46);
+
+// c Note that the v6 compiler on an R8000 does something stupid with
+// c the above. Using the following instead (or various other things)
+// c makes the calculation run almost 10 times as fast.
+//
+// c     save d2m46
+// c      data d2m46/0.0d0/
+// c      if (d2m46 .eq. 0.0d0) then
+// c         d2m46 = 0.5d0**46
+// c      endif
+
+      Lx = (long)x;
+      La = (long)a;
+      //fprintf(stdout,("================== Vranlc ================");
+      //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
+	LLx = Lx;
+	for (i=0; i< n; i++) {
+		  Lx   = Lx*La & i246m1 ;
+		  LLx = Lx;
+		  y[i] = d2m46 * (double)LLx;
+		  /*
+		     if(i == 0) {
+		     fprintf(stdout,("After loop 0:");
+		     fprintf(stdout,("Lx = " + Lx + ", La = " + La);
+		     fprintf(stdout,("d2m46 = " + d2m46);
+		     fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
+		     fprintf(stdout,("Y[0]" + y[0]);
+		     }
+		   */
+	}
+
+      x = (double)LLx;
+      /*
+      fprintf(stdout,("Change: Lx = " + Lx);
+      fprintf(stdout,("=============End   Vranlc ================");
+      */
+      return x;
+    }
+
+
+
+//-------------- the core (unique function) -----------
+      void doTest(int argc, char **argv) {
+		  double dum[3] = {1.,1.,1.};
+		  double x1, x2, sx, sy, tm, an, tt, gc;
+		  double Mops;
+		  double epsilon=1.0E-8, a = 1220703125., s=271828183.;
+		  double t1, t2, t3, t4; 
+		  double sx_verify_value, sy_verify_value, sx_err, sy_err;
+
+#include "npbparams.h"
+		  int    mk=16, 
+			   // --> set by make : in npbparams.h
+			   //m=28, // for CLASS=A
+			   //m=30, // for CLASS=B
+			   //npm=2, // NPROCS
+			   mm = m-mk, 
+			   nn = (int)(pow(2,mm)), 
+			   nk = (int)(pow(2,mk)), 
+			   nq=10, 
+			   np, 
+			   node, 
+			   no_nodes, 
+			   i, 
+			   ik, 
+			   kk, 
+			   l, 
+			   k, nit, no_large_nodes,
+			   np_add, k_offset, j;
+		  int    me, nprocs, root=0, dp_type;
+		  int verified, 
+			    timers_enabled=true;
+		  char  size[500]; // mind the size of the string to represent a big number
+
+		  //Use in randlc..
+		  int KS = 0;
+		  double R23, R46, T23, T46;
+
+		  double *qq = (double *) malloc (10000*sizeof(double));
+		  double *start = (double *) malloc (64*sizeof(double));
+		  double *elapsed = (double *) malloc (64*sizeof(double));
+
+		  double *x = (double *) malloc (2*nk*sizeof(double));
+		  double *q = (double *) malloc (nq*sizeof(double));
+
+		  MPI_Init( &argc, &argv );
+		  MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
+		  MPI_Comm_rank( MPI_COMM_WORLD, &node);
+
+#ifdef USE_MPE
+    MPE_Init_log();
+#endif
+		  root = 0;
+		  if (node == root ) {
+
+			    /*   Because the size of the problem is too large to store in a 32-bit
+			     *   integer for some classes, we put it into a string (for printing).
+			     *   Have to strip off the decimal point put in there by the floating
+			     *   point print statement (internal file)
+			     */
+			    fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
+			    sprintf(size,"%d",pow(2,m+1));
+			    //size = size.replace('.', ' ');
+			    fprintf(stdout," Number of random numbers generated: %s\n",size);
+			    fprintf(stdout," Number of active processes: %d\n",no_nodes);
+
+		  }
+		  verified = false;
+
+		  /* c   Compute the number of "batches" of random number pairs generated 
+		     c   per processor. Adjust if the number of processors does not evenly 
+		     c   divide the total number
+*/
+
+       np = nn / no_nodes;
+       no_large_nodes = nn % no_nodes;
+       if (node < no_large_nodes) np_add = 1;
+       else np_add = 0;
+       np = np + np_add;
+
+       if (np == 0) {
+             fprintf(stdout,"Too many nodes: %d  %d",no_nodes,nn);
+             MPI_Abort(MPI_COMM_WORLD,1);
+             exit(0); 
+       } 
+
+/* c   Call the random number generator functions and initialize
+   c   the x-array to reduce the effects of paging on the timings.
+   c   Also, call all mathematical functions that are used. Make
+   c   sure these initializations cannot be eliminated as dead code.
+*/
+
+	 //call vranlc(0, dum[1], dum[2], dum[3]);
+	 // Array indexes start at 1 in Fortran, 0 in Java
+	 vranlc(0, dum[0], dum[1], &(dum[2])); 
+
+	 dum[0] = randlc(&(dum[1]),&(dum[2]));
+	 /////////////////////////////////
+	 for (i=0;i<2*nk;i++) {
+		   x[i] = -1e99;
+	 }
+	 Mops = log(sqrt(abs(1))); 
+
+	 /*
+	    c---------------------------------------------------------------------
+	    c    Synchronize before placing time stamp
+	    c---------------------------------------------------------------------
+	  */
+        MPI_Barrier( MPI_COMM_WORLD );
+
+        timer_clear(&(elapsed[1]));
+        timer_clear(&(elapsed[2]));
+        timer_clear(&(elapsed[3]));
+        timer_start(&(start[1]));
+        
+        t1 = a;
+	//fprintf(stdout,("(ep.f:160) t1 = " + t1);
+        t1 = vranlc(0, t1, a, x);
+	//fprintf(stdout,("(ep.f:161) t1 = " + t1);
+	
+        
+/* c   Compute AN = A ^ (2 * NK) (mod 2^46). */
+        
+        t1 = a;
+	//fprintf(stdout,("(ep.f:165) t1 = " + t1);
+        for (i=1; i <= mk+1; i++) {
+               t2 = randlc(&t1, &t1);
+	       //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
+        } 
+        an = t1;
+	//fprintf(stdout,("(ep.f:172) s = " + s);
+        tt = s;
+        gc = 0.;
+        sx = 0.;
+        sy = 0.;
+        for (i=0; i < nq ; i++) {
+               q[i] = 0.;
+        }
+
+/*
+    Each instance of this loop may be performed independently. We compute
+    the k offsets separately to take into account the fact that some nodes
+    have more numbers to generate than others
+*/
+
+      if (np_add == 1)
+         k_offset = node * np -1;
+      else
+         k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
+     
+      int stop = false;
+      for(k = 1; k <= np; k++) SMPI_SAMPLE_LOCAL(0.25 * np) {
+         stop = false;
+         kk = k_offset + k ;
+         t1 = s;
+         //fprintf(stdout,("(ep.f:193) t1 = " + t1);
+         t2 = an;
+
+//       Find starting seed t1 for this kk.
+
+         for (i=1;i<=100 && !stop;i++) {
+            ik = kk / 2;
+	    //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
+            if (2 * ik != kk)  {
+                t3 = randlc(&t1, &t2);
+                //fprintf(stdout,("(ep.f:200) t1= " +t1 );
+            }
+            if (ik==0)
+                stop = true;
+            else {
+               t3 = randlc(&t2, &t2);
+               kk = ik;
+           }
+         }
+//       Compute uniform pseudorandom numbers.
+
+         //if (timers_enabled)  timer_start(3);
+	 timer_start(&(start[3]));
+         //call vranlc(2 * nk, t1, a, x)  --> t1 and y are modified
+
+	//fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
+	//fprintf(stdout,"2*nk = " + (2*nk));
+	//fprintf(stdout,"t1 = " + t1);
+	//fprintf(stdout,"a  = " + a);
+	//fprintf(stdout,"x[0] = " + x[0]);
+	//fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+        
+	t1 = vranlc(2 * nk, t1, a, x);
+
+	//fprintf(stdout,(">>>>>>>>>>>After  Enter vranlc (l.210)<<<<<<");
+	//fprintf(stdout,("2*nk = " + (2*nk));
+	//fprintf(stdout,("t1 = " + t1);
+	//fprintf(stdout,("a  = " + a);
+	//fprintf(stdout,("x[0] = " + x[0]);
+	//fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+        
+         //if (timers_enabled)  timer_stop(3);
+	 timer_stop(3,elapsed,start);
+
+/*       Compute Gaussian deviates by acceptance-rejection method and 
+ *       tally counts in concentric square annuli.  This loop is not 
+ *       vectorizable. 
+ */
+         //if (timers_enabled) timer_start(2);
+ 	 timer_start(&(start[2]));
+         for(i=1; i<=nk;i++) {
+            x1 = 2. * x[2*i-2] -1.0;
+            x2 = 2. * x[2*i-1] - 1.0;
+            t1 = x1*x1 + x2*x2;
+            if (t1 <= 1.) {
+               t2   = sqrt(-2. * log(t1) / t1);
+               t3   = (x1 * t2);
+               t4   = (x2 * t2);
+               l    = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
+               q[l] = q[l] + 1.;
+               sx   = sx + t3;
+               sy   = sy + t4;
+             }
+		/*
+	     if(i == 1) {
+                fprintf(stdout,"x1 = " + x1);
+                fprintf(stdout,"x2 = " + x2);
+                fprintf(stdout,"t1 = " + t1);
+                fprintf(stdout,"t2 = " + t2);
+                fprintf(stdout,"t3 = " + t3);
+                fprintf(stdout,"t4 = " + t4);
+                fprintf(stdout,"l = " + l);
+                fprintf(stdout,"q[l] = " + q[l]);
+                fprintf(stdout,"sx = " + sx);
+                fprintf(stdout,"sy = " + sy);
+	     }
+		*/
+           }
+         //if (timers_enabled)  timer_stop(2);
+ 	 timer_stop(2,elapsed,start);
+      }
+
+      //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)   
+	MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+	sx = x[0]; //FIXME :  x[0] or x[1] => x[0] because fortran starts with 1
+      MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+      sy = x[0];
+      MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+      for(i = 0; i < nq; i++) {
+		q[i] = x[i];
+	}
+	for(i = 0; i < nq; i++) {
+		gc += q[i];
+	}
+
+	timer_stop(1,elapsed,start);
+      tm = timer_read(1,elapsed);
+	MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+	tm = x[0];
+
+	if(node == root) {
+		nit = 0;
+		verified = true;
+
+		if(m == 24) {
+         		sx_verify_value = -3.247834652034740E3;
+	                sy_verify_value = -6.958407078382297E3;
+		} else if(m == 25) {
+	            	sx_verify_value = -2.863319731645753E3;
+			sy_verify_value = -6.320053679109499E3;
+		} else if(m == 28) {
+		        sx_verify_value = -4.295875165629892E3;
+			sy_verify_value = -1.580732573678431E4;
+		} else if(m == 30) {
+	        	sx_verify_value =  4.033815542441498E4;
+	                sy_verify_value = -2.660669192809235E4;
+		} else if(m == 32) {
+	                sx_verify_value =  4.764367927995374E4;
+                   	sy_verify_value = -8.084072988043731E4;
+		} else if(m == 36) {
+		        sx_verify_value =  1.982481200946593E5;
+		        sy_verify_value = -1.020596636361769E5;
+		} else {
+			verified = false;
+		}
+
+		/*
+		fprintf(stdout,("sx        = " + sx);
+		fprintf(stdout,("sx_verify = " + sx_verify_value);
+		fprintf(stdout,("sy        = " + sy);
+		fprintf(stdout,("sy_verify = " + sy_verify_value);
+		*/
+		if(verified) {
+			sx_err = abs((sx - sx_verify_value)/sx_verify_value);
+			sy_err = abs((sy - sy_verify_value)/sy_verify_value);
+			/*
+			fprintf(stdout,("sx_err = " + sx_err);
+			fprintf(stdout,("sy_err = " + sx_err);
+			fprintf(stdout,("epsilon= " + epsilon);
+			*/
+			verified = ((sx_err < epsilon) && (sy_err < epsilon));
+		}
+
+		Mops = (pow(2.0, m+1))/tm/1000;
+
+		fprintf(stdout,"EP Benchmark Results:\n");
+		fprintf(stdout,"CPU Time=%d\n",tm);
+		fprintf(stdout,"N = 2^%d\n",m);
+		fprintf(stdout,"No. Gaussain Pairs =%d\n",gc);
+		fprintf(stdout,"Sum = %lf %ld\n",sx,sy);
+		fprintf(stdout,"Count:");
+		for(i = 0; i < nq; i++) {
+			fprintf(stdout,"%d\t %ld\n",i,q[i]);
+		}
+
+		/*
+		print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
+				"Random numbers generated", verified, npbversion,
+				compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
+		fprintf(stdout,"\nEP Benchmark Completed\n");
+            fprintf(stdout,"Class           = %s\n", _class);
+		fprintf(stdout,"Size            = %s\n", size);
+		fprintf(stdout,"Iteration       = %d\n", nit);
+		fprintf(stdout,"Time in seconds = %lf\n",(tm/1000));
+		fprintf(stdout,"Total processes = %d\n",no_nodes);
+		fprintf(stdout,"Mops/s total    = %lf\n",Mops);
+		fprintf(stdout,"Mops/s/process  = %lf\n", Mops/no_nodes);
+		fprintf(stdout,"Operation type  = Random number generated\n");
+		if(verified) {
+			fprintf(stdout,"Verification    = SUCCESSFUL\n");
+		} else {
+			fprintf(stdout,"Verification    = UNSUCCESSFUL\n");
+		}
+           	fprintf(stdout,"Total time:     %lf\n",(timer_read(1,elapsed)/1000));
+           	fprintf(stdout,"Gaussian pairs: %lf\n",(timer_read(2,elapsed)/1000));
+           	fprintf(stdout,"Random numbers: %lf\n",(timer_read(3,elapsed)/1000));
+       	}
+#ifdef USE_MPE
+    MPE_Finish_log(argv[0]);
+#endif
+ 
+       MPI_Finalize();
+      }
+
+    int main(int argc, char **argv) {
+       doTest(argc,argv);
+    }
diff --git a/examples/smpi/NAS/EP-folding/randlc.c b/examples/smpi/NAS/EP-folding/randlc.c
new file mode 100644
index 0000000000..624b800df8
--- /dev/null
+++ b/examples/smpi/NAS/EP-folding/randlc.c
@@ -0,0 +1,107 @@
+
+/*
+ *    FUNCTION RANDLC (X, A)
+ *
+ *  This routine returns a uniform pseudorandom double precision number in the
+ *  range (0, 1) by using the linear congruential generator
+ *
+ *  x_{k+1} = a x_k  (mod 2^46)
+ *
+ *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
+ *  before repeating.  The argument A is the same as 'a' in the above formula,
+ *  and X is the same as x_0.  A and X must be odd double precision integers
+ *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
+ *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
+ *  the new seed x_1, so that subsequent calls to RANDLC using the same
+ *  arguments will generate a continuous sequence.
+ *
+ *  This routine should produce the same results on any computer with at least
+ *  48 mantissa bits in double precision floating point data.  On Cray systems,
+ *  double precision should be disabled.
+ *
+ *  David H. Bailey     October 26, 1990
+ *
+ *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ *     SAVE KS, R23, R46, T23, T46
+ *     DATA KS/0/
+ *
+ *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
+ *  by merely using the ** operator, in order to insure that the results are
+ *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/*************           R  A  N  D  L  C             ************/
+/*************                                        ************/
+/*************    portable random number generator    ************/
+/*****************************************************************/
+
+double	randlc( double *X, double *A )
+{
+      static int        KS=0;
+      static double	R23, R46, T23, T46;
+      double		T1, T2, T3, T4;
+      double		A1;
+      double		A2;
+      double		X1;
+      double		X2;
+      double		Z;
+      int     		i, j;
+
+      if (KS == 0) 
+      {
+        R23 = 1.0;
+        R46 = 1.0;
+        T23 = 1.0;
+        T46 = 1.0;
+    
+        for (i=1; i<=23; i++)
+        {
+          R23 = 0.50 * R23;
+          T23 = 2.0 * T23;
+        }
+        for (i=1; i<=46; i++)
+        {
+          R46 = 0.50 * R46;
+          T46 = 2.0 * T46;
+        }
+        KS = 1;
+      }
+
+/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
+
+      T1 = R23 * *A;
+      j  = T1;
+      A1 = j;
+      A2 = *A - T23 * A1;
+
+/*  Break X into two parts such that X = 2^23 * X1 + X2, compute
+    Z = A1 * X2 + A2 * X1  (mod 2^23), and then
+    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
+
+      T1 = R23 * *X;
+      j  = T1;
+      X1 = j;
+      X2 = *X - T23 * X1;
+      T1 = A1 * X2 + A2 * X1;
+      
+      j  = R23 * T1;
+      T2 = j;
+      Z = T1 - T23 * T2;
+      T3 = T23 * Z + A2 * X2;
+      j  = R46 * T3;
+      T4 = j;
+      *X = T3 - T46 * T4;
+      return(R46 * *X);
+} 
+
+
+
+/*****************************************************************/
+/************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
+/************                                         ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
diff --git a/examples/smpi/NAS/EP-folding/randlc.h b/examples/smpi/NAS/EP-folding/randlc.h
new file mode 100644
index 0000000000..aff84d341a
--- /dev/null
+++ b/examples/smpi/NAS/EP-folding/randlc.h
@@ -0,0 +1,3 @@
+
+double      randlc( double *X, double *A );
+
diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile
index 794191c451..4fc35670f8 100644
--- a/examples/smpi/NAS/Makefile
+++ b/examples/smpi/NAS/Makefile
@@ -48,6 +48,10 @@ EP-trace: ep-trace
 ep-trace: header
 	cd EP-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
 
+EP-folding: ep-folding
+ep-folding: header
+	cd EP-folding; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
 DT: dt
 dt: header
 	cd DT; $(MAKE) CLASS=$(CLASS)
-- 
2.20.1