6 #include "nas_common.h"
7 #include "simgrid/instr.h" //TRACE_
15 int main(int argc, char **argv) {
16 double dum[3] = {1.,1.,1.};
17 double x1, x2, sx, sy, tm, an, tt, gc;
19 double epsilon=1.0E-8, a = 1220703125., s=271828183.;
20 double t1, t2, t3, t4;
21 double sx_verify_value, sy_verify_value, sx_err, sy_err;
25 nk = (int)(pow(2,mk)),
34 k, nit, no_large_nodes,
38 char size[500]; // mind the size of the string to represent a big number
40 double *start = (double *) malloc (64*sizeof(double));
41 double *elapsed = (double *) malloc (64*sizeof(double));
43 double *x = (double *) malloc (2*nk*sizeof(double));
44 double *q = (double *) malloc (nq*sizeof(double));
46 MPI_Init( &argc, &argv );
47 MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
48 MPI_Comm_rank( MPI_COMM_WORLD, &node);
50 TRACE_smpi_set_category ("start");
52 get_info(argc, argv, &nprocs, &class);
53 check_info(EP, nprocs, class);
55 if (class == 'S') { m = 24; }
56 else if (class == 'W') { m = 25; }
57 else if (class == 'A') { m = 28; }
58 else if (class == 'B') { m = 30; }
59 else if (class == 'C') { m = 32; }
60 else if (class == 'D') { m = 36; }
61 else if (class == 'E') { m = 40; }
63 printf("EP: Internal error: invalid class type %c\n", class);
67 nn = (int)(pow(2,mm)),
71 /* Because the size of the problem is too large to store in a 32-bit integer for some classes, we put it into a
72 * string (for printing). Have to strip off the decimal point put in there by the floating point print statement
75 fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
76 sprintf(size,"%zu",(unsigned long)pow(2,m+1));
77 //size = size.replace('.', ' ');
78 fprintf(stdout," Number of random numbers generated: %s\n",size);
79 fprintf(stdout," Number of active processes: %d\n",no_nodes);
83 /* Compute the number of "batches" of random number pairs generated per processor. Adjust if the number of processors
84 * does not evenly divide the total number
88 no_large_nodes = nn % no_nodes;
89 if (node < no_large_nodes) np_add = 1;
94 fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
95 MPI_Abort(MPI_COMM_WORLD,1);
99 /* Call the random number generator functions and initialize the x-array to reduce the effects of paging the timings.
100 Also, call all mathematical functions that are used. Make sure initializations cannot be eliminated as dead code. */
102 //call vranlc(0, dum[1], dum[2], dum[3]);
103 // Array indexes start at 1 in Fortran, 0 in Java
104 vranlc(0, dum[0], dum[1], &(dum[2]));
106 dum[0] = randlc(&(dum[1]),&(dum[2]));
107 for (i=0;i<2*nk;i++) {
110 Mops = log(sqrt(abs(1)));
112 /* Synchronize before placing time stamp */
113 MPI_Barrier( MPI_COMM_WORLD );
115 TRACE_smpi_set_category ("ep");
117 time_clear(&(elapsed[1]));
118 time_clear(&(elapsed[2]));
119 time_clear(&(elapsed[3]));
120 time_start(&(start[1]));
123 //fprintf(stdout,("(ep.f:160) t1 = " + t1);
124 t1 = vranlc(0, t1, a, x);
125 //fprintf(stdout,("(ep.f:161) t1 = " + t1);
127 /* Compute AN = A ^ (2 * NK) (mod 2^46). */
129 //fprintf(stdout,("(ep.f:165) t1 = " + t1);
130 for (i=1; i <= mk+1; i++) {
131 t2 = randlc(&t1, &t1);
132 //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
135 //fprintf(stdout,("(ep.f:172) s = " + s);
140 for (i=0; i < nq ; i++) {
144 /* Each instance of this loop may be performed independently. We compute the k offsets separately to take into account
145 * the fact that some nodes have more numbers to generate than others */
148 k_offset = node * np -1;
150 k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
153 for(k = 1; k <= np; k++) {// SMPI_SAMPLE_LOCAL(0.25 * np, 0.03) {
157 //fprintf(stdout,("(ep.f:193) t1 = " + t1);
160 // Find starting seed t1 for this kk.
161 for (i=1;i<=100 && !stop;i++) {
163 //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
165 t3 = randlc(&t1, &t2);
166 //fprintf(stdout,("(ep.f:200) t1= " +t1 );
171 t3 = randlc(&t2, &t2);
175 // Compute uniform pseudorandom numbers.
177 //if (timers_enabled) timer_start(3);
178 time_start(&(start[3]));
179 //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified
181 //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
182 //fprintf(stdout,"2*nk = " + (2*nk));
183 //fprintf(stdout,"t1 = " + t1);
184 //fprintf(stdout,"a = " + a);
185 //fprintf(stdout,"x[0] = " + x[0]);
186 //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
187 t1 = vranlc(2 * nk, t1, a, x);
189 //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<");
190 //fprintf(stdout,("2*nk = " + (2*nk));
191 //fprintf(stdout,("t1 = " + t1);
192 //fprintf(stdout,("a = " + a);
193 //fprintf(stdout,("x[0] = " + x[0]);
194 //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
196 //if (timers_enabled) timer_stop(3);
197 time_stop(3,elapsed,start);
199 /* Compute Gaussian deviates by acceptance-rejection method and tally counts in concentric square annuli.
200 * This loop is not vectorizable. */
201 //if (timers_enabled) timer_start(2);
202 time_start(&(start[2]));
203 for(i=1; i<=nk;i++) {
204 x1 = 2. * x[2*i-2] -1.0;
205 x2 = 2. * x[2*i-1] - 1.0;
208 t2 = sqrt(-2. * log(t1) / t1);
211 l = (int)(fabs(t3) > fabs(t4) ? fabs(t3) : fabs(t4));
218 fprintf(stdout,"x1 = " + x1);
219 fprintf(stdout,"x2 = " + x2);
220 fprintf(stdout,"t1 = " + t1);
221 fprintf(stdout,"t2 = " + t2);
222 fprintf(stdout,"t3 = " + t3);
223 fprintf(stdout,"t4 = " + t4);
224 fprintf(stdout,"l = " + l);
225 fprintf(stdout,"q[l] = " + q[l]);
226 fprintf(stdout,"sx = " + sx);
227 fprintf(stdout,"sy = " + sy);
231 //if (timers_enabled) timer_stop(2);
232 time_stop(2,elapsed,start);
235 TRACE_smpi_set_category ("finalize");
237 MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
238 sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
239 MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
241 MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
243 for(i = 0; i < nq; i++) {
246 for(i = 0; i < nq; i++) {
250 time_stop(1,elapsed,start);
251 tm = time_read(1,elapsed);
252 MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
260 sx_verify_value = -3.247834652034740E3;
261 sy_verify_value = -6.958407078382297E3;
263 sx_verify_value = -2.863319731645753E3;
264 sy_verify_value = -6.320053679109499E3;
266 sx_verify_value = -4.295875165629892E3;
267 sy_verify_value = -1.580732573678431E4;
269 sx_verify_value = 4.033815542441498E4;
270 sy_verify_value = -2.660669192809235E4;
272 sx_verify_value = 4.764367927995374E4;
273 sy_verify_value = -8.084072988043731E4;
275 sx_verify_value = 1.982481200946593E5;
276 sy_verify_value = -1.020596636361769E5;
282 fprintf(stdout,("sx = " + sx);
283 fprintf(stdout,("sx_verify = " + sx_verify_value);
284 fprintf(stdout,("sy = " + sy);
285 fprintf(stdout,("sy_verify = " + sy_verify_value);
288 sx_err = fabs((sx - sx_verify_value)/sx_verify_value);
289 sy_err = fabs((sy - sy_verify_value)/sy_verify_value);
291 fprintf(stdout,("sx_err = " + sx_err);
292 fprintf(stdout,("sy_err = " + sx_err);
293 fprintf(stdout,("epsilon= " + epsilon);
295 verified = ((sx_err < epsilon) && (sy_err < epsilon));
298 Mops = (pow(2.0, m+1))/tm/1000;
300 fprintf(stdout,"EP Benchmark Results:\n");
301 fprintf(stdout,"CPU Time=%d\n",(int) tm);
302 fprintf(stdout,"N = 2^%d\n",m);
303 fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
304 fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
305 fprintf(stdout,"Count:");
306 for(i = 0; i < nq; i++) {
307 fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
309 c_print_results("EP", class, m+1, 0, 0, nit, nprocs, no_nodes, tm, Mops, "Random number generated",verified);
311 fprintf(stdout,"Total time: %f\n",(time_read(1,elapsed)/1000));
312 fprintf(stdout,"Gaussian pairs: %f\n",(time_read(2,elapsed)/1000));
313 fprintf(stdout,"Random numbers: %f\n",(time_read(3,elapsed)/1000));