6 #include "nas_common.h"
7 #include "simgrid/instr.h" //TRACE_
15 int main(int argc, char **argv) {
16 double dum[3] = {1.,1.,1.};
17 double x1, x2, sx, sy, tm, an, gc;
18 XBT_ATTRIB_UNUSED double tt;
20 double epsilon=1.0E-8, a = 1220703125., s=271828183.;
21 double t1, t2, t3, t4;
22 double sx_verify_value, sy_verify_value, sx_err, sy_err;
26 int nk = (int)(pow(2,mk)),
28 np, node, no_nodes, i, ik, kk, l, k, nit, no_large_nodes, np_add, k_offset;
30 char size[500]; // mind the size of the string to represent a big number
32 double *x = (double *) SMPI_SHARED_MALLOC (2*nk*sizeof(double));
33 double *q = (double *) SMPI_SHARED_MALLOC (nq*sizeof(double));
35 MPI_Init( &argc, &argv );
36 MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
37 MPI_Comm_rank( MPI_COMM_WORLD, &node);
39 TRACE_smpi_set_category ("start");
41 get_info(argc, argv, &nprocs, &class);
42 check_info(EP, nprocs, class);
46 else if (class == 'W')
48 else if (class == 'A')
50 else if (class == 'B')
52 else if (class == 'C')
54 else if (class == 'D')
56 else if (class == 'E')
59 printf("EP: Internal error: invalid class type %c\n", class);
64 int nn = (int)(pow(2,mm));
68 /* Because the size of the problem is too large to store in a 32-bit integer for some classes, we put it into a
69 * string (for printing). Have to strip off the decimal point put in there by the floating point print statement
71 fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
72 snprintf(size,500,"%lu",(unsigned long)pow(2,m+1));
73 fprintf(stdout," Number of random numbers generated: %s\n",size);
74 fprintf(stdout," Number of active processes: %d\n",no_nodes);
78 /* Compute the number of "batches" of random number pairs generated per processor. Adjust if the number of processors
79 * does not evenly divide the total number */
81 no_large_nodes = nn % no_nodes;
82 if (node < no_large_nodes)
89 fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
90 MPI_Abort(MPI_COMM_WORLD,1);
94 /* Call the random number generator functions and initialize the x-array to reduce the effects of paging the timings.
95 Also, call all mathematical functions that are used. Make sure initializations cannot be eliminated as dead code. */
96 vranlc(0, dum[0], dum[1], &(dum[2]));
98 dum[0] = randlc(&(dum[1]),&(dum[2]));
99 for (i=0;i<2*nk;i++) {
102 Mops = log(sqrt(abs(1)));
104 /* Synchronize before placing time stamp */
105 MPI_Barrier( MPI_COMM_WORLD );
107 TRACE_smpi_set_category ("ep");
115 t1 = vranlc(0, t1, a, x);
117 /* Compute AN = A ^ (2 * NK) (mod 2^46). */
119 for (i=1; i <= mk+1; i++) {
120 t2 = randlc(&t1, &t1);
128 for (i=0; i < nq ; i++) {
132 /* Each instance of this loop may be performed independently. We compute the k offsets separately to take into account
133 * the fact that some nodes have more numbers to generate than others */
135 k_offset = node * np -1;
137 k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
140 for(k = 1; k <= np; k++) { SMPI_SAMPLE_GLOBAL(0.25 * np, 0.03) {
146 // Find starting seed t1 for this kk.
147 for (i=1;i<=100 && !stop;i++) {
150 t3 = randlc(&t1, &t2);
155 t3 = randlc(&t2, &t2);
159 // Compute uniform pseudorandom numbers.
163 t1 = vranlc(2 * nk, t1, a, x);
167 /* Compute Gaussian deviates by acceptance-rejection method and tally counts in concentric square annuli.
168 * This loop is not vectorizable. */
171 for(i=1; i<=nk;i++) {
172 x1 = 2. * x[2*i-2] -1.0;
173 x2 = 2. * x[2*i-1] - 1.0;
176 t2 = sqrt(-2. * log(t1) / t1);
179 l = (int)(fabs(t3) > fabs(t4) ? fabs(t3) : fabs(t4));
188 TRACE_smpi_set_category ("finalize");
190 MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
192 MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
194 MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
196 for(i = 0; i < nq; i++) {
199 for(i = 0; i < nq; i++) {
205 MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
215 sx_verify_value = -3.247834652034740E3;
216 sy_verify_value = -6.958407078382297E3;
218 sx_verify_value = -2.863319731645753E3;
219 sy_verify_value = -6.320053679109499E3;
221 sx_verify_value = -4.295875165629892E3;
222 sy_verify_value = -1.580732573678431E4;
224 sx_verify_value = 4.033815542441498E4;
225 sy_verify_value = -2.660669192809235E4;
227 sx_verify_value = 4.764367927995374E4;
228 sy_verify_value = -8.084072988043731E4;
230 sx_verify_value = 1.982481200946593E5;
231 sy_verify_value = -1.020596636361769E5;
237 sx_err = fabs((sx - sx_verify_value)/sx_verify_value);
238 sy_err = fabs((sy - sy_verify_value)/sy_verify_value);
239 verified = ((sx_err < epsilon) && (sy_err < epsilon));
242 Mops = (pow(2.0, m+1))/tm/1000;
244 fprintf(stdout,"EP Benchmark Results:\n");
245 fprintf(stdout,"CPU Time=%d\n",(int) tm);
246 fprintf(stdout,"N = 2^%d\n",m);
247 fprintf(stdout,"No. Gaussain Pairs =%d\n",(int) gc);
248 fprintf(stdout,"Sum = %f %ld\n",sx,(long) sy);
249 fprintf(stdout,"Count:");
250 for(i = 0; i < nq; i++) {
251 fprintf(stdout,"%d\t %ld\n",i,(long) q[i]);
253 c_print_results("EP", class, m+1, 0, 0, nit, nprocs, no_nodes, tm, Mops, "Random number generated",verified);
255 fprintf(stdout,"Total time: %f\n",(timer_read(1)/1000));
256 fprintf(stdout,"Gaussian pairs: %f\n",(timer_read(2)/1000));
257 fprintf(stdout,"Random numbers: %f\n",(timer_read(3)/1000));