Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
creating instrumented versions of DT, EP and IS benchmarks
[simgrid.git] / examples / smpi / NAS / IS-trace / is-trace.c
1 /*************************************************************************
2  *                                                                       * 
3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
4  *                                                                       * 
5  *                                  I S                                  * 
6  *                                                                       * 
7  ************************************************************************* 
8  *                                                                       * 
9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
10  *   It is described in NAS Technical Report 95-020.                     * 
11  *                                                                       * 
12  *   Permission to use, copy, distribute and modify this software        * 
13  *   for any purpose with or without fee is hereby granted.  We          * 
14  *   request, however, that all derived work reference the NAS           * 
15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
16  *   without express or implied warranty.                                * 
17  *                                                                       * 
18  *   Information on NPB 3.3, including the technical report, the         *
19  *   original specifications, source code, results and information       * 
20  *   on how to submit new results, is available at:                      * 
21  *                                                                       * 
22  *          http://www.nas.nasa.gov/Software/NPB                         * 
23  *                                                                       * 
24  *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
25  *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
26  *                                                                       * 
27  *         NAS Parallel Benchmarks Group                                 * 
28  *         NASA Ames Research Center                                     * 
29  *         Mail Stop: T27A-1                                             * 
30  *         Moffett Field, CA   94035-1000                                * 
31  *                                                                       * 
32  *         E-mail:  npb@nas.nasa.gov                                     * 
33  *         Fax:     (650) 604-3957                                       * 
34  *                                                                       * 
35  ************************************************************************* 
36  *                                                                       * 
37  *   Author: M. Yarrow                                                   * 
38  *           H. Jin                                                      * 
39  *                                                                       * 
40  *************************************************************************/
41
42 #include "mpi.h"
43 #include "npbparams.h"
44 #include <stdlib.h>
45 #include <stdio.h>
46
47 #include "instr/instr.h" //TRACE_
48
49 /******************/
50 /* default values */
51 /******************/
52 #ifndef CLASS
53 #define CLASS 'S'
54 #define NUM_PROCS            1                 
55 #endif
56 #define MIN_PROCS            1
57
58
59 /*************/
60 /*  CLASS S  */
61 /*************/
62 #if CLASS == 'S'
63 #define  TOTAL_KEYS_LOG_2    16
64 #define  MAX_KEY_LOG_2       11
65 #define  NUM_BUCKETS_LOG_2   9
66 #endif
67
68
69 /*************/
70 /*  CLASS W  */
71 /*************/
72 #if CLASS == 'W'
73 #define  TOTAL_KEYS_LOG_2    20
74 #define  MAX_KEY_LOG_2       16
75 #define  NUM_BUCKETS_LOG_2   10
76 #endif
77
78 /*************/
79 /*  CLASS A  */
80 /*************/
81 #if CLASS == 'A'
82 #define  TOTAL_KEYS_LOG_2    23
83 #define  MAX_KEY_LOG_2       19
84 #define  NUM_BUCKETS_LOG_2   10
85 #endif
86
87
88 /*************/
89 /*  CLASS B  */
90 /*************/
91 #if CLASS == 'B'
92 #define  TOTAL_KEYS_LOG_2    25
93 #define  MAX_KEY_LOG_2       21
94 #define  NUM_BUCKETS_LOG_2   10
95 #endif
96
97
98 /*************/
99 /*  CLASS C  */
100 /*************/
101 #if CLASS == 'C'
102 #define  TOTAL_KEYS_LOG_2    27
103 #define  MAX_KEY_LOG_2       23
104 #define  NUM_BUCKETS_LOG_2   10
105 #endif
106
107
108 /*************/
109 /*  CLASS D  */
110 /*************/
111 #if CLASS == 'D'
112 #define  TOTAL_KEYS_LOG_2    29
113 #define  MAX_KEY_LOG_2       27
114 #define  NUM_BUCKETS_LOG_2   10
115 #undef   MIN_PROCS
116 #define  MIN_PROCS           4
117 #endif
118
119
120 #define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
121 #define  MAX_KEY             (1 << MAX_KEY_LOG_2)
122 #define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
123 #define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
124
125 /*****************************************************************/
126 /* On larger number of processors, since the keys are (roughly)  */ 
127 /* gaussian distributed, the first and last processor sort keys  */ 
128 /* in a large interval, requiring array sizes to be larger. Note */
129 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
130 /* The required array size also depends on the bucket size used. */
131 /* The following values are validated for the 1024-bucket setup. */
132 /*****************************************************************/
133 #if   NUM_PROCS < 256
134 #define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
135 #elif NUM_PROCS < 512
136 #define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
137 #elif NUM_PROCS < 1024
138 #define  SIZE_OF_BUFFERS     4*NUM_KEYS
139 #else
140 #define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
141 #endif
142
143 /*****************************************************************/
144 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
145 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
146 /* MAX_PROCS AT YOUR PERIL                                       */
147 /*****************************************************************/
148 #if CLASS == 'S'
149 #define  MAX_PROCS           128
150 #else
151 #define  MAX_PROCS           1024
152 #endif
153
154 #define  MAX_ITERATIONS      10
155 #define  TEST_ARRAY_SIZE     5
156
157
158 /***********************************/
159 /* Enable separate communication,  */
160 /* computation timing and printout */
161 /***********************************/
162 /* #define  TIMING_ENABLED         */
163
164
165 /*************************************/
166 /* Typedef: if necessary, change the */
167 /* size of int here by changing the  */
168 /* int type to, say, long            */
169 /*************************************/
170 typedef  int  INT_TYPE;
171 typedef  long INT_TYPE2;
172 #define MP_KEY_TYPE MPI_INT
173
174
175 typedef struct {
176
177 /********************/
178 /* MPI properties:  */
179 /********************/
180 int      my_rank,
181          comm_size;
182
183
184 /********************/
185 /* Some global info */
186 /********************/
187 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
188          total_local_keys,             /* copies of rank info        */
189          total_lesser_keys;
190
191
192 int      passed_verification;
193                                  
194
195
196 /************************************/
197 /* These are the three main arrays. */
198 /* See SIZE_OF_BUFFERS def above    */
199 /************************************/
200 INT_TYPE key_array[SIZE_OF_BUFFERS],    
201          key_buff1[SIZE_OF_BUFFERS],    
202          key_buff2[SIZE_OF_BUFFERS],
203          bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
204          bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
205          bucket_ptrs[NUM_BUCKETS],
206          process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],   
207          process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];   
208 int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
209          send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
210
211
212 /**********************/
213 /* Partial verif info */
214 /**********************/
215 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
216          test_rank_array[TEST_ARRAY_SIZE];
217
218 /**********/
219 /* Timers */
220 /**********/
221 double start[64], elapsed[64];
222
223 } global_data;
224
225
226 const INT_TYPE2
227          S_test_index_array[TEST_ARRAY_SIZE] = 
228                              {48427,17148,23627,62548,4431},
229          S_test_rank_array[TEST_ARRAY_SIZE] = 
230                              {0,18,346,64917,65463},
231
232          W_test_index_array[TEST_ARRAY_SIZE] = 
233                              {357773,934767,875723,898999,404505},
234          W_test_rank_array[TEST_ARRAY_SIZE] = 
235                              {1249,11698,1039987,1043896,1048018},
236
237          A_test_index_array[TEST_ARRAY_SIZE] = 
238                              {2112377,662041,5336171,3642833,4250760},
239          A_test_rank_array[TEST_ARRAY_SIZE] = 
240                              {104,17523,123928,8288932,8388264},
241
242          B_test_index_array[TEST_ARRAY_SIZE] = 
243                              {41869,812306,5102857,18232239,26860214},
244          B_test_rank_array[TEST_ARRAY_SIZE] = 
245                              {33422937,10244,59149,33135281,99}, 
246
247          C_test_index_array[TEST_ARRAY_SIZE] = 
248                              {44172927,72999161,74326391,129606274,21736814},
249          C_test_rank_array[TEST_ARRAY_SIZE] = 
250                              {61147,882988,266290,133997595,133525895},
251
252          D_test_index_array[TEST_ARRAY_SIZE] = 
253                              {1317351170,995930646,1157283250,1503301535,1453734525},
254          D_test_rank_array[TEST_ARRAY_SIZE] = 
255                              {1,36538729,1978098519,2145192618,2147425337};
256
257
258
259 /***********************/
260 /* function prototypes */
261 /***********************/
262 double  randlc( double *X, double *A );
263
264 void full_verify( global_data* gd );
265
266 void c_print_results( char   *name,
267                       char   class,
268                       int    n1, 
269                       int    n2,
270                       int    n3,
271                       int    niter,
272                       int    nprocs_compiled,
273                       int    nprocs_total,
274                       double t,
275                       double mops,
276                       char   *optype,
277                       int    passed_verification,
278                       char   *npbversion,
279                       char   *compiletime,
280                       char   *mpicc,
281                       char   *clink,
282                       char   *cmpi_lib,
283                       char   *cmpi_inc,
284                       char   *cflags,
285                       char   *clinkflags );
286
287 void    timer_clear(global_data* gd, int n );
288 void    timer_start(global_data* gd, int n );
289 void    timer_stop(global_data* gd, int n );
290 double  timer_read(global_data* gd, int n );
291
292 void    timer_clear(global_data* gd, int n ) {
293    gd->elapsed[n] = 0.0;
294 }
295
296 void    timer_start(global_data* gd, int n ) {
297    gd->start[n] = MPI_Wtime();
298 }
299
300 void    timer_stop(global_data* gd, int n ) {
301    gd->elapsed[n] += MPI_Wtime() - gd->start[n];
302 }
303
304 double  timer_read(global_data* gd, int n ) {
305    return gd->elapsed[n];
306 }
307
308
309 /*
310  *    FUNCTION RANDLC (X, A)
311  *
312  *  This routine returns a uniform pseudorandom double precision number in the
313  *  range (0, 1) by using the linear congruential generator
314  *
315  *  x_{k+1} = a x_k  (mod 2^46)
316  *
317  *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
318  *  before repeating.  The argument A is the same as 'a' in the above formula,
319  *  and X is the same as x_0.  A and X must be odd double precision integers
320  *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
321  *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
322  *  the new seed x_1, so that subsequent calls to RANDLC using the same
323  *  arguments will generate a continuous sequence.
324  *
325  *  This routine should produce the same results on any computer with at least
326  *  48 mantissa bits in double precision floating point data.  On Cray systems,
327  *  double precision should be disabled.
328  *
329  *  David H. Bailey     October 26, 1990
330  *
331  *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
332  *     SAVE KS, R23, R46, T23, T46
333  *     DATA KS/0/
334  *
335  *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
336  *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
337  *  by merely using the ** operator, in order to insure that the results are
338  *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
339  */
340
341
342 /*****************************************************************/
343 /*************           R  A  N  D  L  C             ************/
344 /*************                                        ************/
345 /*************    portable random number generator    ************/
346 /*****************************************************************/
347
348 double  randlc( double *X, double *A )
349 {
350       static int        KS=0;
351       static double     R23, R46, T23, T46;
352       double            T1, T2, T3, T4;
353       double            A1;
354       double            A2;
355       double            X1;
356       double            X2;
357       double            Z;
358       int               i, j;
359
360       if (KS == 0) 
361       {
362         R23 = 1.0;
363         R46 = 1.0;
364         T23 = 1.0;
365         T46 = 1.0;
366     
367         for (i=1; i<=23; i++)
368         {
369           R23 = 0.50 * R23;
370           T23 = 2.0 * T23;
371         }
372         for (i=1; i<=46; i++)
373         {
374           R46 = 0.50 * R46;
375           T46 = 2.0 * T46;
376         }
377         KS = 1;
378       }
379
380 /*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
381
382       T1 = R23 * *A;
383       j  = T1;
384       A1 = j;
385       A2 = *A - T23 * A1;
386
387 /*  Break X into two parts such that X = 2^23 * X1 + X2, compute
388     Z = A1 * X2 + A2 * X1  (mod 2^23), and then
389     X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
390
391       T1 = R23 * *X;
392       j  = T1;
393       X1 = j;
394       X2 = *X - T23 * X1;
395       T1 = A1 * X2 + A2 * X1;
396       
397       j  = R23 * T1;
398       T2 = j;
399       Z = T1 - T23 * T2;
400       T3 = T23 * Z + A2 * X2;
401       j  = R46 * T3;
402       T4 = j;
403       *X = T3 - T46 * T4;
404       return(R46 * *X);
405
406
407
408
409 /*****************************************************************/
410 /************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
411 /************                                         ************/
412 /************ returns parallel random number seq seed ************/
413 /*****************************************************************/
414
415 /*
416  * Create a random number sequence of total length nn residing
417  * on np number of processors.  Each processor will therefore have a 
418  * subsequence of length nn/np.  This routine returns that random 
419  * number which is the first random number for the subsequence belonging
420  * to processor rank kn, and which is used as seed for proc kn ran # gen.
421  */
422
423 double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
424                        int  np,       /* np = num procs                      */
425                        long nn,       /* total num of ran numbers, all procs */
426                        double s,      /* Ran num seed, for ex.: 314159265.00 */
427                        double a )     /* Ran num gen mult, try 1220703125.00 */
428 {
429
430   long   i;
431
432   double t1,t2,t3,an;
433   long   mq,nq,kk,ik;
434
435
436
437       nq = nn / np;
438
439       for( mq=0; nq>1; mq++,nq/=2 )
440           ;
441
442       t1 = a;
443
444       for( i=1; i<=mq; i++ )
445         t2 = randlc( &t1, &t1 );
446
447       an = t1;
448
449       kk = kn;
450       t1 = s;
451       t2 = an;
452
453       for( i=1; i<=100; i++ )
454       {
455         ik = kk / 2;
456         if( 2 * ik !=  kk ) 
457             t3 = randlc( &t1, &t2 );
458         if( ik == 0 ) 
459             break;
460         t3 = randlc( &t2, &t2 );
461         kk = ik;
462       }
463
464       return( t1 );
465
466 }
467
468
469
470
471 /*****************************************************************/
472 /*************      C  R  E  A  T  E  _  S  E  Q      ************/
473 /*****************************************************************/
474
475 void    create_seq( global_data* gd, double seed, double a )
476 {
477         double x;
478         int    i, k;
479
480         k = MAX_KEY/4;
481
482         for (i=0; i<NUM_KEYS; i++)
483         {
484             x = randlc(&seed, &a);
485             x += randlc(&seed, &a);
486             x += randlc(&seed, &a);
487             x += randlc(&seed, &a);  
488
489             gd->key_array[i] = k*x;
490         }
491 }
492
493
494
495
496 /*****************************************************************/
497 /*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
498 /*****************************************************************/
499
500
501 void full_verify( global_data* gd )
502 {
503     MPI_Status  status;
504     MPI_Request request;
505     
506     INT_TYPE    i, j;
507     INT_TYPE    k, last_local_key;
508
509     
510 /*  Now, finally, sort the keys:  */
511     for( i=0; i<gd->total_local_keys; i++ )
512         gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
513                                  gd->total_lesser_keys] = gd->key_buff2[i];
514     last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
515
516 /*  Send largest key value to next processor  */
517     if( gd->my_rank > 0 )
518         MPI_Irecv( &k,
519                    1,
520                    MP_KEY_TYPE,
521                    gd->my_rank-1,
522                    1000,
523                    MPI_COMM_WORLD,
524                    &request );                   
525     if( gd->my_rank < gd->comm_size-1 )
526         MPI_Send( &gd->key_array[last_local_key],
527                   1,
528                   MP_KEY_TYPE,
529                   gd->my_rank+1,
530                   1000,
531                   MPI_COMM_WORLD );
532     if( gd->my_rank > 0 )
533         MPI_Wait( &request, &status );
534
535 /*  Confirm that neighbor's greatest key value 
536     is not greater than my least key value       */              
537     j = 0;
538     if( gd->my_rank > 0 && gd->total_local_keys > 0 )
539         if( k > gd->key_array[0] )
540             j++;
541
542
543 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
544     for( i=1; i<gd->total_local_keys; i++ )
545         if( gd->key_array[i-1] > gd->key_array[i] )
546             j++;
547
548
549     if( j != 0 )
550     {
551         printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
552                 gd->my_rank, j );
553     }
554     else
555         gd->passed_verification++;
556            
557
558 }
559
560
561
562
563 /*****************************************************************/
564 /*************             R  A  N  K             ****************/
565 /*****************************************************************/
566
567
568 void rank( global_data* gd, int iteration )
569 {
570
571     INT_TYPE    i, k;
572
573     INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
574     INT_TYPE    key;
575     INT_TYPE2   bucket_sum_accumulator, j, m;
576     INT_TYPE    local_bucket_sum_accumulator;
577     INT_TYPE    min_key_val, max_key_val;
578     INT_TYPE    *key_buff_ptr;
579
580
581
582
583 /*  Iteration alteration of keys */  
584     if(gd->my_rank == 0 )                    
585     {
586       gd->key_array[iteration] = iteration;
587       gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
588     }
589
590
591 /*  Initialize */
592     for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )  
593     {
594         gd->bucket_size[i] = 0;
595         gd->bucket_size_totals[i] = 0;
596         gd->process_bucket_distrib_ptr1[i] = 0;
597         gd->process_bucket_distrib_ptr2[i] = 0;
598     }
599
600
601 /*  Determine where the partial verify test keys are, load into  */
602 /*  top of array bucket_size                                     */
603     for( i=0; i<TEST_ARRAY_SIZE; i++ )
604         if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
605             gd->bucket_size[NUM_BUCKETS+i] = 
606                           gd->key_array[gd->test_index_array[i] % NUM_KEYS];
607
608
609 /*  Determine the number of keys in each bucket */
610     for( i=0; i<NUM_KEYS; i++ )
611         gd->bucket_size[gd->key_array[i] >> shift]++;
612
613
614 /*  Accumulative bucket sizes are the bucket pointers */
615     gd->bucket_ptrs[0] = 0;
616     for( i=1; i< NUM_BUCKETS; i++ )  
617         gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
618
619
620 /*  Sort into appropriate bucket */
621     for( i=0; i<NUM_KEYS; i++ )  
622     {
623         key = gd->key_array[i];
624         gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
625     }
626
627 #ifdef  TIMING_ENABLED
628     timer_stop(gd, 2 );
629     timer_start(gd, 3 );
630 #endif
631
632 /*  Get the bucket size totals for the entire problem. These 
633     will be used to determine the redistribution of keys      */
634     MPI_Allreduce( gd->bucket_size, 
635                    gd->bucket_size_totals, 
636                    NUM_BUCKETS+TEST_ARRAY_SIZE, 
637                    MP_KEY_TYPE,
638                    MPI_SUM,
639                    MPI_COMM_WORLD );
640
641 #ifdef  TIMING_ENABLED
642     timer_stop(gd, 3 );
643     timer_start(gd, 2 );
644 #endif
645
646 /*  Determine Redistibution of keys: accumulate the bucket size totals 
647     till this number surpasses NUM_KEYS (which the average number of keys
648     per processor).  Then all keys in these buckets go to processor 0.
649     Continue accumulating again until supassing 2*NUM_KEYS. All keys
650     in these buckets go to processor 1, etc.  This algorithm guarantees
651     that all processors have work ranking; no processors are left idle.
652     The optimum number of buckets, however, does not result in as high
653     a degree of load balancing (as even a distribution of keys as is
654     possible) as is obtained from increasing the number of buckets, but
655     more buckets results in more computation per processor so that the
656     optimum number of buckets turns out to be 1024 for machines tested.
657     Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
658     number of first and last bucket which each processor will have after   
659     the redistribution is done.                                          */
660
661     bucket_sum_accumulator = 0;
662     local_bucket_sum_accumulator = 0;
663     gd->send_displ[0] = 0;
664     gd->process_bucket_distrib_ptr1[0] = 0;
665     for( i=0, j=0; i<NUM_BUCKETS; i++ )  
666     {
667         bucket_sum_accumulator       += gd->bucket_size_totals[i];
668         local_bucket_sum_accumulator += gd->bucket_size[i];
669         if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )  
670         {
671             gd->send_count[j] = local_bucket_sum_accumulator;
672             if( j != 0 )
673             {
674                 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
675                 gd->process_bucket_distrib_ptr1[j] = 
676                                         gd->process_bucket_distrib_ptr2[j-1]+1;
677             }
678             gd->process_bucket_distrib_ptr2[j++] = i;
679             local_bucket_sum_accumulator = 0;
680         }
681     }
682
683 /*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
684     that the last few processors don't get any buckets.  So, we
685     need to set counts properly in this case to avoid any fallouts.    */
686     while( j < gd->comm_size )
687     {
688         gd->send_count[j] = 0;
689         gd->process_bucket_distrib_ptr1[j] = 1;
690         j++;
691     }
692
693 #ifdef  TIMING_ENABLED
694     timer_stop(gd, 2 );
695     timer_start(gd, 3 ); 
696 #endif
697
698 /*  This is the redistribution section:  first find out how many keys
699     each processor will send to every other processor:                 */
700     MPI_Alltoall( gd->send_count,
701                   1,
702                   MPI_INT,
703                   gd->recv_count,
704                   1,
705                   MPI_INT,
706                   MPI_COMM_WORLD );
707
708 /*  Determine the receive array displacements for the buckets */    
709     gd->recv_displ[0] = 0;
710     for( i=1; i<gd->comm_size; i++ )
711         gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
712
713
714 /*  Now send the keys to respective processors  */    
715     MPI_Alltoallv( gd->key_buff1,
716                    gd->send_count,
717                    gd->send_displ,
718                    MP_KEY_TYPE,
719                    gd->key_buff2,
720                    gd->recv_count,
721                    gd->recv_displ,
722                    MP_KEY_TYPE,
723                    MPI_COMM_WORLD );
724
725 #ifdef  TIMING_ENABLED
726     timer_stop(gd, 3 ); 
727     timer_start(gd, 2 );
728 #endif
729
730 /*  The starting and ending bucket numbers on each processor are
731     multiplied by the interval size of the buckets to obtain the 
732     smallest possible min and greatest possible max value of any 
733     key on each processor                                          */
734     min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
735     max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
736
737 /*  Clear the work array */
738     for( i=0; i<max_key_val-min_key_val+1; i++ )
739         gd->key_buff1[i] = 0;
740
741 /*  Determine the total number of keys on all other 
742     processors holding keys of lesser value         */
743     m = 0;
744     for( k=0; k<gd->my_rank; k++ )
745         for( i= gd->process_bucket_distrib_ptr1[k];
746              i<=gd->process_bucket_distrib_ptr2[k];
747              i++ )  
748             m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
749
750 /*  Determine total number of keys on this processor */
751     j = 0;                                 
752     for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
753          i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
754          i++ )  
755         j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
756
757
758 /*  Ranking of all keys occurs in this section:                 */
759 /*  shift it backwards so no subtractions are necessary in loop */
760     key_buff_ptr = gd->key_buff1 - min_key_val;
761
762 /*  In this section, the keys themselves are used as their 
763     own indexes to determine how many of each there are: their
764     individual population                                       */
765     for( i=0; i<j; i++ )
766         key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
767                                        /* population                     */
768
769 /*  To obtain ranks of each key, successively add the individual key
770     population, not forgetting the total of lesser keys, m.
771     NOTE: Since the total of lesser keys would be subtracted later 
772     in verification, it is no longer added to the first key population 
773     here, but still needed during the partial verify test.  This is to 
774     ensure that 32-bit key_buff can still be used for class D.           */
775 /*    key_buff_ptr[min_key_val] += m;    */
776     for( i=min_key_val; i<max_key_val; i++ )   
777         key_buff_ptr[i+1] += key_buff_ptr[i];  
778
779
780 /* This is the partial verify test section */
781 /* Observe that test_rank_array vals are   */
782 /* shifted differently for different cases */
783     for( i=0; i<TEST_ARRAY_SIZE; i++ )
784     {                                             
785         k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
786         if( min_key_val <= k  &&  k <= max_key_val )
787         {
788             /* Add the total of lesser keys, m, here */
789             INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
790             int failed = 0;
791
792             switch( CLASS )
793             {
794                 case 'S':
795                     if( i <= 2 )
796                     {
797                         if( key_rank != gd->test_rank_array[i]+iteration )
798                             failed = 1;
799                         else
800                             gd->passed_verification++;
801                     }
802                     else
803                     {
804                         if( key_rank != gd->test_rank_array[i]-iteration )
805                             failed = 1;
806                         else
807                             gd->passed_verification++;
808                     }
809                     break;
810                 case 'W':
811                     if( i < 2 )
812                     {
813                         if( key_rank != gd->test_rank_array[i]+(iteration-2) )
814                             failed = 1;
815                         else
816                             gd->passed_verification++;
817                     }
818                     else
819                     {
820                         if( key_rank != gd->test_rank_array[i]-iteration )
821                             failed = 1;
822                         else
823                             gd->passed_verification++;
824                     }
825                     break;
826                 case 'A':
827                     if( i <= 2 )
828                     {
829                         if( key_rank != gd->test_rank_array[i]+(iteration-1) )
830                             failed = 1;
831                         else
832                           gd->passed_verification++;
833                     }
834                     else
835                     {
836                         if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
837                             failed = 1;
838                         else
839                             gd->passed_verification++;
840                     }
841                     break;
842                 case 'B':
843                     if( i == 1 || i == 2 || i == 4 )
844                     {
845                         if( key_rank != gd->test_rank_array[i]+iteration )
846                             failed = 1;
847                         else
848                             gd->passed_verification++;
849                     }
850                     else
851                     {
852                         if( key_rank != gd->test_rank_array[i]-iteration )
853                             failed = 1;
854                         else
855                             gd->passed_verification++;
856                     }
857                     break;
858                 case 'C':
859                     if( i <= 2 )
860                     {
861                         if( key_rank != gd->test_rank_array[i]+iteration )
862                             failed = 1;
863                         else
864                             gd->passed_verification++;
865                     }
866                     else
867                     {
868                         if( key_rank != gd->test_rank_array[i]-iteration )
869                             failed = 1;
870                         else
871                             gd->passed_verification++;
872                     }
873                     break;
874                 case 'D':
875                     if( i < 2 )
876                     {
877                         if( key_rank != gd->test_rank_array[i]+iteration )
878                             failed = 1;
879                         else
880                             gd->passed_verification++;
881                     }
882                     else
883                     {
884                         if( key_rank != gd->test_rank_array[i]-iteration )
885                             failed = 1;
886                         else
887                             gd->passed_verification++;
888                     }
889                     break;
890             }
891             if( failed == 1 )
892                 printf( "Failed partial verification: "
893                         "iteration %d, processor %d, test key %d\n", 
894                          iteration, gd->my_rank, (int)i );
895         }
896     }
897
898
899
900
901 /*  Make copies of rank info for use by full_verify: these variables
902     in rank are local; making them global slows down the code, probably
903     since they cannot be made register by compiler                        */
904
905     if( iteration == MAX_ITERATIONS ) 
906     {
907         gd->key_buff_ptr_global = key_buff_ptr;
908         gd->total_local_keys    = j;
909         gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
910     }
911
912 }      
913
914
915 /*****************************************************************/
916 /*************             M  A  I  N             ****************/
917 /*****************************************************************/
918
919 int main( int argc, char **argv )
920 {
921
922     int             i, iteration, itemp;
923
924     double          timecounter, maxtime;
925
926     TRACE_smpi_set_category ("start");
927
928     global_data* gd = malloc(sizeof(global_data));
929 /*  Initialize MPI */
930     MPI_Init( &argc, &argv );
931     MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
932     MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
933
934 /*  Initialize the verification arrays if a valid class */
935     for( i=0; i<TEST_ARRAY_SIZE; i++ )
936         switch( CLASS )
937         {
938             case 'S':
939                 gd->test_index_array[i] = S_test_index_array[i];
940                 gd->test_rank_array[i]  = S_test_rank_array[i];
941                 break;
942             case 'A':
943                 gd->test_index_array[i] = A_test_index_array[i];
944                 gd->test_rank_array[i]  = A_test_rank_array[i];
945                 break;
946             case 'W':
947                 gd->test_index_array[i] = W_test_index_array[i];
948                 gd->test_rank_array[i]  = W_test_rank_array[i];
949                 break;
950             case 'B':
951                 gd->test_index_array[i] = B_test_index_array[i];
952                 gd->test_rank_array[i]  = B_test_rank_array[i];
953                 break;
954             case 'C':
955                 gd->test_index_array[i] = C_test_index_array[i];
956                 gd->test_rank_array[i]  = C_test_rank_array[i];
957                 break;
958             case 'D':
959                 gd->test_index_array[i] = D_test_index_array[i];
960                 gd->test_rank_array[i]  = D_test_rank_array[i];
961                 break;
962         };
963
964         
965
966 /*  Printout initial NPB info */
967     if( gd->my_rank == 0 )
968     {
969         printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
970         printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
971         printf( " Iterations:   %d\n", MAX_ITERATIONS );
972         printf( " Number of processes:     %d\n",gd->comm_size );
973     }
974
975 /*  Check that actual and compiled number of processors agree */
976     if( gd->comm_size != NUM_PROCS )
977     {
978         if( gd->my_rank == 0 )
979             printf( "\n ERROR: compiled for %d processes\n"
980                     " Number of active processes: %d\n"
981                     " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
982         MPI_Finalize();
983         exit( 1 );
984     }
985
986 /*  Check to see whether total number of processes is within bounds.
987     This could in principle be checked in setparams.c, but it is more
988     convenient to do it here                                               */
989     if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
990     {
991        if( gd->my_rank == 0 )
992            printf( "\n ERROR: number of processes %d not within range %d-%d"
993                    "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
994        MPI_Finalize();
995        exit( 1 );
996     }
997
998
999 /*  Generate random number sequence and subsequent keys on all procs */
1000     create_seq(gd,  find_my_seed( gd->my_rank, 
1001                               gd->comm_size, 
1002                               4*(long)TOTAL_KEYS*MIN_PROCS,
1003                               314159265.00,      /* Random number gen seed */
1004                               1220703125.00 ),   /* Random number gen mult */
1005                 1220703125.00 );                 /* Random number gen mult */
1006
1007 /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
1008     all data and code pages and respective tables */
1009     rank(gd, 1 );  
1010
1011 /*  Start verification counter */
1012     gd->passed_verification = 0;
1013
1014     if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
1015
1016 /*  Initialize timer  */             
1017     timer_clear(gd, 0 );
1018
1019 /*  Initialize separate communication, computation timing */
1020 #ifdef  TIMING_ENABLED 
1021     for( i=1; i<=3; i++ ) timer_clear(gd, i );
1022 #endif
1023
1024 /*  Start timer  */             
1025     timer_start(gd, 0 );
1026
1027 #ifdef  TIMING_ENABLED
1028     timer_start(gd, 1 );
1029     timer_start(gd, 2 );
1030 #endif
1031
1032 /*  This is the main iteration */
1033     for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1034     {
1035         char it_str[100];
1036         snprintf (it_str, 100, "i%d", iteration);
1037         TRACE_smpi_set_category (it_str);
1038
1039         if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
1040         rank(gd,  iteration );
1041     }
1042
1043     TRACE_smpi_set_category ("finalize");
1044
1045
1046 #ifdef  TIMING_ENABLED
1047     timer_stop(gd, 2 );
1048     timer_stop(gd, 1 );
1049 #endif
1050
1051 /*  Stop timer, obtain time for processors */
1052     timer_stop(gd, 0 );
1053
1054     timecounter = timer_read(gd, 0 );
1055
1056 /*  End of timing, obtain maximum time of all processors */
1057     MPI_Reduce( &timecounter,
1058                 &maxtime,
1059                 1,
1060                 MPI_DOUBLE,
1061                 MPI_MAX,
1062                 0,
1063                 MPI_COMM_WORLD );
1064
1065 #ifdef  TIMING_ENABLED
1066     {
1067         double    tmin, tsum, tmax;
1068     
1069         if( my_rank == 0 )
1070         {
1071             printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1072             printf( "              min                avg                max\n" );
1073         }
1074         for( i=1; i<=3; i++ )
1075         {
1076             timecounter = timer_read(gd, i );
1077             MPI_Reduce( &timecounter,
1078                         &tmin,
1079                         1,
1080                         MPI_DOUBLE,
1081                         MPI_MIN,
1082                         0,
1083                         MPI_COMM_WORLD );
1084             MPI_Reduce( &timecounter,
1085                         &tsum,
1086                         1,
1087                         MPI_DOUBLE,
1088                         MPI_SUM,
1089                         0,
1090                         MPI_COMM_WORLD );
1091             MPI_Reduce( &timecounter,
1092                         &tmax,
1093                         1,
1094                         MPI_DOUBLE,
1095                         MPI_MAX,
1096                         0,
1097                         MPI_COMM_WORLD );
1098             if( my_rank == 0 )
1099                 printf( "timer %d:    %f           %f            %f\n",
1100                         i, tmin, tsum/((double) comm_size), tmax );
1101         }
1102         if( my_rank == 0 )
1103             printf( "\n" );
1104     }
1105 #endif
1106
1107 /*  This tests that keys are in sequence: sorting of last ranked key seq
1108     occurs here, but is an untimed operation                             */
1109     full_verify(gd);
1110
1111
1112 /*  Obtain verification counter sum */
1113     itemp =gd->passed_verification;
1114     MPI_Reduce( &itemp,
1115                 &gd->passed_verification,
1116                 1,
1117                 MPI_INT,
1118                 MPI_SUM,
1119                 0,
1120                 MPI_COMM_WORLD );
1121
1122
1123
1124 /*  The final printout  */
1125     if( gd->my_rank == 0 )
1126     {
1127         if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1128             gd->passed_verification = 0;
1129         c_print_results( "IS",
1130                          CLASS,
1131                          (int)(TOTAL_KEYS),
1132                          MIN_PROCS,
1133                          0,
1134                          MAX_ITERATIONS,
1135                          NUM_PROCS,
1136                          gd->comm_size,
1137                          maxtime,
1138                          ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1139                                                       /maxtime/1000000.,
1140                          "keys ranked", 
1141                          gd->passed_verification,
1142                          NPBVERSION,
1143                          COMPILETIME,
1144                          MPICC,
1145                          CLINK,
1146                          CMPI_LIB,
1147                          CMPI_INC,
1148                          CFLAGS,
1149                          CLINKFLAGS );
1150     }
1151                     
1152     MPI_Finalize();
1153     free(gd);
1154
1155     return 0;
1156          /**************************/
1157 }        /*  E N D  P R O G R A M  */
1158          /**************************/