Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Added our tweaked version of NAS benchmarks.
[simgrid.git] / examples / smpi / NAS / IS / is.c
1 /*************************************************************************
2  *                                                                       * 
3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
4  *                                                                       * 
5  *                                  I S                                  * 
6  *                                                                       * 
7  ************************************************************************* 
8  *                                                                       * 
9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
10  *   It is described in NAS Technical Report 95-020.                     * 
11  *                                                                       * 
12  *   Permission to use, copy, distribute and modify this software        * 
13  *   for any purpose with or without fee is hereby granted.  We          * 
14  *   request, however, that all derived work reference the NAS           * 
15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
16  *   without express or implied warranty.                                * 
17  *                                                                       * 
18  *   Information on NPB 3.3, including the technical report, the         *
19  *   original specifications, source code, results and information       * 
20  *   on how to submit new results, is available at:                      * 
21  *                                                                       * 
22  *          http://www.nas.nasa.gov/Software/NPB                         * 
23  *                                                                       * 
24  *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
25  *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
26  *                                                                       * 
27  *         NAS Parallel Benchmarks Group                                 * 
28  *         NASA Ames Research Center                                     * 
29  *         Mail Stop: T27A-1                                             * 
30  *         Moffett Field, CA   94035-1000                                * 
31  *                                                                       * 
32  *         E-mail:  npb@nas.nasa.gov                                     * 
33  *         Fax:     (650) 604-3957                                       * 
34  *                                                                       * 
35  ************************************************************************* 
36  *                                                                       * 
37  *   Author: M. Yarrow                                                   * 
38  *           H. Jin                                                      * 
39  *                                                                       * 
40  *************************************************************************/
41
42 #include "mpi.h"
43 #include "npbparams.h"
44 #include <stdlib.h>
45 #include <stdio.h>
46
47 /******************/
48 /* default values */
49 /******************/
50 #ifndef CLASS
51 #define CLASS 'S'
52 #define NUM_PROCS            1                 
53 #endif
54 #define MIN_PROCS            1
55
56
57 /*************/
58 /*  CLASS S  */
59 /*************/
60 #if CLASS == 'S'
61 #define  TOTAL_KEYS_LOG_2    16
62 #define  MAX_KEY_LOG_2       11
63 #define  NUM_BUCKETS_LOG_2   9
64 #endif
65
66
67 /*************/
68 /*  CLASS W  */
69 /*************/
70 #if CLASS == 'W'
71 #define  TOTAL_KEYS_LOG_2    20
72 #define  MAX_KEY_LOG_2       16
73 #define  NUM_BUCKETS_LOG_2   10
74 #endif
75
76 /*************/
77 /*  CLASS A  */
78 /*************/
79 #if CLASS == 'A'
80 #define  TOTAL_KEYS_LOG_2    23
81 #define  MAX_KEY_LOG_2       19
82 #define  NUM_BUCKETS_LOG_2   10
83 #endif
84
85
86 /*************/
87 /*  CLASS B  */
88 /*************/
89 #if CLASS == 'B'
90 #define  TOTAL_KEYS_LOG_2    25
91 #define  MAX_KEY_LOG_2       21
92 #define  NUM_BUCKETS_LOG_2   10
93 #endif
94
95
96 /*************/
97 /*  CLASS C  */
98 /*************/
99 #if CLASS == 'C'
100 #define  TOTAL_KEYS_LOG_2    27
101 #define  MAX_KEY_LOG_2       23
102 #define  NUM_BUCKETS_LOG_2   10
103 #endif
104
105
106 /*************/
107 /*  CLASS D  */
108 /*************/
109 #if CLASS == 'D'
110 #define  TOTAL_KEYS_LOG_2    29
111 #define  MAX_KEY_LOG_2       27
112 #define  NUM_BUCKETS_LOG_2   10
113 #undef   MIN_PROCS
114 #define  MIN_PROCS           4
115 #endif
116
117
118 #define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
119 #define  MAX_KEY             (1 << MAX_KEY_LOG_2)
120 #define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
121 #define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
122
123 /*****************************************************************/
124 /* On larger number of processors, since the keys are (roughly)  */ 
125 /* gaussian distributed, the first and last processor sort keys  */ 
126 /* in a large interval, requiring array sizes to be larger. Note */
127 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
128 /* The required array size also depends on the bucket size used. */
129 /* The following values are validated for the 1024-bucket setup. */
130 /*****************************************************************/
131 #if   NUM_PROCS < 256
132 #define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
133 #elif NUM_PROCS < 512
134 #define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
135 #elif NUM_PROCS < 1024
136 #define  SIZE_OF_BUFFERS     4*NUM_KEYS
137 #else
138 #define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
139 #endif
140
141 /*****************************************************************/
142 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
143 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
144 /* MAX_PROCS AT YOUR PERIL                                       */
145 /*****************************************************************/
146 #if CLASS == 'S'
147 #define  MAX_PROCS           128
148 #else
149 #define  MAX_PROCS           1024
150 #endif
151
152 #define  MAX_ITERATIONS      10
153 #define  TEST_ARRAY_SIZE     5
154
155
156 /***********************************/
157 /* Enable separate communication,  */
158 /* computation timing and printout */
159 /***********************************/
160 /* #define  TIMING_ENABLED         */
161
162
163 /*************************************/
164 /* Typedef: if necessary, change the */
165 /* size of int here by changing the  */
166 /* int type to, say, long            */
167 /*************************************/
168 typedef  int  INT_TYPE;
169 typedef  long INT_TYPE2;
170 #define MP_KEY_TYPE MPI_INT
171
172
173 typedef struct {
174
175 /********************/
176 /* MPI properties:  */
177 /********************/
178 int      my_rank,
179          comm_size;
180
181
182 /********************/
183 /* Some global info */
184 /********************/
185 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
186          total_local_keys,             /* copies of rank info        */
187          total_lesser_keys;
188
189
190 int      passed_verification;
191                                  
192
193
194 /************************************/
195 /* These are the three main arrays. */
196 /* See SIZE_OF_BUFFERS def above    */
197 /************************************/
198 INT_TYPE key_array[SIZE_OF_BUFFERS],    
199          key_buff1[SIZE_OF_BUFFERS],    
200          key_buff2[SIZE_OF_BUFFERS],
201          bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
202          bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
203          bucket_ptrs[NUM_BUCKETS],
204          process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],   
205          process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];   
206 int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
207          send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
208
209
210 /**********************/
211 /* Partial verif info */
212 /**********************/
213 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
214          test_rank_array[TEST_ARRAY_SIZE];
215
216 /**********/
217 /* Timers */
218 /**********/
219 double start[64], elapsed[64];
220
221 } global_data;
222
223
224 const INT_TYPE2
225          S_test_index_array[TEST_ARRAY_SIZE] = 
226                              {48427,17148,23627,62548,4431},
227          S_test_rank_array[TEST_ARRAY_SIZE] = 
228                              {0,18,346,64917,65463},
229
230          W_test_index_array[TEST_ARRAY_SIZE] = 
231                              {357773,934767,875723,898999,404505},
232          W_test_rank_array[TEST_ARRAY_SIZE] = 
233                              {1249,11698,1039987,1043896,1048018},
234
235          A_test_index_array[TEST_ARRAY_SIZE] = 
236                              {2112377,662041,5336171,3642833,4250760},
237          A_test_rank_array[TEST_ARRAY_SIZE] = 
238                              {104,17523,123928,8288932,8388264},
239
240          B_test_index_array[TEST_ARRAY_SIZE] = 
241                              {41869,812306,5102857,18232239,26860214},
242          B_test_rank_array[TEST_ARRAY_SIZE] = 
243                              {33422937,10244,59149,33135281,99}, 
244
245          C_test_index_array[TEST_ARRAY_SIZE] = 
246                              {44172927,72999161,74326391,129606274,21736814},
247          C_test_rank_array[TEST_ARRAY_SIZE] = 
248                              {61147,882988,266290,133997595,133525895},
249
250          D_test_index_array[TEST_ARRAY_SIZE] = 
251                              {1317351170,995930646,1157283250,1503301535,1453734525},
252          D_test_rank_array[TEST_ARRAY_SIZE] = 
253                              {1,36538729,1978098519,2145192618,2147425337};
254
255
256
257 /***********************/
258 /* function prototypes */
259 /***********************/
260 double  randlc( double *X, double *A );
261
262 void full_verify( global_data* gd );
263
264 void c_print_results( char   *name,
265                       char   class,
266                       int    n1, 
267                       int    n2,
268                       int    n3,
269                       int    niter,
270                       int    nprocs_compiled,
271                       int    nprocs_total,
272                       double t,
273                       double mops,
274                       char   *optype,
275                       int    passed_verification,
276                       char   *npbversion,
277                       char   *compiletime,
278                       char   *mpicc,
279                       char   *clink,
280                       char   *cmpi_lib,
281                       char   *cmpi_inc,
282                       char   *cflags,
283                       char   *clinkflags );
284
285 void    timer_clear(global_data* gd, int n );
286 void    timer_start(global_data* gd, int n );
287 void    timer_stop(global_data* gd, int n );
288 double  timer_read(global_data* gd, int n );
289
290 void    timer_clear(global_data* gd, int n ) {
291    gd->elapsed[n] = 0.0;
292 }
293
294 void    timer_start(global_data* gd, int n ) {
295    gd->start[n] = MPI_Wtime();
296 }
297
298 void    timer_stop(global_data* gd, int n ) {
299    gd->elapsed[n] += MPI_Wtime() - gd->start[n];
300 }
301
302 double  timer_read(global_data* gd, int n ) {
303    return gd->elapsed[n];
304 }
305
306
307 /*
308  *    FUNCTION RANDLC (X, A)
309  *
310  *  This routine returns a uniform pseudorandom double precision number in the
311  *  range (0, 1) by using the linear congruential generator
312  *
313  *  x_{k+1} = a x_k  (mod 2^46)
314  *
315  *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
316  *  before repeating.  The argument A is the same as 'a' in the above formula,
317  *  and X is the same as x_0.  A and X must be odd double precision integers
318  *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
319  *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
320  *  the new seed x_1, so that subsequent calls to RANDLC using the same
321  *  arguments will generate a continuous sequence.
322  *
323  *  This routine should produce the same results on any computer with at least
324  *  48 mantissa bits in double precision floating point data.  On Cray systems,
325  *  double precision should be disabled.
326  *
327  *  David H. Bailey     October 26, 1990
328  *
329  *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
330  *     SAVE KS, R23, R46, T23, T46
331  *     DATA KS/0/
332  *
333  *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
334  *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
335  *  by merely using the ** operator, in order to insure that the results are
336  *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
337  */
338
339
340 /*****************************************************************/
341 /*************           R  A  N  D  L  C             ************/
342 /*************                                        ************/
343 /*************    portable random number generator    ************/
344 /*****************************************************************/
345
346 double  randlc( double *X, double *A )
347 {
348       static int        KS=0;
349       static double     R23, R46, T23, T46;
350       double            T1, T2, T3, T4;
351       double            A1;
352       double            A2;
353       double            X1;
354       double            X2;
355       double            Z;
356       int               i, j;
357
358       if (KS == 0) 
359       {
360         R23 = 1.0;
361         R46 = 1.0;
362         T23 = 1.0;
363         T46 = 1.0;
364     
365         for (i=1; i<=23; i++)
366         {
367           R23 = 0.50 * R23;
368           T23 = 2.0 * T23;
369         }
370         for (i=1; i<=46; i++)
371         {
372           R46 = 0.50 * R46;
373           T46 = 2.0 * T46;
374         }
375         KS = 1;
376       }
377
378 /*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
379
380       T1 = R23 * *A;
381       j  = T1;
382       A1 = j;
383       A2 = *A - T23 * A1;
384
385 /*  Break X into two parts such that X = 2^23 * X1 + X2, compute
386     Z = A1 * X2 + A2 * X1  (mod 2^23), and then
387     X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
388
389       T1 = R23 * *X;
390       j  = T1;
391       X1 = j;
392       X2 = *X - T23 * X1;
393       T1 = A1 * X2 + A2 * X1;
394       
395       j  = R23 * T1;
396       T2 = j;
397       Z = T1 - T23 * T2;
398       T3 = T23 * Z + A2 * X2;
399       j  = R46 * T3;
400       T4 = j;
401       *X = T3 - T46 * T4;
402       return(R46 * *X);
403
404
405
406
407 /*****************************************************************/
408 /************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
409 /************                                         ************/
410 /************ returns parallel random number seq seed ************/
411 /*****************************************************************/
412
413 /*
414  * Create a random number sequence of total length nn residing
415  * on np number of processors.  Each processor will therefore have a 
416  * subsequence of length nn/np.  This routine returns that random 
417  * number which is the first random number for the subsequence belonging
418  * to processor rank kn, and which is used as seed for proc kn ran # gen.
419  */
420
421 double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
422                        int  np,       /* np = num procs                      */
423                        long nn,       /* total num of ran numbers, all procs */
424                        double s,      /* Ran num seed, for ex.: 314159265.00 */
425                        double a )     /* Ran num gen mult, try 1220703125.00 */
426 {
427
428   long   i;
429
430   double t1,t2,t3,an;
431   long   mq,nq,kk,ik;
432
433
434
435       nq = nn / np;
436
437       for( mq=0; nq>1; mq++,nq/=2 )
438           ;
439
440       t1 = a;
441
442       for( i=1; i<=mq; i++ )
443         t2 = randlc( &t1, &t1 );
444
445       an = t1;
446
447       kk = kn;
448       t1 = s;
449       t2 = an;
450
451       for( i=1; i<=100; i++ )
452       {
453         ik = kk / 2;
454         if( 2 * ik !=  kk ) 
455             t3 = randlc( &t1, &t2 );
456         if( ik == 0 ) 
457             break;
458         t3 = randlc( &t2, &t2 );
459         kk = ik;
460       }
461
462       return( t1 );
463
464 }
465
466
467
468
469 /*****************************************************************/
470 /*************      C  R  E  A  T  E  _  S  E  Q      ************/
471 /*****************************************************************/
472
473 void    create_seq( global_data* gd, double seed, double a )
474 {
475         double x;
476         int    i, k;
477
478         k = MAX_KEY/4;
479
480         for (i=0; i<NUM_KEYS; i++)
481         {
482             x = randlc(&seed, &a);
483             x += randlc(&seed, &a);
484             x += randlc(&seed, &a);
485             x += randlc(&seed, &a);  
486
487             gd->key_array[i] = k*x;
488         }
489 }
490
491
492
493
494 /*****************************************************************/
495 /*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
496 /*****************************************************************/
497
498
499 void full_verify( global_data* gd )
500 {
501     MPI_Status  status;
502     MPI_Request request;
503     
504     INT_TYPE    i, j;
505     INT_TYPE    k, last_local_key;
506
507     
508 /*  Now, finally, sort the keys:  */
509     for( i=0; i<gd->total_local_keys; i++ )
510         gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
511                                  gd->total_lesser_keys] = gd->key_buff2[i];
512     last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
513
514 /*  Send largest key value to next processor  */
515     if( gd->my_rank > 0 )
516         MPI_Irecv( &k,
517                    1,
518                    MP_KEY_TYPE,
519                    gd->my_rank-1,
520                    1000,
521                    MPI_COMM_WORLD,
522                    &request );                   
523     if( gd->my_rank < gd->comm_size-1 )
524         MPI_Send( &gd->key_array[last_local_key],
525                   1,
526                   MP_KEY_TYPE,
527                   gd->my_rank+1,
528                   1000,
529                   MPI_COMM_WORLD );
530     if( gd->my_rank > 0 )
531         MPI_Wait( &request, &status );
532
533 /*  Confirm that neighbor's greatest key value 
534     is not greater than my least key value       */              
535     j = 0;
536     if( gd->my_rank > 0 && gd->total_local_keys > 0 )
537         if( k > gd->key_array[0] )
538             j++;
539
540
541 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
542     for( i=1; i<gd->total_local_keys; i++ )
543         if( gd->key_array[i-1] > gd->key_array[i] )
544             j++;
545
546
547     if( j != 0 )
548     {
549         printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
550                 gd->my_rank, j );
551     }
552     else
553         gd->passed_verification++;
554            
555
556 }
557
558
559
560
561 /*****************************************************************/
562 /*************             R  A  N  K             ****************/
563 /*****************************************************************/
564
565
566 void rank( global_data* gd, int iteration )
567 {
568
569     INT_TYPE    i, k;
570
571     INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
572     INT_TYPE    key;
573     INT_TYPE2   bucket_sum_accumulator, j, m;
574     INT_TYPE    local_bucket_sum_accumulator;
575     INT_TYPE    min_key_val, max_key_val;
576     INT_TYPE    *key_buff_ptr;
577
578
579
580
581 /*  Iteration alteration of keys */  
582     if(gd->my_rank == 0 )                    
583     {
584       gd->key_array[iteration] = iteration;
585       gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
586     }
587
588
589 /*  Initialize */
590     for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )  
591     {
592         gd->bucket_size[i] = 0;
593         gd->bucket_size_totals[i] = 0;
594         gd->process_bucket_distrib_ptr1[i] = 0;
595         gd->process_bucket_distrib_ptr2[i] = 0;
596     }
597
598
599 /*  Determine where the partial verify test keys are, load into  */
600 /*  top of array bucket_size                                     */
601     for( i=0; i<TEST_ARRAY_SIZE; i++ )
602         if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
603             gd->bucket_size[NUM_BUCKETS+i] = 
604                           gd->key_array[gd->test_index_array[i] % NUM_KEYS];
605
606
607 /*  Determine the number of keys in each bucket */
608     for( i=0; i<NUM_KEYS; i++ )
609         gd->bucket_size[gd->key_array[i] >> shift]++;
610
611
612 /*  Accumulative bucket sizes are the bucket pointers */
613     gd->bucket_ptrs[0] = 0;
614     for( i=1; i< NUM_BUCKETS; i++ )  
615         gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
616
617
618 /*  Sort into appropriate bucket */
619     for( i=0; i<NUM_KEYS; i++ )  
620     {
621         key = gd->key_array[i];
622         gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
623     }
624
625 #ifdef  TIMING_ENABLED
626     timer_stop(gd, 2 );
627     timer_start(gd, 3 );
628 #endif
629
630 /*  Get the bucket size totals for the entire problem. These 
631     will be used to determine the redistribution of keys      */
632     MPI_Allreduce( gd->bucket_size, 
633                    gd->bucket_size_totals, 
634                    NUM_BUCKETS+TEST_ARRAY_SIZE, 
635                    MP_KEY_TYPE,
636                    MPI_SUM,
637                    MPI_COMM_WORLD );
638
639 #ifdef  TIMING_ENABLED
640     timer_stop(gd, 3 );
641     timer_start(gd, 2 );
642 #endif
643
644 /*  Determine Redistibution of keys: accumulate the bucket size totals 
645     till this number surpasses NUM_KEYS (which the average number of keys
646     per processor).  Then all keys in these buckets go to processor 0.
647     Continue accumulating again until supassing 2*NUM_KEYS. All keys
648     in these buckets go to processor 1, etc.  This algorithm guarantees
649     that all processors have work ranking; no processors are left idle.
650     The optimum number of buckets, however, does not result in as high
651     a degree of load balancing (as even a distribution of keys as is
652     possible) as is obtained from increasing the number of buckets, but
653     more buckets results in more computation per processor so that the
654     optimum number of buckets turns out to be 1024 for machines tested.
655     Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
656     number of first and last bucket which each processor will have after   
657     the redistribution is done.                                          */
658
659     bucket_sum_accumulator = 0;
660     local_bucket_sum_accumulator = 0;
661     gd->send_displ[0] = 0;
662     gd->process_bucket_distrib_ptr1[0] = 0;
663     for( i=0, j=0; i<NUM_BUCKETS; i++ )  
664     {
665         bucket_sum_accumulator       += gd->bucket_size_totals[i];
666         local_bucket_sum_accumulator += gd->bucket_size[i];
667         if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )  
668         {
669             gd->send_count[j] = local_bucket_sum_accumulator;
670             if( j != 0 )
671             {
672                 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
673                 gd->process_bucket_distrib_ptr1[j] = 
674                                         gd->process_bucket_distrib_ptr2[j-1]+1;
675             }
676             gd->process_bucket_distrib_ptr2[j++] = i;
677             local_bucket_sum_accumulator = 0;
678         }
679     }
680
681 /*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
682     that the last few processors don't get any buckets.  So, we
683     need to set counts properly in this case to avoid any fallouts.    */
684     while( j < gd->comm_size )
685     {
686         gd->send_count[j] = 0;
687         gd->process_bucket_distrib_ptr1[j] = 1;
688         j++;
689     }
690
691 #ifdef  TIMING_ENABLED
692     timer_stop(gd, 2 );
693     timer_start(gd, 3 ); 
694 #endif
695
696 /*  This is the redistribution section:  first find out how many keys
697     each processor will send to every other processor:                 */
698     MPI_Alltoall( gd->send_count,
699                   1,
700                   MPI_INT,
701                   gd->recv_count,
702                   1,
703                   MPI_INT,
704                   MPI_COMM_WORLD );
705     MPI_Wtime();
706
707 /*  Determine the receive array displacements for the buckets */    
708     gd->recv_displ[0] = 0;
709     for( i=1; i<gd->comm_size; i++ )
710         gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
711
712
713     MPI_Wtime();
714 /*  Now send the keys to respective processors  */    
715     MPI_Alltoallv( gd->key_buff1,
716                    gd->send_count,
717                    gd->send_displ,
718                    MP_KEY_TYPE,
719                    gd->key_buff2,
720                    gd->recv_count,
721                    gd->recv_displ,
722                    MP_KEY_TYPE,
723                    MPI_COMM_WORLD );
724
725 #ifdef  TIMING_ENABLED
726     timer_stop(gd, 3 ); 
727     timer_start(gd, 2 );
728 #endif
729
730 /*  The starting and ending bucket numbers on each processor are
731     multiplied by the interval size of the buckets to obtain the 
732     smallest possible min and greatest possible max value of any 
733     key on each processor                                          */
734     min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
735     max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
736
737 /*  Clear the work array */
738     for( i=0; i<max_key_val-min_key_val+1; i++ )
739         gd->key_buff1[i] = 0;
740
741 /*  Determine the total number of keys on all other 
742     processors holding keys of lesser value         */
743     m = 0;
744     for( k=0; k<gd->my_rank; k++ )
745         for( i= gd->process_bucket_distrib_ptr1[k];
746              i<=gd->process_bucket_distrib_ptr2[k];
747              i++ )  
748             m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
749
750 /*  Determine total number of keys on this processor */
751     j = 0;                                 
752     for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
753          i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
754          i++ )  
755         j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
756
757
758 /*  Ranking of all keys occurs in this section:                 */
759 /*  shift it backwards so no subtractions are necessary in loop */
760     key_buff_ptr = gd->key_buff1 - min_key_val;
761
762 /*  In this section, the keys themselves are used as their 
763     own indexes to determine how many of each there are: their
764     individual population                                       */
765     for( i=0; i<j; i++ )
766         key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
767                                        /* population                     */
768
769 /*  To obtain ranks of each key, successively add the individual key
770     population, not forgetting the total of lesser keys, m.
771     NOTE: Since the total of lesser keys would be subtracted later 
772     in verification, it is no longer added to the first key population 
773     here, but still needed during the partial verify test.  This is to 
774     ensure that 32-bit key_buff can still be used for class D.           */
775 /*    key_buff_ptr[min_key_val] += m;    */
776     for( i=min_key_val; i<max_key_val; i++ )   
777         key_buff_ptr[i+1] += key_buff_ptr[i];  
778
779
780 /* This is the partial verify test section */
781 /* Observe that test_rank_array vals are   */
782 /* shifted differently for different cases */
783     for( i=0; i<TEST_ARRAY_SIZE; i++ )
784     {                                             
785         k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
786         if( min_key_val <= k  &&  k <= max_key_val )
787         {
788             /* Add the total of lesser keys, m, here */
789             INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
790             int failed = 0;
791
792             switch( CLASS )
793             {
794                 case 'S':
795                     if( i <= 2 )
796                     {
797                         if( key_rank != gd->test_rank_array[i]+iteration )
798                             failed = 1;
799                         else
800                             gd->passed_verification++;
801                     }
802                     else
803                     {
804                         if( key_rank != gd->test_rank_array[i]-iteration )
805                             failed = 1;
806                         else
807                             gd->passed_verification++;
808                     }
809                     break;
810                 case 'W':
811                     if( i < 2 )
812                     {
813                         if( key_rank != gd->test_rank_array[i]+(iteration-2) )
814                             failed = 1;
815                         else
816                             gd->passed_verification++;
817                     }
818                     else
819                     {
820                         if( key_rank != gd->test_rank_array[i]-iteration )
821                             failed = 1;
822                         else
823                             gd->passed_verification++;
824                     }
825                     break;
826                 case 'A':
827                     if( i <= 2 )
828                     {
829                         if( key_rank != gd->test_rank_array[i]+(iteration-1) )
830                             failed = 1;
831                         else
832                           gd->passed_verification++;
833                     }
834                     else
835                     {
836                         if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
837                             failed = 1;
838                         else
839                             gd->passed_verification++;
840                     }
841                     break;
842                 case 'B':
843                     if( i == 1 || i == 2 || i == 4 )
844                     {
845                         if( key_rank != gd->test_rank_array[i]+iteration )
846                             failed = 1;
847                         else
848                             gd->passed_verification++;
849                     }
850                     else
851                     {
852                         if( key_rank != gd->test_rank_array[i]-iteration )
853                             failed = 1;
854                         else
855                             gd->passed_verification++;
856                     }
857                     break;
858                 case 'C':
859                     if( i <= 2 )
860                     {
861                         if( key_rank != gd->test_rank_array[i]+iteration )
862                             failed = 1;
863                         else
864                             gd->passed_verification++;
865                     }
866                     else
867                     {
868                         if( key_rank != gd->test_rank_array[i]-iteration )
869                             failed = 1;
870                         else
871                             gd->passed_verification++;
872                     }
873                     break;
874                 case 'D':
875                     if( i < 2 )
876                     {
877                         if( key_rank != gd->test_rank_array[i]+iteration )
878                             failed = 1;
879                         else
880                             gd->passed_verification++;
881                     }
882                     else
883                     {
884                         if( key_rank != gd->test_rank_array[i]-iteration )
885                             failed = 1;
886                         else
887                             gd->passed_verification++;
888                     }
889                     break;
890             }
891             if( failed == 1 )
892                 printf( "Failed partial verification: "
893                         "iteration %d, processor %d, test key %d\n", 
894                          iteration, gd->my_rank, (int)i );
895         }
896     }
897
898
899
900
901 /*  Make copies of rank info for use by full_verify: these variables
902     in rank are local; making them global slows down the code, probably
903     since they cannot be made register by compiler                        */
904
905     if( iteration == MAX_ITERATIONS ) 
906     {
907         gd->key_buff_ptr_global = key_buff_ptr;
908         gd->total_local_keys    = j;
909         gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
910     }
911
912 }      
913
914
915 /*****************************************************************/
916 /*************             M  A  I  N             ****************/
917 /*****************************************************************/
918
919 int main( int argc, char **argv )
920 {
921
922     int             i, iteration, itemp;
923
924     double          timecounter, maxtime;
925
926     global_data* gd = malloc(sizeof(global_data));
927 /*  Initialize MPI */
928     MPI_Init( &argc, &argv );
929     MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
930     MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
931
932 /*  Initialize the verification arrays if a valid class */
933     for( i=0; i<TEST_ARRAY_SIZE; i++ )
934         switch( CLASS )
935         {
936             case 'S':
937                 gd->test_index_array[i] = S_test_index_array[i];
938                 gd->test_rank_array[i]  = S_test_rank_array[i];
939                 break;
940             case 'A':
941                 gd->test_index_array[i] = A_test_index_array[i];
942                 gd->test_rank_array[i]  = A_test_rank_array[i];
943                 break;
944             case 'W':
945                 gd->test_index_array[i] = W_test_index_array[i];
946                 gd->test_rank_array[i]  = W_test_rank_array[i];
947                 break;
948             case 'B':
949                 gd->test_index_array[i] = B_test_index_array[i];
950                 gd->test_rank_array[i]  = B_test_rank_array[i];
951                 break;
952             case 'C':
953                 gd->test_index_array[i] = C_test_index_array[i];
954                 gd->test_rank_array[i]  = C_test_rank_array[i];
955                 break;
956             case 'D':
957                 gd->test_index_array[i] = D_test_index_array[i];
958                 gd->test_rank_array[i]  = D_test_rank_array[i];
959                 break;
960         };
961
962         
963
964 /*  Printout initial NPB info */
965     if( gd->my_rank == 0 )
966     {
967         printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
968         printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
969         printf( " Iterations:   %d\n", MAX_ITERATIONS );
970         printf( " Number of processes:     %d\n",gd->comm_size );
971     }
972
973 /*  Check that actual and compiled number of processors agree */
974     if( gd->comm_size != NUM_PROCS )
975     {
976         if( gd->my_rank == 0 )
977             printf( "\n ERROR: compiled for %d processes\n"
978                     " Number of active processes: %d\n"
979                     " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
980         MPI_Finalize();
981         exit( 1 );
982     }
983
984 /*  Check to see whether total number of processes is within bounds.
985     This could in principle be checked in setparams.c, but it is more
986     convenient to do it here                                               */
987     if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
988     {
989        if( gd->my_rank == 0 )
990            printf( "\n ERROR: number of processes %d not within range %d-%d"
991                    "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
992        MPI_Finalize();
993        exit( 1 );
994     }
995
996
997 /*  Generate random number sequence and subsequent keys on all procs */
998     create_seq(gd,  find_my_seed( gd->my_rank, 
999                               gd->comm_size, 
1000                               4*(long)TOTAL_KEYS*MIN_PROCS,
1001                               314159265.00,      /* Random number gen seed */
1002                               1220703125.00 ),   /* Random number gen mult */
1003                 1220703125.00 );                 /* Random number gen mult */
1004
1005 /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
1006     all data and code pages and respective tables */
1007     rank(gd, 1 );  
1008
1009 /*  Start verification counter */
1010     gd->passed_verification = 0;
1011
1012     if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
1013
1014 /*  Initialize timer  */             
1015     timer_clear(gd, 0 );
1016
1017 /*  Initialize separate communication, computation timing */
1018 #ifdef  TIMING_ENABLED 
1019     for( i=1; i<=3; i++ ) timer_clear(gd, i );
1020 #endif
1021
1022 /*  Start timer  */             
1023     timer_start(gd, 0 );
1024
1025 #ifdef  TIMING_ENABLED
1026     timer_start(gd, 1 );
1027     timer_start(gd, 2 );
1028 #endif
1029
1030 /*  This is the main iteration */
1031     for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1032     {
1033         if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
1034         rank(gd,  iteration );
1035     }
1036
1037
1038 #ifdef  TIMING_ENABLED
1039     timer_stop(gd, 2 );
1040     timer_stop(gd, 1 );
1041 #endif
1042
1043 /*  Stop timer, obtain time for processors */
1044     timer_stop(gd, 0 );
1045
1046     timecounter = timer_read(gd, 0 );
1047
1048 /*  End of timing, obtain maximum time of all processors */
1049     MPI_Reduce( &timecounter,
1050                 &maxtime,
1051                 1,
1052                 MPI_DOUBLE,
1053                 MPI_MAX,
1054                 0,
1055                 MPI_COMM_WORLD );
1056
1057 #ifdef  TIMING_ENABLED
1058     {
1059         double    tmin, tsum, tmax;
1060     
1061         if( my_rank == 0 )
1062         {
1063             printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1064             printf( "              min                avg                max\n" );
1065         }
1066         for( i=1; i<=3; i++ )
1067         {
1068             timecounter = timer_read(gd, i );
1069             MPI_Reduce( &timecounter,
1070                         &tmin,
1071                         1,
1072                         MPI_DOUBLE,
1073                         MPI_MIN,
1074                         0,
1075                         MPI_COMM_WORLD );
1076             MPI_Reduce( &timecounter,
1077                         &tsum,
1078                         1,
1079                         MPI_DOUBLE,
1080                         MPI_SUM,
1081                         0,
1082                         MPI_COMM_WORLD );
1083             MPI_Reduce( &timecounter,
1084                         &tmax,
1085                         1,
1086                         MPI_DOUBLE,
1087                         MPI_MAX,
1088                         0,
1089                         MPI_COMM_WORLD );
1090             if( my_rank == 0 )
1091                 printf( "timer %d:    %f           %f            %f\n",
1092                         i, tmin, tsum/((double) comm_size), tmax );
1093         }
1094         if( my_rank == 0 )
1095             printf( "\n" );
1096     }
1097 #endif
1098
1099 /*  This tests that keys are in sequence: sorting of last ranked key seq
1100     occurs here, but is an untimed operation                             */
1101     full_verify(gd);
1102
1103
1104 /*  Obtain verification counter sum */
1105     itemp =gd->passed_verification;
1106     MPI_Reduce( &itemp,
1107                 &gd->passed_verification,
1108                 1,
1109                 MPI_INT,
1110                 MPI_SUM,
1111                 0,
1112                 MPI_COMM_WORLD );
1113
1114
1115
1116 /*  The final printout  */
1117     if( gd->my_rank == 0 )
1118     {
1119         if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1120             gd->passed_verification = 0;
1121         c_print_results( "IS",
1122                          CLASS,
1123                          (int)(TOTAL_KEYS),
1124                          MIN_PROCS,
1125                          0,
1126                          MAX_ITERATIONS,
1127                          NUM_PROCS,
1128                          gd->comm_size,
1129                          maxtime,
1130                          ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1131                                                       /maxtime/1000000.,
1132                          "keys ranked", 
1133                          gd->passed_verification,
1134                          NPBVERSION,
1135                          COMPILETIME,
1136                          MPICC,
1137                          CLINK,
1138                          CMPI_LIB,
1139                          CMPI_INC,
1140                          CFLAGS,
1141                          CLINKFLAGS );
1142     }
1143                     
1144     MPI_Finalize();
1145     free(gd);
1146
1147     return 0;
1148          /**************************/
1149 }        /*  E N D  P R O G R A M  */
1150          /**************************/