Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Solve white space conflicts
[simgrid.git] / examples / smpi / NAS / IS / is.c
1 /*************************************************************************
2  *                                                                       * 
3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
4  *                                                                       * 
5  *                                  I S                                  * 
6  *                                                                       * 
7  ************************************************************************* 
8  *                                                                       * 
9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
10  *   It is described in NAS Technical Report 95-020.                     * 
11  *                                                                       * 
12  *   Permission to use, copy, distribute and modify this software        * 
13  *   for any purpose with or without fee is hereby granted.  We          * 
14  *   request, however, that all derived work reference the NAS           * 
15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
16  *   without express or implied warranty.                                * 
17  *                                                                       * 
18  *   Information on NPB 3.3, including the technical report, the         *
19  *   original specifications, source code, results and information       * 
20  *   on how to submit new results, is available at:                      * 
21  *                                                                       * 
22  *          http://www.nas.nasa.gov/Software/NPB                         * 
23  *                                                                       * 
24  *   Send comments or suggestions to  npb@nas.nasa.gov                   * 
25  *   Send bug reports to              npb-bugs@nas.nasa.gov              * 
26  *                                                                       * 
27  *         NAS Parallel Benchmarks Group                                 * 
28  *         NASA Ames Research Center                                     * 
29  *         Mail Stop: T27A-1                                             * 
30  *         Moffett Field, CA   94035-1000                                * 
31  *                                                                       * 
32  *         E-mail:  npb@nas.nasa.gov                                     * 
33  *         Fax:     (650) 604-3957                                       * 
34  *                                                                       * 
35  ************************************************************************* 
36  *                                                                       * 
37  *   Author: M. Yarrow                                                   * 
38  *           H. Jin                                                      * 
39  *                                                                       * 
40  *************************************************************************/
41
42 #include "mpi.h"
43 #include "npbparams.h"
44 #include <stdlib.h>
45 #include <stdio.h>
46
47 /******************/
48 /* default values */
49 /******************/
50 #ifndef CLASS
51 #define CLASS 'S'
52 #define NUM_PROCS            1                 
53 #endif
54 #define MIN_PROCS            1
55
56
57 /*************/
58 /*  CLASS S  */
59 /*************/
60 #if CLASS == 'S'
61 #define  TOTAL_KEYS_LOG_2    16
62 #define  MAX_KEY_LOG_2       11
63 #define  NUM_BUCKETS_LOG_2   9
64 #endif
65
66
67 /*************/
68 /*  CLASS W  */
69 /*************/
70 #if CLASS == 'W'
71 #define  TOTAL_KEYS_LOG_2    20
72 #define  MAX_KEY_LOG_2       16
73 #define  NUM_BUCKETS_LOG_2   10
74 #endif
75
76 /*************/
77 /*  CLASS A  */
78 /*************/
79 #if CLASS == 'A'
80 #define  TOTAL_KEYS_LOG_2    23
81 #define  MAX_KEY_LOG_2       19
82 #define  NUM_BUCKETS_LOG_2   10
83 #endif
84
85
86 /*************/
87 /*  CLASS B  */
88 /*************/
89 #if CLASS == 'B'
90 #define  TOTAL_KEYS_LOG_2    25
91 #define  MAX_KEY_LOG_2       21
92 #define  NUM_BUCKETS_LOG_2   10
93 #endif
94
95
96 /*************/
97 /*  CLASS C  */
98 /*************/
99 #if CLASS == 'C'
100 #define  TOTAL_KEYS_LOG_2    27
101 #define  MAX_KEY_LOG_2       23
102 #define  NUM_BUCKETS_LOG_2   10
103 #endif
104
105
106 /*************/
107 /*  CLASS D  */
108 /*************/
109 #if CLASS == 'D'
110 #define  TOTAL_KEYS_LOG_2    29
111 #define  MAX_KEY_LOG_2       27
112 #define  NUM_BUCKETS_LOG_2   10
113 #undef   MIN_PROCS
114 #define  MIN_PROCS           4
115 #endif
116
117
118 #define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
119 #define  MAX_KEY             (1 << MAX_KEY_LOG_2)
120 #define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
121 #define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
122
123 /*****************************************************************/
124 /* On larger number of processors, since the keys are (roughly)  */ 
125 /* gaussian distributed, the first and last processor sort keys  */ 
126 /* in a large interval, requiring array sizes to be larger. Note */
127 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
128 /* The required array size also depends on the bucket size used. */
129 /* The following values are validated for the 1024-bucket setup. */
130 /*****************************************************************/
131 #if   NUM_PROCS < 256
132 #define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
133 #elif NUM_PROCS < 512
134 #define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
135 #elif NUM_PROCS < 1024
136 #define  SIZE_OF_BUFFERS     4*NUM_KEYS
137 #else
138 #define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
139 #endif
140
141 /*****************************************************************/
142 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
143 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
144 /* MAX_PROCS AT YOUR PERIL                                       */
145 /*****************************************************************/
146 #if CLASS == 'S'
147 #define  MAX_PROCS           128
148 #else
149 #define  MAX_PROCS           1024
150 #endif
151
152 #define  MAX_ITERATIONS      10
153 #define  TEST_ARRAY_SIZE     5
154
155
156 /***********************************/
157 /* Enable separate communication,  */
158 /* computation timing and printout */
159 /***********************************/
160 /* #define  TIMING_ENABLED         */
161
162
163 /*************************************/
164 /* Typedef: if necessary, change the */
165 /* size of int here by changing the  */
166 /* int type to, say, long            */
167 /*************************************/
168 typedef  int  INT_TYPE;
169 typedef  long INT_TYPE2;
170 #define MP_KEY_TYPE MPI_INT
171
172
173 typedef struct {
174
175 /********************/
176 /* MPI properties:  */
177 /********************/
178 int      my_rank,
179          comm_size;
180
181
182 /********************/
183 /* Some global info */
184 /********************/
185 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
186          total_local_keys,             /* copies of rank info        */
187          total_lesser_keys;
188
189
190 int      passed_verification;
191                                  
192
193
194 /************************************/
195 /* These are the three main arrays. */
196 /* See SIZE_OF_BUFFERS def above    */
197 /************************************/
198 INT_TYPE key_array[SIZE_OF_BUFFERS],    
199          key_buff1[SIZE_OF_BUFFERS],    
200          key_buff2[SIZE_OF_BUFFERS],
201          bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
202          bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
203          bucket_ptrs[NUM_BUCKETS],
204          process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],   
205          process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];   
206 int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
207          send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
208
209
210 /**********************/
211 /* Partial verif info */
212 /**********************/
213 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
214          test_rank_array[TEST_ARRAY_SIZE];
215
216 /**********/
217 /* Timers */
218 /**********/
219 double start[64], elapsed[64];
220
221 } global_data;
222
223
224 const INT_TYPE2
225          S_test_index_array[TEST_ARRAY_SIZE] = 
226                              {48427,17148,23627,62548,4431},
227          S_test_rank_array[TEST_ARRAY_SIZE] = 
228                              {0,18,346,64917,65463},
229
230          W_test_index_array[TEST_ARRAY_SIZE] = 
231                              {357773,934767,875723,898999,404505},
232          W_test_rank_array[TEST_ARRAY_SIZE] = 
233                              {1249,11698,1039987,1043896,1048018},
234
235          A_test_index_array[TEST_ARRAY_SIZE] = 
236                              {2112377,662041,5336171,3642833,4250760},
237          A_test_rank_array[TEST_ARRAY_SIZE] = 
238                              {104,17523,123928,8288932,8388264},
239
240          B_test_index_array[TEST_ARRAY_SIZE] = 
241                              {41869,812306,5102857,18232239,26860214},
242          B_test_rank_array[TEST_ARRAY_SIZE] = 
243                              {33422937,10244,59149,33135281,99}, 
244
245          C_test_index_array[TEST_ARRAY_SIZE] = 
246                              {44172927,72999161,74326391,129606274,21736814},
247          C_test_rank_array[TEST_ARRAY_SIZE] = 
248                              {61147,882988,266290,133997595,133525895},
249
250          D_test_index_array[TEST_ARRAY_SIZE] = 
251                              {1317351170,995930646,1157283250,1503301535,1453734525},
252          D_test_rank_array[TEST_ARRAY_SIZE] = 
253                              {1,36538729,1978098519,2145192618,2147425337};
254
255
256
257 /***********************/
258 /* function prototypes */
259 /***********************/
260 double  randlc( double *X, double *A );
261
262 void full_verify( global_data* gd );
263
264 void c_print_results( char   *name,
265                       char   class,
266                       int    n1, 
267                       int    n2,
268                       int    n3,
269                       int    niter,
270                       int    nprocs_compiled,
271                       int    nprocs_total,
272                       double t,
273                       double mops,
274           char   *optype,
275                       int    passed_verification,
276                       char   *npbversion,
277                       char   *compiletime,
278                       char   *mpicc,
279                       char   *clink,
280                       char   *cmpi_lib,
281                       char   *cmpi_inc,
282                       char   *cflags,
283                       char   *clinkflags );
284
285 void    timer_clear(global_data* gd, int n );
286 void    timer_start(global_data* gd, int n );
287 void    timer_stop(global_data* gd, int n );
288 double  timer_read(global_data* gd, int n );
289
290 void    timer_clear(global_data* gd, int n ) {
291    gd->elapsed[n] = 0.0;
292 }
293
294 void    timer_start(global_data* gd, int n ) {
295    gd->start[n] = MPI_Wtime();
296 }
297
298 void    timer_stop(global_data* gd, int n ) {
299    gd->elapsed[n] += MPI_Wtime() - gd->start[n];
300 }
301
302 double  timer_read(global_data* gd, int n ) {
303    return gd->elapsed[n];
304 }
305
306
307 /*
308  *    FUNCTION RANDLC (X, A)
309  *
310  *  This routine returns a uniform pseudorandom double precision number in the
311  *  range (0, 1) by using the linear congruential generator
312  *
313  *  x_{k+1} = a x_k  (mod 2^46)
314  *
315  *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
316  *  before repeating.  The argument A is the same as 'a' in the above formula,
317  *  and X is the same as x_0.  A and X must be odd double precision integers
318  *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
319  *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
320  *  the new seed x_1, so that subsequent calls to RANDLC using the same
321  *  arguments will generate a continuous sequence.
322  *
323  *  This routine should produce the same results on any computer with at least
324  *  48 mantissa bits in double precision floating point data.  On Cray systems,
325  *  double precision should be disabled.
326  *
327  *  David H. Bailey     October 26, 1990
328  *
329  *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
330  *     SAVE KS, R23, R46, T23, T46
331  *     DATA KS/0/
332  *
333  *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
334  *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
335  *  by merely using the ** operator, in order to insure that the results are
336  *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
337  */
338
339
340 /*****************************************************************/
341 /*************           R  A  N  D  L  C             ************/
342 /*************                                        ************/
343 /*************    portable random number generator    ************/
344 /*****************************************************************/
345
346 double  randlc( double *X, double *A )
347 {
348       static int        KS=0;
349       static double  R23, R46, T23, T46;
350       double    T1, T2, T3, T4;
351       double    A1;
352       double    A2;
353       double    X1;
354       double    X2;
355       double    Z;
356       int         i, j;
357
358       if (KS == 0) 
359       {
360         R23 = 1.0;
361         R46 = 1.0;
362         T23 = 1.0;
363         T46 = 1.0;
364     
365         for (i=1; i<=23; i++)
366         {
367           R23 = 0.50 * R23;
368           T23 = 2.0 * T23;
369         }
370         for (i=1; i<=46; i++)
371         {
372           R46 = 0.50 * R46;
373           T46 = 2.0 * T46;
374         }
375         KS = 1;
376       }
377
378 /*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
379
380       T1 = R23 * *A;
381       j  = T1;
382       A1 = j;
383       A2 = *A - T23 * A1;
384
385 /*  Break X into two parts such that X = 2^23 * X1 + X2, compute
386     Z = A1 * X2 + A2 * X1  (mod 2^23), and then
387     X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
388
389       T1 = R23 * *X;
390       j  = T1;
391       X1 = j;
392       X2 = *X - T23 * X1;
393       T1 = A1 * X2 + A2 * X1;
394       
395       j  = R23 * T1;
396       T2 = j;
397       Z = T1 - T23 * T2;
398       T3 = T23 * Z + A2 * X2;
399       j  = R46 * T3;
400       T4 = j;
401       *X = T3 - T46 * T4;
402       return(R46 * *X);
403
404
405
406
407 /*****************************************************************/
408 /************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
409 /************                                         ************/
410 /************ returns parallel random number seq seed ************/
411 /*****************************************************************/
412
413 /*
414  * Create a random number sequence of total length nn residing
415  * on np number of processors.  Each processor will therefore have a 
416  * subsequence of length nn/np.  This routine returns that random 
417  * number which is the first random number for the subsequence belonging
418  * to processor rank kn, and which is used as seed for proc kn ran # gen.
419  */
420
421 double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
422                        int  np,       /* np = num procs                      */
423                        long nn,       /* total num of ran numbers, all procs */
424                        double s,      /* Ran num seed, for ex.: 314159265.00 */
425                        double a )     /* Ran num gen mult, try 1220703125.00 */
426 {
427
428   long   i;
429
430   double t1,t2,t3,an;
431   long   mq,nq,kk,ik;
432
433
434
435       nq = nn / np;
436
437       for( mq=0; nq>1; mq++,nq/=2 )
438           ;
439
440       t1 = a;
441
442       for( i=1; i<=mq; i++ )
443         t2 = randlc( &t1, &t1 );
444
445       an = t1;
446
447       kk = kn;
448       t1 = s;
449       t2 = an;
450
451       for( i=1; i<=100; i++ )
452       {
453         ik = kk / 2;
454         if( 2 * ik !=  kk ) 
455             t3 = randlc( &t1, &t2 );
456         if( ik == 0 ) 
457             break;
458         t3 = randlc( &t2, &t2 );
459         kk = ik;
460       }
461
462       return( t1 );
463
464 }
465
466
467
468
469 /*****************************************************************/
470 /*************      C  R  E  A  T  E  _  S  E  Q      ************/
471 /*****************************************************************/
472
473 void  create_seq( global_data* gd, double seed, double a )
474 {
475   double x;
476   int    i, k;
477
478         k = MAX_KEY/4;
479
480   for (i=0; i<NUM_KEYS; i++)
481   {
482       x = randlc(&seed, &a);
483       x += randlc(&seed, &a);
484           x += randlc(&seed, &a);
485       x += randlc(&seed, &a);  
486
487             gd->key_array[i] = k*x;
488   }
489 }
490
491
492
493
494 /*****************************************************************/
495 /*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
496 /*****************************************************************/
497
498
499 void full_verify( global_data* gd )
500 {
501     MPI_Status  status;
502     MPI_Request request;
503     
504     INT_TYPE    i, j;
505     INT_TYPE    k, last_local_key;
506
507     
508 /*  Now, finally, sort the keys:  */
509     for( i=0; i<gd->total_local_keys; i++ )
510         gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
511                                  gd->total_lesser_keys] = gd->key_buff2[i];
512     last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
513
514 /*  Send largest key value to next processor  */
515     if( gd->my_rank > 0 )
516         MPI_Irecv( &k,
517                    1,
518                    MP_KEY_TYPE,
519                    gd->my_rank-1,
520                    1000,
521                    MPI_COMM_WORLD,
522                    &request );                   
523     if( gd->my_rank < gd->comm_size-1 )
524         MPI_Send( &gd->key_array[last_local_key],
525                   1,
526                   MP_KEY_TYPE,
527                   gd->my_rank+1,
528                   1000,
529                   MPI_COMM_WORLD );
530     if( gd->my_rank > 0 )
531         MPI_Wait( &request, &status );
532
533 /*  Confirm that neighbor's greatest key value 
534     is not greater than my least key value       */              
535     j = 0;
536     if( gd->my_rank > 0 && gd->total_local_keys > 0 )
537         if( k > gd->key_array[0] )
538             j++;
539
540
541 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
542     for( i=1; i<gd->total_local_keys; i++ )
543         if( gd->key_array[i-1] > gd->key_array[i] )
544             j++;
545
546
547     if( j != 0 )
548     {
549         printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
550                 gd->my_rank, j );
551     }
552     else
553         gd->passed_verification++;
554            
555
556 }
557
558
559
560
561 /*****************************************************************/
562 /*************             R  A  N  K             ****************/
563 /*****************************************************************/
564
565
566 void rank( global_data* gd, int iteration )
567 {
568
569     INT_TYPE    i, k;
570
571     INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
572     INT_TYPE    key;
573     INT_TYPE2   bucket_sum_accumulator, j, m;
574     INT_TYPE    local_bucket_sum_accumulator;
575     INT_TYPE    min_key_val, max_key_val;
576     INT_TYPE    *key_buff_ptr;
577
578
579
580
581 /*  Iteration alteration of keys */  
582     if(gd->my_rank == 0 )                    
583     {
584       gd->key_array[iteration] = iteration;
585       gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
586     }
587
588
589 /*  Initialize */
590     for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )  
591     {
592         gd->bucket_size[i] = 0;
593         gd->bucket_size_totals[i] = 0;
594         gd->process_bucket_distrib_ptr1[i] = 0;
595         gd->process_bucket_distrib_ptr2[i] = 0;
596     }
597
598
599 /*  Determine where the partial verify test keys are, load into  */
600 /*  top of array bucket_size                                     */
601     for( i=0; i<TEST_ARRAY_SIZE; i++ )
602         if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
603             gd->bucket_size[NUM_BUCKETS+i] = 
604                           gd->key_array[gd->test_index_array[i] % NUM_KEYS];
605
606
607 /*  Determine the number of keys in each bucket */
608     for( i=0; i<NUM_KEYS; i++ )
609         gd->bucket_size[gd->key_array[i] >> shift]++;
610
611
612 /*  Accumulative bucket sizes are the bucket pointers */
613     gd->bucket_ptrs[0] = 0;
614     for( i=1; i< NUM_BUCKETS; i++ )  
615         gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
616
617
618 /*  Sort into appropriate bucket */
619     for( i=0; i<NUM_KEYS; i++ )  
620     {
621         key = gd->key_array[i];
622         gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
623     }
624
625 #ifdef  TIMING_ENABLED
626     timer_stop(gd, 2 );
627     timer_start(gd, 3 );
628 #endif
629
630 /*  Get the bucket size totals for the entire problem. These 
631     will be used to determine the redistribution of keys      */
632     MPI_Allreduce( gd->bucket_size, 
633                    gd->bucket_size_totals, 
634                    NUM_BUCKETS+TEST_ARRAY_SIZE, 
635                    MP_KEY_TYPE,
636                    MPI_SUM,
637                    MPI_COMM_WORLD );
638
639 #ifdef  TIMING_ENABLED
640     timer_stop(gd, 3 );
641     timer_start(gd, 2 );
642 #endif
643
644 /*  Determine Redistibution of keys: accumulate the bucket size totals 
645     till this number surpasses NUM_KEYS (which the average number of keys
646     per processor).  Then all keys in these buckets go to processor 0.
647     Continue accumulating again until supassing 2*NUM_KEYS. All keys
648     in these buckets go to processor 1, etc.  This algorithm guarantees
649     that all processors have work ranking; no processors are left idle.
650     The optimum number of buckets, however, does not result in as high
651     a degree of load balancing (as even a distribution of keys as is
652     possible) as is obtained from increasing the number of buckets, but
653     more buckets results in more computation per processor so that the
654     optimum number of buckets turns out to be 1024 for machines tested.
655     Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
656     number of first and last bucket which each processor will have after   
657     the redistribution is done.                                          */
658
659     bucket_sum_accumulator = 0;
660     local_bucket_sum_accumulator = 0;
661     gd->send_displ[0] = 0;
662     gd->process_bucket_distrib_ptr1[0] = 0;
663     for( i=0, j=0; i<NUM_BUCKETS; i++ )  
664     {
665         bucket_sum_accumulator       += gd->bucket_size_totals[i];
666         local_bucket_sum_accumulator += gd->bucket_size[i];
667         if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )  
668         {
669             gd->send_count[j] = local_bucket_sum_accumulator;
670             if( j != 0 )
671             {
672                 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
673                 gd->process_bucket_distrib_ptr1[j] = 
674                                         gd->process_bucket_distrib_ptr2[j-1]+1;
675             }
676             gd->process_bucket_distrib_ptr2[j++] = i;
677             local_bucket_sum_accumulator = 0;
678         }
679     }
680
681 /*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
682     that the last few processors don't get any buckets.  So, we
683     need to set counts properly in this case to avoid any fallouts.    */
684     while( j < gd->comm_size )
685     {
686         gd->send_count[j] = 0;
687         gd->process_bucket_distrib_ptr1[j] = 1;
688         j++;
689     }
690
691 #ifdef  TIMING_ENABLED
692     timer_stop(gd, 2 );
693     timer_start(gd, 3 ); 
694 #endif
695
696 /*  This is the redistribution section:  first find out how many keys
697     each processor will send to every other processor:                 */
698     MPI_Alltoall( gd->send_count,
699                   1,
700                   MPI_INT,
701                   gd->recv_count,
702                   1,
703                   MPI_INT,
704                   MPI_COMM_WORLD );
705
706 /*  Determine the receive array displacements for the buckets */    
707     gd->recv_displ[0] = 0;
708     for( i=1; i<gd->comm_size; i++ )
709         gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
710
711
712 /*  Now send the keys to respective processors  */    
713     MPI_Alltoallv( gd->key_buff1,
714                    gd->send_count,
715                    gd->send_displ,
716                    MP_KEY_TYPE,
717                    gd->key_buff2,
718                    gd->recv_count,
719                    gd->recv_displ,
720                    MP_KEY_TYPE,
721                    MPI_COMM_WORLD );
722
723 #ifdef  TIMING_ENABLED
724     timer_stop(gd, 3 ); 
725     timer_start(gd, 2 );
726 #endif
727
728 /*  The starting and ending bucket numbers on each processor are
729     multiplied by the interval size of the buckets to obtain the 
730     smallest possible min and greatest possible max value of any 
731     key on each processor                                          */
732     min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
733     max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
734
735 /*  Clear the work array */
736     for( i=0; i<max_key_val-min_key_val+1; i++ )
737         gd->key_buff1[i] = 0;
738
739 /*  Determine the total number of keys on all other 
740     processors holding keys of lesser value         */
741     m = 0;
742     for( k=0; k<gd->my_rank; k++ )
743         for( i= gd->process_bucket_distrib_ptr1[k];
744              i<=gd->process_bucket_distrib_ptr2[k];
745              i++ )  
746             m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
747
748 /*  Determine total number of keys on this processor */
749     j = 0;                                 
750     for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
751          i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
752          i++ )  
753         j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
754
755
756 /*  Ranking of all keys occurs in this section:                 */
757 /*  shift it backwards so no subtractions are necessary in loop */
758     key_buff_ptr = gd->key_buff1 - min_key_val;
759
760 /*  In this section, the keys themselves are used as their 
761     own indexes to determine how many of each there are: their
762     individual population                                       */
763     for( i=0; i<j; i++ )
764         key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
765                                        /* population                     */
766
767 /*  To obtain ranks of each key, successively add the individual key
768     population, not forgetting the total of lesser keys, m.
769     NOTE: Since the total of lesser keys would be subtracted later 
770     in verification, it is no longer added to the first key population 
771     here, but still needed during the partial verify test.  This is to 
772     ensure that 32-bit key_buff can still be used for class D.           */
773 /*    key_buff_ptr[min_key_val] += m;    */
774     for( i=min_key_val; i<max_key_val; i++ )   
775         key_buff_ptr[i+1] += key_buff_ptr[i];  
776
777
778 /* This is the partial verify test section */
779 /* Observe that test_rank_array vals are   */
780 /* shifted differently for different cases */
781     for( i=0; i<TEST_ARRAY_SIZE; i++ )
782     {                                             
783         k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
784         if( min_key_val <= k  &&  k <= max_key_val )
785         {
786             /* Add the total of lesser keys, m, here */
787             INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
788             int failed = 0;
789
790             switch( CLASS )
791             {
792                 case 'S':
793                     if( i <= 2 )
794                     {
795                         if( key_rank != gd->test_rank_array[i]+iteration )
796                             failed = 1;
797                         else
798                             gd->passed_verification++;
799                     }
800                     else
801                     {
802                         if( key_rank != gd->test_rank_array[i]-iteration )
803                             failed = 1;
804                         else
805                             gd->passed_verification++;
806                     }
807                     break;
808                 case 'W':
809                     if( i < 2 )
810                     {
811                         if( key_rank != gd->test_rank_array[i]+(iteration-2) )
812                             failed = 1;
813                         else
814                             gd->passed_verification++;
815                     }
816                     else
817                     {
818                         if( key_rank != gd->test_rank_array[i]-iteration )
819                             failed = 1;
820                         else
821                             gd->passed_verification++;
822                     }
823                     break;
824                 case 'A':
825                     if( i <= 2 )
826               {
827                         if( key_rank != gd->test_rank_array[i]+(iteration-1) )
828                             failed = 1;
829                         else
830                           gd->passed_verification++;
831               }
832                     else
833                     {
834                         if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
835                             failed = 1;
836                         else
837                             gd->passed_verification++;
838                     }
839                     break;
840                 case 'B':
841                     if( i == 1 || i == 2 || i == 4 )
842               {
843                         if( key_rank != gd->test_rank_array[i]+iteration )
844                             failed = 1;
845                         else
846                             gd->passed_verification++;
847               }
848                     else
849                     {
850                         if( key_rank != gd->test_rank_array[i]-iteration )
851                             failed = 1;
852                         else
853                             gd->passed_verification++;
854                     }
855                     break;
856                 case 'C':
857                     if( i <= 2 )
858               {
859                         if( key_rank != gd->test_rank_array[i]+iteration )
860                             failed = 1;
861                         else
862                             gd->passed_verification++;
863               }
864                     else
865                     {
866                         if( key_rank != gd->test_rank_array[i]-iteration )
867                             failed = 1;
868                         else
869                             gd->passed_verification++;
870                     }
871                     break;
872                 case 'D':
873                     if( i < 2 )
874               {
875                         if( key_rank != gd->test_rank_array[i]+iteration )
876                             failed = 1;
877                         else
878                             gd->passed_verification++;
879               }
880                     else
881                     {
882                         if( key_rank != gd->test_rank_array[i]-iteration )
883                             failed = 1;
884                         else
885                             gd->passed_verification++;
886                     }
887                     break;
888             }
889             if( failed == 1 )
890                 printf( "Failed partial verification: "
891                         "iteration %d, processor %d, test key %d\n", 
892                          iteration, gd->my_rank, (int)i );
893         }
894     }
895
896
897
898
899 /*  Make copies of rank info for use by full_verify: these variables
900     in rank are local; making them global slows down the code, probably
901     since they cannot be made register by compiler                        */
902
903     if( iteration == MAX_ITERATIONS ) 
904     {
905         gd->key_buff_ptr_global = key_buff_ptr;
906         gd->total_local_keys    = j;
907         gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
908     }
909
910 }      
911
912
913 /*****************************************************************/
914 /*************             M  A  I  N             ****************/
915 /*****************************************************************/
916
917 int main( int argc, char **argv )
918 {
919
920     int             i, iteration, itemp;
921
922     double          timecounter, maxtime;
923
924     global_data* gd = malloc(sizeof(global_data));
925 /*  Initialize MPI */
926     MPI_Init( &argc, &argv );
927     MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
928     MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
929
930 /*  Initialize the verification arrays if a valid class */
931     for( i=0; i<TEST_ARRAY_SIZE; i++ )
932         switch( CLASS )
933         {
934             case 'S':
935                 gd->test_index_array[i] = S_test_index_array[i];
936                 gd->test_rank_array[i]  = S_test_rank_array[i];
937                 break;
938             case 'A':
939                 gd->test_index_array[i] = A_test_index_array[i];
940                 gd->test_rank_array[i]  = A_test_rank_array[i];
941                 break;
942             case 'W':
943                 gd->test_index_array[i] = W_test_index_array[i];
944                 gd->test_rank_array[i]  = W_test_rank_array[i];
945                 break;
946             case 'B':
947                 gd->test_index_array[i] = B_test_index_array[i];
948                 gd->test_rank_array[i]  = B_test_rank_array[i];
949                 break;
950             case 'C':
951                 gd->test_index_array[i] = C_test_index_array[i];
952                 gd->test_rank_array[i]  = C_test_rank_array[i];
953                 break;
954             case 'D':
955                 gd->test_index_array[i] = D_test_index_array[i];
956                 gd->test_rank_array[i]  = D_test_rank_array[i];
957                 break;
958         };
959
960         
961
962 /*  Printout initial NPB info */
963     if( gd->my_rank == 0 )
964     {
965         printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
966         printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
967         printf( " Iterations:   %d\n", MAX_ITERATIONS );
968         printf( " Number of processes:     %d\n",gd->comm_size );
969     }
970
971 /*  Check that actual and compiled number of processors agree */
972     if( gd->comm_size != NUM_PROCS )
973     {
974         if( gd->my_rank == 0 )
975             printf( "\n ERROR: compiled for %d processes\n"
976                     " Number of active processes: %d\n"
977                     " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
978         MPI_Finalize();
979         exit( 1 );
980     }
981
982 /*  Check to see whether total number of processes is within bounds.
983     This could in principle be checked in setparams.c, but it is more
984     convenient to do it here                                               */
985     if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
986     {
987        if( gd->my_rank == 0 )
988            printf( "\n ERROR: number of processes %d not within range %d-%d"
989                    "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
990        MPI_Finalize();
991        exit( 1 );
992     }
993
994
995 /*  Generate random number sequence and subsequent keys on all procs */
996     create_seq(gd,  find_my_seed( gd->my_rank, 
997                               gd->comm_size, 
998                               4*(long)TOTAL_KEYS*MIN_PROCS,
999                               314159265.00,      /* Random number gen seed */
1000                               1220703125.00 ),   /* Random number gen mult */
1001                 1220703125.00 );                 /* Random number gen mult */
1002
1003 /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
1004     all data and code pages and respective tables */
1005     rank(gd, 1 );  
1006
1007 /*  Start verification counter */
1008     gd->passed_verification = 0;
1009
1010     if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
1011
1012 /*  Initialize timer  */             
1013     timer_clear(gd, 0 );
1014
1015 /*  Initialize separate communication, computation timing */
1016 #ifdef  TIMING_ENABLED 
1017     for( i=1; i<=3; i++ ) timer_clear(gd, i );
1018 #endif
1019
1020 /*  Start timer  */             
1021     timer_start(gd, 0 );
1022
1023 #ifdef  TIMING_ENABLED
1024     timer_start(gd, 1 );
1025     timer_start(gd, 2 );
1026 #endif
1027
1028 /*  This is the main iteration */
1029     for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1030     {
1031         if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
1032         rank(gd,  iteration );
1033     }
1034
1035
1036 #ifdef  TIMING_ENABLED
1037     timer_stop(gd, 2 );
1038     timer_stop(gd, 1 );
1039 #endif
1040
1041 /*  Stop timer, obtain time for processors */
1042     timer_stop(gd, 0 );
1043
1044     timecounter = timer_read(gd, 0 );
1045
1046 /*  End of timing, obtain maximum time of all processors */
1047     MPI_Reduce( &timecounter,
1048                 &maxtime,
1049                 1,
1050                 MPI_DOUBLE,
1051                 MPI_MAX,
1052                 0,
1053                 MPI_COMM_WORLD );
1054
1055 #ifdef  TIMING_ENABLED
1056     {
1057         double    tmin, tsum, tmax;
1058     
1059         if( my_rank == 0 )
1060         {
1061             printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1062             printf( "              min                avg                max\n" );
1063         }
1064         for( i=1; i<=3; i++ )
1065         {
1066             timecounter = timer_read(gd, i );
1067             MPI_Reduce( &timecounter,
1068                         &tmin,
1069                         1,
1070                         MPI_DOUBLE,
1071                         MPI_MIN,
1072                         0,
1073                         MPI_COMM_WORLD );
1074             MPI_Reduce( &timecounter,
1075                         &tsum,
1076                         1,
1077                         MPI_DOUBLE,
1078                         MPI_SUM,
1079                         0,
1080                         MPI_COMM_WORLD );
1081             MPI_Reduce( &timecounter,
1082                         &tmax,
1083                         1,
1084                         MPI_DOUBLE,
1085                         MPI_MAX,
1086                         0,
1087                         MPI_COMM_WORLD );
1088             if( my_rank == 0 )
1089                 printf( "timer %d:    %f           %f            %f\n",
1090                         i, tmin, tsum/((double) comm_size), tmax );
1091         }
1092         if( my_rank == 0 )
1093             printf( "\n" );
1094     }
1095 #endif
1096
1097 /*  This tests that keys are in sequence: sorting of last ranked key seq
1098     occurs here, but is an untimed operation                             */
1099     full_verify(gd);
1100
1101
1102 /*  Obtain verification counter sum */
1103     itemp =gd->passed_verification;
1104     MPI_Reduce( &itemp,
1105                 &gd->passed_verification,
1106                 1,
1107                 MPI_INT,
1108                 MPI_SUM,
1109                 0,
1110                 MPI_COMM_WORLD );
1111
1112
1113
1114 /*  The final printout  */
1115     if( gd->my_rank == 0 )
1116     {
1117         if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1118             gd->passed_verification = 0;
1119         c_print_results( "IS",
1120                          CLASS,
1121                          (int)(TOTAL_KEYS),
1122                          MIN_PROCS,
1123                          0,
1124                          MAX_ITERATIONS,
1125                          NUM_PROCS,
1126                          gd->comm_size,
1127                          maxtime,
1128                          ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1129                                                       /maxtime/1000000.,
1130                          "keys ranked", 
1131                          gd->passed_verification,
1132                          NPBVERSION,
1133                          COMPILETIME,
1134                          MPICC,
1135                          CLINK,
1136                          CMPI_LIB,
1137                          CMPI_INC,
1138                          CFLAGS,
1139                          CLINKFLAGS );
1140     }
1141                     
1142     MPI_Finalize();
1143     free(gd);
1144
1145     return 0;
1146          /**************************/
1147 }        /*  E N D  P R O G R A M  */
1148          /**************************/