examples/smpi/NAS/IS-trace/is-trace.c

   1 /*************************************************************************
   2  *                                                                       *
   3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
   4  *                                                                       *
   5  *                                  I S                                  *
   6  *                                                                       *
   7  *************************************************************************
   8  *                                                                       *
   9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
  10  *   It is described in NAS Technical Report 95-020.                     *
  11  *                                                                       *
  12  *   Permission to use, copy, distribute and modify this software        *
  13  *   for any purpose with or without fee is hereby granted.  We          *
  14  *   request, however, that all derived work reference the NAS           *
  15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
  16  *   without express or implied warranty.                                *
  17  *                                                                       *
  18  *   Information on NPB 3.3, including the technical report, the         *
  19  *   original specifications, source code, results and information       *
  20  *   on how to submit new results, is available at:                      *
  21  *                                                                       *
  22  *          http://www.nas.nasa.gov/Software/NPB                         *
  23  *                                                                       *
  24  *   Send comments or suggestions to  npb@nas.nasa.gov                   *
  25  *   Send bug reports to              npb-bugs@nas.nasa.gov              *
  26  *                                                                       *
  27  *         NAS Parallel Benchmarks Group                                 *
  28  *         NASA Ames Research Center                                     *
  29  *         Mail Stop: T27A-1                                             *
  30  *         Moffett Field, CA   94035-1000                                *
  31  *                                                                       *
  32  *         E-mail:  npb@nas.nasa.gov                                     *
  33  *         Fax:     (650) 604-3957                                       *
  34  *                                                                       *
  35  *************************************************************************
  36  *                                                                       *
  37  *   Author: M. Yarrow                                                   *
  38  *           H. Jin                                                      *
  39  *                                                                       *
  40  *************************************************************************/
  41
  42 #include "mpi.h"
  43 #include "npbparams.h"
  44 #include <stdlib.h>
  45 #include <stdio.h>
  46
  47 #include "instr/instr.h" //TRACE_
  48
  49 /******************/
  50 /* default values */
  51 /******************/
  52 #ifndef CLASS
  53 #define CLASS 'S'
  54 #define NUM_PROCS            1
  55 #endif
  56 #define MIN_PROCS            1
  57
  58
  59 /*************/
  60 /*  CLASS S  */
  61 /*************/
  62 #if CLASS == 'S'
  63 #define  TOTAL_KEYS_LOG_2    16
  64 #define  MAX_KEY_LOG_2       11
  65 #define  NUM_BUCKETS_LOG_2   9
  66 #endif
  67
  68
  69 /*************/
  70 /*  CLASS W  */
  71 /*************/
  72 #if CLASS == 'W'
  73 #define  TOTAL_KEYS_LOG_2    20
  74 #define  MAX_KEY_LOG_2       16
  75 #define  NUM_BUCKETS_LOG_2   10
  76 #endif
  77
  78 /*************/
  79 /*  CLASS A  */
  80 /*************/
  81 #if CLASS == 'A'
  82 #define  TOTAL_KEYS_LOG_2    23
  83 #define  MAX_KEY_LOG_2       19
  84 #define  NUM_BUCKETS_LOG_2   10
  85 #endif
  86
  87
  88 /*************/
  89 /*  CLASS B  */
  90 /*************/
  91 #if CLASS == 'B'
  92 #define  TOTAL_KEYS_LOG_2    25
  93 #define  MAX_KEY_LOG_2       21
  94 #define  NUM_BUCKETS_LOG_2   10
  95 #endif
  96
  97
  98 /*************/
  99 /*  CLASS C  */
 100 /*************/
 101 #if CLASS == 'C'
 102 #define  TOTAL_KEYS_LOG_2    27
 103 #define  MAX_KEY_LOG_2       23
 104 #define  NUM_BUCKETS_LOG_2   10
 105 #endif
 106
 107
 108 /*************/
 109 /*  CLASS D  */
 110 /*************/
 111 #if CLASS == 'D'
 112 #define  TOTAL_KEYS_LOG_2    29
 113 #define  MAX_KEY_LOG_2       27
 114 #define  NUM_BUCKETS_LOG_2   10
 115 #undef   MIN_PROCS
 116 #define  MIN_PROCS           4
 117 #endif
 118
 119
 120 #define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
 121 #define  MAX_KEY             (1 << MAX_KEY_LOG_2)
 122 #define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
 123 #define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
 124
 125 /*****************************************************************/
 126 /* On larger number of processors, since the keys are (roughly)  */
 127 /* gaussian distributed, the first and last processor sort keys  */
 128 /* in a large interval, requiring array sizes to be larger. Note */
 129 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
 130 /* The required array size also depends on the bucket size used. */
 131 /* The following values are validated for the 1024-bucket setup. */
 132 /*****************************************************************/
 133 #if   NUM_PROCS < 256
 134 #define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
 135 #elif NUM_PROCS < 512
 136 #define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
 137 #elif NUM_PROCS < 1024
 138 #define  SIZE_OF_BUFFERS     4*NUM_KEYS
 139 #else
 140 #define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
 141 #endif
 142
 143 /*****************************************************************/
 144 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
 145 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
 146 /* MAX_PROCS AT YOUR PERIL                                       */
 147 /*****************************************************************/
 148 #if CLASS == 'S'
 149 #define  MAX_PROCS           128
 150 #else
 151 #define  MAX_PROCS           1024
 152 #endif
 153
 154 #define  MAX_ITERATIONS      10
 155 #define  TEST_ARRAY_SIZE     5
 156
 157
 158 /***********************************/
 159 /* Enable separate communication,  */
 160 /* computation timing and printout */
 161 /***********************************/
 162 /* #define  TIMING_ENABLED         */
 163
 164
 165 /*************************************/
 166 /* Typedef: if necessary, change the */
 167 /* size of int here by changing the  */
 168 /* int type to, say, long            */
 169 /*************************************/
 170 typedef  int  INT_TYPE;
 171 typedef  long INT_TYPE2;
 172 #define MP_KEY_TYPE MPI_INT
 173
 174
 175 typedef struct {
 176
 177 /********************/
 178 /* MPI properties:  */
 179 /********************/
 180 int      my_rank,
 181          comm_size;
 182
 183
 184 /********************/
 185 /* Some global info */
 186 /********************/
 187 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
 188          total_local_keys,             /* copies of rank info        */
 189          total_lesser_keys;
 190
 191
 192 int      passed_verification;
 193
 194
 195
 196 /************************************/
 197 /* These are the three main arrays. */
 198 /* See SIZE_OF_BUFFERS def above    */
 199 /************************************/
 200 INT_TYPE key_array[SIZE_OF_BUFFERS],
 201          key_buff1[SIZE_OF_BUFFERS],
 202          key_buff2[SIZE_OF_BUFFERS],
 203          bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
 204          bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
 205          bucket_ptrs[NUM_BUCKETS],
 206          process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
 207          process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
 208 int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
 209          send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
 210
 211
 212 /**********************/
 213 /* Partial verif info */
 214 /**********************/
 215 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
 216          test_rank_array[TEST_ARRAY_SIZE];
 217
 218 /**********/
 219 /* Timers */
 220 /**********/
 221 double start[64], elapsed[64];
 222
 223 } global_data;
 224
 225
 226 const INT_TYPE2
 227          S_test_index_array[TEST_ARRAY_SIZE] =
 228                              {48427,17148,23627,62548,4431},
 229          S_test_rank_array[TEST_ARRAY_SIZE] =
 230                              {0,18,346,64917,65463},
 231
 232          W_test_index_array[TEST_ARRAY_SIZE] =
 233                              {357773,934767,875723,898999,404505},
 234          W_test_rank_array[TEST_ARRAY_SIZE] =
 235                              {1249,11698,1039987,1043896,1048018},
 236
 237          A_test_index_array[TEST_ARRAY_SIZE] =
 238                              {2112377,662041,5336171,3642833,4250760},
 239          A_test_rank_array[TEST_ARRAY_SIZE] =
 240                              {104,17523,123928,8288932,8388264},
 241
 242          B_test_index_array[TEST_ARRAY_SIZE] =
 243                              {41869,812306,5102857,18232239,26860214},
 244          B_test_rank_array[TEST_ARRAY_SIZE] =
 245                              {33422937,10244,59149,33135281,99},
 246
 247          C_test_index_array[TEST_ARRAY_SIZE] =
 248                              {44172927,72999161,74326391,129606274,21736814},
 249          C_test_rank_array[TEST_ARRAY_SIZE] =
 250                              {61147,882988,266290,133997595,133525895},
 251
 252          D_test_index_array[TEST_ARRAY_SIZE] =
 253                              {1317351170,995930646,1157283250,1503301535,1453734525},
 254          D_test_rank_array[TEST_ARRAY_SIZE] =
 255                              {1,36538729,1978098519,2145192618,2147425337};
 256
 257
 258
 259 /***********************/
 260 /* function prototypes */
 261 /***********************/
 262 double  randlc( double *X, double *A );
 263
 264 void full_verify( global_data* gd );
 265
 266 void c_print_results( char   *name,
 267                       char   class,
 268                       int    n1,
 269                       int    n2,
 270                       int    n3,
 271                       int    niter,
 272                       int    nprocs_compiled,
 273                       int    nprocs_total,
 274                       double t,
 275                       double mops,
 276                       char   *optype,
 277                       int    passed_verification,
 278                       char   *npbversion,
 279                       char   *compiletime,
 280                       char   *mpicc,
 281                       char   *clink,
 282                       char   *cmpi_lib,
 283                       char   *cmpi_inc,
 284                       char   *cflags,
 285                       char   *clinkflags );
 286
 287 void    timer_clear(global_data* gd, int n );
 288 void    timer_start(global_data* gd, int n );
 289 void    timer_stop(global_data* gd, int n );
 290 double  timer_read(global_data* gd, int n );
 291
 292 void    timer_clear(global_data* gd, int n ) {
 293    gd->elapsed[n] = 0.0;
 294 }
 295
 296 void    timer_start(global_data* gd, int n ) {
 297    gd->start[n] = MPI_Wtime();
 298 }
 299
 300 void    timer_stop(global_data* gd, int n ) {
 301    gd->elapsed[n] += MPI_Wtime() - gd->start[n];
 302 }
 303
 304 double  timer_read(global_data* gd, int n ) {
 305    return gd->elapsed[n];
 306 }
 307
 308
 309 /*
 310  *    FUNCTION RANDLC (X, A)
 311  *
 312  *  This routine returns a uniform pseudorandom double precision number in the
 313  *  range (0, 1) by using the linear congruential generator
 314  *
 315  *  x_{k+1} = a x_k  (mod 2^46)
 316  *
 317  *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 318  *  before repeating.  The argument A is the same as 'a' in the above formula,
 319  *  and X is the same as x_0.  A and X must be odd double precision integers
 320  *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
 321  *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 322  *  the new seed x_1, so that subsequent calls to RANDLC using the same
 323  *  arguments will generate a continuous sequence.
 324  *
 325  *  This routine should produce the same results on any computer with at least
 326  *  48 mantissa bits in double precision floating point data.  On Cray systems,
 327  *  double precision should be disabled.
 328  *
 329  *  David H. Bailey     October 26, 1990
 330  *
 331  *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
 332  *     SAVE KS, R23, R46, T23, T46
 333  *     DATA KS/0/
 334  *
 335  *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
 336  *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
 337  *  by merely using the ** operator, in order to insure that the results are
 338  *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
 339  */
 340
 341
 342 /*****************************************************************/
 343 /*************           R  A  N  D  L  C             ************/
 344 /*************                                        ************/
 345 /*************    portable random number generator    ************/
 346 /*****************************************************************/
 347
 348 double  randlc( double *X, double *A )
 349 {
 350       static int        KS=0;
 351       static double     R23, R46, T23, T46;
 352       double            T1, T2, T3, T4;
 353       double            A1;
 354       double            A2;
 355       double            X1;
 356       double            X2;
 357       double            Z;
 358       int               i, j;
 359
 360       if (KS == 0)
 361       {
 362         R23 = 1.0;
 363         R46 = 1.0;
 364         T23 = 1.0;
 365         T46 = 1.0;
 366
 367         for (i=1; i<=23; i++)
 368         {
 369           R23 = 0.50 * R23;
 370           T23 = 2.0 * T23;
 371         }
 372         for (i=1; i<=46; i++)
 373         {
 374           R46 = 0.50 * R46;
 375           T46 = 2.0 * T46;
 376         }
 377         KS = 1;
 378       }
 379
 380 /*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
 381
 382       T1 = R23 * *A;
 383       j  = T1;
 384       A1 = j;
 385       A2 = *A - T23 * A1;
 386
 387 /*  Break X into two parts such that X = 2^23 * X1 + X2, compute
 388     Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 389     X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
 390
 391       T1 = R23 * *X;
 392       j  = T1;
 393       X1 = j;
 394       X2 = *X - T23 * X1;
 395       T1 = A1 * X2 + A2 * X1;
 396
 397       j  = R23 * T1;
 398       T2 = j;
 399       Z = T1 - T23 * T2;
 400       T3 = T23 * Z + A2 * X2;
 401       j  = R46 * T3;
 402       T4 = j;
 403       *X = T3 - T46 * T4;
 404       return(R46 * *X);
 405 }
 406
 407
 408
 409 /*****************************************************************/
 410 /************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
 411 /************                                         ************/
 412 /************ returns parallel random number seq seed ************/
 413 /*****************************************************************/
 414
 415 /*
 416  * Create a random number sequence of total length nn residing
 417  * on np number of processors.  Each processor will therefore have a
 418  * subsequence of length nn/np.  This routine returns that random
 419  * number which is the first random number for the subsequence belonging
 420  * to processor rank kn, and which is used as seed for proc kn ran # gen.
 421  */
 422
 423 double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
 424                        int  np,       /* np = num procs                      */
 425                        long nn,       /* total num of ran numbers, all procs */
 426                        double s,      /* Ran num seed, for ex.: 314159265.00 */
 427                        double a )     /* Ran num gen mult, try 1220703125.00 */
 428 {
 429
 430   long   i;
 431
 432   double t1,t2,t3,an;
 433   long   mq,nq,kk,ik;
 434
 435
 436
 437       nq = nn / np;
 438
 439       for( mq=0; nq>1; mq++,nq/=2 )
 440           ;
 441
 442       t1 = a;
 443
 444       for( i=1; i<=mq; i++ )
 445         t2 = randlc( &t1, &t1 );
 446
 447       an = t1;
 448
 449       kk = kn;
 450       t1 = s;
 451       t2 = an;
 452
 453       for( i=1; i<=100; i++ )
 454       {
 455         ik = kk / 2;
 456         if( 2 * ik !=  kk )
 457             t3 = randlc( &t1, &t2 );
 458         if( ik == 0 )
 459             break;
 460         t3 = randlc( &t2, &t2 );
 461         kk = ik;
 462       }
 463
 464       return( t1 );
 465
 466 }
 467
 468
 469
 470
 471 /*****************************************************************/
 472 /*************      C  R  E  A  T  E  _  S  E  Q      ************/
 473 /*****************************************************************/
 474
 475 void    create_seq( global_data* gd, double seed, double a )
 476 {
 477         double x;
 478         int    i, k;
 479
 480         k = MAX_KEY/4;
 481
 482         for (i=0; i<NUM_KEYS; i++)
 483         {
 484             x = randlc(&seed, &a);
 485             x += randlc(&seed, &a);
 486             x += randlc(&seed, &a);
 487             x += randlc(&seed, &a);
 488
 489             gd->key_array[i] = k*x;
 490         }
 491 }
 492
 493
 494
 495
 496 /*****************************************************************/
 497 /*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
 498 /*****************************************************************/
 499
 500
 501 void full_verify( global_data* gd )
 502 {
 503     MPI_Status  status;
 504     MPI_Request request;
 505
 506     INT_TYPE    i, j;
 507     INT_TYPE    k, last_local_key;
 508
 509
 510 /*  Now, finally, sort the keys:  */
 511     for( i=0; i<gd->total_local_keys; i++ )
 512         gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
 513                                  gd->total_lesser_keys] = gd->key_buff2[i];
 514     last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
 515
 516 /*  Send largest key value to next processor  */
 517     if( gd->my_rank > 0 )
 518         MPI_Irecv( &k,
 519                    1,
 520                    MP_KEY_TYPE,
 521                    gd->my_rank-1,
 522                    1000,
 523                    MPI_COMM_WORLD,
 524                    &request );
 525     if( gd->my_rank < gd->comm_size-1 )
 526         MPI_Send( &gd->key_array[last_local_key],
 527                   1,
 528                   MP_KEY_TYPE,
 529                   gd->my_rank+1,
 530                   1000,
 531                   MPI_COMM_WORLD );
 532     if( gd->my_rank > 0 )
 533         MPI_Wait( &request, &status );
 534
 535 /*  Confirm that neighbor's greatest key value
 536     is not greater than my least key value       */
 537     j = 0;
 538     if( gd->my_rank > 0 && gd->total_local_keys > 0 )
 539         if( k > gd->key_array[0] )
 540             j++;
 541
 542
 543 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
 544     for( i=1; i<gd->total_local_keys; i++ )
 545         if( gd->key_array[i-1] > gd->key_array[i] )
 546             j++;
 547
 548
 549     if( j != 0 )
 550     {
 551         printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
 552                 gd->my_rank, j );
 553     }
 554     else
 555         gd->passed_verification++;
 556
 557
 558 }
 559
 560
 561
 562
 563 /*****************************************************************/
 564 /*************             R  A  N  K             ****************/
 565 /*****************************************************************/
 566
 567
 568 void rank( global_data* gd, int iteration )
 569 {
 570
 571     INT_TYPE    i, k;
 572
 573     INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
 574     INT_TYPE    key;
 575     INT_TYPE2   bucket_sum_accumulator, j, m;
 576     INT_TYPE    local_bucket_sum_accumulator;
 577     INT_TYPE    min_key_val, max_key_val;
 578     INT_TYPE    *key_buff_ptr;
 579
 580
 581
 582
 583 /*  Iteration alteration of keys */
 584     if(gd->my_rank == 0 )
 585     {
 586       gd->key_array[iteration] = iteration;
 587       gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
 588     }
 589
 590
 591 /*  Initialize */
 592     for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
 593     {
 594         gd->bucket_size[i] = 0;
 595         gd->bucket_size_totals[i] = 0;
 596         gd->process_bucket_distrib_ptr1[i] = 0;
 597         gd->process_bucket_distrib_ptr2[i] = 0;
 598     }
 599
 600
 601 /*  Determine where the partial verify test keys are, load into  */
 602 /*  top of array bucket_size                                     */
 603     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 604         if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
 605             gd->bucket_size[NUM_BUCKETS+i] =
 606                           gd->key_array[gd->test_index_array[i] % NUM_KEYS];
 607
 608
 609 /*  Determine the number of keys in each bucket */
 610     for( i=0; i<NUM_KEYS; i++ )
 611         gd->bucket_size[gd->key_array[i] >> shift]++;
 612
 613
 614 /*  Accumulative bucket sizes are the bucket pointers */
 615     gd->bucket_ptrs[0] = 0;
 616     for( i=1; i< NUM_BUCKETS; i++ )
 617         gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
 618
 619
 620 /*  Sort into appropriate bucket */
 621     for( i=0; i<NUM_KEYS; i++ )
 622     {
 623         key = gd->key_array[i];
 624         gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
 625     }
 626
 627 #ifdef  TIMING_ENABLED
 628     timer_stop(gd, 2 );
 629     timer_start(gd, 3 );
 630 #endif
 631
 632 /*  Get the bucket size totals for the entire problem. These
 633     will be used to determine the redistribution of keys      */
 634     MPI_Allreduce( gd->bucket_size,
 635                    gd->bucket_size_totals,
 636                    NUM_BUCKETS+TEST_ARRAY_SIZE,
 637                    MP_KEY_TYPE,
 638                    MPI_SUM,
 639                    MPI_COMM_WORLD );
 640
 641 #ifdef  TIMING_ENABLED
 642     timer_stop(gd, 3 );
 643     timer_start(gd, 2 );
 644 #endif
 645
 646 /*  Determine Redistibution of keys: accumulate the bucket size totals
 647     till this number surpasses NUM_KEYS (which the average number of keys
 648     per processor).  Then all keys in these buckets go to processor 0.
 649     Continue accumulating again until supassing 2*NUM_KEYS. All keys
 650     in these buckets go to processor 1, etc.  This algorithm guarantees
 651     that all processors have work ranking; no processors are left idle.
 652     The optimum number of buckets, however, does not result in as high
 653     a degree of load balancing (as even a distribution of keys as is
 654     possible) as is obtained from increasing the number of buckets, but
 655     more buckets results in more computation per processor so that the
 656     optimum number of buckets turns out to be 1024 for machines tested.
 657     Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
 658     number of first and last bucket which each processor will have after
 659     the redistribution is done.                                          */
 660
 661     bucket_sum_accumulator = 0;
 662     local_bucket_sum_accumulator = 0;
 663     gd->send_displ[0] = 0;
 664     gd->process_bucket_distrib_ptr1[0] = 0;
 665     for( i=0, j=0; i<NUM_BUCKETS; i++ )
 666     {
 667         bucket_sum_accumulator       += gd->bucket_size_totals[i];
 668         local_bucket_sum_accumulator += gd->bucket_size[i];
 669         if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
 670         {
 671             gd->send_count[j] = local_bucket_sum_accumulator;
 672             if( j != 0 )
 673             {
 674                 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
 675                 gd->process_bucket_distrib_ptr1[j] =
 676                                         gd->process_bucket_distrib_ptr2[j-1]+1;
 677             }
 678             gd->process_bucket_distrib_ptr2[j++] = i;
 679             local_bucket_sum_accumulator = 0;
 680         }
 681     }
 682
 683 /*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
 684     that the last few processors don't get any buckets.  So, we
 685     need to set counts properly in this case to avoid any fallouts.    */
 686     while( j < gd->comm_size )
 687     {
 688         gd->send_count[j] = 0;
 689         gd->process_bucket_distrib_ptr1[j] = 1;
 690         j++;
 691     }
 692
 693 #ifdef  TIMING_ENABLED
 694     timer_stop(gd, 2 );
 695     timer_start(gd, 3 );
 696 #endif
 697
 698 /*  This is the redistribution section:  first find out how many keys
 699     each processor will send to every other processor:                 */
 700     MPI_Alltoall( gd->send_count,
 701                   1,
 702                   MPI_INT,
 703                   gd->recv_count,
 704                   1,
 705                   MPI_INT,
 706                   MPI_COMM_WORLD );
 707
 708 /*  Determine the receive array displacements for the buckets */
 709     gd->recv_displ[0] = 0;
 710     for( i=1; i<gd->comm_size; i++ )
 711         gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
 712
 713
 714 /*  Now send the keys to respective processors  */
 715     MPI_Alltoallv( gd->key_buff1,
 716                    gd->send_count,
 717                    gd->send_displ,
 718                    MP_KEY_TYPE,
 719                    gd->key_buff2,
 720                    gd->recv_count,
 721                    gd->recv_displ,
 722                    MP_KEY_TYPE,
 723                    MPI_COMM_WORLD );
 724
 725 #ifdef  TIMING_ENABLED
 726     timer_stop(gd, 3 );
 727     timer_start(gd, 2 );
 728 #endif
 729
 730 /*  The starting and ending bucket numbers on each processor are
 731     multiplied by the interval size of the buckets to obtain the
 732     smallest possible min and greatest possible max value of any
 733     key on each processor                                          */
 734     min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
 735     max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
 736
 737 /*  Clear the work array */
 738     for( i=0; i<max_key_val-min_key_val+1; i++ )
 739         gd->key_buff1[i] = 0;
 740
 741 /*  Determine the total number of keys on all other
 742     processors holding keys of lesser value         */
 743     m = 0;
 744     for( k=0; k<gd->my_rank; k++ )
 745         for( i= gd->process_bucket_distrib_ptr1[k];
 746              i<=gd->process_bucket_distrib_ptr2[k];
 747              i++ )
 748             m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
 749
 750 /*  Determine total number of keys on this processor */
 751     j = 0;
 752     for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
 753          i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
 754          i++ )
 755         j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
 756
 757
 758 /*  Ranking of all keys occurs in this section:                 */
 759 /*  shift it backwards so no subtractions are necessary in loop */
 760     key_buff_ptr = gd->key_buff1 - min_key_val;
 761
 762 /*  In this section, the keys themselves are used as their
 763     own indexes to determine how many of each there are: their
 764     individual population                                       */
 765     for( i=0; i<j; i++ )
 766         key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
 767                                        /* population                     */
 768
 769 /*  To obtain ranks of each key, successively add the individual key
 770     population, not forgetting the total of lesser keys, m.
 771     NOTE: Since the total of lesser keys would be subtracted later
 772     in verification, it is no longer added to the first key population
 773     here, but still needed during the partial verify test.  This is to
 774     ensure that 32-bit key_buff can still be used for class D.           */
 775 /*    key_buff_ptr[min_key_val] += m;    */
 776     for( i=min_key_val; i<max_key_val; i++ )
 777         key_buff_ptr[i+1] += key_buff_ptr[i];
 778
 779
 780 /* This is the partial verify test section */
 781 /* Observe that test_rank_array vals are   */
 782 /* shifted differently for different cases */
 783     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 784     {
 785         k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
 786         if( min_key_val <= k  &&  k <= max_key_val )
 787         {
 788             /* Add the total of lesser keys, m, here */
 789             INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
 790             int failed = 0;
 791
 792             switch( CLASS )
 793             {
 794                 case 'S':
 795                     if( i <= 2 )
 796                     {
 797                         if( key_rank != gd->test_rank_array[i]+iteration )
 798                             failed = 1;
 799                         else
 800                             gd->passed_verification++;
 801                     }
 802                     else
 803                     {
 804                         if( key_rank != gd->test_rank_array[i]-iteration )
 805                             failed = 1;
 806                         else
 807                             gd->passed_verification++;
 808                     }
 809                     break;
 810                 case 'W':
 811                     if( i < 2 )
 812                     {
 813                         if( key_rank != gd->test_rank_array[i]+(iteration-2) )
 814                             failed = 1;
 815                         else
 816                             gd->passed_verification++;
 817                     }
 818                     else
 819                     {
 820                         if( key_rank != gd->test_rank_array[i]-iteration )
 821                             failed = 1;
 822                         else
 823                             gd->passed_verification++;
 824                     }
 825                     break;
 826                 case 'A':
 827                     if( i <= 2 )
 828                     {
 829                         if( key_rank != gd->test_rank_array[i]+(iteration-1) )
 830                             failed = 1;
 831                         else
 832                           gd->passed_verification++;
 833                     }
 834                     else
 835                     {
 836                         if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
 837                             failed = 1;
 838                         else
 839                             gd->passed_verification++;
 840                     }
 841                     break;
 842                 case 'B':
 843                     if( i == 1 || i == 2 || i == 4 )
 844                     {
 845                         if( key_rank != gd->test_rank_array[i]+iteration )
 846                             failed = 1;
 847                         else
 848                             gd->passed_verification++;
 849                     }
 850                     else
 851                     {
 852                         if( key_rank != gd->test_rank_array[i]-iteration )
 853                             failed = 1;
 854                         else
 855                             gd->passed_verification++;
 856                     }
 857                     break;
 858                 case 'C':
 859                     if( i <= 2 )
 860                     {
 861                         if( key_rank != gd->test_rank_array[i]+iteration )
 862                             failed = 1;
 863                         else
 864                             gd->passed_verification++;
 865                     }
 866                     else
 867                     {
 868                         if( key_rank != gd->test_rank_array[i]-iteration )
 869                             failed = 1;
 870                         else
 871                             gd->passed_verification++;
 872                     }
 873                     break;
 874                 case 'D':
 875                     if( i < 2 )
 876                     {
 877                         if( key_rank != gd->test_rank_array[i]+iteration )
 878                             failed = 1;
 879                         else
 880                             gd->passed_verification++;
 881                     }
 882                     else
 883                     {
 884                         if( key_rank != gd->test_rank_array[i]-iteration )
 885                             failed = 1;
 886                         else
 887                             gd->passed_verification++;
 888                     }
 889                     break;
 890             }
 891             if( failed == 1 )
 892                 printf( "Failed partial verification: "
 893                         "iteration %d, processor %d, test key %d\n",
 894                          iteration, gd->my_rank, (int)i );
 895         }
 896     }
 897
 898
 899
 900
 901 /*  Make copies of rank info for use by full_verify: these variables
 902     in rank are local; making them global slows down the code, probably
 903     since they cannot be made register by compiler                        */
 904
 905     if( iteration == MAX_ITERATIONS )
 906     {
 907         gd->key_buff_ptr_global = key_buff_ptr;
 908         gd->total_local_keys    = j;
 909         gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
 910     }
 911
 912 }
 913
 914
 915 /*****************************************************************/
 916 /*************             M  A  I  N             ****************/
 917 /*****************************************************************/
 918
 919 int main( int argc, char **argv )
 920 {
 921
 922     int             i, iteration, itemp;
 923
 924     double          timecounter, maxtime;
 925
 926     TRACE_smpi_set_category ("start");
 927
 928     global_data* gd = malloc(sizeof(global_data));
 929 /*  Initialize MPI */
 930     MPI_Init( &argc, &argv );
 931     MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
 932     MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
 933
 934 /*  Initialize the verification arrays if a valid class */
 935     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 936         switch( CLASS )
 937         {
 938             case 'S':
 939                 gd->test_index_array[i] = S_test_index_array[i];
 940                 gd->test_rank_array[i]  = S_test_rank_array[i];
 941                 break;
 942             case 'A':
 943                 gd->test_index_array[i] = A_test_index_array[i];
 944                 gd->test_rank_array[i]  = A_test_rank_array[i];
 945                 break;
 946             case 'W':
 947                 gd->test_index_array[i] = W_test_index_array[i];
 948                 gd->test_rank_array[i]  = W_test_rank_array[i];
 949                 break;
 950             case 'B':
 951                 gd->test_index_array[i] = B_test_index_array[i];
 952                 gd->test_rank_array[i]  = B_test_rank_array[i];
 953                 break;
 954             case 'C':
 955                 gd->test_index_array[i] = C_test_index_array[i];
 956                 gd->test_rank_array[i]  = C_test_rank_array[i];
 957                 break;
 958             case 'D':
 959                 gd->test_index_array[i] = D_test_index_array[i];
 960                 gd->test_rank_array[i]  = D_test_rank_array[i];
 961                 break;
 962         };
 963
 964
 965
 966 /*  Printout initial NPB info */
 967     if( gd->my_rank == 0 )
 968     {
 969         printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
 970         printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
 971         printf( " Iterations:   %d\n", MAX_ITERATIONS );
 972         printf( " Number of processes:     %d\n",gd->comm_size );
 973     }
 974
 975 /*  Check that actual and compiled number of processors agree */
 976     if( gd->comm_size != NUM_PROCS )
 977     {
 978         if( gd->my_rank == 0 )
 979             printf( "\n ERROR: compiled for %d processes\n"
 980                     " Number of active processes: %d\n"
 981                     " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
 982         MPI_Finalize();
 983         exit( 1 );
 984     }
 985
 986 /*  Check to see whether total number of processes is within bounds.
 987     This could in principle be checked in setparams.c, but it is more
 988     convenient to do it here                                               */
 989     if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
 990     {
 991        if( gd->my_rank == 0 )
 992            printf( "\n ERROR: number of processes %d not within range %d-%d"
 993                    "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
 994        MPI_Finalize();
 995        exit( 1 );
 996     }
 997
 998
 999 /*  Generate random number sequence and subsequent keys on all procs */
1000     create_seq(gd,  find_my_seed( gd->my_rank,
1001                               gd->comm_size,
1002                               4*(long)TOTAL_KEYS*MIN_PROCS,
1003                               314159265.00,      /* Random number gen seed */
1004                               1220703125.00 ),   /* Random number gen mult */
1005                 1220703125.00 );                 /* Random number gen mult */
1006
1007 /*  Do one interation for free (i.e., untimed) to guarantee initialization of
1008     all data and code pages and respective tables */
1009     rank(gd, 1 );
1010
1011 /*  Start verification counter */
1012     gd->passed_verification = 0;
1013
1014     if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
1015
1016 /*  Initialize timer  */
1017     timer_clear(gd, 0 );
1018
1019 /*  Initialize separate communication, computation timing */
1020 #ifdef  TIMING_ENABLED
1021     for( i=1; i<=3; i++ ) timer_clear(gd, i );
1022 #endif
1023
1024 /*  Start timer  */
1025     timer_start(gd, 0 );
1026
1027 #ifdef  TIMING_ENABLED
1028     timer_start(gd, 1 );
1029     timer_start(gd, 2 );
1030 #endif
1031
1032 /*  This is the main iteration */
1033     for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1034     {
1035         char it_str[100];
1036         snprintf (it_str, 100, "i%d", iteration);
1037         TRACE_smpi_set_category (it_str);
1038
1039         if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
1040         rank(gd,  iteration );
1041     }
1042
1043     TRACE_smpi_set_category ("finalize");
1044
1045
1046 #ifdef  TIMING_ENABLED
1047     timer_stop(gd, 2 );
1048     timer_stop(gd, 1 );
1049 #endif
1050
1051 /*  Stop timer, obtain time for processors */
1052     timer_stop(gd, 0 );
1053
1054     timecounter = timer_read(gd, 0 );
1055
1056 /*  End of timing, obtain maximum time of all processors */
1057     MPI_Reduce( &timecounter,
1058                 &maxtime,
1059                 1,
1060                 MPI_DOUBLE,
1061                 MPI_MAX,
1062                 0,
1063                 MPI_COMM_WORLD );
1064
1065 #ifdef  TIMING_ENABLED
1066     {
1067         double    tmin, tsum, tmax;
1068
1069         if( my_rank == 0 )
1070         {
1071             printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1072             printf( "              min                avg                max\n" );
1073         }
1074         for( i=1; i<=3; i++ )
1075         {
1076             timecounter = timer_read(gd, i );
1077             MPI_Reduce( &timecounter,
1078                         &tmin,
1079                         1,
1080                         MPI_DOUBLE,
1081                         MPI_MIN,
1082                         0,
1083                         MPI_COMM_WORLD );
1084             MPI_Reduce( &timecounter,
1085                         &tsum,
1086                         1,
1087                         MPI_DOUBLE,
1088                         MPI_SUM,
1089                         0,
1090                         MPI_COMM_WORLD );
1091             MPI_Reduce( &timecounter,
1092                         &tmax,
1093                         1,
1094                         MPI_DOUBLE,
1095                         MPI_MAX,
1096                         0,
1097                         MPI_COMM_WORLD );
1098             if( my_rank == 0 )
1099                 printf( "timer %d:    %f           %f            %f\n",
1100                         i, tmin, tsum/((double) comm_size), tmax );
1101         }
1102         if( my_rank == 0 )
1103             printf( "\n" );
1104     }
1105 #endif
1106
1107 /*  This tests that keys are in sequence: sorting of last ranked key seq
1108     occurs here, but is an untimed operation                             */
1109     full_verify(gd);
1110
1111
1112 /*  Obtain verification counter sum */
1113     itemp =gd->passed_verification;
1114     MPI_Reduce( &itemp,
1115                 &gd->passed_verification,
1116                 1,
1117                 MPI_INT,
1118                 MPI_SUM,
1119                 0,
1120                 MPI_COMM_WORLD );
1121
1122
1123
1124 /*  The final printout  */
1125     if( gd->my_rank == 0 )
1126     {
1127         if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1128             gd->passed_verification = 0;
1129         c_print_results( "IS",
1130                          CLASS,
1131                          (int)(TOTAL_KEYS),
1132                          MIN_PROCS,
1133                          0,
1134                          MAX_ITERATIONS,
1135                          NUM_PROCS,
1136                          gd->comm_size,
1137                          maxtime,
1138                          ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1139                                                       /maxtime/1000000.,
1140                          "keys ranked",
1141                          gd->passed_verification,
1142                          NPBVERSION,
1143                          COMPILETIME,
1144                          MPICC,
1145                          CLINK,
1146                          CMPI_LIB,
1147                          CMPI_INC,
1148                          CFLAGS,
1149                          CLINKFLAGS );
1150     }
1151
1152     MPI_Finalize();
1153     free(gd);
1154
1155     return 0;
1156          /**************************/
1157 }        /*  E N D  P R O G R A M  */
1158          /**************************/