examples/smpi/NAS/IS-trace/is-trace.c

   1 /*************************************************************************
   2  *                                                                       *
   3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
   4  *                                                                       *
   5  *                                  I S                                  *
   6  *                                                                       *
   7  *************************************************************************
   8  *                                                                       *
   9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
  10  *   It is described in NAS Technical Report 95-020.                     *
  11  *                                                                       *
  12  *   Permission to use, copy, distribute and modify this software        *
  13  *   for any purpose with or without fee is hereby granted.  We          *
  14  *   request, however, that all derived work reference the NAS           *
  15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
  16  *   without express or implied warranty.                                *
  17  *                                                                       *
  18  *   Information on NPB 3.3, including the technical report, the         *
  19  *   original specifications, source code, results and information       *
  20  *   on how to submit new results, is available at:                      *
  21  *                                                                       *
  22  *          http://www.nas.nasa.gov/Software/NPB                         *
  23  *                                                                       *
  24  *   Send comments or suggestions to  npb@nas.nasa.gov                   *
  25  *   Send bug reports to              npb-bugs@nas.nasa.gov              *
  26  *                                                                       *
  27  *         NAS Parallel Benchmarks Group                                 *
  28  *         NASA Ames Research Center                                     *
  29  *         Mail Stop: T27A-1                                             *
  30  *         Moffett Field, CA   94035-1000                                *
  31  *                                                                       *
  32  *         E-mail:  npb@nas.nasa.gov                                     *
  33  *         Fax:     (650) 604-3957                                       *
  34  *                                                                       *
  35  *************************************************************************
  36  *                                                                       *
  37  *   Author: M. Yarrow                                                   *
  38  *           H. Jin                                                      *
  39  *                                                                       *
  40  *************************************************************************/
  41
  42 #include "mpi.h"
  43 #include "npbparams.h"
  44 #include <stdlib.h>
  45 #include <stdio.h>
  46
  47 #include "instr/instr.h" //TRACE_
  48
  49 /******************/
  50 /* default values */
  51 /******************/
  52 #ifndef CLASS
  53 #define CLASS 'S'
  54 #define NUM_PROCS            1
  55 #endif
  56 #define MIN_PROCS            1
  57
  58
  59 /*************/
  60 /*  CLASS S  */
  61 /*************/
  62 #if CLASS == 'S'
  63 #define  TOTAL_KEYS_LOG_2    16
  64 #define  MAX_KEY_LOG_2       11
  65 #define  NUM_BUCKETS_LOG_2   9
  66 #endif
  67
  68
  69 /*************/
  70 /*  CLASS W  */
  71 /*************/
  72 #if CLASS == 'W'
  73 #define  TOTAL_KEYS_LOG_2    20
  74 #define  MAX_KEY_LOG_2       16
  75 #define  NUM_BUCKETS_LOG_2   10
  76 #endif
  77
  78 /*************/
  79 /*  CLASS A  */
  80 /*************/
  81 #if CLASS == 'A'
  82 #define  TOTAL_KEYS_LOG_2    23
  83 #define  MAX_KEY_LOG_2       19
  84 #define  NUM_BUCKETS_LOG_2   10
  85 #endif
  86
  87
  88 /*************/
  89 /*  CLASS B  */
  90 /*************/
  91 #if CLASS == 'B'
  92 #define  TOTAL_KEYS_LOG_2    25
  93 #define  MAX_KEY_LOG_2       21
  94 #define  NUM_BUCKETS_LOG_2   10
  95 #endif
  96
  97
  98 /*************/
  99 /*  CLASS C  */
 100 /*************/
 101 #if CLASS == 'C'
 102 #define  TOTAL_KEYS_LOG_2    27
 103 #define  MAX_KEY_LOG_2       23
 104 #define  NUM_BUCKETS_LOG_2   10
 105 #endif
 106
 107
 108 /*************/
 109 /*  CLASS D  */
 110 /*************/
 111 #if CLASS == 'D'
 112 #define  TOTAL_KEYS_LOG_2    29
 113 #define  MAX_KEY_LOG_2       27
 114 #define  NUM_BUCKETS_LOG_2   10
 115 #undef   MIN_PROCS
 116 #define  MIN_PROCS           4
 117 #endif
 118
 119
 120 #define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
 121 #define  MAX_KEY             (1 << MAX_KEY_LOG_2)
 122 #define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
 123 #define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
 124
 125 /*****************************************************************/
 126 /* On larger number of processors, since the keys are (roughly)  */
 127 /* gaussian distributed, the first and last processor sort keys  */
 128 /* in a large interval, requiring array sizes to be larger. Note */
 129 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
 130 /* The required array size also depends on the bucket size used. */
 131 /* The following values are validated for the 1024-bucket setup. */
 132 /*****************************************************************/
 133 #if   NUM_PROCS < 256
 134 #define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
 135 #elif NUM_PROCS < 512
 136 #define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
 137 #elif NUM_PROCS < 1024
 138 #define  SIZE_OF_BUFFERS     4*NUM_KEYS
 139 #else
 140 #define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
 141 #endif
 142
 143 /*****************************************************************/
 144 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
 145 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
 146 /* MAX_PROCS AT YOUR PERIL                                       */
 147 /*****************************************************************/
 148 #if CLASS == 'S'
 149 #define  MAX_PROCS           128
 150 #else
 151 #define  MAX_PROCS           1024
 152 #endif
 153
 154 #define  MAX_ITERATIONS      10
 155 #define  TEST_ARRAY_SIZE     5
 156
 157
 158 /***********************************/
 159 /* Enable separate communication,  */
 160 /* computation timing and printout */
 161 /***********************************/
 162 /* #define  TIMING_ENABLED         */
 163
 164
 165 /*************************************/
 166 /* Typedef: if necessary, change the */
 167 /* size of int here by changing the  */
 168 /* int type to, say, long            */
 169 /*************************************/
 170 typedef  int  INT_TYPE;
 171 typedef  long INT_TYPE2;
 172 #define MP_KEY_TYPE MPI_INT
 173
 174
 175 typedef struct {
 176
 177 /********************/
 178 /* MPI properties:  */
 179 /********************/
 180 int      my_rank,
 181          comm_size;
 182
 183
 184 /********************/
 185 /* Some global info */
 186 /********************/
 187 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
 188          total_local_keys,             /* copies of rank info        */
 189          total_lesser_keys;
 190
 191
 192 int      passed_verification;
 193
 194
 195
 196 /************************************/
 197 /* These are the three main arrays. */
 198 /* See SIZE_OF_BUFFERS def above    */
 199 /************************************/
 200 INT_TYPE key_array[SIZE_OF_BUFFERS],
 201          key_buff1[SIZE_OF_BUFFERS],
 202          key_buff2[SIZE_OF_BUFFERS],
 203          bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
 204          bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
 205          bucket_ptrs[NUM_BUCKETS],
 206          process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
 207          process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
 208 int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
 209          send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
 210
 211
 212 /**********************/
 213 /* Partial verif info */
 214 /**********************/
 215 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
 216          test_rank_array[TEST_ARRAY_SIZE];
 217
 218 /**********/
 219 /* Timers */
 220 /**********/
 221 double start[64], elapsed[64];
 222
 223 } global_data;
 224
 225
 226 const INT_TYPE2
 227          S_test_index_array[TEST_ARRAY_SIZE] =
 228                              {48427,17148,23627,62548,4431},
 229          S_test_rank_array[TEST_ARRAY_SIZE] =
 230                              {0,18,346,64917,65463},
 231
 232          W_test_index_array[TEST_ARRAY_SIZE] =
 233                              {357773,934767,875723,898999,404505},
 234          W_test_rank_array[TEST_ARRAY_SIZE] =
 235                              {1249,11698,1039987,1043896,1048018},
 236
 237          A_test_index_array[TEST_ARRAY_SIZE] =
 238                              {2112377,662041,5336171,3642833,4250760},
 239          A_test_rank_array[TEST_ARRAY_SIZE] =
 240                              {104,17523,123928,8288932,8388264},
 241
 242          B_test_index_array[TEST_ARRAY_SIZE] =
 243                              {41869,812306,5102857,18232239,26860214},
 244          B_test_rank_array[TEST_ARRAY_SIZE] =
 245                              {33422937,10244,59149,33135281,99},
 246
 247          C_test_index_array[TEST_ARRAY_SIZE] =
 248                              {44172927,72999161,74326391,129606274,21736814},
 249          C_test_rank_array[TEST_ARRAY_SIZE] =
 250                              {61147,882988,266290,133997595,133525895},
 251
 252          D_test_index_array[TEST_ARRAY_SIZE] =
 253                              {1317351170,995930646,1157283250,1503301535,1453734525},
 254          D_test_rank_array[TEST_ARRAY_SIZE] =
 255                              {1,36538729,1978098519,2145192618,2147425337};
 256
 257
 258
 259 /***********************/
 260 /* function prototypes */
 261 /***********************/
 262 double  randlc( double *X, double *A );
 263
 264 void full_verify( global_data* gd );
 265
 266 void c_print_results( char   *name,
 267                       char   class,
 268                       int    n1,
 269                       int    n2,
 270                       int    n3,
 271                       int    niter,
 272                       int    nprocs_compiled,
 273                       int    nprocs_total,
 274                       double t,
 275                       double mops,
 276                       char   *optype,
 277                       int    passed_verification,
 278                       char   *npbversion,
 279                       char   *compiletime,
 280                       char   *mpicc,
 281                       char   *clink,
 282                       char   *cmpi_lib,
 283                       char   *cmpi_inc,
 284                       char   *cflags,
 285                       char   *clinkflags );
 286
 287 void    timer_clear(global_data* gd, int n );
 288 void    timer_start(global_data* gd, int n );
 289 void    timer_stop(global_data* gd, int n );
 290 double  timer_read(global_data* gd, int n );
 291
 292 void    timer_clear(global_data* gd, int n ) {
 293    gd->elapsed[n] = 0.0;
 294 }
 295
 296 void    timer_start(global_data* gd, int n ) {
 297    gd->start[n] = MPI_Wtime();
 298 }
 299
 300 void    timer_stop(global_data* gd, int n ) {
 301    gd->elapsed[n] += MPI_Wtime() - gd->start[n];
 302 }
 303
 304 double  timer_read(global_data* gd, int n ) {
 305    return gd->elapsed[n];
 306 }
 307
 308
 309 /*
 310  *    FUNCTION RANDLC (X, A)
 311  *
 312  *  This routine returns a uniform pseudorandom double precision number in the
 313  *  range (0, 1) by using the linear congruential generator
 314  *
 315  *  x_{k+1} = a x_k  (mod 2^46)
 316  *
 317  *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 318  *  before repeating.  The argument A is the same as 'a' in the above formula,
 319  *  and X is the same as x_0.  A and X must be odd double precision integers
 320  *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
 321  *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 322  *  the new seed x_1, so that subsequent calls to RANDLC using the same
 323  *  arguments will generate a continuous sequence.
 324  *
 325  *  This routine should produce the same results on any computer with at least
 326  *  48 mantissa bits in double precision floating point data.  On Cray systems,
 327  *  double precision should be disabled.
 328  *
 329  *  David H. Bailey     October 26, 1990
 330  *
 331  *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
 332  *     SAVE KS, R23, R46, T23, T46
 333  *     DATA KS/0/
 334  *
 335  *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
 336  *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
 337  *  by merely using the ** operator, in order to insure that the results are
 338  *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
 339  */
 340
 341
 342 /*****************************************************************/
 343 /*************           R  A  N  D  L  C             ************/
 344 /*************                                        ************/
 345 /*************    portable random number generator    ************/
 346 /*****************************************************************/
 347
 348 double  randlc( double *X, double *A )
 349 {
 350       static int        KS=0;
 351       static double     R23, R46, T23, T46;
 352       double            T1, T2, T3, T4;
 353       double            A1;
 354       double            A2;
 355       double            X1;
 356       double            X2;
 357       double            Z;
 358       int               i, j;
 359
 360       if (KS == 0)
 361       {
 362         R23 = 1.0;
 363         R46 = 1.0;
 364         T23 = 1.0;
 365         T46 = 1.0;
 366
 367         for (i=1; i<=23; i++)
 368         {
 369           R23 = 0.50 * R23;
 370           T23 = 2.0 * T23;
 371         }
 372         for (i=1; i<=46; i++)
 373         {
 374           R46 = 0.50 * R46;
 375           T46 = 2.0 * T46;
 376         }
 377         KS = 1;
 378       }
 379
 380 /*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
 381
 382       T1 = R23 * *A;
 383       j  = T1;
 384       A1 = j;
 385       A2 = *A - T23 * A1;
 386
 387 /*  Break X into two parts such that X = 2^23 * X1 + X2, compute
 388     Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 389     X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
 390
 391       T1 = R23 * *X;
 392       j  = T1;
 393       X1 = j;
 394       X2 = *X - T23 * X1;
 395       T1 = A1 * X2 + A2 * X1;
 396
 397       j  = R23 * T1;
 398       T2 = j;
 399       Z = T1 - T23 * T2;
 400       T3 = T23 * Z + A2 * X2;
 401       j  = R46 * T3;
 402       T4 = j;
 403       *X = T3 - T46 * T4;
 404       return(R46 * *X);
 405 }
 406
 407
 408
 409 /*****************************************************************/
 410 /************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
 411 /************                                         ************/
 412 /************ returns parallel random number seq seed ************/
 413 /*****************************************************************/
 414
 415 /*
 416  * Create a random number sequence of total length nn residing
 417  * on np number of processors.  Each processor will therefore have a
 418  * subsequence of length nn/np.  This routine returns that random
 419  * number which is the first random number for the subsequence belonging
 420  * to processor rank kn, and which is used as seed for proc kn ran # gen.
 421  */
 422
 423 double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
 424                        int  np,       /* np = num procs                      */
 425                        long nn,       /* total num of ran numbers, all procs */
 426                        double s,      /* Ran num seed, for ex.: 314159265.00 */
 427                        double a )     /* Ran num gen mult, try 1220703125.00 */
 428 {
 429
 430   long   i;
 431
 432   double t1,t2,t3,an;
 433   long   mq,nq,kk,ik;
 434
 435
 436
 437       nq = nn / np;
 438
 439       for( mq=0; nq>1; mq++,nq/=2 )
 440           ;
 441
 442       t1 = a;
 443
 444       for( i=1; i<=mq; i++ )
 445         t2 = randlc( &t1, &t1 );
 446
 447       an = t1;
 448
 449       kk = kn;
 450       t1 = s;
 451       t2 = an;
 452
 453       for( i=1; i<=100; i++ )
 454       {
 455         ik = kk / 2;
 456         if( 2 * ik !=  kk )
 457             t3 = randlc( &t1, &t2 );
 458         if( ik == 0 )
 459             break;
 460         t3 = randlc( &t2, &t2 );
 461         kk = ik;
 462       }
 463
 464       return( t1 );
 465
 466 }
 467
 468
 469
 470
 471 /*****************************************************************/
 472 /*************      C  R  E  A  T  E  _  S  E  Q      ************/
 473 /*****************************************************************/
 474
 475 void    create_seq( global_data* gd, double seed, double a )
 476 {
 477         double x;
 478         int    i, k;
 479
 480         k = MAX_KEY/4;
 481
 482         for (i=0; i<NUM_KEYS; i++)
 483         {
 484             x = randlc(&seed, &a);
 485             x += randlc(&seed, &a);
 486             x += randlc(&seed, &a);
 487             x += randlc(&seed, &a);
 488
 489             gd->key_array[i] = k*x;
 490         }
 491 }
 492
 493
 494
 495
 496 /*****************************************************************/
 497 /*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
 498 /*****************************************************************/
 499
 500
 501 void full_verify( global_data* gd )
 502 {
 503     MPI_Status  status;
 504     MPI_Request request;
 505
 506     INT_TYPE    i, j;
 507     INT_TYPE    k, last_local_key;
 508
 509
 510 /*  Now, finally, sort the keys:  */
 511     for( i=0; i<gd->total_local_keys; i++ )
 512         gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
 513                                  gd->total_lesser_keys] = gd->key_buff2[i];
 514     last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
 515
 516 /*  Send largest key value to next processor  */
 517     if( gd->my_rank > 0 )
 518         MPI_Irecv( &k,
 519                    1,
 520                    MP_KEY_TYPE,
 521                    gd->my_rank-1,
 522                    1000,
 523                    MPI_COMM_WORLD,
 524                    &request );
 525     if( gd->my_rank < gd->comm_size-1 )
 526         MPI_Send( &gd->key_array[last_local_key],
 527                   1,
 528                   MP_KEY_TYPE,
 529                   gd->my_rank+1,
 530                   1000,
 531                   MPI_COMM_WORLD );
 532     if( gd->my_rank > 0 )
 533         MPI_Wait( &request, &status );
 534
 535 /*  Confirm that neighbor's greatest key value
 536     is not greater than my least key value       */
 537     j = 0;
 538     if( gd->my_rank > 0 && gd->total_local_keys > 0 )
 539         if( k > gd->key_array[0] )
 540             j++;
 541
 542
 543 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
 544     for( i=1; i<gd->total_local_keys; i++ )
 545         if( gd->key_array[i-1] > gd->key_array[i] )
 546             j++;
 547
 548
 549     if( j != 0 )
 550     {
 551         printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
 552                 gd->my_rank, j );
 553     }
 554     else
 555         gd->passed_verification++;
 556
 557
 558 }
 559
 560
 561
 562
 563 /*****************************************************************/
 564 /*************             R  A  N  K             ****************/
 565 /*****************************************************************/
 566
 567
 568 void rank( global_data* gd, int iteration )
 569 {
 570
 571     INT_TYPE    i, k;
 572
 573     INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
 574     INT_TYPE    key;
 575     INT_TYPE2   bucket_sum_accumulator, j, m;
 576     INT_TYPE    local_bucket_sum_accumulator;
 577     INT_TYPE    min_key_val, max_key_val;
 578     INT_TYPE    *key_buff_ptr;
 579
 580
 581
 582
 583 /*  Iteration alteration of keys */
 584     if(gd->my_rank == 0 )
 585     {
 586       gd->key_array[iteration] = iteration;
 587       gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
 588     }
 589
 590
 591 /*  Initialize */
 592     for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
 593     {
 594         gd->bucket_size[i] = 0;
 595         gd->bucket_size_totals[i] = 0;
 596         gd->process_bucket_distrib_ptr1[i] = 0;
 597         gd->process_bucket_distrib_ptr2[i] = 0;
 598     }
 599
 600
 601 /*  Determine where the partial verify test keys are, load into  */
 602 /*  top of array bucket_size                                     */
 603     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 604         if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
 605             gd->bucket_size[NUM_BUCKETS+i] =
 606                           gd->key_array[gd->test_index_array[i] % NUM_KEYS];
 607
 608
 609 /*  Determine the number of keys in each bucket */
 610     for( i=0; i<NUM_KEYS; i++ )
 611         gd->bucket_size[gd->key_array[i] >> shift]++;
 612
 613
 614 /*  Accumulative bucket sizes are the bucket pointers */
 615     gd->bucket_ptrs[0] = 0;
 616     for( i=1; i< NUM_BUCKETS; i++ )
 617         gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
 618
 619
 620 /*  Sort into appropriate bucket */
 621     for( i=0; i<NUM_KEYS; i++ )
 622     {
 623         key = gd->key_array[i];
 624         gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
 625     }
 626
 627 #ifdef  TIMING_ENABLED
 628     timer_stop(gd, 2 );
 629     timer_start(gd, 3 );
 630 #endif
 631
 632 /*  Get the bucket size totals for the entire problem. These
 633     will be used to determine the redistribution of keys      */
 634     MPI_Allreduce( gd->bucket_size,
 635                    gd->bucket_size_totals,
 636                    NUM_BUCKETS+TEST_ARRAY_SIZE,
 637                    MP_KEY_TYPE,
 638                    MPI_SUM,
 639                    MPI_COMM_WORLD );
 640
 641 #ifdef  TIMING_ENABLED
 642     timer_stop(gd, 3 );
 643     timer_start(gd, 2 );
 644 #endif
 645
 646 /*  Determine Redistibution of keys: accumulate the bucket size totals
 647     till this number surpasses NUM_KEYS (which the average number of keys
 648     per processor).  Then all keys in these buckets go to processor 0.
 649     Continue accumulating again until supassing 2*NUM_KEYS. All keys
 650     in these buckets go to processor 1, etc.  This algorithm guarantees
 651     that all processors have work ranking; no processors are left idle.
 652     The optimum number of buckets, however, does not result in as high
 653     a degree of load balancing (as even a distribution of keys as is
 654     possible) as is obtained from increasing the number of buckets, but
 655     more buckets results in more computation per processor so that the
 656     optimum number of buckets turns out to be 1024 for machines tested.
 657     Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
 658     number of first and last bucket which each processor will have after
 659     the redistribution is done.                                          */
 660
 661     bucket_sum_accumulator = 0;
 662     local_bucket_sum_accumulator = 0;
 663     gd->send_displ[0] = 0;
 664     gd->process_bucket_distrib_ptr1[0] = 0;
 665     for( i=0, j=0; i<NUM_BUCKETS; i++ )
 666     {
 667         bucket_sum_accumulator       += gd->bucket_size_totals[i];
 668         local_bucket_sum_accumulator += gd->bucket_size[i];
 669         if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
 670         {
 671             gd->send_count[j] = local_bucket_sum_accumulator;
 672             if( j != 0 )
 673             {
 674                 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
 675                 gd->process_bucket_distrib_ptr1[j] =
 676                                         gd->process_bucket_distrib_ptr2[j-1]+1;
 677             }
 678             gd->process_bucket_distrib_ptr2[j++] = i;
 679             local_bucket_sum_accumulator = 0;
 680         }
 681     }
 682
 683 /*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
 684     that the last few processors don't get any buckets.  So, we
 685     need to set counts properly in this case to avoid any fallouts.    */
 686     while( j < gd->comm_size )
 687     {
 688         gd->send_count[j] = 0;
 689         gd->process_bucket_distrib_ptr1[j] = 1;
 690         j++;
 691     }
 692
 693 #ifdef  TIMING_ENABLED
 694     timer_stop(gd, 2 );
 695     timer_start(gd, 3 );
 696 #endif
 697
 698 /*  This is the redistribution section:  first find out how many keys
 699     each processor will send to every other processor:                 */
 700     MPI_Alltoall( gd->send_count,
 701                   1,
 702                   MPI_INT,
 703                   gd->recv_count,
 704                   1,
 705                   MPI_INT,
 706                   MPI_COMM_WORLD );
 707
 708 /*  Determine the receive array displacements for the buckets */
 709     gd->recv_displ[0] = 0;
 710     for( i=1; i<gd->comm_size; i++ )
 711         gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
 712
 713
 714 /*  Now send the keys to respective processors  */
 715     MPI_Alltoallv( gd->key_buff1,
 716                    gd->send_count,
 717                    gd->send_displ,
 718                    MP_KEY_TYPE,
 719                    gd->key_buff2,
 720                    gd->recv_count,
 721                    gd->recv_displ,
 722                    MP_KEY_TYPE,
 723                    MPI_COMM_WORLD );
 724
 725 #ifdef  TIMING_ENABLED
 726     timer_stop(gd, 3 );
 727     timer_start(gd, 2 );
 728 #endif
 729
 730 /*  The starting and ending bucket numbers on each processor are
 731     multiplied by the interval size of the buckets to obtain the
 732     smallest possible min and greatest possible max value of any
 733     key on each processor                                          */
 734     min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
 735     max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
 736
 737 /*  Clear the work array */
 738     for( i=0; i<max_key_val-min_key_val+1; i++ )
 739         gd->key_buff1[i] = 0;
 740
 741 /*  Determine the total number of keys on all other
 742     processors holding keys of lesser value         */
 743     m = 0;
 744     for( k=0; k<gd->my_rank; k++ )
 745         for( i= gd->process_bucket_distrib_ptr1[k];
 746              i<=gd->process_bucket_distrib_ptr2[k];
 747              i++ )
 748             m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
 749
 750 /*  Determine total number of keys on this processor */
 751     j = 0;
 752     for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
 753          i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
 754          i++ )
 755         j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
 756
 757
 758 /*  Ranking of all keys occurs in this section:                 */
 759 /*  shift it backwards so no subtractions are necessary in loop */
 760     key_buff_ptr = gd->key_buff1 - min_key_val;
 761
 762 /*  In this section, the keys themselves are used as their
 763     own indexes to determine how many of each there are: their
 764     individual population                                       */
 765     for( i=0; i<j; i++ )
 766         key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
 767                                        /* population                     */
 768
 769 /*  To obtain ranks of each key, successively add the individual key
 770     population, not forgetting the total of lesser keys, m.
 771     NOTE: Since the total of lesser keys would be subtracted later
 772     in verification, it is no longer added to the first key population
 773     here, but still needed during the partial verify test.  This is to
 774     ensure that 32-bit key_buff can still be used for class D.           */
 775 /*    key_buff_ptr[min_key_val] += m;    */
 776     for( i=min_key_val; i<max_key_val; i++ )
 777         key_buff_ptr[i+1] += key_buff_ptr[i];
 778
 779
 780 /* This is the partial verify test section */
 781 /* Observe that test_rank_array vals are   */
 782 /* shifted differently for different cases */
 783     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 784     {
 785         k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
 786         if( min_key_val <= k  &&  k <= max_key_val )
 787         {
 788             /* Add the total of lesser keys, m, here */
 789             INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
 790             int failed = 0;
 791
 792             switch( CLASS )
 793             {
 794                 case 'S':
 795                     if( i <= 2 )
 796                     {
 797                         if( key_rank != gd->test_rank_array[i]+iteration )
 798                             failed = 1;
 799                         else
 800                             gd->passed_verification++;
 801                     }
 802                     else
 803                     {
 804                         if( key_rank != gd->test_rank_array[i]-iteration )
 805                             failed = 1;
 806                         else
 807                             gd->passed_verification++;
 808                     }
 809                     break;
 810                 case 'W':
 811                     if( i < 2 )
 812                     {
 813                         if( key_rank != gd->test_rank_array[i]+(iteration-2) )
 814                             failed = 1;
 815                         else
 816                             gd->passed_verification++;
 817                     }
 818                     else
 819                     {
 820                         if( key_rank != gd->test_rank_array[i]-iteration )
 821                             failed = 1;
 822                         else
 823                             gd->passed_verification++;
 824                     }
 825                     break;
 826                 case 'A':
 827                     if( i <= 2 )
 828                     {
 829                         if( key_rank != gd->test_rank_array[i]+(iteration-1) )
 830                             failed = 1;
 831                         else
 832                           gd->passed_verification++;
 833                     }
 834                     else
 835                     {
 836                         if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
 837                             failed = 1;
 838                         else
 839                             gd->passed_verification++;
 840                     }
 841                     break;
 842                 case 'B':
 843                     if( i == 1 || i == 2 || i == 4 )
 844                     {
 845                         if( key_rank != gd->test_rank_array[i]+iteration )
 846                             failed = 1;
 847                         else
 848                             gd->passed_verification++;
 849                     }
 850                     else
 851                     {
 852                         if( key_rank != gd->test_rank_array[i]-iteration )
 853                             failed = 1;
 854                         else
 855                             gd->passed_verification++;
 856                     }
 857                     break;
 858                 case 'C':
 859                     if( i <= 2 )
 860                     {
 861                         if( key_rank != gd->test_rank_array[i]+iteration )
 862                             failed = 1;
 863                         else
 864                             gd->passed_verification++;
 865                     }
 866                     else
 867                     {
 868                         if( key_rank != gd->test_rank_array[i]-iteration )
 869                             failed = 1;
 870                         else
 871                             gd->passed_verification++;
 872                     }
 873                     break;
 874                 case 'D':
 875                     if( i < 2 )
 876                     {
 877                         if( key_rank != gd->test_rank_array[i]+iteration )
 878                             failed = 1;
 879                         else
 880                             gd->passed_verification++;
 881                     }
 882                     else
 883                     {
 884                         if( key_rank != gd->test_rank_array[i]-iteration )
 885                             failed = 1;
 886                         else
 887                             gd->passed_verification++;
 888                     }
 889                     break;
 890             }
 891             if( failed == 1 )
 892                 printf( "Failed partial verification: "
 893                         "iteration %d, processor %d, test key %d\n",
 894                          iteration, gd->my_rank, (int)i );
 895         }
 896     }
 897
 898
 899
 900
 901 /*  Make copies of rank info for use by full_verify: these variables
 902     in rank are local; making them global slows down the code, probably
 903     since they cannot be made register by compiler                        */
 904
 905     if( iteration == MAX_ITERATIONS )
 906     {
 907         gd->key_buff_ptr_global = key_buff_ptr;
 908         gd->total_local_keys    = j;
 909         gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
 910     }
 911
 912 }
 913
 914
 915 /*****************************************************************/
 916 /*************             M  A  I  N             ****************/
 917 /*****************************************************************/
 918
 919 int main( int argc, char **argv )
 920 {
 921
 922     int             i, iteration, itemp;
 923
 924     double          timecounter, maxtime;
 925
 926     global_data* gd = malloc(sizeof(global_data));
 927 /*  Initialize MPI */
 928     MPI_Init( &argc, &argv );
 929     MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
 930     MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
 931
 932 /*  Initialize the verification arrays if a valid class */
 933     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 934         switch( CLASS )
 935         {
 936             case 'S':
 937                 gd->test_index_array[i] = S_test_index_array[i];
 938                 gd->test_rank_array[i]  = S_test_rank_array[i];
 939                 break;
 940             case 'A':
 941                 gd->test_index_array[i] = A_test_index_array[i];
 942                 gd->test_rank_array[i]  = A_test_rank_array[i];
 943                 break;
 944             case 'W':
 945                 gd->test_index_array[i] = W_test_index_array[i];
 946                 gd->test_rank_array[i]  = W_test_rank_array[i];
 947                 break;
 948             case 'B':
 949                 gd->test_index_array[i] = B_test_index_array[i];
 950                 gd->test_rank_array[i]  = B_test_rank_array[i];
 951                 break;
 952             case 'C':
 953                 gd->test_index_array[i] = C_test_index_array[i];
 954                 gd->test_rank_array[i]  = C_test_rank_array[i];
 955                 break;
 956             case 'D':
 957                 gd->test_index_array[i] = D_test_index_array[i];
 958                 gd->test_rank_array[i]  = D_test_rank_array[i];
 959                 break;
 960         };
 961
 962
 963
 964 /*  Printout initial NPB info */
 965     if( gd->my_rank == 0 )
 966     {
 967         printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
 968         printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
 969         printf( " Iterations:   %d\n", MAX_ITERATIONS );
 970         printf( " Number of processes:     %d\n",gd->comm_size );
 971     }
 972
 973 /*  Check that actual and compiled number of processors agree */
 974     if( gd->comm_size != NUM_PROCS )
 975     {
 976         if( gd->my_rank == 0 )
 977             printf( "\n ERROR: compiled for %d processes\n"
 978                     " Number of active processes: %d\n"
 979                     " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
 980         MPI_Finalize();
 981         exit( 1 );
 982     }
 983
 984 /*  Check to see whether total number of processes is within bounds.
 985     This could in principle be checked in setparams.c, but it is more
 986     convenient to do it here                                               */
 987     if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
 988     {
 989        if( gd->my_rank == 0 )
 990            printf( "\n ERROR: number of processes %d not within range %d-%d"
 991                    "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
 992        MPI_Finalize();
 993        exit( 1 );
 994     }
 995
 996
 997 /*  Generate random number sequence and subsequent keys on all procs */
 998     create_seq(gd,  find_my_seed( gd->my_rank,
 999                               gd->comm_size,
1000                               4*(long)TOTAL_KEYS*MIN_PROCS,
1001                               314159265.00,      /* Random number gen seed */
1002                               1220703125.00 ),   /* Random number gen mult */
1003                 1220703125.00 );                 /* Random number gen mult */
1004
1005 /*  Do one interation for free (i.e., untimed) to guarantee initialization of
1006     all data and code pages and respective tables */
1007     rank(gd, 1 );
1008
1009 /*  Start verification counter */
1010     gd->passed_verification = 0;
1011
1012     if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
1013
1014 /*  Initialize timer  */
1015     timer_clear(gd, 0 );
1016
1017 /*  Initialize separate communication, computation timing */
1018 #ifdef  TIMING_ENABLED
1019     for( i=1; i<=3; i++ ) timer_clear(gd, i );
1020 #endif
1021
1022 /*  Start timer  */
1023     timer_start(gd, 0 );
1024
1025 #ifdef  TIMING_ENABLED
1026     timer_start(gd, 1 );
1027     timer_start(gd, 2 );
1028 #endif
1029
1030     char smpi_category[100];
1031     snprintf (smpi_category, 100, "%d", gd->my_rank);
1032     TRACE_smpi_set_category (smpi_category);
1033
1034 /*  This is the main iteration */
1035     for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1036     {
1037         if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
1038         rank(gd,  iteration );
1039     }
1040     TRACE_smpi_set_category (NULL);
1041
1042 #ifdef  TIMING_ENABLED
1043     timer_stop(gd, 2 );
1044     timer_stop(gd, 1 );
1045 #endif
1046
1047 /*  Stop timer, obtain time for processors */
1048     timer_stop(gd, 0 );
1049
1050     timecounter = timer_read(gd, 0 );
1051
1052 /*  End of timing, obtain maximum time of all processors */
1053     MPI_Reduce( &timecounter,
1054                 &maxtime,
1055                 1,
1056                 MPI_DOUBLE,
1057                 MPI_MAX,
1058                 0,
1059                 MPI_COMM_WORLD );
1060
1061 #ifdef  TIMING_ENABLED
1062     {
1063         double    tmin, tsum, tmax;
1064
1065         if( my_rank == 0 )
1066         {
1067             printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1068             printf( "              min                avg                max\n" );
1069         }
1070         for( i=1; i<=3; i++ )
1071         {
1072             timecounter = timer_read(gd, i );
1073             MPI_Reduce( &timecounter,
1074                         &tmin,
1075                         1,
1076                         MPI_DOUBLE,
1077                         MPI_MIN,
1078                         0,
1079                         MPI_COMM_WORLD );
1080             MPI_Reduce( &timecounter,
1081                         &tsum,
1082                         1,
1083                         MPI_DOUBLE,
1084                         MPI_SUM,
1085                         0,
1086                         MPI_COMM_WORLD );
1087             MPI_Reduce( &timecounter,
1088                         &tmax,
1089                         1,
1090                         MPI_DOUBLE,
1091                         MPI_MAX,
1092                         0,
1093                         MPI_COMM_WORLD );
1094             if( my_rank == 0 )
1095                 printf( "timer %d:    %f           %f            %f\n",
1096                         i, tmin, tsum/((double) comm_size), tmax );
1097         }
1098         if( my_rank == 0 )
1099             printf( "\n" );
1100     }
1101 #endif
1102
1103 /*  This tests that keys are in sequence: sorting of last ranked key seq
1104     occurs here, but is an untimed operation                             */
1105     full_verify(gd);
1106
1107
1108 /*  Obtain verification counter sum */
1109     itemp =gd->passed_verification;
1110     MPI_Reduce( &itemp,
1111                 &gd->passed_verification,
1112                 1,
1113                 MPI_INT,
1114                 MPI_SUM,
1115                 0,
1116                 MPI_COMM_WORLD );
1117
1118
1119
1120 /*  The final printout  */
1121     if( gd->my_rank == 0 )
1122     {
1123         if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1124             gd->passed_verification = 0;
1125         c_print_results( "IS",
1126                          CLASS,
1127                          (int)(TOTAL_KEYS),
1128                          MIN_PROCS,
1129                          0,
1130                          MAX_ITERATIONS,
1131                          NUM_PROCS,
1132                          gd->comm_size,
1133                          maxtime,
1134                          ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1135                                                       /maxtime/1000000.,
1136                          "keys ranked",
1137                          gd->passed_verification,
1138                          NPBVERSION,
1139                          COMPILETIME,
1140                          MPICC,
1141                          CLINK,
1142                          CMPI_LIB,
1143                          CMPI_INC,
1144                          CFLAGS,
1145                          CLINKFLAGS );
1146     }
1147
1148     MPI_Finalize();
1149     free(gd);
1150
1151     return 0;
1152          /**************************/
1153 }        /*  E N D  P R O G R A M  */
1154          /**************************/