examples/smpi/NAS/IS/is.c

   1 /*************************************************************************
   2  *                                                                       *
   3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
   4  *                                                                       *
   5  *                                  I S                                  *
   6  *                                                                       *
   7  *************************************************************************
   8  *                                                                       *
   9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
  10  *   It is described in NAS Technical Report 95-020.                     *
  11  *                                                                       *
  12  *   Permission to use, copy, distribute and modify this software        *
  13  *   for any purpose with or without fee is hereby granted.  We          *
  14  *   request, however, that all derived work reference the NAS           *
  15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
  16  *   without express or implied warranty.                                *
  17  *                                                                       *
  18  *   Information on NPB 3.3, including the technical report, the         *
  19  *   original specifications, source code, results and information       *
  20  *   on how to submit new results, is available at:                      *
  21  *                                                                       *
  22  *          http://www.nas.nasa.gov/Software/NPB                         *
  23  *                                                                       *
  24  *   Send comments or suggestions to  npb@nas.nasa.gov                   *
  25  *   Send bug reports to              npb-bugs@nas.nasa.gov              *
  26  *                                                                       *
  27  *         NAS Parallel Benchmarks Group                                 *
  28  *         NASA Ames Research Center                                     *
  29  *         Mail Stop: T27A-1                                             *
  30  *         Moffett Field, CA   94035-1000                                *
  31  *                                                                       *
  32  *         E-mail:  npb@nas.nasa.gov                                     *
  33  *         Fax:     (650) 604-3957                                       *
  34  *                                                                       *
  35  *************************************************************************
  36  *                                                                       *
  37  *   Author: M. Yarrow                                                   *
  38  *           H. Jin                                                      *
  39  *                                                                       *
  40  *************************************************************************/
  41
  42 #include "mpi.h"
  43 #include "npbparams.h"
  44 #include <stdlib.h>
  45 #include <stdio.h>
  46
  47 /******************/
  48 /* default values */
  49 /******************/
  50 #ifndef CLASS
  51 #define CLASS 'S'
  52 #define NUM_PROCS            1
  53 #endif
  54 #define MIN_PROCS            1
  55
  56
  57 /*************/
  58 /*  CLASS S  */
  59 /*************/
  60 #if CLASS == 'S'
  61 #define  TOTAL_KEYS_LOG_2    16
  62 #define  MAX_KEY_LOG_2       11
  63 #define  NUM_BUCKETS_LOG_2   9
  64 #endif
  65
  66
  67 /*************/
  68 /*  CLASS W  */
  69 /*************/
  70 #if CLASS == 'W'
  71 #define  TOTAL_KEYS_LOG_2    20
  72 #define  MAX_KEY_LOG_2       16
  73 #define  NUM_BUCKETS_LOG_2   10
  74 #endif
  75
  76 /*************/
  77 /*  CLASS A  */
  78 /*************/
  79 #if CLASS == 'A'
  80 #define  TOTAL_KEYS_LOG_2    23
  81 #define  MAX_KEY_LOG_2       19
  82 #define  NUM_BUCKETS_LOG_2   10
  83 #endif
  84
  85
  86 /*************/
  87 /*  CLASS B  */
  88 /*************/
  89 #if CLASS == 'B'
  90 #define  TOTAL_KEYS_LOG_2    25
  91 #define  MAX_KEY_LOG_2       21
  92 #define  NUM_BUCKETS_LOG_2   10
  93 #endif
  94
  95
  96 /*************/
  97 /*  CLASS C  */
  98 /*************/
  99 #if CLASS == 'C'
 100 #define  TOTAL_KEYS_LOG_2    27
 101 #define  MAX_KEY_LOG_2       23
 102 #define  NUM_BUCKETS_LOG_2   10
 103 #endif
 104
 105
 106 /*************/
 107 /*  CLASS D  */
 108 /*************/
 109 #if CLASS == 'D'
 110 #define  TOTAL_KEYS_LOG_2    29
 111 #define  MAX_KEY_LOG_2       27
 112 #define  NUM_BUCKETS_LOG_2   10
 113 #undef   MIN_PROCS
 114 #define  MIN_PROCS           4
 115 #endif
 116
 117
 118 #define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
 119 #define  MAX_KEY             (1 << MAX_KEY_LOG_2)
 120 #define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
 121 #define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
 122
 123 /*****************************************************************/
 124 /* On larger number of processors, since the keys are (roughly)  */
 125 /* gaussian distributed, the first and last processor sort keys  */
 126 /* in a large interval, requiring array sizes to be larger. Note */
 127 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
 128 /* The required array size also depends on the bucket size used. */
 129 /* The following values are validated for the 1024-bucket setup. */
 130 /*****************************************************************/
 131 #if   NUM_PROCS < 256
 132 #define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
 133 #elif NUM_PROCS < 512
 134 #define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
 135 #elif NUM_PROCS < 1024
 136 #define  SIZE_OF_BUFFERS     4*NUM_KEYS
 137 #else
 138 #define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
 139 #endif
 140
 141 /*****************************************************************/
 142 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
 143 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
 144 /* MAX_PROCS AT YOUR PERIL                                       */
 145 /*****************************************************************/
 146 #if CLASS == 'S'
 147 #define  MAX_PROCS           128
 148 #else
 149 #define  MAX_PROCS           1024
 150 #endif
 151
 152 #define  MAX_ITERATIONS      10
 153 #define  TEST_ARRAY_SIZE     5
 154
 155
 156 /***********************************/
 157 /* Enable separate communication,  */
 158 /* computation timing and printout */
 159 /***********************************/
 160 /* #define  TIMING_ENABLED         */
 161
 162
 163 /*************************************/
 164 /* Typedef: if necessary, change the */
 165 /* size of int here by changing the  */
 166 /* int type to, say, long            */
 167 /*************************************/
 168 typedef  int  INT_TYPE;
 169 typedef  long INT_TYPE2;
 170 #define MP_KEY_TYPE MPI_INT
 171
 172
 173 typedef struct {
 174
 175 /********************/
 176 /* MPI properties:  */
 177 /********************/
 178 int      my_rank,
 179          comm_size;
 180
 181
 182 /********************/
 183 /* Some global info */
 184 /********************/
 185 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
 186          total_local_keys,             /* copies of rank info        */
 187          total_lesser_keys;
 188
 189
 190 int      passed_verification;
 191
 192
 193
 194 /************************************/
 195 /* These are the three main arrays. */
 196 /* See SIZE_OF_BUFFERS def above    */
 197 /************************************/
 198 INT_TYPE key_array[SIZE_OF_BUFFERS],
 199          key_buff1[SIZE_OF_BUFFERS],
 200          key_buff2[SIZE_OF_BUFFERS],
 201          bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
 202          bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
 203          bucket_ptrs[NUM_BUCKETS],
 204          process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
 205          process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
 206 int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
 207          send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
 208
 209
 210 /**********************/
 211 /* Partial verif info */
 212 /**********************/
 213 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
 214          test_rank_array[TEST_ARRAY_SIZE];
 215
 216 /**********/
 217 /* Timers */
 218 /**********/
 219 double start[64], elapsed[64];
 220
 221 } global_data;
 222
 223
 224 const INT_TYPE2
 225          S_test_index_array[TEST_ARRAY_SIZE] =
 226                              {48427,17148,23627,62548,4431},
 227          S_test_rank_array[TEST_ARRAY_SIZE] =
 228                              {0,18,346,64917,65463},
 229
 230          W_test_index_array[TEST_ARRAY_SIZE] =
 231                              {357773,934767,875723,898999,404505},
 232          W_test_rank_array[TEST_ARRAY_SIZE] =
 233                              {1249,11698,1039987,1043896,1048018},
 234
 235          A_test_index_array[TEST_ARRAY_SIZE] =
 236                              {2112377,662041,5336171,3642833,4250760},
 237          A_test_rank_array[TEST_ARRAY_SIZE] =
 238                              {104,17523,123928,8288932,8388264},
 239
 240          B_test_index_array[TEST_ARRAY_SIZE] =
 241                              {41869,812306,5102857,18232239,26860214},
 242          B_test_rank_array[TEST_ARRAY_SIZE] =
 243                              {33422937,10244,59149,33135281,99},
 244
 245          C_test_index_array[TEST_ARRAY_SIZE] =
 246                              {44172927,72999161,74326391,129606274,21736814},
 247          C_test_rank_array[TEST_ARRAY_SIZE] =
 248                              {61147,882988,266290,133997595,133525895},
 249
 250          D_test_index_array[TEST_ARRAY_SIZE] =
 251                              {1317351170,995930646,1157283250,1503301535,1453734525},
 252          D_test_rank_array[TEST_ARRAY_SIZE] =
 253                              {1,36538729,1978098519,2145192618,2147425337};
 254
 255
 256
 257 /***********************/
 258 /* function prototypes */
 259 /***********************/
 260 double  randlc( double *X, double *A );
 261
 262 void full_verify( global_data* gd );
 263
 264 void c_print_results( char   *name,
 265                       char   class,
 266                       int    n1,
 267                       int    n2,
 268                       int    n3,
 269                       int    niter,
 270                       int    nprocs_compiled,
 271                       int    nprocs_total,
 272                       double t,
 273                       double mops,
 274                       char   *optype,
 275                       int    passed_verification,
 276                       char   *npbversion,
 277                       char   *compiletime,
 278                       char   *mpicc,
 279                       char   *clink,
 280                       char   *cmpi_lib,
 281                       char   *cmpi_inc,
 282                       char   *cflags,
 283                       char   *clinkflags );
 284
 285 void    timer_clear(global_data* gd, int n );
 286 void    timer_start(global_data* gd, int n );
 287 void    timer_stop(global_data* gd, int n );
 288 double  timer_read(global_data* gd, int n );
 289
 290 void    timer_clear(global_data* gd, int n ) {
 291    gd->elapsed[n] = 0.0;
 292 }
 293
 294 void    timer_start(global_data* gd, int n ) {
 295    gd->start[n] = MPI_Wtime();
 296 }
 297
 298 void    timer_stop(global_data* gd, int n ) {
 299    gd->elapsed[n] += MPI_Wtime() - gd->start[n];
 300 }
 301
 302 double  timer_read(global_data* gd, int n ) {
 303    return gd->elapsed[n];
 304 }
 305
 306
 307 /*
 308  *    FUNCTION RANDLC (X, A)
 309  *
 310  *  This routine returns a uniform pseudorandom double precision number in the
 311  *  range (0, 1) by using the linear congruential generator
 312  *
 313  *  x_{k+1} = a x_k  (mod 2^46)
 314  *
 315  *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 316  *  before repeating.  The argument A is the same as 'a' in the above formula,
 317  *  and X is the same as x_0.  A and X must be odd double precision integers
 318  *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
 319  *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 320  *  the new seed x_1, so that subsequent calls to RANDLC using the same
 321  *  arguments will generate a continuous sequence.
 322  *
 323  *  This routine should produce the same results on any computer with at least
 324  *  48 mantissa bits in double precision floating point data.  On Cray systems,
 325  *  double precision should be disabled.
 326  *
 327  *  David H. Bailey     October 26, 1990
 328  *
 329  *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
 330  *     SAVE KS, R23, R46, T23, T46
 331  *     DATA KS/0/
 332  *
 333  *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
 334  *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
 335  *  by merely using the ** operator, in order to insure that the results are
 336  *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
 337  */
 338
 339
 340 /*****************************************************************/
 341 /*************           R  A  N  D  L  C             ************/
 342 /*************                                        ************/
 343 /*************    portable random number generator    ************/
 344 /*****************************************************************/
 345
 346 double  randlc( double *X, double *A )
 347 {
 348       static int        KS=0;
 349       static double     R23, R46, T23, T46;
 350       double            T1, T2, T3, T4;
 351       double            A1;
 352       double            A2;
 353       double            X1;
 354       double            X2;
 355       double            Z;
 356       int               i, j;
 357
 358       if (KS == 0)
 359       {
 360         R23 = 1.0;
 361         R46 = 1.0;
 362         T23 = 1.0;
 363         T46 = 1.0;
 364
 365         for (i=1; i<=23; i++)
 366         {
 367           R23 = 0.50 * R23;
 368           T23 = 2.0 * T23;
 369         }
 370         for (i=1; i<=46; i++)
 371         {
 372           R46 = 0.50 * R46;
 373           T46 = 2.0 * T46;
 374         }
 375         KS = 1;
 376       }
 377
 378 /*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
 379
 380       T1 = R23 * *A;
 381       j  = T1;
 382       A1 = j;
 383       A2 = *A - T23 * A1;
 384
 385 /*  Break X into two parts such that X = 2^23 * X1 + X2, compute
 386     Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 387     X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
 388
 389       T1 = R23 * *X;
 390       j  = T1;
 391       X1 = j;
 392       X2 = *X - T23 * X1;
 393       T1 = A1 * X2 + A2 * X1;
 394
 395       j  = R23 * T1;
 396       T2 = j;
 397       Z = T1 - T23 * T2;
 398       T3 = T23 * Z + A2 * X2;
 399       j  = R46 * T3;
 400       T4 = j;
 401       *X = T3 - T46 * T4;
 402       return(R46 * *X);
 403 }
 404
 405
 406
 407 /*****************************************************************/
 408 /************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
 409 /************                                         ************/
 410 /************ returns parallel random number seq seed ************/
 411 /*****************************************************************/
 412
 413 /*
 414  * Create a random number sequence of total length nn residing
 415  * on np number of processors.  Each processor will therefore have a
 416  * subsequence of length nn/np.  This routine returns that random
 417  * number which is the first random number for the subsequence belonging
 418  * to processor rank kn, and which is used as seed for proc kn ran # gen.
 419  */
 420
 421 double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
 422                        int  np,       /* np = num procs                      */
 423                        long nn,       /* total num of ran numbers, all procs */
 424                        double s,      /* Ran num seed, for ex.: 314159265.00 */
 425                        double a )     /* Ran num gen mult, try 1220703125.00 */
 426 {
 427
 428   long   i;
 429
 430   double t1,t2,t3,an;
 431   long   mq,nq,kk,ik;
 432
 433
 434
 435       nq = nn / np;
 436
 437       for( mq=0; nq>1; mq++,nq/=2 )
 438           ;
 439
 440       t1 = a;
 441
 442       for( i=1; i<=mq; i++ )
 443         t2 = randlc( &t1, &t1 );
 444
 445       an = t1;
 446
 447       kk = kn;
 448       t1 = s;
 449       t2 = an;
 450
 451       for( i=1; i<=100; i++ )
 452       {
 453         ik = kk / 2;
 454         if( 2 * ik !=  kk )
 455             t3 = randlc( &t1, &t2 );
 456         if( ik == 0 )
 457             break;
 458         t3 = randlc( &t2, &t2 );
 459         kk = ik;
 460       }
 461
 462       return( t1 );
 463
 464 }
 465
 466
 467
 468
 469 /*****************************************************************/
 470 /*************      C  R  E  A  T  E  _  S  E  Q      ************/
 471 /*****************************************************************/
 472
 473 void    create_seq( global_data* gd, double seed, double a )
 474 {
 475         double x;
 476         int    i, k;
 477
 478         k = MAX_KEY/4;
 479
 480         for (i=0; i<NUM_KEYS; i++)
 481         {
 482             x = randlc(&seed, &a);
 483             x += randlc(&seed, &a);
 484             x += randlc(&seed, &a);
 485             x += randlc(&seed, &a);
 486
 487             gd->key_array[i] = k*x;
 488         }
 489 }
 490
 491
 492
 493
 494 /*****************************************************************/
 495 /*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
 496 /*****************************************************************/
 497
 498
 499 void full_verify( global_data* gd )
 500 {
 501     MPI_Status  status;
 502     MPI_Request request;
 503
 504     INT_TYPE    i, j;
 505     INT_TYPE    k, last_local_key;
 506
 507
 508 /*  Now, finally, sort the keys:  */
 509     for( i=0; i<gd->total_local_keys; i++ )
 510         gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
 511                                  gd->total_lesser_keys] = gd->key_buff2[i];
 512     last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
 513
 514 /*  Send largest key value to next processor  */
 515     if( gd->my_rank > 0 )
 516         MPI_Irecv( &k,
 517                    1,
 518                    MP_KEY_TYPE,
 519                    gd->my_rank-1,
 520                    1000,
 521                    MPI_COMM_WORLD,
 522                    &request );
 523     if( gd->my_rank < gd->comm_size-1 )
 524         MPI_Send( &gd->key_array[last_local_key],
 525                   1,
 526                   MP_KEY_TYPE,
 527                   gd->my_rank+1,
 528                   1000,
 529                   MPI_COMM_WORLD );
 530     if( gd->my_rank > 0 )
 531         MPI_Wait( &request, &status );
 532
 533 /*  Confirm that neighbor's greatest key value
 534     is not greater than my least key value       */
 535     j = 0;
 536     if( gd->my_rank > 0 && gd->total_local_keys > 0 )
 537         if( k > gd->key_array[0] )
 538             j++;
 539
 540
 541 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
 542     for( i=1; i<gd->total_local_keys; i++ )
 543         if( gd->key_array[i-1] > gd->key_array[i] )
 544             j++;
 545
 546
 547     if( j != 0 )
 548     {
 549         printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
 550                 gd->my_rank, j );
 551     }
 552     else
 553         gd->passed_verification++;
 554
 555
 556 }
 557
 558
 559
 560
 561 /*****************************************************************/
 562 /*************             R  A  N  K             ****************/
 563 /*****************************************************************/
 564
 565
 566 void rank( global_data* gd, int iteration )
 567 {
 568
 569     INT_TYPE    i, k;
 570
 571     INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
 572     INT_TYPE    key;
 573     INT_TYPE2   bucket_sum_accumulator, j, m;
 574     INT_TYPE    local_bucket_sum_accumulator;
 575     INT_TYPE    min_key_val, max_key_val;
 576     INT_TYPE    *key_buff_ptr;
 577
 578
 579
 580
 581 /*  Iteration alteration of keys */
 582     if(gd->my_rank == 0 )
 583     {
 584       gd->key_array[iteration] = iteration;
 585       gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
 586     }
 587
 588
 589 /*  Initialize */
 590     for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
 591     {
 592         gd->bucket_size[i] = 0;
 593         gd->bucket_size_totals[i] = 0;
 594         gd->process_bucket_distrib_ptr1[i] = 0;
 595         gd->process_bucket_distrib_ptr2[i] = 0;
 596     }
 597
 598
 599 /*  Determine where the partial verify test keys are, load into  */
 600 /*  top of array bucket_size                                     */
 601     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 602         if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
 603             gd->bucket_size[NUM_BUCKETS+i] =
 604                           gd->key_array[gd->test_index_array[i] % NUM_KEYS];
 605
 606
 607 /*  Determine the number of keys in each bucket */
 608     for( i=0; i<NUM_KEYS; i++ )
 609         gd->bucket_size[gd->key_array[i] >> shift]++;
 610
 611
 612 /*  Accumulative bucket sizes are the bucket pointers */
 613     gd->bucket_ptrs[0] = 0;
 614     for( i=1; i< NUM_BUCKETS; i++ )
 615         gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
 616
 617
 618 /*  Sort into appropriate bucket */
 619     for( i=0; i<NUM_KEYS; i++ )
 620     {
 621         key = gd->key_array[i];
 622         gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
 623     }
 624
 625 #ifdef  TIMING_ENABLED
 626     timer_stop(gd, 2 );
 627     timer_start(gd, 3 );
 628 #endif
 629
 630 /*  Get the bucket size totals for the entire problem. These
 631     will be used to determine the redistribution of keys      */
 632     MPI_Allreduce( gd->bucket_size,
 633                    gd->bucket_size_totals,
 634                    NUM_BUCKETS+TEST_ARRAY_SIZE,
 635                    MP_KEY_TYPE,
 636                    MPI_SUM,
 637                    MPI_COMM_WORLD );
 638
 639 #ifdef  TIMING_ENABLED
 640     timer_stop(gd, 3 );
 641     timer_start(gd, 2 );
 642 #endif
 643
 644 /*  Determine Redistibution of keys: accumulate the bucket size totals
 645     till this number surpasses NUM_KEYS (which the average number of keys
 646     per processor).  Then all keys in these buckets go to processor 0.
 647     Continue accumulating again until supassing 2*NUM_KEYS. All keys
 648     in these buckets go to processor 1, etc.  This algorithm guarantees
 649     that all processors have work ranking; no processors are left idle.
 650     The optimum number of buckets, however, does not result in as high
 651     a degree of load balancing (as even a distribution of keys as is
 652     possible) as is obtained from increasing the number of buckets, but
 653     more buckets results in more computation per processor so that the
 654     optimum number of buckets turns out to be 1024 for machines tested.
 655     Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
 656     number of first and last bucket which each processor will have after
 657     the redistribution is done.                                          */
 658
 659     bucket_sum_accumulator = 0;
 660     local_bucket_sum_accumulator = 0;
 661     gd->send_displ[0] = 0;
 662     gd->process_bucket_distrib_ptr1[0] = 0;
 663     for( i=0, j=0; i<NUM_BUCKETS; i++ )
 664     {
 665         bucket_sum_accumulator       += gd->bucket_size_totals[i];
 666         local_bucket_sum_accumulator += gd->bucket_size[i];
 667         if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
 668         {
 669             gd->send_count[j] = local_bucket_sum_accumulator;
 670             if( j != 0 )
 671             {
 672                 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
 673                 gd->process_bucket_distrib_ptr1[j] =
 674                                         gd->process_bucket_distrib_ptr2[j-1]+1;
 675             }
 676             gd->process_bucket_distrib_ptr2[j++] = i;
 677             local_bucket_sum_accumulator = 0;
 678         }
 679     }
 680
 681 /*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
 682     that the last few processors don't get any buckets.  So, we
 683     need to set counts properly in this case to avoid any fallouts.    */
 684     while( j < gd->comm_size )
 685     {
 686         gd->send_count[j] = 0;
 687         gd->process_bucket_distrib_ptr1[j] = 1;
 688         j++;
 689     }
 690
 691 #ifdef  TIMING_ENABLED
 692     timer_stop(gd, 2 );
 693     timer_start(gd, 3 );
 694 #endif
 695
 696 /*  This is the redistribution section:  first find out how many keys
 697     each processor will send to every other processor:                 */
 698     MPI_Alltoall( gd->send_count,
 699                   1,
 700                   MPI_INT,
 701                   gd->recv_count,
 702                   1,
 703                   MPI_INT,
 704                   MPI_COMM_WORLD );
 705     MPI_Wtime();
 706
 707 /*  Determine the receive array displacements for the buckets */
 708     gd->recv_displ[0] = 0;
 709     for( i=1; i<gd->comm_size; i++ )
 710         gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
 711
 712
 713     MPI_Wtime();
 714 /*  Now send the keys to respective processors  */
 715     MPI_Alltoallv( gd->key_buff1,
 716                    gd->send_count,
 717                    gd->send_displ,
 718                    MP_KEY_TYPE,
 719                    gd->key_buff2,
 720                    gd->recv_count,
 721                    gd->recv_displ,
 722                    MP_KEY_TYPE,
 723                    MPI_COMM_WORLD );
 724
 725 #ifdef  TIMING_ENABLED
 726     timer_stop(gd, 3 );
 727     timer_start(gd, 2 );
 728 #endif
 729
 730 /*  The starting and ending bucket numbers on each processor are
 731     multiplied by the interval size of the buckets to obtain the
 732     smallest possible min and greatest possible max value of any
 733     key on each processor                                          */
 734     min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
 735     max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
 736
 737 /*  Clear the work array */
 738     for( i=0; i<max_key_val-min_key_val+1; i++ )
 739         gd->key_buff1[i] = 0;
 740
 741 /*  Determine the total number of keys on all other
 742     processors holding keys of lesser value         */
 743     m = 0;
 744     for( k=0; k<gd->my_rank; k++ )
 745         for( i= gd->process_bucket_distrib_ptr1[k];
 746              i<=gd->process_bucket_distrib_ptr2[k];
 747              i++ )
 748             m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
 749
 750 /*  Determine total number of keys on this processor */
 751     j = 0;
 752     for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
 753          i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
 754          i++ )
 755         j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
 756
 757
 758 /*  Ranking of all keys occurs in this section:                 */
 759 /*  shift it backwards so no subtractions are necessary in loop */
 760     key_buff_ptr = gd->key_buff1 - min_key_val;
 761
 762 /*  In this section, the keys themselves are used as their
 763     own indexes to determine how many of each there are: their
 764     individual population                                       */
 765     for( i=0; i<j; i++ )
 766         key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
 767                                        /* population                     */
 768
 769 /*  To obtain ranks of each key, successively add the individual key
 770     population, not forgetting the total of lesser keys, m.
 771     NOTE: Since the total of lesser keys would be subtracted later
 772     in verification, it is no longer added to the first key population
 773     here, but still needed during the partial verify test.  This is to
 774     ensure that 32-bit key_buff can still be used for class D.           */
 775 /*    key_buff_ptr[min_key_val] += m;    */
 776     for( i=min_key_val; i<max_key_val; i++ )
 777         key_buff_ptr[i+1] += key_buff_ptr[i];
 778
 779
 780 /* This is the partial verify test section */
 781 /* Observe that test_rank_array vals are   */
 782 /* shifted differently for different cases */
 783     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 784     {
 785         k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
 786         if( min_key_val <= k  &&  k <= max_key_val )
 787         {
 788             /* Add the total of lesser keys, m, here */
 789             INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
 790             int failed = 0;
 791
 792             switch( CLASS )
 793             {
 794                 case 'S':
 795                     if( i <= 2 )
 796                     {
 797                         if( key_rank != gd->test_rank_array[i]+iteration )
 798                             failed = 1;
 799                         else
 800                             gd->passed_verification++;
 801                     }
 802                     else
 803                     {
 804                         if( key_rank != gd->test_rank_array[i]-iteration )
 805                             failed = 1;
 806                         else
 807                             gd->passed_verification++;
 808                     }
 809                     break;
 810                 case 'W':
 811                     if( i < 2 )
 812                     {
 813                         if( key_rank != gd->test_rank_array[i]+(iteration-2) )
 814                             failed = 1;
 815                         else
 816                             gd->passed_verification++;
 817                     }
 818                     else
 819                     {
 820                         if( key_rank != gd->test_rank_array[i]-iteration )
 821                             failed = 1;
 822                         else
 823                             gd->passed_verification++;
 824                     }
 825                     break;
 826                 case 'A':
 827                     if( i <= 2 )
 828                     {
 829                         if( key_rank != gd->test_rank_array[i]+(iteration-1) )
 830                             failed = 1;
 831                         else
 832                           gd->passed_verification++;
 833                     }
 834                     else
 835                     {
 836                         if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
 837                             failed = 1;
 838                         else
 839                             gd->passed_verification++;
 840                     }
 841                     break;
 842                 case 'B':
 843                     if( i == 1 || i == 2 || i == 4 )
 844                     {
 845                         if( key_rank != gd->test_rank_array[i]+iteration )
 846                             failed = 1;
 847                         else
 848                             gd->passed_verification++;
 849                     }
 850                     else
 851                     {
 852                         if( key_rank != gd->test_rank_array[i]-iteration )
 853                             failed = 1;
 854                         else
 855                             gd->passed_verification++;
 856                     }
 857                     break;
 858                 case 'C':
 859                     if( i <= 2 )
 860                     {
 861                         if( key_rank != gd->test_rank_array[i]+iteration )
 862                             failed = 1;
 863                         else
 864                             gd->passed_verification++;
 865                     }
 866                     else
 867                     {
 868                         if( key_rank != gd->test_rank_array[i]-iteration )
 869                             failed = 1;
 870                         else
 871                             gd->passed_verification++;
 872                     }
 873                     break;
 874                 case 'D':
 875                     if( i < 2 )
 876                     {
 877                         if( key_rank != gd->test_rank_array[i]+iteration )
 878                             failed = 1;
 879                         else
 880                             gd->passed_verification++;
 881                     }
 882                     else
 883                     {
 884                         if( key_rank != gd->test_rank_array[i]-iteration )
 885                             failed = 1;
 886                         else
 887                             gd->passed_verification++;
 888                     }
 889                     break;
 890             }
 891             if( failed == 1 )
 892                 printf( "Failed partial verification: "
 893                         "iteration %d, processor %d, test key %d\n",
 894                          iteration, gd->my_rank, (int)i );
 895         }
 896     }
 897
 898
 899
 900
 901 /*  Make copies of rank info for use by full_verify: these variables
 902     in rank are local; making them global slows down the code, probably
 903     since they cannot be made register by compiler                        */
 904
 905     if( iteration == MAX_ITERATIONS )
 906     {
 907         gd->key_buff_ptr_global = key_buff_ptr;
 908         gd->total_local_keys    = j;
 909         gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
 910     }
 911
 912 }
 913
 914
 915 /*****************************************************************/
 916 /*************             M  A  I  N             ****************/
 917 /*****************************************************************/
 918
 919 int main( int argc, char **argv )
 920 {
 921
 922     int             i, iteration, itemp;
 923
 924     double          timecounter, maxtime;
 925
 926     global_data* gd = malloc(sizeof(global_data));
 927 /*  Initialize MPI */
 928     MPI_Init( &argc, &argv );
 929     MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
 930     MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
 931
 932 /*  Initialize the verification arrays if a valid class */
 933     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 934         switch( CLASS )
 935         {
 936             case 'S':
 937                 gd->test_index_array[i] = S_test_index_array[i];
 938                 gd->test_rank_array[i]  = S_test_rank_array[i];
 939                 break;
 940             case 'A':
 941                 gd->test_index_array[i] = A_test_index_array[i];
 942                 gd->test_rank_array[i]  = A_test_rank_array[i];
 943                 break;
 944             case 'W':
 945                 gd->test_index_array[i] = W_test_index_array[i];
 946                 gd->test_rank_array[i]  = W_test_rank_array[i];
 947                 break;
 948             case 'B':
 949                 gd->test_index_array[i] = B_test_index_array[i];
 950                 gd->test_rank_array[i]  = B_test_rank_array[i];
 951                 break;
 952             case 'C':
 953                 gd->test_index_array[i] = C_test_index_array[i];
 954                 gd->test_rank_array[i]  = C_test_rank_array[i];
 955                 break;
 956             case 'D':
 957                 gd->test_index_array[i] = D_test_index_array[i];
 958                 gd->test_rank_array[i]  = D_test_rank_array[i];
 959                 break;
 960         };
 961
 962
 963
 964 /*  Printout initial NPB info */
 965     if( gd->my_rank == 0 )
 966     {
 967         printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
 968         printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
 969         printf( " Iterations:   %d\n", MAX_ITERATIONS );
 970         printf( " Number of processes:     %d\n",gd->comm_size );
 971     }
 972
 973 /*  Check that actual and compiled number of processors agree */
 974     if( gd->comm_size != NUM_PROCS )
 975     {
 976         if( gd->my_rank == 0 )
 977             printf( "\n ERROR: compiled for %d processes\n"
 978                     " Number of active processes: %d\n"
 979                     " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
 980         MPI_Finalize();
 981         exit( 1 );
 982     }
 983
 984 /*  Check to see whether total number of processes is within bounds.
 985     This could in principle be checked in setparams.c, but it is more
 986     convenient to do it here                                               */
 987     if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
 988     {
 989        if( gd->my_rank == 0 )
 990            printf( "\n ERROR: number of processes %d not within range %d-%d"
 991                    "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
 992        MPI_Finalize();
 993        exit( 1 );
 994     }
 995
 996
 997 /*  Generate random number sequence and subsequent keys on all procs */
 998     create_seq(gd,  find_my_seed( gd->my_rank,
 999                               gd->comm_size,
1000                               4*(long)TOTAL_KEYS*MIN_PROCS,
1001                               314159265.00,      /* Random number gen seed */
1002                               1220703125.00 ),   /* Random number gen mult */
1003                 1220703125.00 );                 /* Random number gen mult */
1004
1005 /*  Do one interation for free (i.e., untimed) to guarantee initialization of
1006     all data and code pages and respective tables */
1007     rank(gd, 1 );
1008
1009 /*  Start verification counter */
1010     gd->passed_verification = 0;
1011
1012     if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
1013
1014 /*  Initialize timer  */
1015     timer_clear(gd, 0 );
1016
1017 /*  Initialize separate communication, computation timing */
1018 #ifdef  TIMING_ENABLED
1019     for( i=1; i<=3; i++ ) timer_clear(gd, i );
1020 #endif
1021
1022 /*  Start timer  */
1023     timer_start(gd, 0 );
1024
1025 #ifdef  TIMING_ENABLED
1026     timer_start(gd, 1 );
1027     timer_start(gd, 2 );
1028 #endif
1029
1030 /*  This is the main iteration */
1031     for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1032     {
1033         if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
1034         rank(gd,  iteration );
1035     }
1036
1037
1038 #ifdef  TIMING_ENABLED
1039     timer_stop(gd, 2 );
1040     timer_stop(gd, 1 );
1041 #endif
1042
1043 /*  Stop timer, obtain time for processors */
1044     timer_stop(gd, 0 );
1045
1046     timecounter = timer_read(gd, 0 );
1047
1048 /*  End of timing, obtain maximum time of all processors */
1049     MPI_Reduce( &timecounter,
1050                 &maxtime,
1051                 1,
1052                 MPI_DOUBLE,
1053                 MPI_MAX,
1054                 0,
1055                 MPI_COMM_WORLD );
1056
1057 #ifdef  TIMING_ENABLED
1058     {
1059         double    tmin, tsum, tmax;
1060
1061         if( my_rank == 0 )
1062         {
1063             printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1064             printf( "              min                avg                max\n" );
1065         }
1066         for( i=1; i<=3; i++ )
1067         {
1068             timecounter = timer_read(gd, i );
1069             MPI_Reduce( &timecounter,
1070                         &tmin,
1071                         1,
1072                         MPI_DOUBLE,
1073                         MPI_MIN,
1074                         0,
1075                         MPI_COMM_WORLD );
1076             MPI_Reduce( &timecounter,
1077                         &tsum,
1078                         1,
1079                         MPI_DOUBLE,
1080                         MPI_SUM,
1081                         0,
1082                         MPI_COMM_WORLD );
1083             MPI_Reduce( &timecounter,
1084                         &tmax,
1085                         1,
1086                         MPI_DOUBLE,
1087                         MPI_MAX,
1088                         0,
1089                         MPI_COMM_WORLD );
1090             if( my_rank == 0 )
1091                 printf( "timer %d:    %f           %f            %f\n",
1092                         i, tmin, tsum/((double) comm_size), tmax );
1093         }
1094         if( my_rank == 0 )
1095             printf( "\n" );
1096     }
1097 #endif
1098
1099 /*  This tests that keys are in sequence: sorting of last ranked key seq
1100     occurs here, but is an untimed operation                             */
1101     full_verify(gd);
1102
1103
1104 /*  Obtain verification counter sum */
1105     itemp =gd->passed_verification;
1106     MPI_Reduce( &itemp,
1107                 &gd->passed_verification,
1108                 1,
1109                 MPI_INT,
1110                 MPI_SUM,
1111                 0,
1112                 MPI_COMM_WORLD );
1113
1114
1115
1116 /*  The final printout  */
1117     if( gd->my_rank == 0 )
1118     {
1119         if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1120             gd->passed_verification = 0;
1121         c_print_results( "IS",
1122                          CLASS,
1123                          (int)(TOTAL_KEYS),
1124                          MIN_PROCS,
1125                          0,
1126                          MAX_ITERATIONS,
1127                          NUM_PROCS,
1128                          gd->comm_size,
1129                          maxtime,
1130                          ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1131                                                       /maxtime/1000000.,
1132                          "keys ranked",
1133                          gd->passed_verification,
1134                          NPBVERSION,
1135                          COMPILETIME,
1136                          MPICC,
1137                          CLINK,
1138                          CMPI_LIB,
1139                          CMPI_INC,
1140                          CFLAGS,
1141                          CLINKFLAGS );
1142     }
1143
1144     MPI_Finalize();
1145     free(gd);
1146
1147     return 0;
1148          /**************************/
1149 }        /*  E N D  P R O G R A M  */
1150          /**************************/