examples/smpi/NAS/IS/is.c

   1 /*************************************************************************
   2  *                                                                       *
   3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
   4  *                                                                       *
   5  *                                  I S                                  *
   6  *                                                                       *
   7  *************************************************************************
   8  *                                                                       *
   9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
  10  *   It is described in NAS Technical Report 95-020.                     *
  11  *                                                                       *
  12  *   Permission to use, copy, distribute and modify this software        *
  13  *   for any purpose with or without fee is hereby granted.  We          *
  14  *   request, however, that all derived work reference the NAS           *
  15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
  16  *   without express or implied warranty.                                *
  17  *                                                                       *
  18  *   Information on NPB 3.3, including the technical report, the         *
  19  *   original specifications, source code, results and information       *
  20  *   on how to submit new results, is available at:                      *
  21  *                                                                       *
  22  *          http://www.nas.nasa.gov/Software/NPB                         *
  23  *                                                                       *
  24  *   Send comments or suggestions to  npb@nas.nasa.gov                   *
  25  *   Send bug reports to              npb-bugs@nas.nasa.gov              *
  26  *                                                                       *
  27  *         NAS Parallel Benchmarks Group                                 *
  28  *         NASA Ames Research Center                                     *
  29  *         Mail Stop: T27A-1                                             *
  30  *         Moffett Field, CA   94035-1000                                *
  31  *                                                                       *
  32  *         E-mail:  npb@nas.nasa.gov                                     *
  33  *         Fax:     (650) 604-3957                                       *
  34  *                                                                       *
  35  *************************************************************************
  36  *                                                                       *
  37  *   Author: M. Yarrow                                                   *
  38  *           H. Jin                                                      *
  39  *                                                                       *
  40  *************************************************************************/
  41
  42 #include "mpi.h"
  43 #include "npbparams.h"
  44 #include <stdlib.h>
  45 #include <stdio.h>
  46
  47 /******************/
  48 /* default values */
  49 /******************/
  50 #ifndef CLASS
  51 #define CLASS 'S'
  52 #define NUM_PROCS            1
  53 #endif
  54 #define MIN_PROCS            1
  55
  56
  57 /*************/
  58 /*  CLASS S  */
  59 /*************/
  60 #if CLASS == 'S'
  61 #define  TOTAL_KEYS_LOG_2    16
  62 #define  MAX_KEY_LOG_2       11
  63 #define  NUM_BUCKETS_LOG_2   9
  64 #endif
  65
  66
  67 /*************/
  68 /*  CLASS W  */
  69 /*************/
  70 #if CLASS == 'W'
  71 #define  TOTAL_KEYS_LOG_2    20
  72 #define  MAX_KEY_LOG_2       16
  73 #define  NUM_BUCKETS_LOG_2   10
  74 #endif
  75
  76 /*************/
  77 /*  CLASS A  */
  78 /*************/
  79 #if CLASS == 'A'
  80 #define  TOTAL_KEYS_LOG_2    23
  81 #define  MAX_KEY_LOG_2       19
  82 #define  NUM_BUCKETS_LOG_2   10
  83 #endif
  84
  85
  86 /*************/
  87 /*  CLASS B  */
  88 /*************/
  89 #if CLASS == 'B'
  90 #define  TOTAL_KEYS_LOG_2    25
  91 #define  MAX_KEY_LOG_2       21
  92 #define  NUM_BUCKETS_LOG_2   10
  93 #endif
  94
  95
  96 /*************/
  97 /*  CLASS C  */
  98 /*************/
  99 #if CLASS == 'C'
 100 #define  TOTAL_KEYS_LOG_2    27
 101 #define  MAX_KEY_LOG_2       23
 102 #define  NUM_BUCKETS_LOG_2   10
 103 #endif
 104
 105
 106 /*************/
 107 /*  CLASS D  */
 108 /*************/
 109 #if CLASS == 'D'
 110 #define  TOTAL_KEYS_LOG_2    29
 111 #define  MAX_KEY_LOG_2       27
 112 #define  NUM_BUCKETS_LOG_2   10
 113 #undef   MIN_PROCS
 114 #define  MIN_PROCS           4
 115 #endif
 116
 117
 118 #define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)
 119 #define  MAX_KEY             (1 << MAX_KEY_LOG_2)
 120 #define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)
 121 #define  NUM_KEYS            (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
 122
 123 /*****************************************************************/
 124 /* On larger number of processors, since the keys are (roughly)  */
 125 /* gaussian distributed, the first and last processor sort keys  */
 126 /* in a large interval, requiring array sizes to be larger. Note */
 127 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
 128 /* The required array size also depends on the bucket size used. */
 129 /* The following values are validated for the 1024-bucket setup. */
 130 /*****************************************************************/
 131 #if   NUM_PROCS < 256
 132 #define  SIZE_OF_BUFFERS     3*NUM_KEYS/2
 133 #elif NUM_PROCS < 512
 134 #define  SIZE_OF_BUFFERS     5*NUM_KEYS/2
 135 #elif NUM_PROCS < 1024
 136 #define  SIZE_OF_BUFFERS     4*NUM_KEYS
 137 #else
 138 #define  SIZE_OF_BUFFERS     13*NUM_KEYS/2
 139 #endif
 140
 141 /*****************************************************************/
 142 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
 143 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE     */
 144 /* MAX_PROCS AT YOUR PERIL                                       */
 145 /*****************************************************************/
 146 #if CLASS == 'S'
 147 #define  MAX_PROCS           128
 148 #else
 149 #define  MAX_PROCS           1024
 150 #endif
 151
 152 #define  MAX_ITERATIONS      10
 153 #define  TEST_ARRAY_SIZE     5
 154
 155
 156 /***********************************/
 157 /* Enable separate communication,  */
 158 /* computation timing and printout */
 159 /***********************************/
 160 /* #define  TIMING_ENABLED         */
 161
 162
 163 /*************************************/
 164 /* Typedef: if necessary, change the */
 165 /* size of int here by changing the  */
 166 /* int type to, say, long            */
 167 /*************************************/
 168 typedef  int  INT_TYPE;
 169 typedef  long INT_TYPE2;
 170 #define MP_KEY_TYPE MPI_INT
 171
 172
 173 typedef struct {
 174
 175 /********************/
 176 /* MPI properties:  */
 177 /********************/
 178 int      my_rank,
 179          comm_size;
 180
 181
 182 /********************/
 183 /* Some global info */
 184 /********************/
 185 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
 186          total_local_keys,             /* copies of rank info        */
 187          total_lesser_keys;
 188
 189
 190 int      passed_verification;
 191
 192
 193
 194 /************************************/
 195 /* These are the three main arrays. */
 196 /* See SIZE_OF_BUFFERS def above    */
 197 /************************************/
 198 INT_TYPE key_array[SIZE_OF_BUFFERS],
 199          key_buff1[SIZE_OF_BUFFERS],
 200          key_buff2[SIZE_OF_BUFFERS],
 201          bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE],     /* Top 5 elements for */
 202          bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
 203          bucket_ptrs[NUM_BUCKETS],
 204          process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
 205          process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
 206 int      send_count[MAX_PROCS], recv_count[MAX_PROCS],
 207          send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
 208
 209
 210 /**********************/
 211 /* Partial verif info */
 212 /**********************/
 213 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
 214          test_rank_array[TEST_ARRAY_SIZE];
 215
 216 /**********/
 217 /* Timers */
 218 /**********/
 219 double start[64], elapsed[64];
 220
 221 } global_data;
 222
 223
 224 const INT_TYPE2
 225          S_test_index_array[TEST_ARRAY_SIZE] =
 226                              {48427,17148,23627,62548,4431},
 227          S_test_rank_array[TEST_ARRAY_SIZE] =
 228                              {0,18,346,64917,65463},
 229
 230          W_test_index_array[TEST_ARRAY_SIZE] =
 231                              {357773,934767,875723,898999,404505},
 232          W_test_rank_array[TEST_ARRAY_SIZE] =
 233                              {1249,11698,1039987,1043896,1048018},
 234
 235          A_test_index_array[TEST_ARRAY_SIZE] =
 236                              {2112377,662041,5336171,3642833,4250760},
 237          A_test_rank_array[TEST_ARRAY_SIZE] =
 238                              {104,17523,123928,8288932,8388264},
 239
 240          B_test_index_array[TEST_ARRAY_SIZE] =
 241                              {41869,812306,5102857,18232239,26860214},
 242          B_test_rank_array[TEST_ARRAY_SIZE] =
 243                              {33422937,10244,59149,33135281,99},
 244
 245          C_test_index_array[TEST_ARRAY_SIZE] =
 246                              {44172927,72999161,74326391,129606274,21736814},
 247          C_test_rank_array[TEST_ARRAY_SIZE] =
 248                              {61147,882988,266290,133997595,133525895},
 249
 250          D_test_index_array[TEST_ARRAY_SIZE] =
 251                              {1317351170,995930646,1157283250,1503301535,1453734525},
 252          D_test_rank_array[TEST_ARRAY_SIZE] =
 253                              {1,36538729,1978098519,2145192618,2147425337};
 254
 255
 256
 257 /***********************/
 258 /* function prototypes */
 259 /***********************/
 260 double  randlc( double *X, double *A );
 261
 262 void full_verify( global_data* gd );
 263
 264 void c_print_results( char   *name,
 265                       char   class,
 266                       int    n1,
 267                       int    n2,
 268                       int    n3,
 269                       int    niter,
 270                       int    nprocs_compiled,
 271                       int    nprocs_total,
 272                       double t,
 273                       double mops,
 274           char   *optype,
 275                       int    passed_verification,
 276                       char   *npbversion,
 277                       char   *compiletime,
 278                       char   *mpicc,
 279                       char   *clink,
 280                       char   *cmpi_lib,
 281                       char   *cmpi_inc,
 282                       char   *cflags,
 283                       char   *clinkflags );
 284
 285 void    timer_clear(global_data* gd, int n );
 286 void    timer_start(global_data* gd, int n );
 287 void    timer_stop(global_data* gd, int n );
 288 double  timer_read(global_data* gd, int n );
 289
 290 void    timer_clear(global_data* gd, int n ) {
 291    gd->elapsed[n] = 0.0;
 292 }
 293
 294 void    timer_start(global_data* gd, int n ) {
 295    gd->start[n] = MPI_Wtime();
 296 }
 297
 298 void    timer_stop(global_data* gd, int n ) {
 299    gd->elapsed[n] += MPI_Wtime() - gd->start[n];
 300 }
 301
 302 double  timer_read(global_data* gd, int n ) {
 303    return gd->elapsed[n];
 304 }
 305
 306
 307 /*
 308  *    FUNCTION RANDLC (X, A)
 309  *
 310  *  This routine returns a uniform pseudorandom double precision number in the
 311  *  range (0, 1) by using the linear congruential generator
 312  *
 313  *  x_{k+1} = a x_k  (mod 2^46)
 314  *
 315  *  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 316  *  before repeating.  The argument A is the same as 'a' in the above formula,
 317  *  and X is the same as x_0.  A and X must be odd double precision integers
 318  *  in the range (1, 2^46).  The returned value RANDLC is normalized to be
 319  *  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 320  *  the new seed x_1, so that subsequent calls to RANDLC using the same
 321  *  arguments will generate a continuous sequence.
 322  *
 323  *  This routine should produce the same results on any computer with at least
 324  *  48 mantissa bits in double precision floating point data.  On Cray systems,
 325  *  double precision should be disabled.
 326  *
 327  *  David H. Bailey     October 26, 1990
 328  *
 329  *     IMPLICIT DOUBLE PRECISION (A-H, O-Z)
 330  *     SAVE KS, R23, R46, T23, T46
 331  *     DATA KS/0/
 332  *
 333  *  If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
 334  *  T23 = 2 ^ 23, and T46 = 2 ^ 46.  These are computed in loops, rather than
 335  *  by merely using the ** operator, in order to insure that the results are
 336  *  exact on all systems.  This code assumes that 0.5D0 is represented exactly.
 337  */
 338
 339
 340 /*****************************************************************/
 341 /*************           R  A  N  D  L  C             ************/
 342 /*************                                        ************/
 343 /*************    portable random number generator    ************/
 344 /*****************************************************************/
 345
 346 double  randlc( double *X, double *A )
 347 {
 348       static int        KS=0;
 349       static double  R23, R46, T23, T46;
 350       double    T1, T2, T3, T4;
 351       double    A1;
 352       double    A2;
 353       double    X1;
 354       double    X2;
 355       double    Z;
 356       int         i, j;
 357
 358       if (KS == 0)
 359       {
 360         R23 = 1.0;
 361         R46 = 1.0;
 362         T23 = 1.0;
 363         T46 = 1.0;
 364
 365         for (i=1; i<=23; i++)
 366         {
 367           R23 = 0.50 * R23;
 368           T23 = 2.0 * T23;
 369         }
 370         for (i=1; i<=46; i++)
 371         {
 372           R46 = 0.50 * R46;
 373           T46 = 2.0 * T46;
 374         }
 375         KS = 1;
 376       }
 377
 378 /*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */
 379
 380       T1 = R23 * *A;
 381       j  = T1;
 382       A1 = j;
 383       A2 = *A - T23 * A1;
 384
 385 /*  Break X into two parts such that X = 2^23 * X1 + X2, compute
 386     Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 387     X = 2^23 * Z + A2 * X2  (mod 2^46).                            */
 388
 389       T1 = R23 * *X;
 390       j  = T1;
 391       X1 = j;
 392       X2 = *X - T23 * X1;
 393       T1 = A1 * X2 + A2 * X1;
 394
 395       j  = R23 * T1;
 396       T2 = j;
 397       Z = T1 - T23 * T2;
 398       T3 = T23 * Z + A2 * X2;
 399       j  = R46 * T3;
 400       T4 = j;
 401       *X = T3 - T46 * T4;
 402       return(R46 * *X);
 403 }
 404
 405
 406
 407 /*****************************************************************/
 408 /************   F  I  N  D  _  M  Y  _  S  E  E  D    ************/
 409 /************                                         ************/
 410 /************ returns parallel random number seq seed ************/
 411 /*****************************************************************/
 412
 413 /*
 414  * Create a random number sequence of total length nn residing
 415  * on np number of processors.  Each processor will therefore have a
 416  * subsequence of length nn/np.  This routine returns that random
 417  * number which is the first random number for the subsequence belonging
 418  * to processor rank kn, and which is used as seed for proc kn ran # gen.
 419  */
 420
 421 double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
 422                        int  np,       /* np = num procs                      */
 423                        long nn,       /* total num of ran numbers, all procs */
 424                        double s,      /* Ran num seed, for ex.: 314159265.00 */
 425                        double a )     /* Ran num gen mult, try 1220703125.00 */
 426 {
 427
 428   long   i;
 429
 430   double t1,t2,t3,an;
 431   long   mq,nq,kk,ik;
 432
 433
 434
 435       nq = nn / np;
 436
 437       for( mq=0; nq>1; mq++,nq/=2 )
 438           ;
 439
 440       t1 = a;
 441
 442       for( i=1; i<=mq; i++ )
 443         t2 = randlc( &t1, &t1 );
 444
 445       an = t1;
 446
 447       kk = kn;
 448       t1 = s;
 449       t2 = an;
 450
 451       for( i=1; i<=100; i++ )
 452       {
 453         ik = kk / 2;
 454         if( 2 * ik !=  kk )
 455             t3 = randlc( &t1, &t2 );
 456         if( ik == 0 )
 457             break;
 458         t3 = randlc( &t2, &t2 );
 459         kk = ik;
 460       }
 461
 462       return( t1 );
 463
 464 }
 465
 466
 467
 468
 469 /*****************************************************************/
 470 /*************      C  R  E  A  T  E  _  S  E  Q      ************/
 471 /*****************************************************************/
 472
 473 void  create_seq( global_data* gd, double seed, double a )
 474 {
 475   double x;
 476   int    i, k;
 477
 478         k = MAX_KEY/4;
 479
 480   for (i=0; i<NUM_KEYS; i++)
 481   {
 482       x = randlc(&seed, &a);
 483       x += randlc(&seed, &a);
 484           x += randlc(&seed, &a);
 485       x += randlc(&seed, &a);
 486
 487             gd->key_array[i] = k*x;
 488   }
 489 }
 490
 491
 492
 493
 494 /*****************************************************************/
 495 /*************    F  U  L  L  _  V  E  R  I  F  Y     ************/
 496 /*****************************************************************/
 497
 498
 499 void full_verify( global_data* gd )
 500 {
 501     MPI_Status  status;
 502     MPI_Request request;
 503
 504     INT_TYPE    i, j;
 505     INT_TYPE    k, last_local_key;
 506
 507
 508 /*  Now, finally, sort the keys:  */
 509     for( i=0; i<gd->total_local_keys; i++ )
 510         gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
 511                                  gd->total_lesser_keys] = gd->key_buff2[i];
 512     last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
 513
 514 /*  Send largest key value to next processor  */
 515     if( gd->my_rank > 0 )
 516         MPI_Irecv( &k,
 517                    1,
 518                    MP_KEY_TYPE,
 519                    gd->my_rank-1,
 520                    1000,
 521                    MPI_COMM_WORLD,
 522                    &request );
 523     if( gd->my_rank < gd->comm_size-1 )
 524         MPI_Send( &gd->key_array[last_local_key],
 525                   1,
 526                   MP_KEY_TYPE,
 527                   gd->my_rank+1,
 528                   1000,
 529                   MPI_COMM_WORLD );
 530     if( gd->my_rank > 0 )
 531         MPI_Wait( &request, &status );
 532
 533 /*  Confirm that neighbor's greatest key value
 534     is not greater than my least key value       */
 535     j = 0;
 536     if( gd->my_rank > 0 && gd->total_local_keys > 0 )
 537         if( k > gd->key_array[0] )
 538             j++;
 539
 540
 541 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
 542     for( i=1; i<gd->total_local_keys; i++ )
 543         if( gd->key_array[i-1] > gd->key_array[i] )
 544             j++;
 545
 546
 547     if( j != 0 )
 548     {
 549         printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n",
 550                 gd->my_rank, j );
 551     }
 552     else
 553         gd->passed_verification++;
 554
 555
 556 }
 557
 558
 559
 560
 561 /*****************************************************************/
 562 /*************             R  A  N  K             ****************/
 563 /*****************************************************************/
 564
 565
 566 void rank( global_data* gd, int iteration )
 567 {
 568
 569     INT_TYPE    i, k;
 570
 571     INT_TYPE    shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
 572     INT_TYPE    key;
 573     INT_TYPE2   bucket_sum_accumulator, j, m;
 574     INT_TYPE    local_bucket_sum_accumulator;
 575     INT_TYPE    min_key_val, max_key_val;
 576     INT_TYPE    *key_buff_ptr;
 577
 578
 579
 580
 581 /*  Iteration alteration of keys */
 582     if(gd->my_rank == 0 )
 583     {
 584       gd->key_array[iteration] = iteration;
 585       gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
 586     }
 587
 588
 589 /*  Initialize */
 590     for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
 591     {
 592         gd->bucket_size[i] = 0;
 593         gd->bucket_size_totals[i] = 0;
 594         gd->process_bucket_distrib_ptr1[i] = 0;
 595         gd->process_bucket_distrib_ptr2[i] = 0;
 596     }
 597
 598
 599 /*  Determine where the partial verify test keys are, load into  */
 600 /*  top of array bucket_size                                     */
 601     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 602         if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
 603             gd->bucket_size[NUM_BUCKETS+i] =
 604                           gd->key_array[gd->test_index_array[i] % NUM_KEYS];
 605
 606
 607 /*  Determine the number of keys in each bucket */
 608     for( i=0; i<NUM_KEYS; i++ )
 609         gd->bucket_size[gd->key_array[i] >> shift]++;
 610
 611
 612 /*  Accumulative bucket sizes are the bucket pointers */
 613     gd->bucket_ptrs[0] = 0;
 614     for( i=1; i< NUM_BUCKETS; i++ )
 615         gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
 616
 617
 618 /*  Sort into appropriate bucket */
 619     for( i=0; i<NUM_KEYS; i++ )
 620     {
 621         key = gd->key_array[i];
 622         gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
 623     }
 624
 625 #ifdef  TIMING_ENABLED
 626     timer_stop(gd, 2 );
 627     timer_start(gd, 3 );
 628 #endif
 629
 630 /*  Get the bucket size totals for the entire problem. These
 631     will be used to determine the redistribution of keys      */
 632     MPI_Allreduce( gd->bucket_size,
 633                    gd->bucket_size_totals,
 634                    NUM_BUCKETS+TEST_ARRAY_SIZE,
 635                    MP_KEY_TYPE,
 636                    MPI_SUM,
 637                    MPI_COMM_WORLD );
 638
 639 #ifdef  TIMING_ENABLED
 640     timer_stop(gd, 3 );
 641     timer_start(gd, 2 );
 642 #endif
 643
 644 /*  Determine Redistibution of keys: accumulate the bucket size totals
 645     till this number surpasses NUM_KEYS (which the average number of keys
 646     per processor).  Then all keys in these buckets go to processor 0.
 647     Continue accumulating again until supassing 2*NUM_KEYS. All keys
 648     in these buckets go to processor 1, etc.  This algorithm guarantees
 649     that all processors have work ranking; no processors are left idle.
 650     The optimum number of buckets, however, does not result in as high
 651     a degree of load balancing (as even a distribution of keys as is
 652     possible) as is obtained from increasing the number of buckets, but
 653     more buckets results in more computation per processor so that the
 654     optimum number of buckets turns out to be 1024 for machines tested.
 655     Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
 656     number of first and last bucket which each processor will have after
 657     the redistribution is done.                                          */
 658
 659     bucket_sum_accumulator = 0;
 660     local_bucket_sum_accumulator = 0;
 661     gd->send_displ[0] = 0;
 662     gd->process_bucket_distrib_ptr1[0] = 0;
 663     for( i=0, j=0; i<NUM_BUCKETS; i++ )
 664     {
 665         bucket_sum_accumulator       += gd->bucket_size_totals[i];
 666         local_bucket_sum_accumulator += gd->bucket_size[i];
 667         if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
 668         {
 669             gd->send_count[j] = local_bucket_sum_accumulator;
 670             if( j != 0 )
 671             {
 672                 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
 673                 gd->process_bucket_distrib_ptr1[j] =
 674                                         gd->process_bucket_distrib_ptr2[j-1]+1;
 675             }
 676             gd->process_bucket_distrib_ptr2[j++] = i;
 677             local_bucket_sum_accumulator = 0;
 678         }
 679     }
 680
 681 /*  When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
 682     that the last few processors don't get any buckets.  So, we
 683     need to set counts properly in this case to avoid any fallouts.    */
 684     while( j < gd->comm_size )
 685     {
 686         gd->send_count[j] = 0;
 687         gd->process_bucket_distrib_ptr1[j] = 1;
 688         j++;
 689     }
 690
 691 #ifdef  TIMING_ENABLED
 692     timer_stop(gd, 2 );
 693     timer_start(gd, 3 );
 694 #endif
 695
 696 /*  This is the redistribution section:  first find out how many keys
 697     each processor will send to every other processor:                 */
 698     MPI_Alltoall( gd->send_count,
 699                   1,
 700                   MPI_INT,
 701                   gd->recv_count,
 702                   1,
 703                   MPI_INT,
 704                   MPI_COMM_WORLD );
 705
 706 /*  Determine the receive array displacements for the buckets */
 707     gd->recv_displ[0] = 0;
 708     for( i=1; i<gd->comm_size; i++ )
 709         gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
 710
 711
 712 /*  Now send the keys to respective processors  */
 713     MPI_Alltoallv( gd->key_buff1,
 714                    gd->send_count,
 715                    gd->send_displ,
 716                    MP_KEY_TYPE,
 717                    gd->key_buff2,
 718                    gd->recv_count,
 719                    gd->recv_displ,
 720                    MP_KEY_TYPE,
 721                    MPI_COMM_WORLD );
 722
 723 #ifdef  TIMING_ENABLED
 724     timer_stop(gd, 3 );
 725     timer_start(gd, 2 );
 726 #endif
 727
 728 /*  The starting and ending bucket numbers on each processor are
 729     multiplied by the interval size of the buckets to obtain the
 730     smallest possible min and greatest possible max value of any
 731     key on each processor                                          */
 732     min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
 733     max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
 734
 735 /*  Clear the work array */
 736     for( i=0; i<max_key_val-min_key_val+1; i++ )
 737         gd->key_buff1[i] = 0;
 738
 739 /*  Determine the total number of keys on all other
 740     processors holding keys of lesser value         */
 741     m = 0;
 742     for( k=0; k<gd->my_rank; k++ )
 743         for( i= gd->process_bucket_distrib_ptr1[k];
 744              i<=gd->process_bucket_distrib_ptr2[k];
 745              i++ )
 746             m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
 747
 748 /*  Determine total number of keys on this processor */
 749     j = 0;
 750     for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
 751          i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
 752          i++ )
 753         j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
 754
 755
 756 /*  Ranking of all keys occurs in this section:                 */
 757 /*  shift it backwards so no subtractions are necessary in loop */
 758     key_buff_ptr = gd->key_buff1 - min_key_val;
 759
 760 /*  In this section, the keys themselves are used as their
 761     own indexes to determine how many of each there are: their
 762     individual population                                       */
 763     for( i=0; i<j; i++ )
 764         key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key   */
 765                                        /* population                     */
 766
 767 /*  To obtain ranks of each key, successively add the individual key
 768     population, not forgetting the total of lesser keys, m.
 769     NOTE: Since the total of lesser keys would be subtracted later
 770     in verification, it is no longer added to the first key population
 771     here, but still needed during the partial verify test.  This is to
 772     ensure that 32-bit key_buff can still be used for class D.           */
 773 /*    key_buff_ptr[min_key_val] += m;    */
 774     for( i=min_key_val; i<max_key_val; i++ )
 775         key_buff_ptr[i+1] += key_buff_ptr[i];
 776
 777
 778 /* This is the partial verify test section */
 779 /* Observe that test_rank_array vals are   */
 780 /* shifted differently for different cases */
 781     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 782     {
 783         k = gd->bucket_size_totals[i+NUM_BUCKETS];    /* Keys were hidden here */
 784         if( min_key_val <= k  &&  k <= max_key_val )
 785         {
 786             /* Add the total of lesser keys, m, here */
 787             INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
 788             int failed = 0;
 789
 790             switch( CLASS )
 791             {
 792                 case 'S':
 793                     if( i <= 2 )
 794                     {
 795                         if( key_rank != gd->test_rank_array[i]+iteration )
 796                             failed = 1;
 797                         else
 798                             gd->passed_verification++;
 799                     }
 800                     else
 801                     {
 802                         if( key_rank != gd->test_rank_array[i]-iteration )
 803                             failed = 1;
 804                         else
 805                             gd->passed_verification++;
 806                     }
 807                     break;
 808                 case 'W':
 809                     if( i < 2 )
 810                     {
 811                         if( key_rank != gd->test_rank_array[i]+(iteration-2) )
 812                             failed = 1;
 813                         else
 814                             gd->passed_verification++;
 815                     }
 816                     else
 817                     {
 818                         if( key_rank != gd->test_rank_array[i]-iteration )
 819                             failed = 1;
 820                         else
 821                             gd->passed_verification++;
 822                     }
 823                     break;
 824                 case 'A':
 825                     if( i <= 2 )
 826               {
 827                         if( key_rank != gd->test_rank_array[i]+(iteration-1) )
 828                             failed = 1;
 829                         else
 830                           gd->passed_verification++;
 831               }
 832                     else
 833                     {
 834                         if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
 835                             failed = 1;
 836                         else
 837                             gd->passed_verification++;
 838                     }
 839                     break;
 840                 case 'B':
 841                     if( i == 1 || i == 2 || i == 4 )
 842               {
 843                         if( key_rank != gd->test_rank_array[i]+iteration )
 844                             failed = 1;
 845                         else
 846                             gd->passed_verification++;
 847               }
 848                     else
 849                     {
 850                         if( key_rank != gd->test_rank_array[i]-iteration )
 851                             failed = 1;
 852                         else
 853                             gd->passed_verification++;
 854                     }
 855                     break;
 856                 case 'C':
 857                     if( i <= 2 )
 858               {
 859                         if( key_rank != gd->test_rank_array[i]+iteration )
 860                             failed = 1;
 861                         else
 862                             gd->passed_verification++;
 863               }
 864                     else
 865                     {
 866                         if( key_rank != gd->test_rank_array[i]-iteration )
 867                             failed = 1;
 868                         else
 869                             gd->passed_verification++;
 870                     }
 871                     break;
 872                 case 'D':
 873                     if( i < 2 )
 874               {
 875                         if( key_rank != gd->test_rank_array[i]+iteration )
 876                             failed = 1;
 877                         else
 878                             gd->passed_verification++;
 879               }
 880                     else
 881                     {
 882                         if( key_rank != gd->test_rank_array[i]-iteration )
 883                             failed = 1;
 884                         else
 885                             gd->passed_verification++;
 886                     }
 887                     break;
 888             }
 889             if( failed == 1 )
 890                 printf( "Failed partial verification: "
 891                         "iteration %d, processor %d, test key %d\n",
 892                          iteration, gd->my_rank, (int)i );
 893         }
 894     }
 895
 896
 897
 898
 899 /*  Make copies of rank info for use by full_verify: these variables
 900     in rank are local; making them global slows down the code, probably
 901     since they cannot be made register by compiler                        */
 902
 903     if( iteration == MAX_ITERATIONS )
 904     {
 905         gd->key_buff_ptr_global = key_buff_ptr;
 906         gd->total_local_keys    = j;
 907         gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
 908     }
 909
 910 }
 911
 912
 913 /*****************************************************************/
 914 /*************             M  A  I  N             ****************/
 915 /*****************************************************************/
 916
 917 int main( int argc, char **argv )
 918 {
 919
 920     int             i, iteration, itemp;
 921
 922     double          timecounter, maxtime;
 923
 924     global_data* gd = malloc(sizeof(global_data));
 925 /*  Initialize MPI */
 926     MPI_Init( &argc, &argv );
 927     MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
 928     MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
 929
 930 /*  Initialize the verification arrays if a valid class */
 931     for( i=0; i<TEST_ARRAY_SIZE; i++ )
 932         switch( CLASS )
 933         {
 934             case 'S':
 935                 gd->test_index_array[i] = S_test_index_array[i];
 936                 gd->test_rank_array[i]  = S_test_rank_array[i];
 937                 break;
 938             case 'A':
 939                 gd->test_index_array[i] = A_test_index_array[i];
 940                 gd->test_rank_array[i]  = A_test_rank_array[i];
 941                 break;
 942             case 'W':
 943                 gd->test_index_array[i] = W_test_index_array[i];
 944                 gd->test_rank_array[i]  = W_test_rank_array[i];
 945                 break;
 946             case 'B':
 947                 gd->test_index_array[i] = B_test_index_array[i];
 948                 gd->test_rank_array[i]  = B_test_rank_array[i];
 949                 break;
 950             case 'C':
 951                 gd->test_index_array[i] = C_test_index_array[i];
 952                 gd->test_rank_array[i]  = C_test_rank_array[i];
 953                 break;
 954             case 'D':
 955                 gd->test_index_array[i] = D_test_index_array[i];
 956                 gd->test_rank_array[i]  = D_test_rank_array[i];
 957                 break;
 958         };
 959
 960
 961
 962 /*  Printout initial NPB info */
 963     if( gd->my_rank == 0 )
 964     {
 965         printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
 966         printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
 967         printf( " Iterations:   %d\n", MAX_ITERATIONS );
 968         printf( " Number of processes:     %d\n",gd->comm_size );
 969     }
 970
 971 /*  Check that actual and compiled number of processors agree */
 972     if( gd->comm_size != NUM_PROCS )
 973     {
 974         if( gd->my_rank == 0 )
 975             printf( "\n ERROR: compiled for %d processes\n"
 976                     " Number of active processes: %d\n"
 977                     " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
 978         MPI_Finalize();
 979         exit( 1 );
 980     }
 981
 982 /*  Check to see whether total number of processes is within bounds.
 983     This could in principle be checked in setparams.c, but it is more
 984     convenient to do it here                                               */
 985     if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
 986     {
 987        if( gd->my_rank == 0 )
 988            printf( "\n ERROR: number of processes %d not within range %d-%d"
 989                    "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
 990        MPI_Finalize();
 991        exit( 1 );
 992     }
 993
 994
 995 /*  Generate random number sequence and subsequent keys on all procs */
 996     create_seq(gd,  find_my_seed( gd->my_rank,
 997                               gd->comm_size,
 998                               4*(long)TOTAL_KEYS*MIN_PROCS,
 999                               314159265.00,      /* Random number gen seed */
1000                               1220703125.00 ),   /* Random number gen mult */
1001                 1220703125.00 );                 /* Random number gen mult */
1002
1003 /*  Do one interation for free (i.e., untimed) to guarantee initialization of
1004     all data and code pages and respective tables */
1005     rank(gd, 1 );
1006
1007 /*  Start verification counter */
1008     gd->passed_verification = 0;
1009
1010     if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );
1011
1012 /*  Initialize timer  */
1013     timer_clear(gd, 0 );
1014
1015 /*  Initialize separate communication, computation timing */
1016 #ifdef  TIMING_ENABLED
1017     for( i=1; i<=3; i++ ) timer_clear(gd, i );
1018 #endif
1019
1020 /*  Start timer  */
1021     timer_start(gd, 0 );
1022
1023 #ifdef  TIMING_ENABLED
1024     timer_start(gd, 1 );
1025     timer_start(gd, 2 );
1026 #endif
1027
1028 /*  This is the main iteration */
1029     for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1030     {
1031         if( gd->my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
1032         rank(gd,  iteration );
1033     }
1034
1035
1036 #ifdef  TIMING_ENABLED
1037     timer_stop(gd, 2 );
1038     timer_stop(gd, 1 );
1039 #endif
1040
1041 /*  Stop timer, obtain time for processors */
1042     timer_stop(gd, 0 );
1043
1044     timecounter = timer_read(gd, 0 );
1045
1046 /*  End of timing, obtain maximum time of all processors */
1047     MPI_Reduce( &timecounter,
1048                 &maxtime,
1049                 1,
1050                 MPI_DOUBLE,
1051                 MPI_MAX,
1052                 0,
1053                 MPI_COMM_WORLD );
1054
1055 #ifdef  TIMING_ENABLED
1056     {
1057         double    tmin, tsum, tmax;
1058
1059         if( my_rank == 0 )
1060         {
1061             printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1062             printf( "              min                avg                max\n" );
1063         }
1064         for( i=1; i<=3; i++ )
1065         {
1066             timecounter = timer_read(gd, i );
1067             MPI_Reduce( &timecounter,
1068                         &tmin,
1069                         1,
1070                         MPI_DOUBLE,
1071                         MPI_MIN,
1072                         0,
1073                         MPI_COMM_WORLD );
1074             MPI_Reduce( &timecounter,
1075                         &tsum,
1076                         1,
1077                         MPI_DOUBLE,
1078                         MPI_SUM,
1079                         0,
1080                         MPI_COMM_WORLD );
1081             MPI_Reduce( &timecounter,
1082                         &tmax,
1083                         1,
1084                         MPI_DOUBLE,
1085                         MPI_MAX,
1086                         0,
1087                         MPI_COMM_WORLD );
1088             if( my_rank == 0 )
1089                 printf( "timer %d:    %f           %f            %f\n",
1090                         i, tmin, tsum/((double) comm_size), tmax );
1091         }
1092         if( my_rank == 0 )
1093             printf( "\n" );
1094     }
1095 #endif
1096
1097 /*  This tests that keys are in sequence: sorting of last ranked key seq
1098     occurs here, but is an untimed operation                             */
1099     full_verify(gd);
1100
1101
1102 /*  Obtain verification counter sum */
1103     itemp =gd->passed_verification;
1104     MPI_Reduce( &itemp,
1105                 &gd->passed_verification,
1106                 1,
1107                 MPI_INT,
1108                 MPI_SUM,
1109                 0,
1110                 MPI_COMM_WORLD );
1111
1112
1113
1114 /*  The final printout  */
1115     if( gd->my_rank == 0 )
1116     {
1117         if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1118             gd->passed_verification = 0;
1119         c_print_results( "IS",
1120                          CLASS,
1121                          (int)(TOTAL_KEYS),
1122                          MIN_PROCS,
1123                          0,
1124                          MAX_ITERATIONS,
1125                          NUM_PROCS,
1126                          gd->comm_size,
1127                          maxtime,
1128                          ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1129                                                       /maxtime/1000000.,
1130                          "keys ranked",
1131                          gd->passed_verification,
1132                          NPBVERSION,
1133                          COMPILETIME,
1134                          MPICC,
1135                          CLINK,
1136                          CMPI_LIB,
1137                          CMPI_INC,
1138                          CFLAGS,
1139                          CLINKFLAGS );
1140     }
1141
1142     MPI_Finalize();
1143     free(gd);
1144
1145     return 0;
1146          /**************************/
1147 }        /*  E N D  P R O G R A M  */
1148          /**************************/