examples/smpi/NAS/is.c

   1 /*************************************************************************
   2  *                                                                       *
   3  *        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3       *
   4  *                                                                       *
   5  *                                  I S                                  *
   6  *                                                                       *
   7  *************************************************************************
   8  *                                                                       *
   9  *   This benchmark is part of the NAS Parallel Benchmark 3.3 suite.     *
  10  *   It is described in NAS Technical Report 95-020.                     *
  11  *                                                                       *
  12  *   Permission to use, copy, distribute and modify this software        *
  13  *   for any purpose with or without fee is hereby granted.  We          *
  14  *   request, however, that all derived work reference the NAS           *
  15  *   Parallel Benchmarks 3.3. This software is provided "as is"          *
  16  *   without express or implied warranty.                                *
  17  *                                                                       *
  18  *   Information on NPB 3.3, including the technical report, the         *
  19  *   original specifications, source code, results and information       *
  20  *   on how to submit new results, is available at:                      *
  21  *                                                                       *
  22  *          http://www.nas.nasa.gov/Software/NPB                         *
  23  *                                                                       *
  24  *   Send comments or suggestions to  npb@nas.nasa.gov                   *
  25  *   Send bug reports to              npb-bugs@nas.nasa.gov              *
  26  *                                                                       *
  27  *         NAS Parallel Benchmarks Group                                 *
  28  *         NASA Ames Research Center                                     *
  29  *         Mail Stop: T27A-1                                             *
  30  *         Moffett Field, CA   94035-1000                                *
  31  *                                                                       *
  32  *         E-mail:  npb@nas.nasa.gov                                     *
  33  *         Fax:     (650) 604-3957                                       *
  34  *                                                                       *
  35  *************************************************************************
  36  *                                                                       *
  37  *   Author: M. Yarrow                                                   *
  38  *           H. Jin                                                      *
  39  *                                                                       *
  40  *************************************************************************/
  41
  42 #include "smpi/mpi.h"
  43 #include "nas_common.h"
  44 #include <stdlib.h>
  45 #include <stdio.h>
  46
  47 #include "simgrid/instr.h" //TRACE_
  48
  49 char class;
  50 int nprocs;
  51 int total_keys_log2;
  52 int max_key_log_2;
  53 int num_bucket_log_2;
  54 int min_procs=1;
  55 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024.
  56  * INCREASE max_procs AT YOUR PERIL */
  57 int max_procs=1024;
  58
  59 int total_keys;
  60 int max_key;
  61 int num_buckets;
  62 int num_keys;
  63 long size_of_buffers;
  64
  65 #define  MAX_ITERATIONS      10
  66 #define  TEST_ARRAY_SIZE     5
  67
  68 /* Typedef: if necessary, change the size of int here by changing the  int type to, say, long */
  69 typedef  int  INT_TYPE;
  70 typedef  long INT_TYPE2;
  71 #define MP_KEY_TYPE MPI_INT
  72
  73 typedef struct {
  74 /* MPI properties:  */
  75 int      my_rank, comm_size;
  76 /* Some global info */
  77 INT_TYPE *key_buff_ptr_global,         /* used by full_verify to get */
  78          total_local_keys,             /* copies of rank info        */
  79          total_lesser_keys;
  80
  81 int      passed_verification;
  82 /* These are the three main arrays. See SIZE_OF_BUFFERS def above    */
  83 INT_TYPE *key_array, *key_buff1, *key_buff2,
  84          *bucket_size,     /* Top 5 elements for */
  85          *bucket_size_totals, /* part. ver. vals */
  86          *bucket_ptrs, *process_bucket_distrib_ptr1, *process_bucket_distrib_ptr2;
  87 int      send_count[1024], recv_count[1024], send_displ[1024], recv_displ[1024];
  88
  89 /* Partial verif info */
  90 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
  91          test_rank_array[TEST_ARRAY_SIZE];
  92 } global_data;
  93
  94 const INT_TYPE2
  95          S_test_index_array[TEST_ARRAY_SIZE] = {48427,17148,23627,62548,4431},
  96          S_test_rank_array[TEST_ARRAY_SIZE] =  {0,18,346,64917,65463},
  97          W_test_index_array[TEST_ARRAY_SIZE] = {357773,934767,875723,898999,404505},
  98          W_test_rank_array[TEST_ARRAY_SIZE] =  {1249,11698,1039987,1043896,1048018},
  99
 100          A_test_index_array[TEST_ARRAY_SIZE] = {2112377,662041,5336171,3642833,4250760},
 101          A_test_rank_array[TEST_ARRAY_SIZE] =  {104,17523,123928,8288932,8388264},
 102
 103          B_test_index_array[TEST_ARRAY_SIZE] = {41869,812306,5102857,18232239,26860214},
 104          B_test_rank_array[TEST_ARRAY_SIZE] =  {33422937,10244,59149,33135281,99},
 105
 106          C_test_index_array[TEST_ARRAY_SIZE] = {44172927,72999161,74326391,129606274,21736814},
 107          C_test_rank_array[TEST_ARRAY_SIZE] =  {61147,882988,266290,133997595,133525895},
 108
 109          D_test_index_array[TEST_ARRAY_SIZE] = {1317351170,995930646,1157283250,1503301535,1453734525},
 110          D_test_rank_array[TEST_ARRAY_SIZE] =  {1,36538729,1978098519,2145192618,2147425337};
 111
 112 void full_verify( global_data* gd );
 113
 114 /************ returns parallel random number seq seed ************/
 115 /*
 116  * Create a random number sequence of total length nn residing on np number of processors.  Each processor will
 117  * therefore have a subsequence of length nn/np.  This routine returns that random number which is the first random
 118  * number for the subsequence belonging to processor rank kn, and which is used as seed for proc kn ran # gen.
 119  */
 120 static double  find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
 121                              int  np,       /* np = num procs                      */
 122                              long nn,       /* total num of ran numbers, all procs */
 123                              double s,      /* Ran num seed, for ex.: 314159265.00 */
 124                              double a )     /* Ran num gen mult, try 1220703125.00 */
 125 {
 126   long   i;
 127   double t1,t2,an;
 128   long   mq,nq,kk,ik;
 129
 130   nq = nn / np;
 131
 132   for( mq=0; nq>1; mq++,nq/=2);
 133
 134   t1 = a;
 135
 136   for( i=1; i<=mq; i++ )
 137     t2 = randlc( &t1, &t1 );
 138
 139   an = t1;
 140
 141   kk = kn;
 142   t1 = s;
 143   t2 = an;
 144
 145   for( i=1; i<=100; i++ ){
 146     ik = kk / 2;
 147     if( 2 * ik !=  kk )
 148       randlc( &t1, &t2 );
 149     if( ik == 0 )
 150       break;
 151     randlc( &t2, &t2 );
 152     kk = ik;
 153   }
 154
 155   return t1;
 156 }
 157
 158 static void create_seq( global_data* gd, double seed, double a )
 159 {
 160   double x;
 161   int    i, k;
 162
 163   k = max_key/4;
 164
 165   for (i=0; i<num_keys; i++){
 166     x = randlc(&seed, &a);
 167     x += randlc(&seed, &a);
 168     x += randlc(&seed, &a);
 169     x += randlc(&seed, &a);
 170
 171     gd->key_array[i] = k*x;
 172   }
 173 }
 174
 175 void full_verify( global_data* gd )
 176 {
 177   MPI_Status  status;
 178   MPI_Request request;
 179
 180   INT_TYPE    i, j;
 181   INT_TYPE    k, last_local_key;
 182
 183 /*  Now, finally, sort the keys:  */
 184   for( i=0; i<gd->total_local_keys; i++ )
 185     gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- gd->total_lesser_keys] = gd->key_buff2[i];
 186   last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
 187
 188 /*  Send largest key value to next processor  */
 189   if( gd->my_rank > 0 )
 190     MPI_Irecv( &k, 1, MP_KEY_TYPE, gd->my_rank-1, 1000, MPI_COMM_WORLD, &request );
 191   if( gd->my_rank < gd->comm_size-1 )
 192     MPI_Send( &gd->key_array[last_local_key], 1, MP_KEY_TYPE, gd->my_rank+1, 1000, MPI_COMM_WORLD );
 193   if( gd->my_rank > 0 )
 194     MPI_Wait( &request, &status );
 195
 196 /*  Confirm that neighbor's greatest key value is not greater than my least key value */
 197   j = 0;
 198   if( gd->my_rank > 0 && gd->total_local_keys > 0 )
 199     if( k > gd->key_array[0] )
 200       j++;
 201
 202 /*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */
 203   for( i=1; i<gd->total_local_keys; i++ )
 204     if( gd->key_array[i-1] > gd->key_array[i] )
 205       j++;
 206
 207   if( j != 0 ) {
 208     printf( "Processor %d:  Full_verify: number of keys out of sort: %d\n", gd->my_rank, j );
 209   } else
 210     gd->passed_verification++;
 211 }
 212
 213 static void rank( global_data* gd, int iteration )
 214 {
 215   INT_TYPE    i, k;
 216   INT_TYPE    shift = max_key_log_2 - num_bucket_log_2;
 217   INT_TYPE    key;
 218   INT_TYPE2   bucket_sum_accumulator, j, m;
 219   INT_TYPE    local_bucket_sum_accumulator;
 220   INT_TYPE    min_key_val, max_key_val;
 221   INT_TYPE    *key_buff_ptr;
 222
 223 /*  Iteration alteration of keys */
 224   if(gd->my_rank == 0){
 225     gd->key_array[iteration] = iteration;
 226     gd->key_array[iteration+MAX_ITERATIONS] = max_key - iteration;
 227   }
 228
 229 /*  Initialize */
 230   for( i=0; i<num_buckets+TEST_ARRAY_SIZE; i++ ){
 231     gd->bucket_size[i] = 0;
 232     gd->bucket_size_totals[i] = 0;
 233     gd->process_bucket_distrib_ptr1[i] = 0;
 234     gd->process_bucket_distrib_ptr2[i] = 0;
 235   }
 236
 237 /*  Determine where the partial verify test keys are, load into top of array bucket_size */
 238   for( i=0; i<TEST_ARRAY_SIZE; i++ )
 239     if( (gd->test_index_array[i]/num_keys) == gd->my_rank )
 240       gd->bucket_size[num_buckets+i] = gd->key_array[gd->test_index_array[i] % num_keys];
 241
 242 /*  Determine the number of keys in each bucket */
 243   for( i=0; i<num_keys; i++ )
 244     gd->bucket_size[gd->key_array[i] >> shift]++;
 245
 246 /*  Accumulative bucket sizes are the bucket pointers */
 247   gd->bucket_ptrs[0] = 0;
 248   for( i=1; i< num_buckets; i++ )
 249     gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
 250
 251 /*  Sort into appropriate bucket */
 252   for( i=0; i<num_keys; i++ ) {
 253     key = gd->key_array[i];
 254     gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
 255   }
 256
 257 /*  Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */
 258   MPI_Allreduce(gd->bucket_size, gd->bucket_size_totals, num_buckets+TEST_ARRAY_SIZE, MP_KEY_TYPE, MPI_SUM,
 259                 MPI_COMM_WORLD);
 260
 261 /* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses num_keys (which the
 262  * average number of keys per processor).  Then all keys in these buckets go to processor 0.
 263    Continue accumulating again until supassing 2*num_keys. All keys in these buckets go to processor 1, etc.  This
 264    algorithm guarantees that all processors have work ranking; no processors are left idle.
 265    The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution
 266    of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more
 267    computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested.
 268    Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each
 269    processor will have after the redistribution is done.
 270 */
 271
 272   bucket_sum_accumulator = 0;
 273   local_bucket_sum_accumulator = 0;
 274   gd->send_displ[0] = 0;
 275   gd->process_bucket_distrib_ptr1[0] = 0;
 276   for( i=0, j=0; i<num_buckets; i++ ) {
 277     bucket_sum_accumulator       += gd->bucket_size_totals[i];
 278     local_bucket_sum_accumulator += gd->bucket_size[i];
 279     if( bucket_sum_accumulator >= (j+1)*num_keys ) {
 280       gd->send_count[j] = local_bucket_sum_accumulator;
 281       if( j != 0 ){
 282          gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
 283          gd->process_bucket_distrib_ptr1[j] = gd->process_bucket_distrib_ptr2[j-1]+1;
 284       }
 285       gd->process_bucket_distrib_ptr2[j++] = i;
 286       local_bucket_sum_accumulator = 0;
 287     }
 288   }
 289
 290 /*  When nprocs approaching num_buckets, it is highly possible that the last few processors don't get any buckets.
 291  *  So, we need to set counts properly in this case to avoid any fallouts.    */
 292   while( j < gd->comm_size ) {
 293     gd->send_count[j] = 0;
 294     gd->process_bucket_distrib_ptr1[j] = 1;
 295     j++;
 296   }
 297
 298 /*  This is the redistribution section:  first find out how many keys
 299     each processor will send to every other processor:                 */
 300   MPI_Alltoall( gd->send_count, 1, MPI_INT, gd->recv_count, 1, MPI_INT, MPI_COMM_WORLD );
 301
 302 /*  Determine the receive array displacements for the buckets */
 303   gd->recv_displ[0] = 0;
 304   for( i=1; i<gd->comm_size; i++ )
 305     gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
 306
 307   /*  Now send the keys to respective processors  */
 308   MPI_Alltoallv(gd->key_buff1, gd->send_count, gd->send_displ, MP_KEY_TYPE, gd->key_buff2, gd->recv_count,
 309                 gd->recv_displ, MP_KEY_TYPE, MPI_COMM_WORLD );
 310
 311 /* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to
 312  * obtain the smallest possible min and greatest possible max value of any key on each processor
 313  */
 314   min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
 315   max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
 316
 317 /*  Clear the work array */
 318   for( i=0; i<max_key_val-min_key_val+1; i++ )
 319     gd->key_buff1[i] = 0;
 320
 321 /*  Determine the total number of keys on all other processors holding keys of lesser value         */
 322   m = 0;
 323   for( k=0; k<gd->my_rank; k++ )
 324     for( i= gd->process_bucket_distrib_ptr1[k]; i<=gd->process_bucket_distrib_ptr2[k]; i++ )
 325       m += gd->bucket_size_totals[i]; /*  m has total # of lesser keys */
 326
 327 /*  Determine total number of keys on this processor */
 328   j = 0;
 329   for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; i++ )
 330     j += gd->bucket_size_totals[i];     /* j has total # of local keys   */
 331
 332 /*  Ranking of all keys occurs in this section:                 */
 333 /*  shift it backwards so no subtractions are necessary in loop */
 334   key_buff_ptr = gd->key_buff1 - min_key_val;
 335
 336 /*  In this section, the keys themselves are used as their own indexes to determine how many of each there are: their
 337     individual population                                       */
 338   for( i=0; i<j; i++ )
 339     key_buff_ptr[gd->key_buff2[i]]++;  /* Now they have individual key  population                     */
 340
 341 /*  To obtain ranks of each key, successively add the individual key population, not forgetting the total of lesser
 342  *  keys, m.
 343     NOTE: Since the total of lesser keys would be subtracted later in verification, it is no longer added to the first
 344     key population here, but still needed during the partial verify test.  This is to ensure that 32-bit key_buff can
 345     still be used for class D.           */
 346 /*    key_buff_ptr[min_key_val] += m;    */
 347   for( i=min_key_val; i<max_key_val; i++ )
 348     key_buff_ptr[i+1] += key_buff_ptr[i];
 349
 350 /* This is the partial verify test section */
 351 /* Observe that test_rank_array vals are shifted differently for different cases */
 352   for( i=0; i<TEST_ARRAY_SIZE; i++ ){
 353     k = gd->bucket_size_totals[i+num_buckets];    /* Keys were hidden here */
 354     if( min_key_val <= k  &&  k <= max_key_val ){
 355       /* Add the total of lesser keys, m, here */
 356       INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
 357       int failed = 0;
 358
 359       switch( class ){
 360         case 'S':
 361           if( i <= 2 ) {
 362             if( key_rank != gd->test_rank_array[i]+iteration )
 363               failed = 1;
 364             else
 365               gd->passed_verification++;
 366           } else {
 367             if( key_rank != gd->test_rank_array[i]-iteration )
 368               failed = 1;
 369             else
 370               gd->passed_verification++;
 371           }
 372           break;
 373         case 'W':
 374           if( i < 2 ){
 375             if( key_rank != gd->test_rank_array[i]+(iteration-2) )
 376               failed = 1;
 377             else
 378               gd->passed_verification++;
 379           } else {
 380               if( key_rank != gd->test_rank_array[i]-iteration )
 381                 failed = 1;
 382               else
 383                 gd->passed_verification++;
 384           }
 385           break;
 386         case 'A':
 387           if( i <= 2 ){
 388             if( key_rank != gd->test_rank_array[i]+(iteration-1) )
 389               failed = 1;
 390             else
 391               gd->passed_verification++;
 392           } else {
 393               if( key_rank !=  gd->test_rank_array[i]-(iteration-1) )
 394                 failed = 1;
 395               else
 396                 gd->passed_verification++;
 397           }
 398           break;
 399         case 'B':
 400           if( i == 1 || i == 2 || i == 4 ) {
 401             if( key_rank != gd->test_rank_array[i]+iteration )
 402               failed = 1;
 403             else
 404               gd->passed_verification++;
 405           } else {
 406               if( key_rank != gd->test_rank_array[i]-iteration )
 407                 failed = 1;
 408               else
 409                 gd->passed_verification++;
 410           }
 411           break;
 412         case 'C':
 413           if( i <= 2 ){
 414             if( key_rank != gd->test_rank_array[i]+iteration )
 415               failed = 1;
 416             else
 417               gd->passed_verification++;
 418           } else {
 419               if( key_rank != gd->test_rank_array[i]-iteration )
 420                 failed = 1;
 421               else
 422                 gd->passed_verification++;
 423           }
 424           break;
 425         case 'D':
 426           if( i < 2 ) {
 427             if( key_rank != gd->test_rank_array[i]+iteration )
 428               failed = 1;
 429             else
 430               gd->passed_verification++;
 431            } else {
 432               if( key_rank != gd->test_rank_array[i]-iteration )
 433                 failed = 1;
 434               else
 435                 gd->passed_verification++;
 436            }
 437          break;
 438       }
 439       if( failed == 1 )
 440         printf( "Failed partial verification: iteration %d, processor %d, test key %d\n",
 441                iteration, gd->my_rank, (int)i );
 442     }
 443   }
 444
 445 /*  Make copies of rank info for use by full_verify: these variables in rank are local; making them global slows down
 446  *  the code, probably since they cannot be made register by compiler                        */
 447
 448   if( iteration == MAX_ITERATIONS ) {
 449     gd->key_buff_ptr_global = key_buff_ptr;
 450     gd->total_local_keys    = j;
 451     gd->total_lesser_keys   = 0;  /* no longer set to 'm', see note above */
 452   }
 453 }
 454
 455 int main( int argc, char **argv )
 456 {
 457   int             i, iteration, itemp;
 458   double          timecounter, maxtime;
 459
 460   global_data* gd = malloc(sizeof(global_data));
 461 /*  Initialize MPI */
 462   MPI_Init( &argc, &argv );
 463   MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
 464   MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
 465
 466   get_info(argc, argv, &nprocs, &class);
 467   check_info(IS, nprocs, class);
 468 /*  Initialize the verification arrays if a valid class */
 469   for( i=0; i<TEST_ARRAY_SIZE; i++ )
 470
 471     switch( class ) {
 472       case 'S':
 473          total_keys_log2 = 16;
 474          max_key_log_2 = 11;
 475          num_bucket_log_2 = 9;
 476          max_procs = 128;
 477          gd->test_index_array[i] = S_test_index_array[i];
 478          gd->test_rank_array[i]  = S_test_rank_array[i];
 479          break;
 480       case 'A':
 481          total_keys_log2 = 23;
 482          max_key_log_2 = 19;
 483          num_bucket_log_2 = 10;
 484          gd->test_index_array[i] = A_test_index_array[i];
 485          gd->test_rank_array[i]  = A_test_rank_array[i];
 486          break;
 487       case 'W':
 488           total_keys_log2 = 20;
 489           max_key_log_2 = 16;
 490           num_bucket_log_2 = 10;
 491          gd->test_index_array[i] = W_test_index_array[i];
 492          gd->test_rank_array[i]  = W_test_rank_array[i];
 493          break;
 494       case 'B':
 495           total_keys_log2 = 25;
 496           max_key_log_2 = 21;
 497           num_bucket_log_2 = 10;
 498          gd->test_index_array[i] = B_test_index_array[i];
 499          gd->test_rank_array[i]  = B_test_rank_array[i];
 500          break;
 501       case 'C':
 502           total_keys_log2 = 27;
 503           max_key_log_2 = 23;
 504           num_bucket_log_2 = 10;
 505          gd->test_index_array[i] = C_test_index_array[i];
 506          gd->test_rank_array[i]  = C_test_rank_array[i];
 507          break;
 508       case 'D':
 509           total_keys_log2 = 29;
 510           max_key_log_2 = 27;
 511           num_bucket_log_2 = 10;
 512          min_procs = 4;
 513          gd->test_index_array[i] = D_test_index_array[i];
 514          gd->test_rank_array[i]  = D_test_rank_array[i];
 515          break;
 516     };
 517
 518   total_keys  = (1 << total_keys_log2);
 519   max_key     = (1 << max_key_log_2);
 520   num_buckets = (1 << num_bucket_log_2);
 521   num_keys    = (total_keys/nprocs*min_procs);
 522
 523   /* On larger number of processors, since the keys are (roughly)  gaussian distributed, the first and last processor
 524    * sort keys in a large interval, requiring array sizes to be larger. Note that for large NUM_PROCS, num_keys is,
 525    * however, a small number The required array size also depends on the bucket size used. The following values are
 526    * validated for the 1024-bucket setup. */
 527   if (nprocs < 256)
 528     size_of_buffers = 3*num_keys/2;
 529   else if (nprocs < 512)
 530     size_of_buffers = 5*num_keys/2;
 531   else if (nprocs < 1024)
 532     size_of_buffers = 4*num_keys/2;
 533   else
 534     size_of_buffers = 13*num_keys/2;
 535
 536   gd->key_array = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 537   gd->key_buff1 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 538   gd->key_buff2 = (INT_TYPE*)malloc(size_of_buffers*sizeof(INT_TYPE));
 539   gd->bucket_size = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));     /* Top 5 elements for */
 540   gd->bucket_size_totals = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE)); /* part. ver. vals */
 541   gd->bucket_ptrs = (INT_TYPE*)malloc(num_buckets*sizeof(INT_TYPE));
 542   gd->process_bucket_distrib_ptr1 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
 543   gd->process_bucket_distrib_ptr2 = (INT_TYPE*)malloc((num_buckets+TEST_ARRAY_SIZE)*sizeof(INT_TYPE));
 544 //  int      send_count[max_procs], recv_count[max_procs],
 545 //           send_displ[max_procs], recv_displ[max_procs];
 546
 547 /*  Printout initial NPB info */
 548   if( gd->my_rank == 0 ){
 549      printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
 550      printf( " Size:  %ld  (class %c)\n", (long)total_keys*min_procs, class);
 551      printf( " Iterations:   %d\n", MAX_ITERATIONS );
 552      printf( " Number of processes:     %d\n",gd->comm_size );
 553   }
 554
 555 /*  Check that actual and compiled number of processors agree */
 556   if( gd->comm_size != nprocs) {
 557     if( gd->my_rank == 0 )
 558        printf( "\n ERROR: compiled for %d processes\n"
 559                " Number of active processes: %d\n"
 560                " Exiting program!\n\n", nprocs, gd->comm_size );
 561     MPI_Finalize();
 562     exit( 1 );
 563   }
 564
 565 /*  Check to see whether total number of processes is within bounds.
 566     This could in principle be checked in setparams.c, but it is more convenient to do it here */
 567   if( gd->comm_size < min_procs || gd->comm_size > max_procs){
 568     if( gd->my_rank == 0 )
 569       printf( "\n ERROR: number of processes %d not within range %d-%d"
 570               "\n Exiting program!\n\n", gd->comm_size, min_procs, max_procs);
 571     MPI_Finalize();
 572     exit( 1 );
 573   }
 574
 575 /*  Generate random number sequence and subsequent keys on all procs */
 576   create_seq(gd,  find_my_seed( gd->my_rank, gd->comm_size, 4*(long)total_keys*min_procs,
 577              314159265.00,      /* Random number gen seed */
 578              1220703125.00 ),   /* Random number gen mult */
 579              1220703125.00 );   /* Random number gen mult */
 580
 581 /*  Do one interation for free (i.e., untimed) to guarantee initialization of
 582     all data and code pages and respective tables */
 583   rank(gd, 1 );
 584
 585 /*  Start verification counter */
 586   gd->passed_verification = 0;
 587
 588   if( gd->my_rank == 0 && class != 'S' ) printf( "\n   iteration\n" );
 589
 590 /*  Initialize timer  */
 591   timer_clear(0);
 592
 593 /*  Start timer */
 594   timer_start(0);
 595
 596   char smpi_category[100];
 597   snprintf (smpi_category, 100, "%d", gd->my_rank);
 598   TRACE_smpi_set_category (smpi_category);
 599
 600 /*  This is the main iteration */
 601   for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) {
 602     if( gd->my_rank == 0 && class != 'S' ) printf( "        %d\n", iteration );
 603     rank(gd,  iteration );
 604   }
 605   TRACE_smpi_set_category (NULL);
 606
 607 /*  Stop timer, obtain time for processors */
 608   timer_stop(0);
 609
 610   timecounter = timer_read(0);
 611
 612 /*  End of timing, obtain maximum time of all processors */
 613   MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
 614
 615 /*  This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */
 616   full_verify(gd);
 617
 618 /*  Obtain verification counter sum */
 619   itemp =gd->passed_verification;
 620   MPI_Reduce( &itemp, &gd->passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );
 621
 622 /*  The final printout  */
 623   if( gd->my_rank == 0 ) {
 624     if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
 625       gd->passed_verification = 0;
 626     c_print_results("IS", class, (int)(total_keys), min_procs, 0, MAX_ITERATIONS, nprocs, gd->comm_size, maxtime,
 627                     ((double) (MAX_ITERATIONS)*total_keys*min_procs)/maxtime/1000000., "keys ranked",
 628                     gd->passed_verification);
 629   }
 630
 631   MPI_Finalize();
 632   free(gd);
 633
 634   return 0;
 635 }