1 /*************************************************************************
3 * N A S P A R A L L E L B E N C H M A R K S 3.3 *
7 *************************************************************************
9 * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
10 * It is described in NAS Technical Report 95-020. *
12 * Permission to use, copy, distribute and modify this software *
13 * for any purpose with or without fee is hereby granted. We *
14 * request, however, that all derived work reference the NAS *
15 * Parallel Benchmarks 3.3. This software is provided "as is" *
16 * without express or implied warranty. *
18 * Information on NPB 3.3, including the technical report, the *
19 * original specifications, source code, results and information *
20 * on how to submit new results, is available at: *
22 * http://www.nas.nasa.gov/Software/NPB *
24 * Send comments or suggestions to npb@nas.nasa.gov *
25 * Send bug reports to npb-bugs@nas.nasa.gov *
27 * NAS Parallel Benchmarks Group *
28 * NASA Ames Research Center *
30 * Moffett Field, CA 94035-1000 *
32 * E-mail: npb@nas.nasa.gov *
33 * Fax: (650) 604-3957 *
35 *************************************************************************
40 *************************************************************************/
43 #include "npbparams.h"
61 #define TOTAL_KEYS_LOG_2 16
62 #define MAX_KEY_LOG_2 11
63 #define NUM_BUCKETS_LOG_2 9
71 #define TOTAL_KEYS_LOG_2 20
72 #define MAX_KEY_LOG_2 16
73 #define NUM_BUCKETS_LOG_2 10
80 #define TOTAL_KEYS_LOG_2 23
81 #define MAX_KEY_LOG_2 19
82 #define NUM_BUCKETS_LOG_2 10
90 #define TOTAL_KEYS_LOG_2 25
91 #define MAX_KEY_LOG_2 21
92 #define NUM_BUCKETS_LOG_2 10
100 #define TOTAL_KEYS_LOG_2 27
101 #define MAX_KEY_LOG_2 23
102 #define NUM_BUCKETS_LOG_2 10
110 #define TOTAL_KEYS_LOG_2 29
111 #define MAX_KEY_LOG_2 27
112 #define NUM_BUCKETS_LOG_2 10
118 #define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2)
119 #define MAX_KEY (1 << MAX_KEY_LOG_2)
120 #define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2)
121 #define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
123 /*****************************************************************/
124 /* On larger number of processors, since the keys are (roughly) */
125 /* gaussian distributed, the first and last processor sort keys */
126 /* in a large interval, requiring array sizes to be larger. Note */
127 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
128 /* The required array size also depends on the bucket size used. */
129 /* The following values are validated for the 1024-bucket setup. */
130 /*****************************************************************/
132 #define SIZE_OF_BUFFERS 3*NUM_KEYS/2
133 #elif NUM_PROCS < 512
134 #define SIZE_OF_BUFFERS 5*NUM_KEYS/2
135 #elif NUM_PROCS < 1024
136 #define SIZE_OF_BUFFERS 4*NUM_KEYS
138 #define SIZE_OF_BUFFERS 13*NUM_KEYS/2
141 /*****************************************************************/
142 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
143 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */
144 /* MAX_PROCS AT YOUR PERIL */
145 /*****************************************************************/
147 #define MAX_PROCS 128
149 #define MAX_PROCS 1024
152 #define MAX_ITERATIONS 10
153 #define TEST_ARRAY_SIZE 5
156 /***********************************/
157 /* Enable separate communication, */
158 /* computation timing and printout */
159 /***********************************/
160 /* #define TIMING_ENABLED */
163 /*************************************/
164 /* Typedef: if necessary, change the */
165 /* size of int here by changing the */
166 /* int type to, say, long */
167 /*************************************/
168 typedef int INT_TYPE;
169 typedef long INT_TYPE2;
170 #define MP_KEY_TYPE MPI_INT
175 /********************/
176 /* MPI properties: */
177 /********************/
182 /********************/
183 /* Some global info */
184 /********************/
185 INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
186 total_local_keys, /* copies of rank info */
190 int passed_verification;
194 /************************************/
195 /* These are the three main arrays. */
196 /* See SIZE_OF_BUFFERS def above */
197 /************************************/
198 INT_TYPE key_array[SIZE_OF_BUFFERS],
199 key_buff1[SIZE_OF_BUFFERS],
200 key_buff2[SIZE_OF_BUFFERS],
201 bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */
202 bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
203 bucket_ptrs[NUM_BUCKETS],
204 process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
205 process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
206 int send_count[MAX_PROCS], recv_count[MAX_PROCS],
207 send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
210 /**********************/
211 /* Partial verif info */
212 /**********************/
213 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
214 test_rank_array[TEST_ARRAY_SIZE];
219 double start[64], elapsed[64];
225 S_test_index_array[TEST_ARRAY_SIZE] =
226 {48427,17148,23627,62548,4431},
227 S_test_rank_array[TEST_ARRAY_SIZE] =
228 {0,18,346,64917,65463},
230 W_test_index_array[TEST_ARRAY_SIZE] =
231 {357773,934767,875723,898999,404505},
232 W_test_rank_array[TEST_ARRAY_SIZE] =
233 {1249,11698,1039987,1043896,1048018},
235 A_test_index_array[TEST_ARRAY_SIZE] =
236 {2112377,662041,5336171,3642833,4250760},
237 A_test_rank_array[TEST_ARRAY_SIZE] =
238 {104,17523,123928,8288932,8388264},
240 B_test_index_array[TEST_ARRAY_SIZE] =
241 {41869,812306,5102857,18232239,26860214},
242 B_test_rank_array[TEST_ARRAY_SIZE] =
243 {33422937,10244,59149,33135281,99},
245 C_test_index_array[TEST_ARRAY_SIZE] =
246 {44172927,72999161,74326391,129606274,21736814},
247 C_test_rank_array[TEST_ARRAY_SIZE] =
248 {61147,882988,266290,133997595,133525895},
250 D_test_index_array[TEST_ARRAY_SIZE] =
251 {1317351170,995930646,1157283250,1503301535,1453734525},
252 D_test_rank_array[TEST_ARRAY_SIZE] =
253 {1,36538729,1978098519,2145192618,2147425337};
257 /***********************/
258 /* function prototypes */
259 /***********************/
260 double randlc( double *X, double *A );
262 void full_verify( global_data* gd );
264 void c_print_results( char *name,
275 int passed_verification,
285 void timer_clear(global_data* gd, int n );
286 void timer_start(global_data* gd, int n );
287 void timer_stop(global_data* gd, int n );
288 double timer_read(global_data* gd, int n );
290 void timer_clear(global_data* gd, int n ) {
291 gd->elapsed[n] = 0.0;
294 void timer_start(global_data* gd, int n ) {
295 gd->start[n] = MPI_Wtime();
298 void timer_stop(global_data* gd, int n ) {
299 gd->elapsed[n] += MPI_Wtime() - gd->start[n];
302 double timer_read(global_data* gd, int n ) {
303 return gd->elapsed[n];
308 * FUNCTION RANDLC (X, A)
310 * This routine returns a uniform pseudorandom double precision number in the
311 * range (0, 1) by using the linear congruential generator
313 * x_{k+1} = a x_k (mod 2^46)
315 * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
316 * before repeating. The argument A is the same as 'a' in the above formula,
317 * and X is the same as x_0. A and X must be odd double precision integers
318 * in the range (1, 2^46). The returned value RANDLC is normalized to be
319 * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
320 * the new seed x_1, so that subsequent calls to RANDLC using the same
321 * arguments will generate a continuous sequence.
323 * This routine should produce the same results on any computer with at least
324 * 48 mantissa bits in double precision floating point data. On Cray systems,
325 * double precision should be disabled.
327 * David H. Bailey October 26, 1990
329 * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
330 * SAVE KS, R23, R46, T23, T46
333 * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
334 * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
335 * by merely using the ** operator, in order to insure that the results are
336 * exact on all systems. This code assumes that 0.5D0 is represented exactly.
340 /*****************************************************************/
341 /************* R A N D L C ************/
342 /************* ************/
343 /************* portable random number generator ************/
344 /*****************************************************************/
346 double randlc( double *X, double *A )
349 static double R23, R46, T23, T46;
350 double T1, T2, T3, T4;
365 for (i=1; i<=23; i++)
370 for (i=1; i<=46; i++)
378 /* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
385 /* Break X into two parts such that X = 2^23 * X1 + X2, compute
386 Z = A1 * X2 + A2 * X1 (mod 2^23), and then
387 X = 2^23 * Z + A2 * X2 (mod 2^46). */
393 T1 = A1 * X2 + A2 * X1;
398 T3 = T23 * Z + A2 * X2;
407 /*****************************************************************/
408 /************ F I N D _ M Y _ S E E D ************/
409 /************ ************/
410 /************ returns parallel random number seq seed ************/
411 /*****************************************************************/
414 * Create a random number sequence of total length nn residing
415 * on np number of processors. Each processor will therefore have a
416 * subsequence of length nn/np. This routine returns that random
417 * number which is the first random number for the subsequence belonging
418 * to processor rank kn, and which is used as seed for proc kn ran # gen.
421 double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
422 int np, /* np = num procs */
423 long nn, /* total num of ran numbers, all procs */
424 double s, /* Ran num seed, for ex.: 314159265.00 */
425 double a ) /* Ran num gen mult, try 1220703125.00 */
437 for( mq=0; nq>1; mq++,nq/=2 )
442 for( i=1; i<=mq; i++ )
443 t2 = randlc( &t1, &t1 );
451 for( i=1; i<=100; i++ )
455 t3 = randlc( &t1, &t2 );
458 t3 = randlc( &t2, &t2 );
469 /*****************************************************************/
470 /************* C R E A T E _ S E Q ************/
471 /*****************************************************************/
473 void create_seq( global_data* gd, double seed, double a )
480 for (i=0; i<NUM_KEYS; i++)
482 x = randlc(&seed, &a);
483 x += randlc(&seed, &a);
484 x += randlc(&seed, &a);
485 x += randlc(&seed, &a);
487 gd->key_array[i] = k*x;
494 /*****************************************************************/
495 /************* F U L L _ V E R I F Y ************/
496 /*****************************************************************/
499 void full_verify( global_data* gd )
505 INT_TYPE k, last_local_key;
508 /* Now, finally, sort the keys: */
509 for( i=0; i<gd->total_local_keys; i++ )
510 gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
511 gd->total_lesser_keys] = gd->key_buff2[i];
512 last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
514 /* Send largest key value to next processor */
515 if( gd->my_rank > 0 )
523 if( gd->my_rank < gd->comm_size-1 )
524 MPI_Send( &gd->key_array[last_local_key],
530 if( gd->my_rank > 0 )
531 MPI_Wait( &request, &status );
533 /* Confirm that neighbor's greatest key value
534 is not greater than my least key value */
536 if( gd->my_rank > 0 && gd->total_local_keys > 0 )
537 if( k > gd->key_array[0] )
541 /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
542 for( i=1; i<gd->total_local_keys; i++ )
543 if( gd->key_array[i-1] > gd->key_array[i] )
549 printf( "Processor %d: Full_verify: number of keys out of sort: %d\n",
553 gd->passed_verification++;
561 /*****************************************************************/
562 /************* R A N K ****************/
563 /*****************************************************************/
566 void rank( global_data* gd, int iteration )
571 INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
573 INT_TYPE2 bucket_sum_accumulator, j, m;
574 INT_TYPE local_bucket_sum_accumulator;
575 INT_TYPE min_key_val, max_key_val;
576 INT_TYPE *key_buff_ptr;
581 /* Iteration alteration of keys */
582 if(gd->my_rank == 0 )
584 gd->key_array[iteration] = iteration;
585 gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
590 for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
592 gd->bucket_size[i] = 0;
593 gd->bucket_size_totals[i] = 0;
594 gd->process_bucket_distrib_ptr1[i] = 0;
595 gd->process_bucket_distrib_ptr2[i] = 0;
599 /* Determine where the partial verify test keys are, load into */
600 /* top of array bucket_size */
601 for( i=0; i<TEST_ARRAY_SIZE; i++ )
602 if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
603 gd->bucket_size[NUM_BUCKETS+i] =
604 gd->key_array[gd->test_index_array[i] % NUM_KEYS];
607 /* Determine the number of keys in each bucket */
608 for( i=0; i<NUM_KEYS; i++ )
609 gd->bucket_size[gd->key_array[i] >> shift]++;
612 /* Accumulative bucket sizes are the bucket pointers */
613 gd->bucket_ptrs[0] = 0;
614 for( i=1; i< NUM_BUCKETS; i++ )
615 gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
618 /* Sort into appropriate bucket */
619 for( i=0; i<NUM_KEYS; i++ )
621 key = gd->key_array[i];
622 gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
625 #ifdef TIMING_ENABLED
630 /* Get the bucket size totals for the entire problem. These
631 will be used to determine the redistribution of keys */
632 MPI_Allreduce( gd->bucket_size,
633 gd->bucket_size_totals,
634 NUM_BUCKETS+TEST_ARRAY_SIZE,
639 #ifdef TIMING_ENABLED
644 /* Determine Redistibution of keys: accumulate the bucket size totals
645 till this number surpasses NUM_KEYS (which the average number of keys
646 per processor). Then all keys in these buckets go to processor 0.
647 Continue accumulating again until supassing 2*NUM_KEYS. All keys
648 in these buckets go to processor 1, etc. This algorithm guarantees
649 that all processors have work ranking; no processors are left idle.
650 The optimum number of buckets, however, does not result in as high
651 a degree of load balancing (as even a distribution of keys as is
652 possible) as is obtained from increasing the number of buckets, but
653 more buckets results in more computation per processor so that the
654 optimum number of buckets turns out to be 1024 for machines tested.
655 Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
656 number of first and last bucket which each processor will have after
657 the redistribution is done. */
659 bucket_sum_accumulator = 0;
660 local_bucket_sum_accumulator = 0;
661 gd->send_displ[0] = 0;
662 gd->process_bucket_distrib_ptr1[0] = 0;
663 for( i=0, j=0; i<NUM_BUCKETS; i++ )
665 bucket_sum_accumulator += gd->bucket_size_totals[i];
666 local_bucket_sum_accumulator += gd->bucket_size[i];
667 if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
669 gd->send_count[j] = local_bucket_sum_accumulator;
672 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
673 gd->process_bucket_distrib_ptr1[j] =
674 gd->process_bucket_distrib_ptr2[j-1]+1;
676 gd->process_bucket_distrib_ptr2[j++] = i;
677 local_bucket_sum_accumulator = 0;
681 /* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
682 that the last few processors don't get any buckets. So, we
683 need to set counts properly in this case to avoid any fallouts. */
684 while( j < gd->comm_size )
686 gd->send_count[j] = 0;
687 gd->process_bucket_distrib_ptr1[j] = 1;
691 #ifdef TIMING_ENABLED
696 /* This is the redistribution section: first find out how many keys
697 each processor will send to every other processor: */
698 MPI_Alltoall( gd->send_count,
707 /* Determine the receive array displacements for the buckets */
708 gd->recv_displ[0] = 0;
709 for( i=1; i<gd->comm_size; i++ )
710 gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
714 /* Now send the keys to respective processors */
715 MPI_Alltoallv( gd->key_buff1,
725 #ifdef TIMING_ENABLED
730 /* The starting and ending bucket numbers on each processor are
731 multiplied by the interval size of the buckets to obtain the
732 smallest possible min and greatest possible max value of any
733 key on each processor */
734 min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
735 max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
737 /* Clear the work array */
738 for( i=0; i<max_key_val-min_key_val+1; i++ )
739 gd->key_buff1[i] = 0;
741 /* Determine the total number of keys on all other
742 processors holding keys of lesser value */
744 for( k=0; k<gd->my_rank; k++ )
745 for( i= gd->process_bucket_distrib_ptr1[k];
746 i<=gd->process_bucket_distrib_ptr2[k];
748 m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
750 /* Determine total number of keys on this processor */
752 for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
753 i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
755 j += gd->bucket_size_totals[i]; /* j has total # of local keys */
758 /* Ranking of all keys occurs in this section: */
759 /* shift it backwards so no subtractions are necessary in loop */
760 key_buff_ptr = gd->key_buff1 - min_key_val;
762 /* In this section, the keys themselves are used as their
763 own indexes to determine how many of each there are: their
764 individual population */
766 key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key */
769 /* To obtain ranks of each key, successively add the individual key
770 population, not forgetting the total of lesser keys, m.
771 NOTE: Since the total of lesser keys would be subtracted later
772 in verification, it is no longer added to the first key population
773 here, but still needed during the partial verify test. This is to
774 ensure that 32-bit key_buff can still be used for class D. */
775 /* key_buff_ptr[min_key_val] += m; */
776 for( i=min_key_val; i<max_key_val; i++ )
777 key_buff_ptr[i+1] += key_buff_ptr[i];
780 /* This is the partial verify test section */
781 /* Observe that test_rank_array vals are */
782 /* shifted differently for different cases */
783 for( i=0; i<TEST_ARRAY_SIZE; i++ )
785 k = gd->bucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */
786 if( min_key_val <= k && k <= max_key_val )
788 /* Add the total of lesser keys, m, here */
789 INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
797 if( key_rank != gd->test_rank_array[i]+iteration )
800 gd->passed_verification++;
804 if( key_rank != gd->test_rank_array[i]-iteration )
807 gd->passed_verification++;
813 if( key_rank != gd->test_rank_array[i]+(iteration-2) )
816 gd->passed_verification++;
820 if( key_rank != gd->test_rank_array[i]-iteration )
823 gd->passed_verification++;
829 if( key_rank != gd->test_rank_array[i]+(iteration-1) )
832 gd->passed_verification++;
836 if( key_rank != gd->test_rank_array[i]-(iteration-1) )
839 gd->passed_verification++;
843 if( i == 1 || i == 2 || i == 4 )
845 if( key_rank != gd->test_rank_array[i]+iteration )
848 gd->passed_verification++;
852 if( key_rank != gd->test_rank_array[i]-iteration )
855 gd->passed_verification++;
861 if( key_rank != gd->test_rank_array[i]+iteration )
864 gd->passed_verification++;
868 if( key_rank != gd->test_rank_array[i]-iteration )
871 gd->passed_verification++;
877 if( key_rank != gd->test_rank_array[i]+iteration )
880 gd->passed_verification++;
884 if( key_rank != gd->test_rank_array[i]-iteration )
887 gd->passed_verification++;
892 printf( "Failed partial verification: "
893 "iteration %d, processor %d, test key %d\n",
894 iteration, gd->my_rank, (int)i );
901 /* Make copies of rank info for use by full_verify: these variables
902 in rank are local; making them global slows down the code, probably
903 since they cannot be made register by compiler */
905 if( iteration == MAX_ITERATIONS )
907 gd->key_buff_ptr_global = key_buff_ptr;
908 gd->total_local_keys = j;
909 gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
915 /*****************************************************************/
916 /************* M A I N ****************/
917 /*****************************************************************/
919 int main( int argc, char **argv )
922 int i, iteration, itemp;
924 double timecounter, maxtime;
926 global_data* gd = malloc(sizeof(global_data));
928 MPI_Init( &argc, &argv );
929 MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
930 MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
932 /* Initialize the verification arrays if a valid class */
933 for( i=0; i<TEST_ARRAY_SIZE; i++ )
937 gd->test_index_array[i] = S_test_index_array[i];
938 gd->test_rank_array[i] = S_test_rank_array[i];
941 gd->test_index_array[i] = A_test_index_array[i];
942 gd->test_rank_array[i] = A_test_rank_array[i];
945 gd->test_index_array[i] = W_test_index_array[i];
946 gd->test_rank_array[i] = W_test_rank_array[i];
949 gd->test_index_array[i] = B_test_index_array[i];
950 gd->test_rank_array[i] = B_test_rank_array[i];
953 gd->test_index_array[i] = C_test_index_array[i];
954 gd->test_rank_array[i] = C_test_rank_array[i];
957 gd->test_index_array[i] = D_test_index_array[i];
958 gd->test_rank_array[i] = D_test_rank_array[i];
964 /* Printout initial NPB info */
965 if( gd->my_rank == 0 )
967 printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
968 printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
969 printf( " Iterations: %d\n", MAX_ITERATIONS );
970 printf( " Number of processes: %d\n",gd->comm_size );
973 /* Check that actual and compiled number of processors agree */
974 if( gd->comm_size != NUM_PROCS )
976 if( gd->my_rank == 0 )
977 printf( "\n ERROR: compiled for %d processes\n"
978 " Number of active processes: %d\n"
979 " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
984 /* Check to see whether total number of processes is within bounds.
985 This could in principle be checked in setparams.c, but it is more
986 convenient to do it here */
987 if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
989 if( gd->my_rank == 0 )
990 printf( "\n ERROR: number of processes %d not within range %d-%d"
991 "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
997 /* Generate random number sequence and subsequent keys on all procs */
998 create_seq(gd, find_my_seed( gd->my_rank,
1000 4*(long)TOTAL_KEYS*MIN_PROCS,
1001 314159265.00, /* Random number gen seed */
1002 1220703125.00 ), /* Random number gen mult */
1003 1220703125.00 ); /* Random number gen mult */
1005 /* Do one interation for free (i.e., untimed) to guarantee initialization of
1006 all data and code pages and respective tables */
1009 /* Start verification counter */
1010 gd->passed_verification = 0;
1012 if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" );
1014 /* Initialize timer */
1015 timer_clear(gd, 0 );
1017 /* Initialize separate communication, computation timing */
1018 #ifdef TIMING_ENABLED
1019 for( i=1; i<=3; i++ ) timer_clear(gd, i );
1023 timer_start(gd, 0 );
1025 #ifdef TIMING_ENABLED
1026 timer_start(gd, 1 );
1027 timer_start(gd, 2 );
1030 /* This is the main iteration */
1031 for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1033 if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
1034 rank(gd, iteration );
1038 #ifdef TIMING_ENABLED
1043 /* Stop timer, obtain time for processors */
1046 timecounter = timer_read(gd, 0 );
1048 /* End of timing, obtain maximum time of all processors */
1049 MPI_Reduce( &timecounter,
1057 #ifdef TIMING_ENABLED
1059 double tmin, tsum, tmax;
1063 printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1064 printf( " min avg max\n" );
1066 for( i=1; i<=3; i++ )
1068 timecounter = timer_read(gd, i );
1069 MPI_Reduce( &timecounter,
1076 MPI_Reduce( &timecounter,
1083 MPI_Reduce( &timecounter,
1091 printf( "timer %d: %f %f %f\n",
1092 i, tmin, tsum/((double) comm_size), tmax );
1099 /* This tests that keys are in sequence: sorting of last ranked key seq
1100 occurs here, but is an untimed operation */
1104 /* Obtain verification counter sum */
1105 itemp =gd->passed_verification;
1107 &gd->passed_verification,
1116 /* The final printout */
1117 if( gd->my_rank == 0 )
1119 if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1120 gd->passed_verification = 0;
1121 c_print_results( "IS",
1130 ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1133 gd->passed_verification,
1148 /**************************/
1149 } /* E N D P R O G R A M */
1150 /**************************/