1 /*************************************************************************
3 * N A S P A R A L L E L B E N C H M A R K S 3.3 *
7 *************************************************************************
9 * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
10 * It is described in NAS Technical Report 95-020. *
12 * Permission to use, copy, distribute and modify this software *
13 * for any purpose with or without fee is hereby granted. We *
14 * request, however, that all derived work reference the NAS *
15 * Parallel Benchmarks 3.3. This software is provided "as is" *
16 * without express or implied warranty. *
18 * Information on NPB 3.3, including the technical report, the *
19 * original specifications, source code, results and information *
20 * on how to submit new results, is available at: *
22 * http://www.nas.nasa.gov/Software/NPB *
24 * Send comments or suggestions to npb@nas.nasa.gov *
25 * Send bug reports to npb-bugs@nas.nasa.gov *
27 * NAS Parallel Benchmarks Group *
28 * NASA Ames Research Center *
30 * Moffett Field, CA 94035-1000 *
32 * E-mail: npb@nas.nasa.gov *
33 * Fax: (650) 604-3957 *
35 *************************************************************************
40 *************************************************************************/
43 #include "npbparams.h"
61 #define TOTAL_KEYS_LOG_2 16
62 #define MAX_KEY_LOG_2 11
63 #define NUM_BUCKETS_LOG_2 9
71 #define TOTAL_KEYS_LOG_2 20
72 #define MAX_KEY_LOG_2 16
73 #define NUM_BUCKETS_LOG_2 10
80 #define TOTAL_KEYS_LOG_2 23
81 #define MAX_KEY_LOG_2 19
82 #define NUM_BUCKETS_LOG_2 10
90 #define TOTAL_KEYS_LOG_2 25
91 #define MAX_KEY_LOG_2 21
92 #define NUM_BUCKETS_LOG_2 10
100 #define TOTAL_KEYS_LOG_2 27
101 #define MAX_KEY_LOG_2 23
102 #define NUM_BUCKETS_LOG_2 10
110 #define TOTAL_KEYS_LOG_2 29
111 #define MAX_KEY_LOG_2 27
112 #define NUM_BUCKETS_LOG_2 10
118 #define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2)
119 #define MAX_KEY (1 << MAX_KEY_LOG_2)
120 #define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2)
121 #define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
123 /*****************************************************************/
124 /* On larger number of processors, since the keys are (roughly) */
125 /* gaussian distributed, the first and last processor sort keys */
126 /* in a large interval, requiring array sizes to be larger. Note */
127 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
128 /* The required array size also depends on the bucket size used. */
129 /* The following values are validated for the 1024-bucket setup. */
130 /*****************************************************************/
132 #define SIZE_OF_BUFFERS 3*NUM_KEYS/2
133 #elif NUM_PROCS < 512
134 #define SIZE_OF_BUFFERS 5*NUM_KEYS/2
135 #elif NUM_PROCS < 1024
136 #define SIZE_OF_BUFFERS 4*NUM_KEYS
138 #define SIZE_OF_BUFFERS 13*NUM_KEYS/2
141 /*****************************************************************/
142 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
143 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */
144 /* MAX_PROCS AT YOUR PERIL */
145 /*****************************************************************/
147 #define MAX_PROCS 128
149 #define MAX_PROCS 1024
152 #define MAX_ITERATIONS 10
153 #define TEST_ARRAY_SIZE 5
156 /***********************************/
157 /* Enable separate communication, */
158 /* computation timing and printout */
159 /***********************************/
160 /* #define TIMING_ENABLED */
163 /*************************************/
164 /* Typedef: if necessary, change the */
165 /* size of int here by changing the */
166 /* int type to, say, long */
167 /*************************************/
168 typedef int INT_TYPE;
169 typedef long INT_TYPE2;
170 #define MP_KEY_TYPE MPI_INT
175 /********************/
176 /* MPI properties: */
177 /********************/
182 /********************/
183 /* Some global info */
184 /********************/
185 INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
186 total_local_keys, /* copies of rank info */
190 int passed_verification;
194 /************************************/
195 /* These are the three main arrays. */
196 /* See SIZE_OF_BUFFERS def above */
197 /************************************/
198 INT_TYPE key_array[SIZE_OF_BUFFERS],
199 key_buff1[SIZE_OF_BUFFERS],
200 key_buff2[SIZE_OF_BUFFERS],
201 bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */
202 bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
203 bucket_ptrs[NUM_BUCKETS],
204 process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
205 process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
206 int send_count[MAX_PROCS], recv_count[MAX_PROCS],
207 send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
210 /**********************/
211 /* Partial verif info */
212 /**********************/
213 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
214 test_rank_array[TEST_ARRAY_SIZE];
219 double start[64], elapsed[64];
225 S_test_index_array[TEST_ARRAY_SIZE] =
226 {48427,17148,23627,62548,4431},
227 S_test_rank_array[TEST_ARRAY_SIZE] =
228 {0,18,346,64917,65463},
230 W_test_index_array[TEST_ARRAY_SIZE] =
231 {357773,934767,875723,898999,404505},
232 W_test_rank_array[TEST_ARRAY_SIZE] =
233 {1249,11698,1039987,1043896,1048018},
235 A_test_index_array[TEST_ARRAY_SIZE] =
236 {2112377,662041,5336171,3642833,4250760},
237 A_test_rank_array[TEST_ARRAY_SIZE] =
238 {104,17523,123928,8288932,8388264},
240 B_test_index_array[TEST_ARRAY_SIZE] =
241 {41869,812306,5102857,18232239,26860214},
242 B_test_rank_array[TEST_ARRAY_SIZE] =
243 {33422937,10244,59149,33135281,99},
245 C_test_index_array[TEST_ARRAY_SIZE] =
246 {44172927,72999161,74326391,129606274,21736814},
247 C_test_rank_array[TEST_ARRAY_SIZE] =
248 {61147,882988,266290,133997595,133525895},
250 D_test_index_array[TEST_ARRAY_SIZE] =
251 {1317351170,995930646,1157283250,1503301535,1453734525},
252 D_test_rank_array[TEST_ARRAY_SIZE] =
253 {1,36538729,1978098519,2145192618,2147425337};
257 /***********************/
258 /* function prototypes */
259 /***********************/
260 double randlc( double *X, double *A );
262 void full_verify( global_data* gd );
264 void c_print_results( char *name,
275 int passed_verification,
285 void timer_clear(global_data* gd, int n );
286 void timer_start(global_data* gd, int n );
287 void timer_stop(global_data* gd, int n );
288 double timer_read(global_data* gd, int n );
290 void timer_clear(global_data* gd, int n ) {
291 gd->elapsed[n] = 0.0;
294 void timer_start(global_data* gd, int n ) {
295 gd->start[n] = MPI_Wtime();
298 void timer_stop(global_data* gd, int n ) {
299 gd->elapsed[n] += MPI_Wtime() - gd->start[n];
302 double timer_read(global_data* gd, int n ) {
303 return gd->elapsed[n];
308 * FUNCTION RANDLC (X, A)
310 * This routine returns a uniform pseudorandom double precision number in the
311 * range (0, 1) by using the linear congruential generator
313 * x_{k+1} = a x_k (mod 2^46)
315 * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
316 * before repeating. The argument A is the same as 'a' in the above formula,
317 * and X is the same as x_0. A and X must be odd double precision integers
318 * in the range (1, 2^46). The returned value RANDLC is normalized to be
319 * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
320 * the new seed x_1, so that subsequent calls to RANDLC using the same
321 * arguments will generate a continuous sequence.
323 * This routine should produce the same results on any computer with at least
324 * 48 mantissa bits in double precision floating point data. On Cray systems,
325 * double precision should be disabled.
327 * David H. Bailey October 26, 1990
329 * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
330 * SAVE KS, R23, R46, T23, T46
333 * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
334 * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
335 * by merely using the ** operator, in order to insure that the results are
336 * exact on all systems. This code assumes that 0.5D0 is represented exactly.
340 /*****************************************************************/
341 /************* R A N D L C ************/
342 /************* ************/
343 /************* portable random number generator ************/
344 /*****************************************************************/
346 double randlc( double *X, double *A )
349 static double R23, R46, T23, T46;
350 double T1, T2, T3, T4;
365 for (i=1; i<=23; i++)
370 for (i=1; i<=46; i++)
378 /* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
385 /* Break X into two parts such that X = 2^23 * X1 + X2, compute
386 Z = A1 * X2 + A2 * X1 (mod 2^23), and then
387 X = 2^23 * Z + A2 * X2 (mod 2^46). */
393 T1 = A1 * X2 + A2 * X1;
398 T3 = T23 * Z + A2 * X2;
407 /*****************************************************************/
408 /************ F I N D _ M Y _ S E E D ************/
409 /************ ************/
410 /************ returns parallel random number seq seed ************/
411 /*****************************************************************/
414 * Create a random number sequence of total length nn residing
415 * on np number of processors. Each processor will therefore have a
416 * subsequence of length nn/np. This routine returns that random
417 * number which is the first random number for the subsequence belonging
418 * to processor rank kn, and which is used as seed for proc kn ran # gen.
421 double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
422 int np, /* np = num procs */
423 long nn, /* total num of ran numbers, all procs */
424 double s, /* Ran num seed, for ex.: 314159265.00 */
425 double a ) /* Ran num gen mult, try 1220703125.00 */
437 for( mq=0; nq>1; mq++,nq/=2 )
442 for( i=1; i<=mq; i++ )
443 t2 = randlc( &t1, &t1 );
451 for( i=1; i<=100; i++ )
455 t3 = randlc( &t1, &t2 );
458 t3 = randlc( &t2, &t2 );
469 /*****************************************************************/
470 /************* C R E A T E _ S E Q ************/
471 /*****************************************************************/
473 void create_seq( global_data* gd, double seed, double a )
480 for (i=0; i<NUM_KEYS; i++)
482 x = randlc(&seed, &a);
483 x += randlc(&seed, &a);
484 x += randlc(&seed, &a);
485 x += randlc(&seed, &a);
487 gd->key_array[i] = k*x;
494 /*****************************************************************/
495 /************* F U L L _ V E R I F Y ************/
496 /*****************************************************************/
499 void full_verify( global_data* gd )
505 INT_TYPE k, last_local_key;
508 /* Now, finally, sort the keys: */
509 for( i=0; i<gd->total_local_keys; i++ )
510 gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
511 gd->total_lesser_keys] = gd->key_buff2[i];
512 last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
514 /* Send largest key value to next processor */
515 if( gd->my_rank > 0 )
523 if( gd->my_rank < gd->comm_size-1 )
524 MPI_Send( &gd->key_array[last_local_key],
530 if( gd->my_rank > 0 )
531 MPI_Wait( &request, &status );
533 /* Confirm that neighbor's greatest key value
534 is not greater than my least key value */
536 if( gd->my_rank > 0 && gd->total_local_keys > 0 )
537 if( k > gd->key_array[0] )
541 /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
542 for( i=1; i<gd->total_local_keys; i++ )
543 if( gd->key_array[i-1] > gd->key_array[i] )
549 printf( "Processor %d: Full_verify: number of keys out of sort: %d\n",
553 gd->passed_verification++;
561 /*****************************************************************/
562 /************* R A N K ****************/
563 /*****************************************************************/
566 void rank( global_data* gd, int iteration )
571 INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
573 INT_TYPE2 bucket_sum_accumulator, j, m;
574 INT_TYPE local_bucket_sum_accumulator;
575 INT_TYPE min_key_val, max_key_val;
576 INT_TYPE *key_buff_ptr;
581 /* Iteration alteration of keys */
582 if(gd->my_rank == 0 )
584 gd->key_array[iteration] = iteration;
585 gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
590 for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
592 gd->bucket_size[i] = 0;
593 gd->bucket_size_totals[i] = 0;
594 gd->process_bucket_distrib_ptr1[i] = 0;
595 gd->process_bucket_distrib_ptr2[i] = 0;
599 /* Determine where the partial verify test keys are, load into */
600 /* top of array bucket_size */
601 for( i=0; i<TEST_ARRAY_SIZE; i++ )
602 if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
603 gd->bucket_size[NUM_BUCKETS+i] =
604 gd->key_array[gd->test_index_array[i] % NUM_KEYS];
607 /* Determine the number of keys in each bucket */
608 for( i=0; i<NUM_KEYS; i++ )
609 gd->bucket_size[gd->key_array[i] >> shift]++;
612 /* Accumulative bucket sizes are the bucket pointers */
613 gd->bucket_ptrs[0] = 0;
614 for( i=1; i< NUM_BUCKETS; i++ )
615 gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
618 /* Sort into appropriate bucket */
619 for( i=0; i<NUM_KEYS; i++ )
621 key = gd->key_array[i];
622 gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
625 #ifdef TIMING_ENABLED
630 /* Get the bucket size totals for the entire problem. These
631 will be used to determine the redistribution of keys */
632 MPI_Allreduce( gd->bucket_size,
633 gd->bucket_size_totals,
634 NUM_BUCKETS+TEST_ARRAY_SIZE,
639 #ifdef TIMING_ENABLED
644 /* Determine Redistibution of keys: accumulate the bucket size totals
645 till this number surpasses NUM_KEYS (which the average number of keys
646 per processor). Then all keys in these buckets go to processor 0.
647 Continue accumulating again until supassing 2*NUM_KEYS. All keys
648 in these buckets go to processor 1, etc. This algorithm guarantees
649 that all processors have work ranking; no processors are left idle.
650 The optimum number of buckets, however, does not result in as high
651 a degree of load balancing (as even a distribution of keys as is
652 possible) as is obtained from increasing the number of buckets, but
653 more buckets results in more computation per processor so that the
654 optimum number of buckets turns out to be 1024 for machines tested.
655 Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
656 number of first and last bucket which each processor will have after
657 the redistribution is done. */
659 bucket_sum_accumulator = 0;
660 local_bucket_sum_accumulator = 0;
661 gd->send_displ[0] = 0;
662 gd->process_bucket_distrib_ptr1[0] = 0;
663 for( i=0, j=0; i<NUM_BUCKETS; i++ )
665 bucket_sum_accumulator += gd->bucket_size_totals[i];
666 local_bucket_sum_accumulator += gd->bucket_size[i];
667 if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
669 gd->send_count[j] = local_bucket_sum_accumulator;
672 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
673 gd->process_bucket_distrib_ptr1[j] =
674 gd->process_bucket_distrib_ptr2[j-1]+1;
676 gd->process_bucket_distrib_ptr2[j++] = i;
677 local_bucket_sum_accumulator = 0;
681 /* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
682 that the last few processors don't get any buckets. So, we
683 need to set counts properly in this case to avoid any fallouts. */
684 while( j < gd->comm_size )
686 gd->send_count[j] = 0;
687 gd->process_bucket_distrib_ptr1[j] = 1;
691 #ifdef TIMING_ENABLED
696 /* This is the redistribution section: first find out how many keys
697 each processor will send to every other processor: */
698 MPI_Alltoall( gd->send_count,
706 /* Determine the receive array displacements for the buckets */
707 gd->recv_displ[0] = 0;
708 for( i=1; i<gd->comm_size; i++ )
709 gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
712 /* Now send the keys to respective processors */
713 MPI_Alltoallv( gd->key_buff1,
723 #ifdef TIMING_ENABLED
728 /* The starting and ending bucket numbers on each processor are
729 multiplied by the interval size of the buckets to obtain the
730 smallest possible min and greatest possible max value of any
731 key on each processor */
732 min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
733 max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
735 /* Clear the work array */
736 for( i=0; i<max_key_val-min_key_val+1; i++ )
737 gd->key_buff1[i] = 0;
739 /* Determine the total number of keys on all other
740 processors holding keys of lesser value */
742 for( k=0; k<gd->my_rank; k++ )
743 for( i= gd->process_bucket_distrib_ptr1[k];
744 i<=gd->process_bucket_distrib_ptr2[k];
746 m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
748 /* Determine total number of keys on this processor */
750 for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
751 i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
753 j += gd->bucket_size_totals[i]; /* j has total # of local keys */
756 /* Ranking of all keys occurs in this section: */
757 /* shift it backwards so no subtractions are necessary in loop */
758 key_buff_ptr = gd->key_buff1 - min_key_val;
760 /* In this section, the keys themselves are used as their
761 own indexes to determine how many of each there are: their
762 individual population */
764 key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key */
767 /* To obtain ranks of each key, successively add the individual key
768 population, not forgetting the total of lesser keys, m.
769 NOTE: Since the total of lesser keys would be subtracted later
770 in verification, it is no longer added to the first key population
771 here, but still needed during the partial verify test. This is to
772 ensure that 32-bit key_buff can still be used for class D. */
773 /* key_buff_ptr[min_key_val] += m; */
774 for( i=min_key_val; i<max_key_val; i++ )
775 key_buff_ptr[i+1] += key_buff_ptr[i];
778 /* This is the partial verify test section */
779 /* Observe that test_rank_array vals are */
780 /* shifted differently for different cases */
781 for( i=0; i<TEST_ARRAY_SIZE; i++ )
783 k = gd->bucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */
784 if( min_key_val <= k && k <= max_key_val )
786 /* Add the total of lesser keys, m, here */
787 INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
795 if( key_rank != gd->test_rank_array[i]+iteration )
798 gd->passed_verification++;
802 if( key_rank != gd->test_rank_array[i]-iteration )
805 gd->passed_verification++;
811 if( key_rank != gd->test_rank_array[i]+(iteration-2) )
814 gd->passed_verification++;
818 if( key_rank != gd->test_rank_array[i]-iteration )
821 gd->passed_verification++;
827 if( key_rank != gd->test_rank_array[i]+(iteration-1) )
830 gd->passed_verification++;
834 if( key_rank != gd->test_rank_array[i]-(iteration-1) )
837 gd->passed_verification++;
841 if( i == 1 || i == 2 || i == 4 )
843 if( key_rank != gd->test_rank_array[i]+iteration )
846 gd->passed_verification++;
850 if( key_rank != gd->test_rank_array[i]-iteration )
853 gd->passed_verification++;
859 if( key_rank != gd->test_rank_array[i]+iteration )
862 gd->passed_verification++;
866 if( key_rank != gd->test_rank_array[i]-iteration )
869 gd->passed_verification++;
875 if( key_rank != gd->test_rank_array[i]+iteration )
878 gd->passed_verification++;
882 if( key_rank != gd->test_rank_array[i]-iteration )
885 gd->passed_verification++;
890 printf( "Failed partial verification: "
891 "iteration %d, processor %d, test key %d\n",
892 iteration, gd->my_rank, (int)i );
899 /* Make copies of rank info for use by full_verify: these variables
900 in rank are local; making them global slows down the code, probably
901 since they cannot be made register by compiler */
903 if( iteration == MAX_ITERATIONS )
905 gd->key_buff_ptr_global = key_buff_ptr;
906 gd->total_local_keys = j;
907 gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
913 /*****************************************************************/
914 /************* M A I N ****************/
915 /*****************************************************************/
917 int main( int argc, char **argv )
920 int i, iteration, itemp;
922 double timecounter, maxtime;
924 global_data* gd = malloc(sizeof(global_data));
926 MPI_Init( &argc, &argv );
927 MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
928 MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
930 /* Initialize the verification arrays if a valid class */
931 for( i=0; i<TEST_ARRAY_SIZE; i++ )
935 gd->test_index_array[i] = S_test_index_array[i];
936 gd->test_rank_array[i] = S_test_rank_array[i];
939 gd->test_index_array[i] = A_test_index_array[i];
940 gd->test_rank_array[i] = A_test_rank_array[i];
943 gd->test_index_array[i] = W_test_index_array[i];
944 gd->test_rank_array[i] = W_test_rank_array[i];
947 gd->test_index_array[i] = B_test_index_array[i];
948 gd->test_rank_array[i] = B_test_rank_array[i];
951 gd->test_index_array[i] = C_test_index_array[i];
952 gd->test_rank_array[i] = C_test_rank_array[i];
955 gd->test_index_array[i] = D_test_index_array[i];
956 gd->test_rank_array[i] = D_test_rank_array[i];
962 /* Printout initial NPB info */
963 if( gd->my_rank == 0 )
965 printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
966 printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
967 printf( " Iterations: %d\n", MAX_ITERATIONS );
968 printf( " Number of processes: %d\n",gd->comm_size );
971 /* Check that actual and compiled number of processors agree */
972 if( gd->comm_size != NUM_PROCS )
974 if( gd->my_rank == 0 )
975 printf( "\n ERROR: compiled for %d processes\n"
976 " Number of active processes: %d\n"
977 " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
982 /* Check to see whether total number of processes is within bounds.
983 This could in principle be checked in setparams.c, but it is more
984 convenient to do it here */
985 if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
987 if( gd->my_rank == 0 )
988 printf( "\n ERROR: number of processes %d not within range %d-%d"
989 "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
995 /* Generate random number sequence and subsequent keys on all procs */
996 create_seq(gd, find_my_seed( gd->my_rank,
998 4*(long)TOTAL_KEYS*MIN_PROCS,
999 314159265.00, /* Random number gen seed */
1000 1220703125.00 ), /* Random number gen mult */
1001 1220703125.00 ); /* Random number gen mult */
1003 /* Do one interation for free (i.e., untimed) to guarantee initialization of
1004 all data and code pages and respective tables */
1007 /* Start verification counter */
1008 gd->passed_verification = 0;
1010 if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" );
1012 /* Initialize timer */
1013 timer_clear(gd, 0 );
1015 /* Initialize separate communication, computation timing */
1016 #ifdef TIMING_ENABLED
1017 for( i=1; i<=3; i++ ) timer_clear(gd, i );
1021 timer_start(gd, 0 );
1023 #ifdef TIMING_ENABLED
1024 timer_start(gd, 1 );
1025 timer_start(gd, 2 );
1028 /* This is the main iteration */
1029 for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1031 if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
1032 rank(gd, iteration );
1036 #ifdef TIMING_ENABLED
1041 /* Stop timer, obtain time for processors */
1044 timecounter = timer_read(gd, 0 );
1046 /* End of timing, obtain maximum time of all processors */
1047 MPI_Reduce( &timecounter,
1055 #ifdef TIMING_ENABLED
1057 double tmin, tsum, tmax;
1061 printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1062 printf( " min avg max\n" );
1064 for( i=1; i<=3; i++ )
1066 timecounter = timer_read(gd, i );
1067 MPI_Reduce( &timecounter,
1074 MPI_Reduce( &timecounter,
1081 MPI_Reduce( &timecounter,
1089 printf( "timer %d: %f %f %f\n",
1090 i, tmin, tsum/((double) comm_size), tmax );
1097 /* This tests that keys are in sequence: sorting of last ranked key seq
1098 occurs here, but is an untimed operation */
1102 /* Obtain verification counter sum */
1103 itemp =gd->passed_verification;
1105 &gd->passed_verification,
1114 /* The final printout */
1115 if( gd->my_rank == 0 )
1117 if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1118 gd->passed_verification = 0;
1119 c_print_results( "IS",
1128 ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1131 gd->passed_verification,
1146 /**************************/
1147 } /* E N D P R O G R A M */
1148 /**************************/