1 /*************************************************************************
3 * N A S P A R A L L E L B E N C H M A R K S 3.3 *
7 *************************************************************************
9 * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
10 * It is described in NAS Technical Report 95-020. *
12 * Permission to use, copy, distribute and modify this software *
13 * for any purpose with or without fee is hereby granted. We *
14 * request, however, that all derived work reference the NAS *
15 * Parallel Benchmarks 3.3. This software is provided "as is" *
16 * without express or implied warranty. *
18 * Information on NPB 3.3, including the technical report, the *
19 * original specifications, source code, results and information *
20 * on how to submit new results, is available at: *
22 * http://www.nas.nasa.gov/Software/NPB *
24 * Send comments or suggestions to npb@nas.nasa.gov *
25 * Send bug reports to npb-bugs@nas.nasa.gov *
27 * NAS Parallel Benchmarks Group *
28 * NASA Ames Research Center *
30 * Moffett Field, CA 94035-1000 *
32 * E-mail: npb@nas.nasa.gov *
33 * Fax: (650) 604-3957 *
35 *************************************************************************
40 *************************************************************************/
43 #include "npbparams.h"
47 #include "instr/instr.h" //TRACE_
63 #define TOTAL_KEYS_LOG_2 16
64 #define MAX_KEY_LOG_2 11
65 #define NUM_BUCKETS_LOG_2 9
73 #define TOTAL_KEYS_LOG_2 20
74 #define MAX_KEY_LOG_2 16
75 #define NUM_BUCKETS_LOG_2 10
82 #define TOTAL_KEYS_LOG_2 23
83 #define MAX_KEY_LOG_2 19
84 #define NUM_BUCKETS_LOG_2 10
92 #define TOTAL_KEYS_LOG_2 25
93 #define MAX_KEY_LOG_2 21
94 #define NUM_BUCKETS_LOG_2 10
102 #define TOTAL_KEYS_LOG_2 27
103 #define MAX_KEY_LOG_2 23
104 #define NUM_BUCKETS_LOG_2 10
112 #define TOTAL_KEYS_LOG_2 29
113 #define MAX_KEY_LOG_2 27
114 #define NUM_BUCKETS_LOG_2 10
120 #define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2)
121 #define MAX_KEY (1 << MAX_KEY_LOG_2)
122 #define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2)
123 #define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
125 /*****************************************************************/
126 /* On larger number of processors, since the keys are (roughly) */
127 /* gaussian distributed, the first and last processor sort keys */
128 /* in a large interval, requiring array sizes to be larger. Note */
129 /* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
130 /* The required array size also depends on the bucket size used. */
131 /* The following values are validated for the 1024-bucket setup. */
132 /*****************************************************************/
134 #define SIZE_OF_BUFFERS 3*NUM_KEYS/2
135 #elif NUM_PROCS < 512
136 #define SIZE_OF_BUFFERS 5*NUM_KEYS/2
137 #elif NUM_PROCS < 1024
138 #define SIZE_OF_BUFFERS 4*NUM_KEYS
140 #define SIZE_OF_BUFFERS 13*NUM_KEYS/2
143 /*****************************************************************/
144 /* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
145 /* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */
146 /* MAX_PROCS AT YOUR PERIL */
147 /*****************************************************************/
149 #define MAX_PROCS 128
151 #define MAX_PROCS 1024
154 #define MAX_ITERATIONS 10
155 #define TEST_ARRAY_SIZE 5
158 /***********************************/
159 /* Enable separate communication, */
160 /* computation timing and printout */
161 /***********************************/
162 /* #define TIMING_ENABLED */
165 /*************************************/
166 /* Typedef: if necessary, change the */
167 /* size of int here by changing the */
168 /* int type to, say, long */
169 /*************************************/
170 typedef int INT_TYPE;
171 typedef long INT_TYPE2;
172 #define MP_KEY_TYPE MPI_INT
177 /********************/
178 /* MPI properties: */
179 /********************/
184 /********************/
185 /* Some global info */
186 /********************/
187 INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
188 total_local_keys, /* copies of rank info */
192 int passed_verification;
196 /************************************/
197 /* These are the three main arrays. */
198 /* See SIZE_OF_BUFFERS def above */
199 /************************************/
200 INT_TYPE key_array[SIZE_OF_BUFFERS],
201 key_buff1[SIZE_OF_BUFFERS],
202 key_buff2[SIZE_OF_BUFFERS],
203 bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */
204 bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
205 bucket_ptrs[NUM_BUCKETS],
206 process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
207 process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
208 int send_count[MAX_PROCS], recv_count[MAX_PROCS],
209 send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
212 /**********************/
213 /* Partial verif info */
214 /**********************/
215 INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
216 test_rank_array[TEST_ARRAY_SIZE];
221 double start[64], elapsed[64];
227 S_test_index_array[TEST_ARRAY_SIZE] =
228 {48427,17148,23627,62548,4431},
229 S_test_rank_array[TEST_ARRAY_SIZE] =
230 {0,18,346,64917,65463},
232 W_test_index_array[TEST_ARRAY_SIZE] =
233 {357773,934767,875723,898999,404505},
234 W_test_rank_array[TEST_ARRAY_SIZE] =
235 {1249,11698,1039987,1043896,1048018},
237 A_test_index_array[TEST_ARRAY_SIZE] =
238 {2112377,662041,5336171,3642833,4250760},
239 A_test_rank_array[TEST_ARRAY_SIZE] =
240 {104,17523,123928,8288932,8388264},
242 B_test_index_array[TEST_ARRAY_SIZE] =
243 {41869,812306,5102857,18232239,26860214},
244 B_test_rank_array[TEST_ARRAY_SIZE] =
245 {33422937,10244,59149,33135281,99},
247 C_test_index_array[TEST_ARRAY_SIZE] =
248 {44172927,72999161,74326391,129606274,21736814},
249 C_test_rank_array[TEST_ARRAY_SIZE] =
250 {61147,882988,266290,133997595,133525895},
252 D_test_index_array[TEST_ARRAY_SIZE] =
253 {1317351170,995930646,1157283250,1503301535,1453734525},
254 D_test_rank_array[TEST_ARRAY_SIZE] =
255 {1,36538729,1978098519,2145192618,2147425337};
259 /***********************/
260 /* function prototypes */
261 /***********************/
262 double randlc( double *X, double *A );
264 void full_verify( global_data* gd );
266 void c_print_results( char *name,
277 int passed_verification,
287 void timer_clear(global_data* gd, int n );
288 void timer_start(global_data* gd, int n );
289 void timer_stop(global_data* gd, int n );
290 double timer_read(global_data* gd, int n );
292 void timer_clear(global_data* gd, int n ) {
293 gd->elapsed[n] = 0.0;
296 void timer_start(global_data* gd, int n ) {
297 gd->start[n] = MPI_Wtime();
300 void timer_stop(global_data* gd, int n ) {
301 gd->elapsed[n] += MPI_Wtime() - gd->start[n];
304 double timer_read(global_data* gd, int n ) {
305 return gd->elapsed[n];
310 * FUNCTION RANDLC (X, A)
312 * This routine returns a uniform pseudorandom double precision number in the
313 * range (0, 1) by using the linear congruential generator
315 * x_{k+1} = a x_k (mod 2^46)
317 * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
318 * before repeating. The argument A is the same as 'a' in the above formula,
319 * and X is the same as x_0. A and X must be odd double precision integers
320 * in the range (1, 2^46). The returned value RANDLC is normalized to be
321 * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
322 * the new seed x_1, so that subsequent calls to RANDLC using the same
323 * arguments will generate a continuous sequence.
325 * This routine should produce the same results on any computer with at least
326 * 48 mantissa bits in double precision floating point data. On Cray systems,
327 * double precision should be disabled.
329 * David H. Bailey October 26, 1990
331 * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
332 * SAVE KS, R23, R46, T23, T46
335 * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
336 * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
337 * by merely using the ** operator, in order to insure that the results are
338 * exact on all systems. This code assumes that 0.5D0 is represented exactly.
342 /*****************************************************************/
343 /************* R A N D L C ************/
344 /************* ************/
345 /************* portable random number generator ************/
346 /*****************************************************************/
348 double randlc( double *X, double *A )
351 static double R23, R46, T23, T46;
352 double T1, T2, T3, T4;
367 for (i=1; i<=23; i++)
372 for (i=1; i<=46; i++)
380 /* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
387 /* Break X into two parts such that X = 2^23 * X1 + X2, compute
388 Z = A1 * X2 + A2 * X1 (mod 2^23), and then
389 X = 2^23 * Z + A2 * X2 (mod 2^46). */
395 T1 = A1 * X2 + A2 * X1;
400 T3 = T23 * Z + A2 * X2;
409 /*****************************************************************/
410 /************ F I N D _ M Y _ S E E D ************/
411 /************ ************/
412 /************ returns parallel random number seq seed ************/
413 /*****************************************************************/
416 * Create a random number sequence of total length nn residing
417 * on np number of processors. Each processor will therefore have a
418 * subsequence of length nn/np. This routine returns that random
419 * number which is the first random number for the subsequence belonging
420 * to processor rank kn, and which is used as seed for proc kn ran # gen.
423 double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
424 int np, /* np = num procs */
425 long nn, /* total num of ran numbers, all procs */
426 double s, /* Ran num seed, for ex.: 314159265.00 */
427 double a ) /* Ran num gen mult, try 1220703125.00 */
439 for( mq=0; nq>1; mq++,nq/=2 )
444 for( i=1; i<=mq; i++ )
445 t2 = randlc( &t1, &t1 );
453 for( i=1; i<=100; i++ )
457 t3 = randlc( &t1, &t2 );
460 t3 = randlc( &t2, &t2 );
471 /*****************************************************************/
472 /************* C R E A T E _ S E Q ************/
473 /*****************************************************************/
475 void create_seq( global_data* gd, double seed, double a )
482 for (i=0; i<NUM_KEYS; i++)
484 x = randlc(&seed, &a);
485 x += randlc(&seed, &a);
486 x += randlc(&seed, &a);
487 x += randlc(&seed, &a);
489 gd->key_array[i] = k*x;
496 /*****************************************************************/
497 /************* F U L L _ V E R I F Y ************/
498 /*****************************************************************/
501 void full_verify( global_data* gd )
507 INT_TYPE k, last_local_key;
510 /* Now, finally, sort the keys: */
511 for( i=0; i<gd->total_local_keys; i++ )
512 gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
513 gd->total_lesser_keys] = gd->key_buff2[i];
514 last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
516 /* Send largest key value to next processor */
517 if( gd->my_rank > 0 )
525 if( gd->my_rank < gd->comm_size-1 )
526 MPI_Send( &gd->key_array[last_local_key],
532 if( gd->my_rank > 0 )
533 MPI_Wait( &request, &status );
535 /* Confirm that neighbor's greatest key value
536 is not greater than my least key value */
538 if( gd->my_rank > 0 && gd->total_local_keys > 0 )
539 if( k > gd->key_array[0] )
543 /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
544 for( i=1; i<gd->total_local_keys; i++ )
545 if( gd->key_array[i-1] > gd->key_array[i] )
551 printf( "Processor %d: Full_verify: number of keys out of sort: %d\n",
555 gd->passed_verification++;
563 /*****************************************************************/
564 /************* R A N K ****************/
565 /*****************************************************************/
568 void rank( global_data* gd, int iteration )
573 INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
575 INT_TYPE2 bucket_sum_accumulator, j, m;
576 INT_TYPE local_bucket_sum_accumulator;
577 INT_TYPE min_key_val, max_key_val;
578 INT_TYPE *key_buff_ptr;
583 /* Iteration alteration of keys */
584 if(gd->my_rank == 0 )
586 gd->key_array[iteration] = iteration;
587 gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
592 for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
594 gd->bucket_size[i] = 0;
595 gd->bucket_size_totals[i] = 0;
596 gd->process_bucket_distrib_ptr1[i] = 0;
597 gd->process_bucket_distrib_ptr2[i] = 0;
601 /* Determine where the partial verify test keys are, load into */
602 /* top of array bucket_size */
603 for( i=0; i<TEST_ARRAY_SIZE; i++ )
604 if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
605 gd->bucket_size[NUM_BUCKETS+i] =
606 gd->key_array[gd->test_index_array[i] % NUM_KEYS];
609 /* Determine the number of keys in each bucket */
610 for( i=0; i<NUM_KEYS; i++ )
611 gd->bucket_size[gd->key_array[i] >> shift]++;
614 /* Accumulative bucket sizes are the bucket pointers */
615 gd->bucket_ptrs[0] = 0;
616 for( i=1; i< NUM_BUCKETS; i++ )
617 gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
620 /* Sort into appropriate bucket */
621 for( i=0; i<NUM_KEYS; i++ )
623 key = gd->key_array[i];
624 gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
627 #ifdef TIMING_ENABLED
632 /* Get the bucket size totals for the entire problem. These
633 will be used to determine the redistribution of keys */
634 MPI_Allreduce( gd->bucket_size,
635 gd->bucket_size_totals,
636 NUM_BUCKETS+TEST_ARRAY_SIZE,
641 #ifdef TIMING_ENABLED
646 /* Determine Redistibution of keys: accumulate the bucket size totals
647 till this number surpasses NUM_KEYS (which the average number of keys
648 per processor). Then all keys in these buckets go to processor 0.
649 Continue accumulating again until supassing 2*NUM_KEYS. All keys
650 in these buckets go to processor 1, etc. This algorithm guarantees
651 that all processors have work ranking; no processors are left idle.
652 The optimum number of buckets, however, does not result in as high
653 a degree of load balancing (as even a distribution of keys as is
654 possible) as is obtained from increasing the number of buckets, but
655 more buckets results in more computation per processor so that the
656 optimum number of buckets turns out to be 1024 for machines tested.
657 Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
658 number of first and last bucket which each processor will have after
659 the redistribution is done. */
661 bucket_sum_accumulator = 0;
662 local_bucket_sum_accumulator = 0;
663 gd->send_displ[0] = 0;
664 gd->process_bucket_distrib_ptr1[0] = 0;
665 for( i=0, j=0; i<NUM_BUCKETS; i++ )
667 bucket_sum_accumulator += gd->bucket_size_totals[i];
668 local_bucket_sum_accumulator += gd->bucket_size[i];
669 if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
671 gd->send_count[j] = local_bucket_sum_accumulator;
674 gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
675 gd->process_bucket_distrib_ptr1[j] =
676 gd->process_bucket_distrib_ptr2[j-1]+1;
678 gd->process_bucket_distrib_ptr2[j++] = i;
679 local_bucket_sum_accumulator = 0;
683 /* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
684 that the last few processors don't get any buckets. So, we
685 need to set counts properly in this case to avoid any fallouts. */
686 while( j < gd->comm_size )
688 gd->send_count[j] = 0;
689 gd->process_bucket_distrib_ptr1[j] = 1;
693 #ifdef TIMING_ENABLED
698 /* This is the redistribution section: first find out how many keys
699 each processor will send to every other processor: */
700 MPI_Alltoall( gd->send_count,
708 /* Determine the receive array displacements for the buckets */
709 gd->recv_displ[0] = 0;
710 for( i=1; i<gd->comm_size; i++ )
711 gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
714 /* Now send the keys to respective processors */
715 MPI_Alltoallv( gd->key_buff1,
725 #ifdef TIMING_ENABLED
730 /* The starting and ending bucket numbers on each processor are
731 multiplied by the interval size of the buckets to obtain the
732 smallest possible min and greatest possible max value of any
733 key on each processor */
734 min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
735 max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
737 /* Clear the work array */
738 for( i=0; i<max_key_val-min_key_val+1; i++ )
739 gd->key_buff1[i] = 0;
741 /* Determine the total number of keys on all other
742 processors holding keys of lesser value */
744 for( k=0; k<gd->my_rank; k++ )
745 for( i= gd->process_bucket_distrib_ptr1[k];
746 i<=gd->process_bucket_distrib_ptr2[k];
748 m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
750 /* Determine total number of keys on this processor */
752 for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
753 i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
755 j += gd->bucket_size_totals[i]; /* j has total # of local keys */
758 /* Ranking of all keys occurs in this section: */
759 /* shift it backwards so no subtractions are necessary in loop */
760 key_buff_ptr = gd->key_buff1 - min_key_val;
762 /* In this section, the keys themselves are used as their
763 own indexes to determine how many of each there are: their
764 individual population */
766 key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key */
769 /* To obtain ranks of each key, successively add the individual key
770 population, not forgetting the total of lesser keys, m.
771 NOTE: Since the total of lesser keys would be subtracted later
772 in verification, it is no longer added to the first key population
773 here, but still needed during the partial verify test. This is to
774 ensure that 32-bit key_buff can still be used for class D. */
775 /* key_buff_ptr[min_key_val] += m; */
776 for( i=min_key_val; i<max_key_val; i++ )
777 key_buff_ptr[i+1] += key_buff_ptr[i];
780 /* This is the partial verify test section */
781 /* Observe that test_rank_array vals are */
782 /* shifted differently for different cases */
783 for( i=0; i<TEST_ARRAY_SIZE; i++ )
785 k = gd->bucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */
786 if( min_key_val <= k && k <= max_key_val )
788 /* Add the total of lesser keys, m, here */
789 INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
797 if( key_rank != gd->test_rank_array[i]+iteration )
800 gd->passed_verification++;
804 if( key_rank != gd->test_rank_array[i]-iteration )
807 gd->passed_verification++;
813 if( key_rank != gd->test_rank_array[i]+(iteration-2) )
816 gd->passed_verification++;
820 if( key_rank != gd->test_rank_array[i]-iteration )
823 gd->passed_verification++;
829 if( key_rank != gd->test_rank_array[i]+(iteration-1) )
832 gd->passed_verification++;
836 if( key_rank != gd->test_rank_array[i]-(iteration-1) )
839 gd->passed_verification++;
843 if( i == 1 || i == 2 || i == 4 )
845 if( key_rank != gd->test_rank_array[i]+iteration )
848 gd->passed_verification++;
852 if( key_rank != gd->test_rank_array[i]-iteration )
855 gd->passed_verification++;
861 if( key_rank != gd->test_rank_array[i]+iteration )
864 gd->passed_verification++;
868 if( key_rank != gd->test_rank_array[i]-iteration )
871 gd->passed_verification++;
877 if( key_rank != gd->test_rank_array[i]+iteration )
880 gd->passed_verification++;
884 if( key_rank != gd->test_rank_array[i]-iteration )
887 gd->passed_verification++;
892 printf( "Failed partial verification: "
893 "iteration %d, processor %d, test key %d\n",
894 iteration, gd->my_rank, (int)i );
901 /* Make copies of rank info for use by full_verify: these variables
902 in rank are local; making them global slows down the code, probably
903 since they cannot be made register by compiler */
905 if( iteration == MAX_ITERATIONS )
907 gd->key_buff_ptr_global = key_buff_ptr;
908 gd->total_local_keys = j;
909 gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
915 /*****************************************************************/
916 /************* M A I N ****************/
917 /*****************************************************************/
919 int main( int argc, char **argv )
922 int i, iteration, itemp;
924 double timecounter, maxtime;
926 global_data* gd = malloc(sizeof(global_data));
928 MPI_Init( &argc, &argv );
929 MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
930 MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
932 /* Initialize the verification arrays if a valid class */
933 for( i=0; i<TEST_ARRAY_SIZE; i++ )
937 gd->test_index_array[i] = S_test_index_array[i];
938 gd->test_rank_array[i] = S_test_rank_array[i];
941 gd->test_index_array[i] = A_test_index_array[i];
942 gd->test_rank_array[i] = A_test_rank_array[i];
945 gd->test_index_array[i] = W_test_index_array[i];
946 gd->test_rank_array[i] = W_test_rank_array[i];
949 gd->test_index_array[i] = B_test_index_array[i];
950 gd->test_rank_array[i] = B_test_rank_array[i];
953 gd->test_index_array[i] = C_test_index_array[i];
954 gd->test_rank_array[i] = C_test_rank_array[i];
957 gd->test_index_array[i] = D_test_index_array[i];
958 gd->test_rank_array[i] = D_test_rank_array[i];
964 /* Printout initial NPB info */
965 if( gd->my_rank == 0 )
967 printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
968 printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
969 printf( " Iterations: %d\n", MAX_ITERATIONS );
970 printf( " Number of processes: %d\n",gd->comm_size );
973 /* Check that actual and compiled number of processors agree */
974 if( gd->comm_size != NUM_PROCS )
976 if( gd->my_rank == 0 )
977 printf( "\n ERROR: compiled for %d processes\n"
978 " Number of active processes: %d\n"
979 " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
984 /* Check to see whether total number of processes is within bounds.
985 This could in principle be checked in setparams.c, but it is more
986 convenient to do it here */
987 if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
989 if( gd->my_rank == 0 )
990 printf( "\n ERROR: number of processes %d not within range %d-%d"
991 "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
997 /* Generate random number sequence and subsequent keys on all procs */
998 create_seq(gd, find_my_seed( gd->my_rank,
1000 4*(long)TOTAL_KEYS*MIN_PROCS,
1001 314159265.00, /* Random number gen seed */
1002 1220703125.00 ), /* Random number gen mult */
1003 1220703125.00 ); /* Random number gen mult */
1005 /* Do one interation for free (i.e., untimed) to guarantee initialization of
1006 all data and code pages and respective tables */
1009 /* Start verification counter */
1010 gd->passed_verification = 0;
1012 if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" );
1014 /* Initialize timer */
1015 timer_clear(gd, 0 );
1017 /* Initialize separate communication, computation timing */
1018 #ifdef TIMING_ENABLED
1019 for( i=1; i<=3; i++ ) timer_clear(gd, i );
1023 timer_start(gd, 0 );
1025 #ifdef TIMING_ENABLED
1026 timer_start(gd, 1 );
1027 timer_start(gd, 2 );
1030 char smpi_category[100];
1031 snprintf (smpi_category, 100, "%d", gd->my_rank);
1032 TRACE_smpi_set_category (smpi_category);
1034 /* This is the main iteration */
1035 for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
1037 if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
1038 rank(gd, iteration );
1040 TRACE_smpi_set_category (NULL);
1042 #ifdef TIMING_ENABLED
1047 /* Stop timer, obtain time for processors */
1050 timecounter = timer_read(gd, 0 );
1052 /* End of timing, obtain maximum time of all processors */
1053 MPI_Reduce( &timecounter,
1061 #ifdef TIMING_ENABLED
1063 double tmin, tsum, tmax;
1067 printf( "\ntimer 1/2/3 = total/computation/communication time\n");
1068 printf( " min avg max\n" );
1070 for( i=1; i<=3; i++ )
1072 timecounter = timer_read(gd, i );
1073 MPI_Reduce( &timecounter,
1080 MPI_Reduce( &timecounter,
1087 MPI_Reduce( &timecounter,
1095 printf( "timer %d: %f %f %f\n",
1096 i, tmin, tsum/((double) comm_size), tmax );
1103 /* This tests that keys are in sequence: sorting of last ranked key seq
1104 occurs here, but is an untimed operation */
1108 /* Obtain verification counter sum */
1109 itemp =gd->passed_verification;
1111 &gd->passed_verification,
1120 /* The final printout */
1121 if( gd->my_rank == 0 )
1123 if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
1124 gd->passed_verification = 0;
1125 c_print_results( "IS",
1134 ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
1137 gd->passed_verification,
1152 /**************************/
1153 } /* E N D P R O G R A M */
1154 /**************************/