-/*************************************************************************
- * *
- * N A S P A R A L L E L B E N C H M A R K S 3.3 *
- * *
- * I S *
- * *
- *************************************************************************
- * *
- * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
- * It is described in NAS Technical Report 95-020. *
- * *
- * Permission to use, copy, distribute and modify this software *
- * for any purpose with or without fee is hereby granted. We *
- * request, however, that all derived work reference the NAS *
- * Parallel Benchmarks 3.3. This software is provided "as is" *
- * without express or implied warranty. *
- * *
- * Information on NPB 3.3, including the technical report, the *
- * original specifications, source code, results and information *
- * on how to submit new results, is available at: *
- * *
- * http://www.nas.nasa.gov/Software/NPB *
- * *
- * Send comments or suggestions to npb@nas.nasa.gov *
- * Send bug reports to npb-bugs@nas.nasa.gov *
- * *
- * NAS Parallel Benchmarks Group *
- * NASA Ames Research Center *
- * Mail Stop: T27A-1 *
- * Moffett Field, CA 94035-1000 *
- * *
- * E-mail: npb@nas.nasa.gov *
- * Fax: (650) 604-3957 *
- * *
- *************************************************************************
- * *
- * Author: M. Yarrow *
- * H. Jin *
- * *
- *************************************************************************/
-
-#include "mpi.h"
-#include "npbparams.h"
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "simgrid/instr.h" //TRACE_
-
-/******************/
-/* default values */
-/******************/
-#ifndef CLASS
-#define CLASS 'S'
-#define NUM_PROCS 1
-#endif
-#define MIN_PROCS 1
-
-
-/*************/
-/* CLASS S */
-/*************/
-#if CLASS == 'S'
-#define TOTAL_KEYS_LOG_2 16
-#define MAX_KEY_LOG_2 11
-#define NUM_BUCKETS_LOG_2 9
-#endif
-
-
-/*************/
-/* CLASS W */
-/*************/
-#if CLASS == 'W'
-#define TOTAL_KEYS_LOG_2 20
-#define MAX_KEY_LOG_2 16
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-/*************/
-/* CLASS A */
-/*************/
-#if CLASS == 'A'
-#define TOTAL_KEYS_LOG_2 23
-#define MAX_KEY_LOG_2 19
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS B */
-/*************/
-#if CLASS == 'B'
-#define TOTAL_KEYS_LOG_2 25
-#define MAX_KEY_LOG_2 21
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS C */
-/*************/
-#if CLASS == 'C'
-#define TOTAL_KEYS_LOG_2 27
-#define MAX_KEY_LOG_2 23
-#define NUM_BUCKETS_LOG_2 10
-#endif
-
-
-/*************/
-/* CLASS D */
-/*************/
-#if CLASS == 'D'
-#define TOTAL_KEYS_LOG_2 29
-#define MAX_KEY_LOG_2 27
-#define NUM_BUCKETS_LOG_2 10
-#undef MIN_PROCS
-#define MIN_PROCS 4
-#endif
-
-
-#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2)
-#define MAX_KEY (1 << MAX_KEY_LOG_2)
-#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2)
-#define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
-
-/*****************************************************************/
-/* On larger number of processors, since the keys are (roughly) */
-/* gaussian distributed, the first and last processor sort keys */
-/* in a large interval, requiring array sizes to be larger. Note */
-/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
-/* The required array size also depends on the bucket size used. */
-/* The following values are validated for the 1024-bucket setup. */
-/*****************************************************************/
-#if NUM_PROCS < 256
-#define SIZE_OF_BUFFERS 3*NUM_KEYS/2
-#elif NUM_PROCS < 512
-#define SIZE_OF_BUFFERS 5*NUM_KEYS/2
-#elif NUM_PROCS < 1024
-#define SIZE_OF_BUFFERS 4*NUM_KEYS
-#else
-#define SIZE_OF_BUFFERS 13*NUM_KEYS/2
-#endif
-
-/*****************************************************************/
-/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
-/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */
-/* MAX_PROCS AT YOUR PERIL */
-/*****************************************************************/
-#if CLASS == 'S'
-#define MAX_PROCS 128
-#else
-#define MAX_PROCS 1024
-#endif
-
-#define MAX_ITERATIONS 10
-#define TEST_ARRAY_SIZE 5
-
-
-/***********************************/
-/* Enable separate communication, */
-/* computation timing and printout */
-/***********************************/
-/* #define TIMING_ENABLED */
-
-
-/*************************************/
-/* Typedef: if necessary, change the */
-/* size of int here by changing the */
-/* int type to, say, long */
-/*************************************/
-typedef int INT_TYPE;
-typedef long INT_TYPE2;
-#define MP_KEY_TYPE MPI_INT
-
-
-typedef struct {
-
-/********************/
-/* MPI properties: */
-/********************/
-int my_rank,
- comm_size;
-
-
-/********************/
-/* Some global info */
-/********************/
-INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
- total_local_keys, /* copies of rank info */
- total_lesser_keys;
-
-
-int passed_verification;
-
-
-
-/************************************/
-/* These are the three main arrays. */
-/* See SIZE_OF_BUFFERS def above */
-/************************************/
-INT_TYPE key_array[SIZE_OF_BUFFERS],
- key_buff1[SIZE_OF_BUFFERS],
- key_buff2[SIZE_OF_BUFFERS],
- bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */
- bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
- bucket_ptrs[NUM_BUCKETS],
- process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
- process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
-int send_count[MAX_PROCS], recv_count[MAX_PROCS],
- send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
-
-
-/**********************/
-/* Partial verif info */
-/**********************/
-INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
- test_rank_array[TEST_ARRAY_SIZE];
-
-/**********/
-/* Timers */
-/**********/
-double start[64], elapsed[64];
-
-} global_data;
-
-
-const INT_TYPE2
- S_test_index_array[TEST_ARRAY_SIZE] =
- {48427,17148,23627,62548,4431},
- S_test_rank_array[TEST_ARRAY_SIZE] =
- {0,18,346,64917,65463},
-
- W_test_index_array[TEST_ARRAY_SIZE] =
- {357773,934767,875723,898999,404505},
- W_test_rank_array[TEST_ARRAY_SIZE] =
- {1249,11698,1039987,1043896,1048018},
-
- A_test_index_array[TEST_ARRAY_SIZE] =
- {2112377,662041,5336171,3642833,4250760},
- A_test_rank_array[TEST_ARRAY_SIZE] =
- {104,17523,123928,8288932,8388264},
-
- B_test_index_array[TEST_ARRAY_SIZE] =
- {41869,812306,5102857,18232239,26860214},
- B_test_rank_array[TEST_ARRAY_SIZE] =
- {33422937,10244,59149,33135281,99},
-
- C_test_index_array[TEST_ARRAY_SIZE] =
- {44172927,72999161,74326391,129606274,21736814},
- C_test_rank_array[TEST_ARRAY_SIZE] =
- {61147,882988,266290,133997595,133525895},
-
- D_test_index_array[TEST_ARRAY_SIZE] =
- {1317351170,995930646,1157283250,1503301535,1453734525},
- D_test_rank_array[TEST_ARRAY_SIZE] =
- {1,36538729,1978098519,2145192618,2147425337};
-
-
-
-/***********************/
-/* function prototypes */
-/***********************/
-double randlc( double *X, double *A );
-
-void full_verify( global_data* gd );
-
-void c_print_results( char *name,
- char class,
- int n1,
- int n2,
- int n3,
- int niter,
- int nprocs_compiled,
- int nprocs_total,
- double t,
- double mops,
- char *optype,
- int passed_verification,
- char *npbversion,
- char *compiletime,
- char *mpicc,
- char *clink,
- char *cmpi_lib,
- char *cmpi_inc,
- char *cflags,
- char *clinkflags );
-
-void timer_clear(global_data* gd, int n );
-void timer_start(global_data* gd, int n );
-void timer_stop(global_data* gd, int n );
-double timer_read(global_data* gd, int n );
-
-void timer_clear(global_data* gd, int n ) {
- gd->elapsed[n] = 0.0;
-}
-
-void timer_start(global_data* gd, int n ) {
- gd->start[n] = MPI_Wtime();
-}
-
-void timer_stop(global_data* gd, int n ) {
- gd->elapsed[n] += MPI_Wtime() - gd->start[n];
-}
-
-double timer_read(global_data* gd, int n ) {
- return gd->elapsed[n];
-}
-
-
-/*
- * FUNCTION RANDLC (X, A)
- *
- * This routine returns a uniform pseudorandom double precision number in the
- * range (0, 1) by using the linear congruential generator
- *
- * x_{k+1} = a x_k (mod 2^46)
- *
- * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
- * before repeating. The argument A is the same as 'a' in the above formula,
- * and X is the same as x_0. A and X must be odd double precision integers
- * in the range (1, 2^46). The returned value RANDLC is normalized to be
- * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
- * the new seed x_1, so that subsequent calls to RANDLC using the same
- * arguments will generate a continuous sequence.
- *
- * This routine should produce the same results on any computer with at least
- * 48 mantissa bits in double precision floating point data. On Cray systems,
- * double precision should be disabled.
- *
- * David H. Bailey October 26, 1990
- *
- * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
- * SAVE KS, R23, R46, T23, T46
- * DATA KS/0/
- *
- * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
- * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
- * by merely using the ** operator, in order to insure that the results are
- * exact on all systems. This code assumes that 0.5D0 is represented exactly.
- */
-
-
-/*****************************************************************/
-/************* R A N D L C ************/
-/************* ************/
-/************* portable random number generator ************/
-/*****************************************************************/
-
-double randlc( double *X, double *A )
-{
- static int KS=0;
- static double R23, R46, T23, T46;
- double T1, T2, T3, T4;
- double A1;
- double A2;
- double X1;
- double X2;
- double Z;
- int i, j;
-
- if (KS == 0)
- {
- R23 = 1.0;
- R46 = 1.0;
- T23 = 1.0;
- T46 = 1.0;
-
- for (i=1; i<=23; i++)
- {
- R23 = 0.50 * R23;
- T23 = 2.0 * T23;
- }
- for (i=1; i<=46; i++)
- {
- R46 = 0.50 * R46;
- T46 = 2.0 * T46;
- }
- KS = 1;
- }
-
-/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
-
- T1 = R23 * *A;
- j = T1;
- A1 = j;
- A2 = *A - T23 * A1;
-
-/* Break X into two parts such that X = 2^23 * X1 + X2, compute
- Z = A1 * X2 + A2 * X1 (mod 2^23), and then
- X = 2^23 * Z + A2 * X2 (mod 2^46). */
-
- T1 = R23 * *X;
- j = T1;
- X1 = j;
- X2 = *X - T23 * X1;
- T1 = A1 * X2 + A2 * X1;
-
- j = R23 * T1;
- T2 = j;
- Z = T1 - T23 * T2;
- T3 = T23 * Z + A2 * X2;
- j = R46 * T3;
- T4 = j;
- *X = T3 - T46 * T4;
- return(R46 * *X);
-}
-
-
-
-/*****************************************************************/
-/************ F I N D _ M Y _ S E E D ************/
-/************ ************/
-/************ returns parallel random number seq seed ************/
-/*****************************************************************/
-
-/*
- * Create a random number sequence of total length nn residing
- * on np number of processors. Each processor will therefore have a
- * subsequence of length nn/np. This routine returns that random
- * number which is the first random number for the subsequence belonging
- * to processor rank kn, and which is used as seed for proc kn ran # gen.
- */
-
-double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
- int np, /* np = num procs */
- long nn, /* total num of ran numbers, all procs */
- double s, /* Ran num seed, for ex.: 314159265.00 */
- double a ) /* Ran num gen mult, try 1220703125.00 */
-{
-
- long i;
-
- double t1,t2,t3,an;
- long mq,nq,kk,ik;
-
-
-
- nq = nn / np;
-
- for( mq=0; nq>1; mq++,nq/=2 )
- ;
-
- t1 = a;
-
- for( i=1; i<=mq; i++ )
- t2 = randlc( &t1, &t1 );
-
- an = t1;
-
- kk = kn;
- t1 = s;
- t2 = an;
-
- for( i=1; i<=100; i++ )
- {
- ik = kk / 2;
- if( 2 * ik != kk )
- t3 = randlc( &t1, &t2 );
- if( ik == 0 )
- break;
- t3 = randlc( &t2, &t2 );
- kk = ik;
- }
-
- return( t1 );
-
-}
-
-
-
-
-/*****************************************************************/
-/************* C R E A T E _ S E Q ************/
-/*****************************************************************/
-
-void create_seq( global_data* gd, double seed, double a )
-{
- double x;
- int i, k;
-
- k = MAX_KEY/4;
-
- for (i=0; i<NUM_KEYS; i++)
- {
- x = randlc(&seed, &a);
- x += randlc(&seed, &a);
- x += randlc(&seed, &a);
- x += randlc(&seed, &a);
-
- gd->key_array[i] = k*x;
- }
-}
-
-
-
-
-/*****************************************************************/
-/************* F U L L _ V E R I F Y ************/
-/*****************************************************************/
-
-
-void full_verify( global_data* gd )
-{
- MPI_Status status;
- MPI_Request request;
-
- INT_TYPE i, j;
- INT_TYPE k, last_local_key;
-
-
-/* Now, finally, sort the keys: */
- for( i=0; i<gd->total_local_keys; i++ )
- gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
- gd->total_lesser_keys] = gd->key_buff2[i];
- last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
-
-/* Send largest key value to next processor */
- if( gd->my_rank > 0 )
- MPI_Irecv( &k,
- 1,
- MP_KEY_TYPE,
- gd->my_rank-1,
- 1000,
- MPI_COMM_WORLD,
- &request );
- if( gd->my_rank < gd->comm_size-1 )
- MPI_Send( &gd->key_array[last_local_key],
- 1,
- MP_KEY_TYPE,
- gd->my_rank+1,
- 1000,
- MPI_COMM_WORLD );
- if( gd->my_rank > 0 )
- MPI_Wait( &request, &status );
-
-/* Confirm that neighbor's greatest key value
- is not greater than my least key value */
- j = 0;
- if( gd->my_rank > 0 && gd->total_local_keys > 0 )
- if( k > gd->key_array[0] )
- j++;
-
-
-/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
- for( i=1; i<gd->total_local_keys; i++ )
- if( gd->key_array[i-1] > gd->key_array[i] )
- j++;
-
-
- if( j != 0 )
- {
- printf( "Processor %d: Full_verify: number of keys out of sort: %d\n",
- gd->my_rank, j );
- }
- else
- gd->passed_verification++;
-
-
-}
-
-
-
-
-/*****************************************************************/
-/************* R A N K ****************/
-/*****************************************************************/
-
-
-void rank( global_data* gd, int iteration )
-{
-
- INT_TYPE i, k;
-
- INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
- INT_TYPE key;
- INT_TYPE2 bucket_sum_accumulator, j, m;
- INT_TYPE local_bucket_sum_accumulator;
- INT_TYPE min_key_val, max_key_val;
- INT_TYPE *key_buff_ptr;
-
-
-
-
-/* Iteration alteration of keys */
- if(gd->my_rank == 0 )
- {
- gd->key_array[iteration] = iteration;
- gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
- }
-
-
-/* Initialize */
- for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
- {
- gd->bucket_size[i] = 0;
- gd->bucket_size_totals[i] = 0;
- gd->process_bucket_distrib_ptr1[i] = 0;
- gd->process_bucket_distrib_ptr2[i] = 0;
- }
-
-
-/* Determine where the partial verify test keys are, load into */
-/* top of array bucket_size */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
- gd->bucket_size[NUM_BUCKETS+i] =
- gd->key_array[gd->test_index_array[i] % NUM_KEYS];
-
-
-/* Determine the number of keys in each bucket */
- for( i=0; i<NUM_KEYS; i++ )
- gd->bucket_size[gd->key_array[i] >> shift]++;
-
-
-/* Accumulative bucket sizes are the bucket pointers */
- gd->bucket_ptrs[0] = 0;
- for( i=1; i< NUM_BUCKETS; i++ )
- gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
-
-
-/* Sort into appropriate bucket */
- for( i=0; i<NUM_KEYS; i++ )
- {
- key = gd->key_array[i];
- gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
- }
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_start(gd, 3 );
-#endif
-
-/* Get the bucket size totals for the entire problem. These
- will be used to determine the redistribution of keys */
- MPI_Allreduce( gd->bucket_size,
- gd->bucket_size_totals,
- NUM_BUCKETS+TEST_ARRAY_SIZE,
- MP_KEY_TYPE,
- MPI_SUM,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 3 );
- timer_start(gd, 2 );
-#endif
-
-/* Determine Redistibution of keys: accumulate the bucket size totals
- till this number surpasses NUM_KEYS (which the average number of keys
- per processor). Then all keys in these buckets go to processor 0.
- Continue accumulating again until supassing 2*NUM_KEYS. All keys
- in these buckets go to processor 1, etc. This algorithm guarantees
- that all processors have work ranking; no processors are left idle.
- The optimum number of buckets, however, does not result in as high
- a degree of load balancing (as even a distribution of keys as is
- possible) as is obtained from increasing the number of buckets, but
- more buckets results in more computation per processor so that the
- optimum number of buckets turns out to be 1024 for machines tested.
- Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
- number of first and last bucket which each processor will have after
- the redistribution is done. */
-
- bucket_sum_accumulator = 0;
- local_bucket_sum_accumulator = 0;
- gd->send_displ[0] = 0;
- gd->process_bucket_distrib_ptr1[0] = 0;
- for( i=0, j=0; i<NUM_BUCKETS; i++ )
- {
- bucket_sum_accumulator += gd->bucket_size_totals[i];
- local_bucket_sum_accumulator += gd->bucket_size[i];
- if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
- {
- gd->send_count[j] = local_bucket_sum_accumulator;
- if( j != 0 )
- {
- gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
- gd->process_bucket_distrib_ptr1[j] =
- gd->process_bucket_distrib_ptr2[j-1]+1;
- }
- gd->process_bucket_distrib_ptr2[j++] = i;
- local_bucket_sum_accumulator = 0;
- }
- }
-
-/* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
- that the last few processors don't get any buckets. So, we
- need to set counts properly in this case to avoid any fallouts. */
- while( j < gd->comm_size )
- {
- gd->send_count[j] = 0;
- gd->process_bucket_distrib_ptr1[j] = 1;
- j++;
- }
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_start(gd, 3 );
-#endif
-
-/* This is the redistribution section: first find out how many keys
- each processor will send to every other processor: */
- MPI_Alltoall( gd->send_count,
- 1,
- MPI_INT,
- gd->recv_count,
- 1,
- MPI_INT,
- MPI_COMM_WORLD );
-
-/* Determine the receive array displacements for the buckets */
- gd->recv_displ[0] = 0;
- for( i=1; i<gd->comm_size; i++ )
- gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
-
-
-/* Now send the keys to respective processors */
- MPI_Alltoallv( gd->key_buff1,
- gd->send_count,
- gd->send_displ,
- MP_KEY_TYPE,
- gd->key_buff2,
- gd->recv_count,
- gd->recv_displ,
- MP_KEY_TYPE,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 3 );
- timer_start(gd, 2 );
-#endif
-
-/* The starting and ending bucket numbers on each processor are
- multiplied by the interval size of the buckets to obtain the
- smallest possible min and greatest possible max value of any
- key on each processor */
- min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
- max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
-
-/* Clear the work array */
- for( i=0; i<max_key_val-min_key_val+1; i++ )
- gd->key_buff1[i] = 0;
-
-/* Determine the total number of keys on all other
- processors holding keys of lesser value */
- m = 0;
- for( k=0; k<gd->my_rank; k++ )
- for( i= gd->process_bucket_distrib_ptr1[k];
- i<=gd->process_bucket_distrib_ptr2[k];
- i++ )
- m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
-
-/* Determine total number of keys on this processor */
- j = 0;
- for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
- i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
- i++ )
- j += gd->bucket_size_totals[i]; /* j has total # of local keys */
-
-
-/* Ranking of all keys occurs in this section: */
-/* shift it backwards so no subtractions are necessary in loop */
- key_buff_ptr = gd->key_buff1 - min_key_val;
-
-/* In this section, the keys themselves are used as their
- own indexes to determine how many of each there are: their
- individual population */
- for( i=0; i<j; i++ )
- key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key */
- /* population */
-
-/* To obtain ranks of each key, successively add the individual key
- population, not forgetting the total of lesser keys, m.
- NOTE: Since the total of lesser keys would be subtracted later
- in verification, it is no longer added to the first key population
- here, but still needed during the partial verify test. This is to
- ensure that 32-bit key_buff can still be used for class D. */
-/* key_buff_ptr[min_key_val] += m; */
- for( i=min_key_val; i<max_key_val; i++ )
- key_buff_ptr[i+1] += key_buff_ptr[i];
-
-
-/* This is the partial verify test section */
-/* Observe that test_rank_array vals are */
-/* shifted differently for different cases */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- {
- k = gd->bucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */
- if( min_key_val <= k && k <= max_key_val )
- {
- /* Add the total of lesser keys, m, here */
- INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
- int failed = 0;
-
- switch( CLASS )
- {
- case 'S':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'W':
- if( i < 2 )
- {
- if( key_rank != gd->test_rank_array[i]+(iteration-2) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'A':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+(iteration-1) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-(iteration-1) )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'B':
- if( i == 1 || i == 2 || i == 4 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'C':
- if( i <= 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- case 'D':
- if( i < 2 )
- {
- if( key_rank != gd->test_rank_array[i]+iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- else
- {
- if( key_rank != gd->test_rank_array[i]-iteration )
- failed = 1;
- else
- gd->passed_verification++;
- }
- break;
- }
- if( failed == 1 )
- printf( "Failed partial verification: "
- "iteration %d, processor %d, test key %d\n",
- iteration, gd->my_rank, (int)i );
- }
- }
-
-
-
-
-/* Make copies of rank info for use by full_verify: these variables
- in rank are local; making them global slows down the code, probably
- since they cannot be made register by compiler */
-
- if( iteration == MAX_ITERATIONS )
- {
- gd->key_buff_ptr_global = key_buff_ptr;
- gd->total_local_keys = j;
- gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
- }
-
-}
-
-
-/*****************************************************************/
-/************* M A I N ****************/
-/*****************************************************************/
-
-int main( int argc, char **argv )
-{
-
- int i, iteration, itemp;
-
- double timecounter, maxtime;
-
- global_data* gd = malloc(sizeof(global_data));
-/* Initialize MPI */
- MPI_Init( &argc, &argv );
- MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
- MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
-
-/* Initialize the verification arrays if a valid class */
- for( i=0; i<TEST_ARRAY_SIZE; i++ )
- switch( CLASS )
- {
- case 'S':
- gd->test_index_array[i] = S_test_index_array[i];
- gd->test_rank_array[i] = S_test_rank_array[i];
- break;
- case 'A':
- gd->test_index_array[i] = A_test_index_array[i];
- gd->test_rank_array[i] = A_test_rank_array[i];
- break;
- case 'W':
- gd->test_index_array[i] = W_test_index_array[i];
- gd->test_rank_array[i] = W_test_rank_array[i];
- break;
- case 'B':
- gd->test_index_array[i] = B_test_index_array[i];
- gd->test_rank_array[i] = B_test_rank_array[i];
- break;
- case 'C':
- gd->test_index_array[i] = C_test_index_array[i];
- gd->test_rank_array[i] = C_test_rank_array[i];
- break;
- case 'D':
- gd->test_index_array[i] = D_test_index_array[i];
- gd->test_rank_array[i] = D_test_rank_array[i];
- break;
- };
-
-
-
-/* Printout initial NPB info */
- if( gd->my_rank == 0 )
- {
- printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
- printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
- printf( " Iterations: %d\n", MAX_ITERATIONS );
- printf( " Number of processes: %d\n",gd->comm_size );
- }
-
-/* Check that actual and compiled number of processors agree */
- if( gd->comm_size != NUM_PROCS )
- {
- if( gd->my_rank == 0 )
- printf( "\n ERROR: compiled for %d processes\n"
- " Number of active processes: %d\n"
- " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
- MPI_Finalize();
- exit( 1 );
- }
-
-/* Check to see whether total number of processes is within bounds.
- This could in principle be checked in setparams.c, but it is more
- convenient to do it here */
- if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
- {
- if( gd->my_rank == 0 )
- printf( "\n ERROR: number of processes %d not within range %d-%d"
- "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
- MPI_Finalize();
- exit( 1 );
- }
-
-
-/* Generate random number sequence and subsequent keys on all procs */
- create_seq(gd, find_my_seed( gd->my_rank,
- gd->comm_size,
- 4*(long)TOTAL_KEYS*MIN_PROCS,
- 314159265.00, /* Random number gen seed */
- 1220703125.00 ), /* Random number gen mult */
- 1220703125.00 ); /* Random number gen mult */
-
-/* Do one interation for free (i.e., untimed) to guarantee initialization of
- all data and code pages and respective tables */
- rank(gd, 1 );
-
-/* Start verification counter */
- gd->passed_verification = 0;
-
- if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" );
-
-/* Initialize timer */
- timer_clear(gd, 0 );
-
-/* Initialize separate communication, computation timing */
-#ifdef TIMING_ENABLED
- for( i=1; i<=3; i++ ) timer_clear(gd, i );
-#endif
-
-/* Start timer */
- timer_start(gd, 0 );
-
-#ifdef TIMING_ENABLED
- timer_start(gd, 1 );
- timer_start(gd, 2 );
-#endif
-
- char smpi_category[100];
- snprintf (smpi_category, 100, "%d", gd->my_rank);
- TRACE_smpi_set_category (smpi_category);
-
-/* This is the main iteration */
- for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
- {
- if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
- rank(gd, iteration );
- }
- TRACE_smpi_set_category (NULL);
-
-#ifdef TIMING_ENABLED
- timer_stop(gd, 2 );
- timer_stop(gd, 1 );
-#endif
-
-/* Stop timer, obtain time for processors */
- timer_stop(gd, 0 );
-
- timecounter = timer_read(gd, 0 );
-
-/* End of timing, obtain maximum time of all processors */
- MPI_Reduce( &timecounter,
- &maxtime,
- 1,
- MPI_DOUBLE,
- MPI_MAX,
- 0,
- MPI_COMM_WORLD );
-
-#ifdef TIMING_ENABLED
- {
- double tmin, tsum, tmax;
-
- if( my_rank == 0 )
- {
- printf( "\ntimer 1/2/3 = total/computation/communication time\n");
- printf( " min avg max\n" );
- }
- for( i=1; i<=3; i++ )
- {
- timecounter = timer_read(gd, i );
- MPI_Reduce( &timecounter,
- &tmin,
- 1,
- MPI_DOUBLE,
- MPI_MIN,
- 0,
- MPI_COMM_WORLD );
- MPI_Reduce( &timecounter,
- &tsum,
- 1,
- MPI_DOUBLE,
- MPI_SUM,
- 0,
- MPI_COMM_WORLD );
- MPI_Reduce( &timecounter,
- &tmax,
- 1,
- MPI_DOUBLE,
- MPI_MAX,
- 0,
- MPI_COMM_WORLD );
- if( my_rank == 0 )
- printf( "timer %d: %f %f %f\n",
- i, tmin, tsum/((double) comm_size), tmax );
- }
- if( my_rank == 0 )
- printf( "\n" );
- }
-#endif
-
-/* This tests that keys are in sequence: sorting of last ranked key seq
- occurs here, but is an untimed operation */
- full_verify(gd);
-
-
-/* Obtain verification counter sum */
- itemp =gd->passed_verification;
- MPI_Reduce( &itemp,
- &gd->passed_verification,
- 1,
- MPI_INT,
- MPI_SUM,
- 0,
- MPI_COMM_WORLD );
-
-
-
-/* The final printout */
- if( gd->my_rank == 0 )
- {
- if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
- gd->passed_verification = 0;
- c_print_results( "IS",
- CLASS,
- (int)(TOTAL_KEYS),
- MIN_PROCS,
- 0,
- MAX_ITERATIONS,
- NUM_PROCS,
- gd->comm_size,
- maxtime,
- ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
- /maxtime/1000000.,
- "keys ranked",
- gd->passed_verification,
- NPBVERSION,
- COMPILETIME,
- MPICC,
- CLINK,
- CMPI_LIB,
- CMPI_INC,
- CFLAGS,
- CLINKFLAGS );
- }
-
- MPI_Finalize();
- free(gd);
-
- return 0;
- /**************************/
-} /* E N D P R O G R A M */
- /**************************/