X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/blobdiff_plain/1264d2c123d6828e4f628cfbf50447f3522ec5c4..62ab7fd2280a35af7c36f602852bc53fc0a56093:/examples/smpi/NAS/IS/is.c diff --git a/examples/smpi/NAS/IS/is.c b/examples/smpi/NAS/IS/is.c deleted file mode 100644 index 57b370bb39..0000000000 --- a/examples/smpi/NAS/IS/is.c +++ /dev/null @@ -1,1154 +0,0 @@ -/************************************************************************* - * * - * N A S P A R A L L E L B E N C H M A R K S 3.3 * - * * - * I S * - * * - ************************************************************************* - * * - * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. * - * It is described in NAS Technical Report 95-020. * - * * - * Permission to use, copy, distribute and modify this software * - * for any purpose with or without fee is hereby granted. We * - * request, however, that all derived work reference the NAS * - * Parallel Benchmarks 3.3. This software is provided "as is" * - * without express or implied warranty. * - * * - * Information on NPB 3.3, including the technical report, the * - * original specifications, source code, results and information * - * on how to submit new results, is available at: * - * * - * http://www.nas.nasa.gov/Software/NPB * - * * - * Send comments or suggestions to npb@nas.nasa.gov * - * Send bug reports to npb-bugs@nas.nasa.gov * - * * - * NAS Parallel Benchmarks Group * - * NASA Ames Research Center * - * Mail Stop: T27A-1 * - * Moffett Field, CA 94035-1000 * - * * - * E-mail: npb@nas.nasa.gov * - * Fax: (650) 604-3957 * - * * - ************************************************************************* - * * - * Author: M. Yarrow * - * H. Jin * - * * - *************************************************************************/ - -#include "mpi.h" -#include "npbparams.h" -#include -#include - -#include "simgrid/instr.h" //TRACE_ - -/******************/ -/* default values */ -/******************/ -#ifndef CLASS -#define CLASS 'S' -#define NUM_PROCS 1 -#endif -#define MIN_PROCS 1 - - -/*************/ -/* CLASS S */ -/*************/ -#if CLASS == 'S' -#define TOTAL_KEYS_LOG_2 16 -#define MAX_KEY_LOG_2 11 -#define NUM_BUCKETS_LOG_2 9 -#endif - - -/*************/ -/* CLASS W */ -/*************/ -#if CLASS == 'W' -#define TOTAL_KEYS_LOG_2 20 -#define MAX_KEY_LOG_2 16 -#define NUM_BUCKETS_LOG_2 10 -#endif - -/*************/ -/* CLASS A */ -/*************/ -#if CLASS == 'A' -#define TOTAL_KEYS_LOG_2 23 -#define MAX_KEY_LOG_2 19 -#define NUM_BUCKETS_LOG_2 10 -#endif - - -/*************/ -/* CLASS B */ -/*************/ -#if CLASS == 'B' -#define TOTAL_KEYS_LOG_2 25 -#define MAX_KEY_LOG_2 21 -#define NUM_BUCKETS_LOG_2 10 -#endif - - -/*************/ -/* CLASS C */ -/*************/ -#if CLASS == 'C' -#define TOTAL_KEYS_LOG_2 27 -#define MAX_KEY_LOG_2 23 -#define NUM_BUCKETS_LOG_2 10 -#endif - - -/*************/ -/* CLASS D */ -/*************/ -#if CLASS == 'D' -#define TOTAL_KEYS_LOG_2 29 -#define MAX_KEY_LOG_2 27 -#define NUM_BUCKETS_LOG_2 10 -#undef MIN_PROCS -#define MIN_PROCS 4 -#endif - - -#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2) -#define MAX_KEY (1 << MAX_KEY_LOG_2) -#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2) -#define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS) - -/*****************************************************************/ -/* On larger number of processors, since the keys are (roughly) */ -/* gaussian distributed, the first and last processor sort keys */ -/* in a large interval, requiring array sizes to be larger. Note */ -/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/ -/* The required array size also depends on the bucket size used. */ -/* The following values are validated for the 1024-bucket setup. */ -/*****************************************************************/ -#if NUM_PROCS < 256 -#define SIZE_OF_BUFFERS 3*NUM_KEYS/2 -#elif NUM_PROCS < 512 -#define SIZE_OF_BUFFERS 5*NUM_KEYS/2 -#elif NUM_PROCS < 1024 -#define SIZE_OF_BUFFERS 4*NUM_KEYS -#else -#define SIZE_OF_BUFFERS 13*NUM_KEYS/2 -#endif - -/*****************************************************************/ -/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */ -/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */ -/* MAX_PROCS AT YOUR PERIL */ -/*****************************************************************/ -#if CLASS == 'S' -#define MAX_PROCS 128 -#else -#define MAX_PROCS 1024 -#endif - -#define MAX_ITERATIONS 10 -#define TEST_ARRAY_SIZE 5 - - -/***********************************/ -/* Enable separate communication, */ -/* computation timing and printout */ -/***********************************/ -/* #define TIMING_ENABLED */ - - -/*************************************/ -/* Typedef: if necessary, change the */ -/* size of int here by changing the */ -/* int type to, say, long */ -/*************************************/ -typedef int INT_TYPE; -typedef long INT_TYPE2; -#define MP_KEY_TYPE MPI_INT - - -typedef struct { - -/********************/ -/* MPI properties: */ -/********************/ -int my_rank, - comm_size; - - -/********************/ -/* Some global info */ -/********************/ -INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */ - total_local_keys, /* copies of rank info */ - total_lesser_keys; - - -int passed_verification; - - - -/************************************/ -/* These are the three main arrays. */ -/* See SIZE_OF_BUFFERS def above */ -/************************************/ -INT_TYPE key_array[SIZE_OF_BUFFERS], - key_buff1[SIZE_OF_BUFFERS], - key_buff2[SIZE_OF_BUFFERS], - bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */ - bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */ - bucket_ptrs[NUM_BUCKETS], - process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE], - process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE]; -int send_count[MAX_PROCS], recv_count[MAX_PROCS], - send_displ[MAX_PROCS], recv_displ[MAX_PROCS]; - - -/**********************/ -/* Partial verif info */ -/**********************/ -INT_TYPE2 test_index_array[TEST_ARRAY_SIZE], - test_rank_array[TEST_ARRAY_SIZE]; - -/**********/ -/* Timers */ -/**********/ -double start[64], elapsed[64]; - -} global_data; - - -const INT_TYPE2 - S_test_index_array[TEST_ARRAY_SIZE] = - {48427,17148,23627,62548,4431}, - S_test_rank_array[TEST_ARRAY_SIZE] = - {0,18,346,64917,65463}, - - W_test_index_array[TEST_ARRAY_SIZE] = - {357773,934767,875723,898999,404505}, - W_test_rank_array[TEST_ARRAY_SIZE] = - {1249,11698,1039987,1043896,1048018}, - - A_test_index_array[TEST_ARRAY_SIZE] = - {2112377,662041,5336171,3642833,4250760}, - A_test_rank_array[TEST_ARRAY_SIZE] = - {104,17523,123928,8288932,8388264}, - - B_test_index_array[TEST_ARRAY_SIZE] = - {41869,812306,5102857,18232239,26860214}, - B_test_rank_array[TEST_ARRAY_SIZE] = - {33422937,10244,59149,33135281,99}, - - C_test_index_array[TEST_ARRAY_SIZE] = - {44172927,72999161,74326391,129606274,21736814}, - C_test_rank_array[TEST_ARRAY_SIZE] = - {61147,882988,266290,133997595,133525895}, - - D_test_index_array[TEST_ARRAY_SIZE] = - {1317351170,995930646,1157283250,1503301535,1453734525}, - D_test_rank_array[TEST_ARRAY_SIZE] = - {1,36538729,1978098519,2145192618,2147425337}; - - - -/***********************/ -/* function prototypes */ -/***********************/ -double randlc( double *X, double *A ); - -void full_verify( global_data* gd ); - -void c_print_results( char *name, - char class, - int n1, - int n2, - int n3, - int niter, - int nprocs_compiled, - int nprocs_total, - double t, - double mops, - char *optype, - int passed_verification, - char *npbversion, - char *compiletime, - char *mpicc, - char *clink, - char *cmpi_lib, - char *cmpi_inc, - char *cflags, - char *clinkflags ); - -void timer_clear(global_data* gd, int n ); -void timer_start(global_data* gd, int n ); -void timer_stop(global_data* gd, int n ); -double timer_read(global_data* gd, int n ); - -void timer_clear(global_data* gd, int n ) { - gd->elapsed[n] = 0.0; -} - -void timer_start(global_data* gd, int n ) { - gd->start[n] = MPI_Wtime(); -} - -void timer_stop(global_data* gd, int n ) { - gd->elapsed[n] += MPI_Wtime() - gd->start[n]; -} - -double timer_read(global_data* gd, int n ) { - return gd->elapsed[n]; -} - - -/* - * FUNCTION RANDLC (X, A) - * - * This routine returns a uniform pseudorandom double precision number in the - * range (0, 1) by using the linear congruential generator - * - * x_{k+1} = a x_k (mod 2^46) - * - * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers - * before repeating. The argument A is the same as 'a' in the above formula, - * and X is the same as x_0. A and X must be odd double precision integers - * in the range (1, 2^46). The returned value RANDLC is normalized to be - * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain - * the new seed x_1, so that subsequent calls to RANDLC using the same - * arguments will generate a continuous sequence. - * - * This routine should produce the same results on any computer with at least - * 48 mantissa bits in double precision floating point data. On Cray systems, - * double precision should be disabled. - * - * David H. Bailey October 26, 1990 - * - * IMPLICIT DOUBLE PRECISION (A-H, O-Z) - * SAVE KS, R23, R46, T23, T46 - * DATA KS/0/ - * - * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46, - * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than - * by merely using the ** operator, in order to insure that the results are - * exact on all systems. This code assumes that 0.5D0 is represented exactly. - */ - - -/*****************************************************************/ -/************* R A N D L C ************/ -/************* ************/ -/************* portable random number generator ************/ -/*****************************************************************/ - -double randlc( double *X, double *A ) -{ - static int KS=0; - static double R23, R46, T23, T46; - double T1, T2, T3, T4; - double A1; - double A2; - double X1; - double X2; - double Z; - int i, j; - - if (KS == 0) - { - R23 = 1.0; - R46 = 1.0; - T23 = 1.0; - T46 = 1.0; - - for (i=1; i<=23; i++) - { - R23 = 0.50 * R23; - T23 = 2.0 * T23; - } - for (i=1; i<=46; i++) - { - R46 = 0.50 * R46; - T46 = 2.0 * T46; - } - KS = 1; - } - -/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ - - T1 = R23 * *A; - j = T1; - A1 = j; - A2 = *A - T23 * A1; - -/* Break X into two parts such that X = 2^23 * X1 + X2, compute - Z = A1 * X2 + A2 * X1 (mod 2^23), and then - X = 2^23 * Z + A2 * X2 (mod 2^46). */ - - T1 = R23 * *X; - j = T1; - X1 = j; - X2 = *X - T23 * X1; - T1 = A1 * X2 + A2 * X1; - - j = R23 * T1; - T2 = j; - Z = T1 - T23 * T2; - T3 = T23 * Z + A2 * X2; - j = R46 * T3; - T4 = j; - *X = T3 - T46 * T4; - return(R46 * *X); -} - - - -/*****************************************************************/ -/************ F I N D _ M Y _ S E E D ************/ -/************ ************/ -/************ returns parallel random number seq seed ************/ -/*****************************************************************/ - -/* - * Create a random number sequence of total length nn residing - * on np number of processors. Each processor will therefore have a - * subsequence of length nn/np. This routine returns that random - * number which is the first random number for the subsequence belonging - * to processor rank kn, and which is used as seed for proc kn ran # gen. - */ - -double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */ - int np, /* np = num procs */ - long nn, /* total num of ran numbers, all procs */ - double s, /* Ran num seed, for ex.: 314159265.00 */ - double a ) /* Ran num gen mult, try 1220703125.00 */ -{ - - long i; - - double t1,t2,t3,an; - long mq,nq,kk,ik; - - - - nq = nn / np; - - for( mq=0; nq>1; mq++,nq/=2 ) - ; - - t1 = a; - - for( i=1; i<=mq; i++ ) - t2 = randlc( &t1, &t1 ); - - an = t1; - - kk = kn; - t1 = s; - t2 = an; - - for( i=1; i<=100; i++ ) - { - ik = kk / 2; - if( 2 * ik != kk ) - t3 = randlc( &t1, &t2 ); - if( ik == 0 ) - break; - t3 = randlc( &t2, &t2 ); - kk = ik; - } - - return( t1 ); - -} - - - - -/*****************************************************************/ -/************* C R E A T E _ S E Q ************/ -/*****************************************************************/ - -void create_seq( global_data* gd, double seed, double a ) -{ - double x; - int i, k; - - k = MAX_KEY/4; - - for (i=0; ikey_array[i] = k*x; - } -} - - - - -/*****************************************************************/ -/************* F U L L _ V E R I F Y ************/ -/*****************************************************************/ - - -void full_verify( global_data* gd ) -{ - MPI_Status status; - MPI_Request request; - - INT_TYPE i, j; - INT_TYPE k, last_local_key; - - -/* Now, finally, sort the keys: */ - for( i=0; itotal_local_keys; i++ ) - gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]- - gd->total_lesser_keys] = gd->key_buff2[i]; - last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1); - -/* Send largest key value to next processor */ - if( gd->my_rank > 0 ) - MPI_Irecv( &k, - 1, - MP_KEY_TYPE, - gd->my_rank-1, - 1000, - MPI_COMM_WORLD, - &request ); - if( gd->my_rank < gd->comm_size-1 ) - MPI_Send( &gd->key_array[last_local_key], - 1, - MP_KEY_TYPE, - gd->my_rank+1, - 1000, - MPI_COMM_WORLD ); - if( gd->my_rank > 0 ) - MPI_Wait( &request, &status ); - -/* Confirm that neighbor's greatest key value - is not greater than my least key value */ - j = 0; - if( gd->my_rank > 0 && gd->total_local_keys > 0 ) - if( k > gd->key_array[0] ) - j++; - - -/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */ - for( i=1; itotal_local_keys; i++ ) - if( gd->key_array[i-1] > gd->key_array[i] ) - j++; - - - if( j != 0 ) - { - printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", - gd->my_rank, j ); - } - else - gd->passed_verification++; - - -} - - - - -/*****************************************************************/ -/************* R A N K ****************/ -/*****************************************************************/ - - -void rank( global_data* gd, int iteration ) -{ - - INT_TYPE i, k; - - INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2; - INT_TYPE key; - INT_TYPE2 bucket_sum_accumulator, j, m; - INT_TYPE local_bucket_sum_accumulator; - INT_TYPE min_key_val, max_key_val; - INT_TYPE *key_buff_ptr; - - - - -/* Iteration alteration of keys */ - if(gd->my_rank == 0 ) - { - gd->key_array[iteration] = iteration; - gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration; - } - - -/* Initialize */ - for( i=0; ibucket_size[i] = 0; - gd->bucket_size_totals[i] = 0; - gd->process_bucket_distrib_ptr1[i] = 0; - gd->process_bucket_distrib_ptr2[i] = 0; - } - - -/* Determine where the partial verify test keys are, load into */ -/* top of array bucket_size */ - for( i=0; itest_index_array[i]/NUM_KEYS) == gd->my_rank ) - gd->bucket_size[NUM_BUCKETS+i] = - gd->key_array[gd->test_index_array[i] % NUM_KEYS]; - - -/* Determine the number of keys in each bucket */ - for( i=0; ibucket_size[gd->key_array[i] >> shift]++; - - -/* Accumulative bucket sizes are the bucket pointers */ - gd->bucket_ptrs[0] = 0; - for( i=1; i< NUM_BUCKETS; i++ ) - gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1]; - - -/* Sort into appropriate bucket */ - for( i=0; ikey_array[i]; - gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key; - } - -#ifdef TIMING_ENABLED - timer_stop(gd, 2 ); - timer_start(gd, 3 ); -#endif - -/* Get the bucket size totals for the entire problem. These - will be used to determine the redistribution of keys */ - MPI_Allreduce( gd->bucket_size, - gd->bucket_size_totals, - NUM_BUCKETS+TEST_ARRAY_SIZE, - MP_KEY_TYPE, - MPI_SUM, - MPI_COMM_WORLD ); - -#ifdef TIMING_ENABLED - timer_stop(gd, 3 ); - timer_start(gd, 2 ); -#endif - -/* Determine Redistibution of keys: accumulate the bucket size totals - till this number surpasses NUM_KEYS (which the average number of keys - per processor). Then all keys in these buckets go to processor 0. - Continue accumulating again until supassing 2*NUM_KEYS. All keys - in these buckets go to processor 1, etc. This algorithm guarantees - that all processors have work ranking; no processors are left idle. - The optimum number of buckets, however, does not result in as high - a degree of load balancing (as even a distribution of keys as is - possible) as is obtained from increasing the number of buckets, but - more buckets results in more computation per processor so that the - optimum number of buckets turns out to be 1024 for machines tested. - Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket - number of first and last bucket which each processor will have after - the redistribution is done. */ - - bucket_sum_accumulator = 0; - local_bucket_sum_accumulator = 0; - gd->send_displ[0] = 0; - gd->process_bucket_distrib_ptr1[0] = 0; - for( i=0, j=0; ibucket_size_totals[i]; - local_bucket_sum_accumulator += gd->bucket_size[i]; - if( bucket_sum_accumulator >= (j+1)*NUM_KEYS ) - { - gd->send_count[j] = local_bucket_sum_accumulator; - if( j != 0 ) - { - gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1]; - gd->process_bucket_distrib_ptr1[j] = - gd->process_bucket_distrib_ptr2[j-1]+1; - } - gd->process_bucket_distrib_ptr2[j++] = i; - local_bucket_sum_accumulator = 0; - } - } - -/* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible - that the last few processors don't get any buckets. So, we - need to set counts properly in this case to avoid any fallouts. */ - while( j < gd->comm_size ) - { - gd->send_count[j] = 0; - gd->process_bucket_distrib_ptr1[j] = 1; - j++; - } - -#ifdef TIMING_ENABLED - timer_stop(gd, 2 ); - timer_start(gd, 3 ); -#endif - -/* This is the redistribution section: first find out how many keys - each processor will send to every other processor: */ - MPI_Alltoall( gd->send_count, - 1, - MPI_INT, - gd->recv_count, - 1, - MPI_INT, - MPI_COMM_WORLD ); - -/* Determine the receive array displacements for the buckets */ - gd->recv_displ[0] = 0; - for( i=1; icomm_size; i++ ) - gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1]; - - -/* Now send the keys to respective processors */ - MPI_Alltoallv( gd->key_buff1, - gd->send_count, - gd->send_displ, - MP_KEY_TYPE, - gd->key_buff2, - gd->recv_count, - gd->recv_displ, - MP_KEY_TYPE, - MPI_COMM_WORLD ); - -#ifdef TIMING_ENABLED - timer_stop(gd, 3 ); - timer_start(gd, 2 ); -#endif - -/* The starting and ending bucket numbers on each processor are - multiplied by the interval size of the buckets to obtain the - smallest possible min and greatest possible max value of any - key on each processor */ - min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift; - max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1; - -/* Clear the work array */ - for( i=0; ikey_buff1[i] = 0; - -/* Determine the total number of keys on all other - processors holding keys of lesser value */ - m = 0; - for( k=0; kmy_rank; k++ ) - for( i= gd->process_bucket_distrib_ptr1[k]; - i<=gd->process_bucket_distrib_ptr2[k]; - i++ ) - m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */ - -/* Determine total number of keys on this processor */ - j = 0; - for( i= gd->process_bucket_distrib_ptr1[gd->my_rank]; - i<=gd->process_bucket_distrib_ptr2[gd->my_rank]; - i++ ) - j += gd->bucket_size_totals[i]; /* j has total # of local keys */ - - -/* Ranking of all keys occurs in this section: */ -/* shift it backwards so no subtractions are necessary in loop */ - key_buff_ptr = gd->key_buff1 - min_key_val; - -/* In this section, the keys themselves are used as their - own indexes to determine how many of each there are: their - individual population */ - for( i=0; ikey_buff2[i]]++; /* Now they have individual key */ - /* population */ - -/* To obtain ranks of each key, successively add the individual key - population, not forgetting the total of lesser keys, m. - NOTE: Since the total of lesser keys would be subtracted later - in verification, it is no longer added to the first key population - here, but still needed during the partial verify test. This is to - ensure that 32-bit key_buff can still be used for class D. */ -/* key_buff_ptr[min_key_val] += m; */ - for( i=min_key_val; ibucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */ - if( min_key_val <= k && k <= max_key_val ) - { - /* Add the total of lesser keys, m, here */ - INT_TYPE2 key_rank = key_buff_ptr[k-1] + m; - int failed = 0; - - switch( CLASS ) - { - case 'S': - if( i <= 2 ) - { - if( key_rank != gd->test_rank_array[i]+iteration ) - failed = 1; - else - gd->passed_verification++; - } - else - { - if( key_rank != gd->test_rank_array[i]-iteration ) - failed = 1; - else - gd->passed_verification++; - } - break; - case 'W': - if( i < 2 ) - { - if( key_rank != gd->test_rank_array[i]+(iteration-2) ) - failed = 1; - else - gd->passed_verification++; - } - else - { - if( key_rank != gd->test_rank_array[i]-iteration ) - failed = 1; - else - gd->passed_verification++; - } - break; - case 'A': - if( i <= 2 ) - { - if( key_rank != gd->test_rank_array[i]+(iteration-1) ) - failed = 1; - else - gd->passed_verification++; - } - else - { - if( key_rank != gd->test_rank_array[i]-(iteration-1) ) - failed = 1; - else - gd->passed_verification++; - } - break; - case 'B': - if( i == 1 || i == 2 || i == 4 ) - { - if( key_rank != gd->test_rank_array[i]+iteration ) - failed = 1; - else - gd->passed_verification++; - } - else - { - if( key_rank != gd->test_rank_array[i]-iteration ) - failed = 1; - else - gd->passed_verification++; - } - break; - case 'C': - if( i <= 2 ) - { - if( key_rank != gd->test_rank_array[i]+iteration ) - failed = 1; - else - gd->passed_verification++; - } - else - { - if( key_rank != gd->test_rank_array[i]-iteration ) - failed = 1; - else - gd->passed_verification++; - } - break; - case 'D': - if( i < 2 ) - { - if( key_rank != gd->test_rank_array[i]+iteration ) - failed = 1; - else - gd->passed_verification++; - } - else - { - if( key_rank != gd->test_rank_array[i]-iteration ) - failed = 1; - else - gd->passed_verification++; - } - break; - } - if( failed == 1 ) - printf( "Failed partial verification: " - "iteration %d, processor %d, test key %d\n", - iteration, gd->my_rank, (int)i ); - } - } - - - - -/* Make copies of rank info for use by full_verify: these variables - in rank are local; making them global slows down the code, probably - since they cannot be made register by compiler */ - - if( iteration == MAX_ITERATIONS ) - { - gd->key_buff_ptr_global = key_buff_ptr; - gd->total_local_keys = j; - gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */ - } - -} - - -/*****************************************************************/ -/************* M A I N ****************/ -/*****************************************************************/ - -int main( int argc, char **argv ) -{ - - int i, iteration, itemp; - - double timecounter, maxtime; - - global_data* gd = malloc(sizeof(global_data)); -/* Initialize MPI */ - MPI_Init( &argc, &argv ); - MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank ); - MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size ); - -/* Initialize the verification arrays if a valid class */ - for( i=0; itest_index_array[i] = S_test_index_array[i]; - gd->test_rank_array[i] = S_test_rank_array[i]; - break; - case 'A': - gd->test_index_array[i] = A_test_index_array[i]; - gd->test_rank_array[i] = A_test_rank_array[i]; - break; - case 'W': - gd->test_index_array[i] = W_test_index_array[i]; - gd->test_rank_array[i] = W_test_rank_array[i]; - break; - case 'B': - gd->test_index_array[i] = B_test_index_array[i]; - gd->test_rank_array[i] = B_test_rank_array[i]; - break; - case 'C': - gd->test_index_array[i] = C_test_index_array[i]; - gd->test_rank_array[i] = C_test_rank_array[i]; - break; - case 'D': - gd->test_index_array[i] = D_test_index_array[i]; - gd->test_rank_array[i] = D_test_rank_array[i]; - break; - }; - - - -/* Printout initial NPB info */ - if( gd->my_rank == 0 ) - { - printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" ); - printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS ); - printf( " Iterations: %d\n", MAX_ITERATIONS ); - printf( " Number of processes: %d\n",gd->comm_size ); - } - -/* Check that actual and compiled number of processors agree */ - if( gd->comm_size != NUM_PROCS ) - { - if( gd->my_rank == 0 ) - printf( "\n ERROR: compiled for %d processes\n" - " Number of active processes: %d\n" - " Exiting program!\n\n", NUM_PROCS, gd->comm_size ); - MPI_Finalize(); - exit( 1 ); - } - -/* Check to see whether total number of processes is within bounds. - This could in principle be checked in setparams.c, but it is more - convenient to do it here */ - if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS) - { - if( gd->my_rank == 0 ) - printf( "\n ERROR: number of processes %d not within range %d-%d" - "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS); - MPI_Finalize(); - exit( 1 ); - } - - -/* Generate random number sequence and subsequent keys on all procs */ - create_seq(gd, find_my_seed( gd->my_rank, - gd->comm_size, - 4*(long)TOTAL_KEYS*MIN_PROCS, - 314159265.00, /* Random number gen seed */ - 1220703125.00 ), /* Random number gen mult */ - 1220703125.00 ); /* Random number gen mult */ - -/* Do one interation for free (i.e., untimed) to guarantee initialization of - all data and code pages and respective tables */ - rank(gd, 1 ); - -/* Start verification counter */ - gd->passed_verification = 0; - - if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" ); - -/* Initialize timer */ - timer_clear(gd, 0 ); - -/* Initialize separate communication, computation timing */ -#ifdef TIMING_ENABLED - for( i=1; i<=3; i++ ) timer_clear(gd, i ); -#endif - -/* Start timer */ - timer_start(gd, 0 ); - -#ifdef TIMING_ENABLED - timer_start(gd, 1 ); - timer_start(gd, 2 ); -#endif - - char smpi_category[100]; - snprintf (smpi_category, 100, "%d", gd->my_rank); - TRACE_smpi_set_category (smpi_category); - -/* This is the main iteration */ - for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) - { - if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration ); - rank(gd, iteration ); - } - TRACE_smpi_set_category (NULL); - -#ifdef TIMING_ENABLED - timer_stop(gd, 2 ); - timer_stop(gd, 1 ); -#endif - -/* Stop timer, obtain time for processors */ - timer_stop(gd, 0 ); - - timecounter = timer_read(gd, 0 ); - -/* End of timing, obtain maximum time of all processors */ - MPI_Reduce( &timecounter, - &maxtime, - 1, - MPI_DOUBLE, - MPI_MAX, - 0, - MPI_COMM_WORLD ); - -#ifdef TIMING_ENABLED - { - double tmin, tsum, tmax; - - if( my_rank == 0 ) - { - printf( "\ntimer 1/2/3 = total/computation/communication time\n"); - printf( " min avg max\n" ); - } - for( i=1; i<=3; i++ ) - { - timecounter = timer_read(gd, i ); - MPI_Reduce( &timecounter, - &tmin, - 1, - MPI_DOUBLE, - MPI_MIN, - 0, - MPI_COMM_WORLD ); - MPI_Reduce( &timecounter, - &tsum, - 1, - MPI_DOUBLE, - MPI_SUM, - 0, - MPI_COMM_WORLD ); - MPI_Reduce( &timecounter, - &tmax, - 1, - MPI_DOUBLE, - MPI_MAX, - 0, - MPI_COMM_WORLD ); - if( my_rank == 0 ) - printf( "timer %d: %f %f %f\n", - i, tmin, tsum/((double) comm_size), tmax ); - } - if( my_rank == 0 ) - printf( "\n" ); - } -#endif - -/* This tests that keys are in sequence: sorting of last ranked key seq - occurs here, but is an untimed operation */ - full_verify(gd); - - -/* Obtain verification counter sum */ - itemp =gd->passed_verification; - MPI_Reduce( &itemp, - &gd->passed_verification, - 1, - MPI_INT, - MPI_SUM, - 0, - MPI_COMM_WORLD ); - - - -/* The final printout */ - if( gd->my_rank == 0 ) - { - if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size ) - gd->passed_verification = 0; - c_print_results( "IS", - CLASS, - (int)(TOTAL_KEYS), - MIN_PROCS, - 0, - MAX_ITERATIONS, - NUM_PROCS, - gd->comm_size, - maxtime, - ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS) - /maxtime/1000000., - "keys ranked", - gd->passed_verification, - NPBVERSION, - COMPILETIME, - MPICC, - CLINK, - CMPI_LIB, - CMPI_INC, - CFLAGS, - CLINKFLAGS ); - } - - MPI_Finalize(); - free(gd); - - return 0; - /**************************/ -} /* E N D P R O G R A M */ - /**************************/