1 !-------------------------------------------------------------------------!
3 ! N A S P A R A L L E L B E N C H M A R K S 3.3 !
7 !-------------------------------------------------------------------------!
9 ! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
10 ! It is described in NAS Technical Reports 95-020 and 02-007 !
12 ! Permission to use, copy, distribute and modify this software !
13 ! for any purpose with or without fee is hereby granted. We !
14 ! request, however, that all derived work reference the NAS !
15 ! Parallel Benchmarks 3.3. This software is provided "as is" !
16 ! without express or implied warranty. !
18 ! Information on NPB 3.3, including the technical report, the !
19 ! original specifications, source code, results and information !
20 ! on how to submit new results, is available at: !
22 ! http://www.nas.nasa.gov/Software/NPB/ !
24 ! Send comments or suggestions to npb@nas.nasa.gov !
26 ! NAS Parallel Benchmarks Group !
27 ! NASA Ames Research Center !
29 ! Moffett Field, CA 94035-1000 !
31 ! E-mail: npb@nas.nasa.gov !
32 ! Fax: (650) 604-3957 !
34 !-------------------------------------------------------------------------!
37 c---------------------------------------------------------------------
41 c R. F. Van der Wijngaart
44 c---------------------------------------------------------------------
47 c---------------------------------------------------------------------
48 c---------------------------------------------------------------------
50 c---------------------------------------------------------------------
51 c---------------------------------------------------------------------
57 integer status(MPI_STATUS_SIZE), request, ierr
61 c---------------------------------------------------------------------
62 c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows.
63 c num_proc_cols and num_proc_cols are to be found in npbparams.h.
64 c When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows.
65 c---------------------------------------------------------------------
67 parameter( num_procs = num_proc_cols * num_proc_rows )
71 c---------------------------------------------------------------------
72 c Class specific parameters:
73 c It appears here for reference only.
74 c These are their values, however, this info is imported in the npbparams.h
75 c include file, which is written by the sys/setparams.c program.
76 c---------------------------------------------------------------------
81 CC parameter( na=1400,
89 CC parameter( na=7000,
97 CC parameter( na=14000,
105 CC parameter( na=75000,
113 CC parameter( na=150000,
121 CC parameter( na=1500000,
129 CC parameter( na=9000000,
138 parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer
139 > + na*(nonzer+2+num_procs/256)/num_proc_cols )
143 common / partit_size / naa, nzz,
145 > proc_col, proc_row,
156 > proc_col, proc_row,
167 common / main_int_mem / colidx, rowstr,
169 integer colidx(nz), rowstr(na+1),
170 > iv(2*na+1), arow(nz), acol(nz)
173 common / main_flt_mem / v, aelt, a,
180 double precision v(na+1), aelt(nz), a(nz),
181 > x(na/num_proc_rows+2),
182 > z(na/num_proc_rows+2),
183 > p(na/num_proc_rows+2),
184 > q(na/num_proc_rows+2),
185 > r(na/num_proc_rows+2),
186 > w(na/num_proc_rows+2)
189 common /urando/ amult, tran
190 double precision amult, tran
195 integer reduce_exch_proc(num_proc_cols)
196 integer reduce_send_starts(num_proc_cols)
197 integer reduce_send_lengths(num_proc_cols)
198 integer reduce_recv_starts(num_proc_cols)
199 integer reduce_recv_lengths(num_proc_cols)
203 double precision zeta, randlc
205 double precision rnorm
206 double precision norm_temp1(2), norm_temp2(2)
208 double precision t, tmax, mflops
210 double precision timer_read
213 double precision zeta_verify_value, epsilon, err
216 c---------------------------------------------------------------------
217 c Set up mpi initialization and number of proc testing
218 c---------------------------------------------------------------------
222 if( na .eq. 1400 .and.
223 & nonzer .eq. 7 .and.
224 & niter .eq. 15 .and.
225 & shift .eq. 10.d0 ) then
227 zeta_verify_value = 8.5971775078648d0
228 else if( na .eq. 7000 .and.
229 & nonzer .eq. 8 .and.
230 & niter .eq. 15 .and.
231 & shift .eq. 12.d0 ) then
233 zeta_verify_value = 10.362595087124d0
234 else if( na .eq. 14000 .and.
235 & nonzer .eq. 11 .and.
236 & niter .eq. 15 .and.
237 & shift .eq. 20.d0 ) then
239 zeta_verify_value = 17.130235054029d0
240 else if( na .eq. 75000 .and.
241 & nonzer .eq. 13 .and.
242 & niter .eq. 75 .and.
243 & shift .eq. 60.d0 ) then
245 zeta_verify_value = 22.712745482631d0
246 else if( na .eq. 150000 .and.
247 & nonzer .eq. 15 .and.
248 & niter .eq. 75 .and.
249 & shift .eq. 110.d0 ) then
251 zeta_verify_value = 28.973605592845d0
252 else if( na .eq. 1500000 .and.
253 & nonzer .eq. 21 .and.
254 & niter .eq. 100 .and.
255 & shift .eq. 500.d0 ) then
257 zeta_verify_value = 52.514532105794d0
258 else if( na .eq. 9000000 .and.
259 & nonzer .eq. 26 .and.
260 & niter .eq. 100 .and.
261 & shift .eq. 1.5d3 ) then
263 zeta_verify_value = 77.522164599383d0
268 if( me .eq. root )then
271 write( *,1002 ) niter
272 write( *,1003 ) nprocs
273 write( *,1004 ) nonzer
274 write( *,1005 ) shift
275 1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /)
276 1001 format(' Size: ', i10 )
277 1002 format(' Iterations: ', i5 )
278 1003 format(' Number of active processes: ', i5 )
279 1004 format(' Number of nonzeroes per row: ', i8)
280 1005 format(' Eigenvalue shift: ', e8.3)
283 if (.not. convertdouble) then
284 dp_type = MPI_DOUBLE_PRECISION
294 c---------------------------------------------------------------------
295 c Set up processor info, such as whether sq num of procs, etc
296 c---------------------------------------------------------------------
297 call setup_proc_info( num_procs,
302 c---------------------------------------------------------------------
303 c Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow
304 c---------------------------------------------------------------------
305 call setup_submatrix_info( l2npcols,
307 > reduce_send_starts,
308 > reduce_send_lengths,
309 > reduce_recv_starts,
310 > reduce_recv_lengths )
314 c---------------------------------------------------------------------
315 c Inialize random number generator
316 c---------------------------------------------------------------------
318 amult = 1220703125.0D0
319 zeta = randlc( tran, amult )
321 c---------------------------------------------------------------------
322 c Set up partition's sparse random matrix for given class size
323 c---------------------------------------------------------------------
324 call makea(naa, nzz, a, colidx, rowstr, nonzer,
325 > firstrow, lastrow, firstcol, lastcol,
326 > rcond, arow, acol, aelt, v, iv, shift)
330 c---------------------------------------------------------------------
331 c Note: as a result of the above call to makea:
332 c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
333 c values of colidx which are col indexes go from firstcol --> lastcol
335 c Shift the col index vals from actual (firstcol --> lastcol )
336 c to local, i.e., (1 --> lastcol-firstcol+1)
337 c---------------------------------------------------------------------
338 do j=1,lastrow-firstrow+1
339 do k=rowstr(j),rowstr(j+1)-1
340 colidx(k) = colidx(k) - firstcol + 1
344 c---------------------------------------------------------------------
345 c set starting vector to (1, 1, .... 1)
346 c---------------------------------------------------------------------
347 do i = 1, na/num_proc_rows+1
353 c---------------------------------------------------------------------
355 c Do one iteration untimed to init all code and data page tables
356 c----> (then reinit, start timing, to niter its)
357 c---------------------------------------------------------------------
360 c---------------------------------------------------------------------
361 c The call to the conjugate gradient routine:
362 c---------------------------------------------------------------------
363 call conj_grad ( colidx,
375 > reduce_send_starts,
376 > reduce_send_lengths,
377 > reduce_recv_starts,
378 > reduce_recv_lengths )
380 c---------------------------------------------------------------------
381 c zeta = shift + 1/(x.z)
383 c Also, find norm of z
385 c---------------------------------------------------------------------
386 norm_temp1(1) = 0.0d0
387 norm_temp1(2) = 0.0d0
388 do j=1, lastcol-firstcol+1
389 norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
390 norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
394 call mpi_irecv( norm_temp2,
397 > reduce_exch_proc(i),
402 call mpi_send( norm_temp1,
405 > reduce_exch_proc(i),
409 call mpi_wait( request, status, ierr )
411 norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
412 norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
415 norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
418 c---------------------------------------------------------------------
419 c Normalize z to obtain x
420 c---------------------------------------------------------------------
421 do j=1, lastcol-firstcol+1
422 x(j) = norm_temp1(2)*z(j)
426 enddo ! end of do one iteration untimed
429 c---------------------------------------------------------------------
430 c set starting vector to (1, 1, .... 1)
431 c---------------------------------------------------------------------
433 c NOTE: a questionable limit on size: should this be na/num_proc_cols+1 ?
435 do i = 1, na/num_proc_rows+1
441 c---------------------------------------------------------------------
442 c Synchronize and start timing
443 c---------------------------------------------------------------------
444 call mpi_barrier( mpi_comm_world,
447 call timer_clear( 1 )
448 call timer_start( 1 )
450 c---------------------------------------------------------------------
452 c Main Iteration for inverse power method
454 c---------------------------------------------------------------------
457 c---------------------------------------------------------------------
458 c The call to the conjugate gradient routine:
459 c---------------------------------------------------------------------
460 call conj_grad ( colidx,
472 > reduce_send_starts,
473 > reduce_send_lengths,
474 > reduce_recv_starts,
475 > reduce_recv_lengths )
478 c---------------------------------------------------------------------
479 c zeta = shift + 1/(x.z)
481 c Also, find norm of z
483 c---------------------------------------------------------------------
484 norm_temp1(1) = 0.0d0
485 norm_temp1(2) = 0.0d0
486 do j=1, lastcol-firstcol+1
487 norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
488 norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
492 call mpi_irecv( norm_temp2,
495 > reduce_exch_proc(i),
500 call mpi_send( norm_temp1,
503 > reduce_exch_proc(i),
507 call mpi_wait( request, status, ierr )
509 norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
510 norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
513 norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
516 if( me .eq. root )then
517 zeta = shift + 1.0d0 / norm_temp1(1)
518 if( it .eq. 1 ) write( *,9000 )
519 write( *,9001 ) it, rnorm, zeta
521 9000 format( /,' iteration ||r|| zeta' )
522 9001 format( 4x, i5, 7x, e20.14, f20.13 )
524 c---------------------------------------------------------------------
525 c Normalize z to obtain x
526 c---------------------------------------------------------------------
527 do j=1, lastcol-firstcol+1
528 x(j) = norm_temp1(2)*z(j)
532 enddo ! end of main iter inv pow meth
536 c---------------------------------------------------------------------
537 c End of timed section
538 c---------------------------------------------------------------------
551 if( me .eq. root )then
553 100 format(' Benchmark completed ')
556 if (class .ne. 'U') then
558 err = abs( zeta - zeta_verify_value )/zeta_verify_value
559 if( err .le. epsilon ) then
564 200 format(' VERIFICATION SUCCESSFUL ')
565 201 format(' Zeta is ', E20.13)
566 202 format(' Error is ', E20.13)
571 write(*, 302) zeta_verify_value
572 300 format(' VERIFICATION FAILED')
573 301 format(' Zeta ', E20.13)
574 302 format(' The correct zeta is ', E20.13)
581 400 format(' Problem size unknown')
582 401 format(' NO VERIFICATION PERFORMED')
586 if( tmax .ne. 0. ) then
587 mflops = float( 2*niter*na )
588 & * ( 3.+float( nonzer*(nonzer+1) )
589 & + 25.*(5.+float( nonzer*(nonzer+1) ))
590 & + 3. ) / tmax / 1000000.0
595 call print_results('CG', class, na, 0, 0,
596 > niter, nnodes_compiled, nprocs, tmax,
597 > mflops, ' floating point',
598 > verified, npbversion, compiletime,
599 > cs1, cs2, cs3, cs4, cs5, cs6, cs7)
605 call mpi_finalize(ierr)
615 c---------------------------------------------------------------------
616 c---------------------------------------------------------------------
617 subroutine initialize_mpi
618 c---------------------------------------------------------------------
619 c---------------------------------------------------------------------
628 call mpi_init( ierr )
629 call mpi_comm_rank( mpi_comm_world, me, ierr )
630 call mpi_comm_size( mpi_comm_world, nprocs, ierr )
639 c---------------------------------------------------------------------
640 c---------------------------------------------------------------------
641 subroutine setup_proc_info( num_procs,
644 c---------------------------------------------------------------------
645 c---------------------------------------------------------------------
651 common / partit_size / naa, nzz,
653 > proc_col, proc_row,
664 > proc_col, proc_row,
674 integer num_procs, num_proc_cols, num_proc_rows
678 c---------------------------------------------------------------------
679 c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows
680 c When num_procs is not square, then num_proc_cols = 2*num_proc_rows
681 c---------------------------------------------------------------------
682 c First, number of procs must be power of two.
683 c---------------------------------------------------------------------
684 if( nprocs .ne. num_procs )then
685 if( me .eq. root ) write( *,9000 ) nprocs, num_procs
686 9000 format( /,'Error: ',/,'num of procs allocated (',
688 > /,'is not equal to',/,
689 > 'compiled number of procs (',
691 call mpi_finalize(ierr)
698 if( i .ne. 1 .and. i/2*2 .ne. i )then
699 if ( me .eq. root ) then
700 write( *,* ) 'Error: num_proc_cols is ',
702 > ' which is not a power of two'
704 call mpi_finalize(ierr)
714 if( i .ne. 1 .and. i/2*2 .ne. i )then
715 if ( me .eq. root ) then
716 write( *,* ) 'Error: num_proc_rows is ',
718 > ' which is not a power of two'
720 call mpi_finalize(ierr)
731 if( i .ne. 1 .and. i/2*2 .ne. i )then
732 write( *,* ) 'Error: nprocs is ',
734 > ' which is not a power of two'
735 call mpi_finalize(ierr)
740 log2nprocs = log2nprocs + 1
744 CC write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs
747 npcols = num_proc_cols
748 nprows = num_proc_rows
757 c---------------------------------------------------------------------
758 c---------------------------------------------------------------------
759 subroutine setup_submatrix_info( l2npcols,
761 > reduce_send_starts,
762 > reduce_send_lengths,
763 > reduce_recv_starts,
764 > reduce_recv_lengths )
766 c---------------------------------------------------------------------
767 c---------------------------------------------------------------------
773 integer col_size, row_size
775 common / partit_size / naa, nzz,
777 > proc_col, proc_row,
788 > proc_col, proc_row,
798 integer reduce_exch_proc(*)
799 integer reduce_send_starts(*)
800 integer reduce_send_lengths(*)
801 integer reduce_recv_starts(*)
802 integer reduce_recv_lengths(*)
809 proc_row = me / npcols
810 proc_col = me - proc_row*npcols
814 c---------------------------------------------------------------------
815 c If naa evenly divisible by npcols, then it is evenly divisible
817 c---------------------------------------------------------------------
819 if( naa/npcols*npcols .eq. naa )then
820 col_size = naa/npcols
821 firstcol = proc_col*col_size + 1
822 lastcol = firstcol - 1 + col_size
823 row_size = naa/nprows
824 firstrow = proc_row*row_size + 1
825 lastrow = firstrow - 1 + row_size
826 c---------------------------------------------------------------------
827 c If naa not evenly divisible by npcols, then first subdivide for nprows
828 c and then, if npcols not equal to nprows (i.e., not a sq number of procs),
829 c get col subdivisions by dividing by 2 each row subdivision.
830 c---------------------------------------------------------------------
832 if( proc_row .lt. naa - naa/nprows*nprows)then
833 row_size = naa/nprows+ 1
834 firstrow = proc_row*row_size + 1
835 lastrow = firstrow - 1 + row_size
837 row_size = naa/nprows
838 firstrow = (naa - naa/nprows*nprows)*(row_size+1)
839 > + (proc_row-(naa-naa/nprows*nprows))
841 lastrow = firstrow - 1 + row_size
843 if( npcols .eq. nprows )then
844 if( proc_col .lt. naa - naa/npcols*npcols )then
845 col_size = naa/npcols+ 1
846 firstcol = proc_col*col_size + 1
847 lastcol = firstcol - 1 + col_size
849 col_size = naa/npcols
850 firstcol = (naa - naa/npcols*npcols)*(col_size+1)
851 > + (proc_col-(naa-naa/npcols*npcols))
853 lastcol = firstcol - 1 + col_size
856 if( (proc_col/2) .lt.
857 > naa - naa/(npcols/2)*(npcols/2) )then
858 col_size = naa/(npcols/2) + 1
859 firstcol = (proc_col/2)*col_size + 1
860 lastcol = firstcol - 1 + col_size
862 col_size = naa/(npcols/2)
863 firstcol = (naa - naa/(npcols/2)*(npcols/2))
865 > + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2)))
867 lastcol = firstcol - 1 + col_size
869 CC write( *,* ) col_size,firstcol,lastcol
870 if( mod( me,2 ) .eq. 0 )then
871 lastcol = firstcol - 1 + (col_size-1)/2 + 1
873 firstcol = firstcol + (col_size-1)/2 + 1
874 lastcol = firstcol - 1 + col_size/2
875 CC write( *,* ) firstcol,lastcol
882 if( npcols .eq. nprows )then
884 send_len = lastrow - firstrow + 1
886 if( mod( me,2 ) .eq. 0 )then
888 send_len = (1 + lastrow-firstrow+1)/2
890 send_start = (1 + lastrow-firstrow+1)/2 + 1
891 send_len = (lastrow-firstrow+1)/2
898 c---------------------------------------------------------------------
899 c Transpose exchange processor
900 c---------------------------------------------------------------------
902 if( npcols .eq. nprows )then
903 exch_proc = mod( me,nprows )*nprows + me/nprows
905 exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows)
914 l2npcols = l2npcols + 1
919 c---------------------------------------------------------------------
920 c Set up the reduce phase schedules...
921 c---------------------------------------------------------------------
926 j = mod( proc_col+div_factor/2, div_factor )
927 > + proc_col / div_factor * div_factor
928 reduce_exch_proc(i) = proc_row*npcols + j
930 div_factor = div_factor / 2
935 do i = l2npcols, 1, -1
937 if( nprows .eq. npcols )then
938 reduce_send_starts(i) = send_start
939 reduce_send_lengths(i) = send_len
940 reduce_recv_lengths(i) = lastrow - firstrow + 1
942 reduce_recv_lengths(i) = send_len
943 if( i .eq. l2npcols )then
944 reduce_send_lengths(i) = lastrow-firstrow+1 - send_len
945 if( me/2*2 .eq. me )then
946 reduce_send_starts(i) = send_start + send_len
948 reduce_send_starts(i) = 1
951 reduce_send_lengths(i) = send_len
952 reduce_send_starts(i) = send_start
955 reduce_recv_starts(i) = send_start
960 exch_recv_length = lastcol - firstcol + 1
969 c---------------------------------------------------------------------
970 c---------------------------------------------------------------------
971 subroutine conj_grad ( colidx,
983 > reduce_send_starts,
984 > reduce_send_lengths,
985 > reduce_recv_starts,
986 > reduce_recv_lengths )
987 c---------------------------------------------------------------------
988 c---------------------------------------------------------------------
990 c---------------------------------------------------------------------
991 c Floaging point arrays here are named as in NPB1 spec discussion of
993 c---------------------------------------------------------------------
999 integer status(MPI_STATUS_SIZE ), request
1002 common / partit_size / naa, nzz,
1004 > proc_col, proc_row,
1015 > proc_col, proc_row,
1027 double precision x(*),
1030 integer colidx(nzz), rowstr(naa+1)
1032 double precision p(*),
1035 > w(*) ! used as work temporary
1038 integer reduce_exch_proc(l2npcols)
1039 integer reduce_send_starts(l2npcols)
1040 integer reduce_send_lengths(l2npcols)
1041 integer reduce_recv_starts(l2npcols)
1042 integer reduce_recv_lengths(l2npcols)
1044 integer i, j, k, ierr
1045 integer cgit, cgitmax
1047 double precision d, sum, rho, rho0, alpha, beta, rnorm
1050 double precision timer_read
1055 c---------------------------------------------------------------------
1056 c Initialize the CG algorithm:
1057 c---------------------------------------------------------------------
1067 c---------------------------------------------------------------------
1069 c Now, obtain the norm of r: First, sum squares of r elements locally...
1070 c---------------------------------------------------------------------
1072 do j=1, lastcol-firstcol+1
1073 sum = sum + r(j)*r(j)
1076 c---------------------------------------------------------------------
1077 c Exchange and sum with procs identified in reduce_exch_proc
1078 c (This is equivalent to mpi_allreduce.)
1079 c Sum the partial sums of rho, leaving rho on all processors
1080 c---------------------------------------------------------------------
1082 call mpi_irecv( rho,
1085 > reduce_exch_proc(i),
1093 > reduce_exch_proc(i),
1097 call mpi_wait( request, status, ierr )
1105 c---------------------------------------------------------------------
1107 c The conj grad iteration loop
1109 c---------------------------------------------------------------------
1110 do cgit = 1, cgitmax
1113 c---------------------------------------------------------------------
1115 c The partition submatrix-vector multiply: use workspace w
1116 c---------------------------------------------------------------------
1117 do j=1,lastrow-firstrow+1
1119 do k=rowstr(j),rowstr(j+1)-1
1120 sum = sum + a(k)*p(colidx(k))
1125 c---------------------------------------------------------------------
1126 c Sum the partition submatrix-vec A.p's across rows
1127 c Exchange and sum piece of w with procs identified in reduce_exch_proc
1128 c---------------------------------------------------------------------
1129 do i = l2npcols, 1, -1
1130 call mpi_irecv( q(reduce_recv_starts(i)),
1131 > reduce_recv_lengths(i),
1133 > reduce_exch_proc(i),
1138 call mpi_send( w(reduce_send_starts(i)),
1139 > reduce_send_lengths(i),
1141 > reduce_exch_proc(i),
1145 call mpi_wait( request, status, ierr )
1146 do j=send_start,send_start + reduce_recv_lengths(i) - 1
1152 c---------------------------------------------------------------------
1153 c Exchange piece of q with transpose processor:
1154 c---------------------------------------------------------------------
1155 if( l2npcols .ne. 0 )then
1165 call mpi_send( w(send_start),
1172 call mpi_wait( request, status, ierr )
1174 do j=1,exch_recv_length
1180 c---------------------------------------------------------------------
1181 c Clear w for reuse...
1182 c---------------------------------------------------------------------
1183 do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 )
1188 c---------------------------------------------------------------------
1190 c---------------------------------------------------------------------
1192 do j=1, lastcol-firstcol+1
1193 sum = sum + p(j)*q(j)
1196 c---------------------------------------------------------------------
1197 c Obtain d with a sum-reduce
1198 c---------------------------------------------------------------------
1203 > reduce_exch_proc(i),
1211 > reduce_exch_proc(i),
1216 call mpi_wait( request, status, ierr )
1223 c---------------------------------------------------------------------
1224 c Obtain alpha = rho / (p.q)
1225 c---------------------------------------------------------------------
1228 c---------------------------------------------------------------------
1229 c Save a temporary of rho
1230 c---------------------------------------------------------------------
1233 c---------------------------------------------------------------------
1234 c Obtain z = z + alpha*p
1235 c and r = r - alpha*q
1236 c---------------------------------------------------------------------
1237 do j=1, lastcol-firstcol+1
1238 z(j) = z(j) + alpha*p(j)
1239 r(j) = r(j) - alpha*q(j)
1242 c---------------------------------------------------------------------
1244 c Now, obtain the norm of r: First, sum squares of r elements locally...
1245 c---------------------------------------------------------------------
1247 do j=1, lastcol-firstcol+1
1248 sum = sum + r(j)*r(j)
1251 c---------------------------------------------------------------------
1252 c Obtain rho with a sum-reduce
1253 c---------------------------------------------------------------------
1255 call mpi_irecv( rho,
1258 > reduce_exch_proc(i),
1266 > reduce_exch_proc(i),
1270 call mpi_wait( request, status, ierr )
1276 c---------------------------------------------------------------------
1278 c---------------------------------------------------------------------
1281 c---------------------------------------------------------------------
1283 c---------------------------------------------------------------------
1284 do j=1, lastcol-firstcol+1
1285 p(j) = r(j) + beta*p(j)
1290 enddo ! end of do cgit=1,cgitmax
1294 c---------------------------------------------------------------------
1295 c Compute residual norm explicitly: ||r|| = ||x - A.z||
1297 c The partition submatrix-vector multiply
1298 c---------------------------------------------------------------------
1299 do j=1,lastrow-firstrow+1
1301 do k=rowstr(j),rowstr(j+1)-1
1302 sum = sum + a(k)*z(colidx(k))
1309 c---------------------------------------------------------------------
1310 c Sum the partition submatrix-vec A.z's across rows
1311 c---------------------------------------------------------------------
1312 do i = l2npcols, 1, -1
1313 call mpi_irecv( r(reduce_recv_starts(i)),
1314 > reduce_recv_lengths(i),
1316 > reduce_exch_proc(i),
1321 call mpi_send( w(reduce_send_starts(i)),
1322 > reduce_send_lengths(i),
1324 > reduce_exch_proc(i),
1328 call mpi_wait( request, status, ierr )
1330 do j=send_start,send_start + reduce_recv_lengths(i) - 1
1336 c---------------------------------------------------------------------
1337 c Exchange piece of q with transpose processor:
1338 c---------------------------------------------------------------------
1339 if( l2npcols .ne. 0 )then
1349 call mpi_send( w(send_start),
1356 call mpi_wait( request, status, ierr )
1358 do j=1,exch_recv_length
1364 c---------------------------------------------------------------------
1365 c At this point, r contains A.z
1366 c---------------------------------------------------------------------
1368 do j=1, lastcol-firstcol+1
1373 c---------------------------------------------------------------------
1374 c Obtain d with a sum-reduce
1375 c---------------------------------------------------------------------
1380 > reduce_exch_proc(i),
1388 > reduce_exch_proc(i),
1392 call mpi_wait( request, status, ierr )
1399 if( me .eq. root ) rnorm = sqrt( d )
1404 end ! end of routine conj_grad
1408 c---------------------------------------------------------------------
1409 c---------------------------------------------------------------------
1410 subroutine makea( n, nz, a, colidx, rowstr, nonzer,
1411 > firstrow, lastrow, firstcol, lastcol,
1412 > rcond, arow, acol, aelt, v, iv, shift )
1413 c---------------------------------------------------------------------
1414 c---------------------------------------------------------------------
1417 integer firstrow, lastrow, firstcol, lastcol
1418 integer colidx(nz), rowstr(n+1)
1419 integer iv(2*n+1), arow(nz), acol(nz)
1420 double precision v(n+1), aelt(nz)
1421 double precision rcond, a(nz), shift
1423 c---------------------------------------------------------------------
1424 c generate the test problem for benchmark 6
1425 c makea generates a sparse matrix with a
1426 c prescribed sparsity distribution
1428 c parameter type usage
1432 c n i number of cols/rows of matrix
1433 c nz i nonzeros as declared array size
1434 c rcond r*8 condition number
1435 c shift r*8 main diagonal shift
1439 c a r*8 array for nonzeros
1440 c colidx i col indices
1441 c rowstr i row pointers
1447 c---------------------------------------------------------------------
1449 integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER
1451 c---------------------------------------------------------------------
1452 c nonzer is approximately (int(sqrt(nnza /n)));
1453 c---------------------------------------------------------------------
1455 double precision size, ratio, scale
1456 external sparse, sprnvc, vecset
1459 ratio = rcond ** (1.0D0 / dfloat(n))
1462 c---------------------------------------------------------------------
1463 c Initialize iv(n+1 .. 2n) to zero.
1464 c Used by sprnvc to mark nonzero positions
1465 c---------------------------------------------------------------------
1472 call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) )
1473 call vecset( n, v, colidx, nzv, iouter, .5D0 )
1475 jcol = colidx(ivelt)
1476 if (jcol.ge.firstcol .and. jcol.le.lastcol) then
1477 scale = size * v(ivelt)
1479 irow = colidx(ivelt1)
1480 if (irow.ge.firstrow .and. irow.le.lastrow) then
1482 if (nnza .gt. nz) goto 9999
1485 aelt(nnza) = v(ivelt1) * scale
1494 c---------------------------------------------------------------------
1495 c ... add the identity * rcond to the generated matrix to bound
1496 c the smallest eigenvalue from below by rcond
1497 c---------------------------------------------------------------------
1498 do i = firstrow, lastrow
1499 if (i.ge.firstcol .and. i.le.lastcol) then
1502 if (nnza .gt. nz) goto 9999
1505 aelt(nnza) = rcond - shift
1510 c---------------------------------------------------------------------
1511 c ... make the sparse matrix from list of elements with duplicates
1512 c (v and iv are used as workspace)
1513 c---------------------------------------------------------------------
1514 call sparse( a, colidx, rowstr, n, arow, acol, aelt,
1515 > firstrow, lastrow,
1516 > v, iv(1), iv(n+1), nnza )
1520 write(*,*) 'Space for matrix elements exceeded in makea'
1521 write(*,*) 'nnza, nzmax = ',nnza, nz
1522 write(*,*) ' iouter = ',iouter
1526 c-------end of makea------------------------------
1528 c---------------------------------------------------------------------
1529 c---------------------------------------------------------------------
1530 subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt,
1531 > firstrow, lastrow,
1532 > x, mark, nzloc, nnza )
1533 c---------------------------------------------------------------------
1534 c---------------------------------------------------------------------
1536 implicit logical (a-z)
1537 integer colidx(*), rowstr(*)
1538 integer firstrow, lastrow
1539 integer n, arow(*), acol(*), nnza
1540 double precision a(*), aelt(*)
1542 c---------------------------------------------------------------------
1543 c rows range from firstrow to lastrow
1544 c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
1545 c---------------------------------------------------------------------
1546 integer nzloc(n), nrows
1547 double precision x(n)
1550 c---------------------------------------------------
1551 c generate a sparse matrix from a list of
1552 c [col, row, element] tri
1553 c---------------------------------------------------
1555 integer i, j, jajp1, nza, k, nzrow
1558 c---------------------------------------------------------------------
1559 c how many rows of result
1560 c---------------------------------------------------------------------
1561 nrows = lastrow - firstrow + 1
1563 c---------------------------------------------------------------------
1564 c ...count the number of triples in each row
1565 c---------------------------------------------------------------------
1573 j = (arow(nza) - firstrow + 1) + 1
1574 rowstr(j) = rowstr(j) + 1
1579 rowstr(j) = rowstr(j) + rowstr(j-1)
1583 c---------------------------------------------------------------------
1584 c ... rowstr(j) now is the location of the first nonzero
1586 c---------------------------------------------------------------------
1589 c---------------------------------------------------------------------
1590 c ... do a bucket sort of the triples on the row index
1591 c---------------------------------------------------------------------
1593 j = arow(nza) - firstrow + 1
1596 colidx(k) = acol(nza)
1597 rowstr(j) = rowstr(j) + 1
1601 c---------------------------------------------------------------------
1602 c ... rowstr(j) now points to the first element of row j+1
1603 c---------------------------------------------------------------------
1605 rowstr(j+1) = rowstr(j)
1610 c---------------------------------------------------------------------
1611 c ... generate the actual output rows by adding elements
1612 c---------------------------------------------------------------------
1623 c---------------------------------------------------------------------
1624 c ...loop over the jth row of a
1625 c---------------------------------------------------------------------
1626 do k = jajp1 , rowstr(j+1)-1
1629 if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then
1636 c---------------------------------------------------------------------
1637 c ... extract the nonzeros of this row
1638 c---------------------------------------------------------------------
1644 if (xi .ne. 0.D0) then
1651 rowstr(j+1) = nza + rowstr(1)
1653 CC write (*, 11000) nza
1655 11000 format ( //,'final nonzero count in sparse ',
1656 1 /,'number of nonzeros = ', i16 )
1658 c-------end of sparse-----------------------------
1661 c---------------------------------------------------------------------
1662 c---------------------------------------------------------------------
1663 subroutine sprnvc( n, nz, v, iv, nzloc, mark )
1664 c---------------------------------------------------------------------
1665 c---------------------------------------------------------------------
1667 implicit logical (a-z)
1668 double precision v(*)
1669 integer n, nz, iv(*), nzloc(n), nn1
1671 common /urando/ amult, tran
1672 double precision amult, tran
1675 c---------------------------------------------------------------------
1676 c generate a sparse n-vector (v, iv)
1677 c having nzv nonzeros
1679 c mark(i) is set to 1 if position i is nonzero.
1680 c mark is all zero on entry and is reset to all zero before exit
1681 c this corrects a performance bug found by John G. Lewis, caused by
1682 c reinitialization of mark on every one of the n calls to sprnvc
1683 c---------------------------------------------------------------------
1685 integer nzrow, nzv, ii, i, icnvrt
1687 external randlc, icnvrt
1688 double precision randlc, vecelt, vecloc
1696 if (nn1 .lt. n) goto 50
1698 c---------------------------------------------------------------------
1699 c nn1 is the smallest power of two not less than n
1700 c---------------------------------------------------------------------
1703 if (nzv .ge. nz) goto 110
1704 vecelt = randlc( tran, amult )
1706 c---------------------------------------------------------------------
1707 c generate an integer between 1 and n in a portable manner
1708 c---------------------------------------------------------------------
1709 vecloc = randlc(tran, amult)
1710 i = icnvrt(vecloc, nn1) + 1
1711 if (i .gt. n) goto 100
1713 c---------------------------------------------------------------------
1714 c was this integer generated already?
1715 c---------------------------------------------------------------------
1716 if (mark(i) .eq. 0) then
1732 c-------end of sprnvc-----------------------------
1735 c---------------------------------------------------------------------
1736 c---------------------------------------------------------------------
1737 function icnvrt(x, ipwr2)
1738 c---------------------------------------------------------------------
1739 c---------------------------------------------------------------------
1741 implicit logical (a-z)
1743 integer ipwr2, icnvrt
1745 c---------------------------------------------------------------------
1746 c scale a double precision number x in (0,1) by a power of 2 and chop it
1747 c---------------------------------------------------------------------
1748 icnvrt = int(ipwr2 * x)
1752 c-------end of icnvrt-----------------------------
1755 c---------------------------------------------------------------------
1756 c---------------------------------------------------------------------
1757 subroutine vecset(n, v, iv, nzv, i, val)
1758 c---------------------------------------------------------------------
1759 c---------------------------------------------------------------------
1761 implicit logical (a-z)
1762 integer n, iv(*), nzv, i, k
1763 double precision v(*), val
1765 c---------------------------------------------------------------------
1766 c set ith element of sparse vector (v, iv) with
1767 c nzv nonzeros to val
1768 c---------------------------------------------------------------------
1774 if (iv(k) .eq. i) then
1786 c-------end of vecset-----------------------------