examples/smpi/NAS/CG/cg.f

   1 !-------------------------------------------------------------------------!
   2 !                                                                         !
   3 !        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
   4 !                                                                         !
   5 !                                   C G                                   !
   6 !                                                                         !
   7 !-------------------------------------------------------------------------!
   8 !                                                                         !
   9 !    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
  10 !    It is described in NAS Technical Reports 95-020 and 02-007           !
  11 !                                                                         !
  12 !    Permission to use, copy, distribute and modify this software         !
  13 !    for any purpose with or without fee is hereby granted.  We           !
  14 !    request, however, that all derived work reference the NAS            !
  15 !    Parallel Benchmarks 3.3. This software is provided "as is"           !
  16 !    without express or implied warranty.                                 !
  17 !                                                                         !
  18 !    Information on NPB 3.3, including the technical report, the          !
  19 !    original specifications, source code, results and information        !
  20 !    on how to submit new results, is available at:                       !
  21 !                                                                         !
  22 !           http://www.nas.nasa.gov/Software/NPB/                         !
  23 !                                                                         !
  24 !    Send comments or suggestions to  npb@nas.nasa.gov                    !
  25 !                                                                         !
  26 !          NAS Parallel Benchmarks Group                                  !
  27 !          NASA Ames Research Center                                      !
  28 !          Mail Stop: T27A-1                                              !
  29 !          Moffett Field, CA   94035-1000                                 !
  30 !                                                                         !
  31 !          E-mail:  npb@nas.nasa.gov                                      !
  32 !          Fax:     (650) 604-3957                                        !
  33 !                                                                         !
  34 !-------------------------------------------------------------------------!
  35
  36
  37 c---------------------------------------------------------------------
  38 c
  39 c Authors: M. Yarrow
  40 c          C. Kuszmaul
  41 c          R. F. Van der Wijngaart
  42 c          H. Jin
  43 c
  44 c---------------------------------------------------------------------
  45
  46
  47 c---------------------------------------------------------------------
  48 c---------------------------------------------------------------------
  49       program cg
  50 c---------------------------------------------------------------------
  51 c---------------------------------------------------------------------
  52
  53
  54       implicit none
  55
  56       include 'mpinpb.h'
  57       integer status(MPI_STATUS_SIZE), request, ierr
  58
  59       include 'npbparams.h'
  60
  61 c---------------------------------------------------------------------
  62 c  num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows.
  63 c  num_proc_cols and num_proc_cols are to be found in npbparams.h.
  64 c  When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows.
  65 c---------------------------------------------------------------------
  66       integer    num_procs
  67       parameter( num_procs = num_proc_cols * num_proc_rows )
  68
  69
  70
  71 c---------------------------------------------------------------------
  72 c  Class specific parameters:
  73 c  It appears here for reference only.
  74 c  These are their values, however, this info is imported in the npbparams.h
  75 c  include file, which is written by the sys/setparams.c program.
  76 c---------------------------------------------------------------------
  77
  78 C----------
  79 C  Class S:
  80 C----------
  81 CC       parameter( na=1400,
  82 CC      >           nonzer=7,
  83 CC      >           shift=10.,
  84 CC      >           niter=15,
  85 CC      >           rcond=1.0d-1 )
  86 C----------
  87 C  Class W:
  88 C----------
  89 CC       parameter( na=7000,
  90 CC      >           nonzer=8,
  91 CC      >           shift=12.,
  92 CC      >           niter=15,
  93 CC      >           rcond=1.0d-1 )
  94 C----------
  95 C  Class A:
  96 C----------
  97 CC       parameter( na=14000,
  98 CC      >           nonzer=11,
  99 CC      >           shift=20.,
 100 CC      >           niter=15,
 101 CC      >           rcond=1.0d-1 )
 102 C----------
 103 C  Class B:
 104 C----------
 105 CC       parameter( na=75000,
 106 CC      >           nonzer=13,
 107 CC      >           shift=60.,
 108 CC      >           niter=75,
 109 CC      >           rcond=1.0d-1 )
 110 C----------
 111 C  Class C:
 112 C----------
 113 CC       parameter( na=150000,
 114 CC      >           nonzer=15,
 115 CC      >           shift=110.,
 116 CC      >           niter=75,
 117 CC      >           rcond=1.0d-1 )
 118 C----------
 119 C  Class D:
 120 C----------
 121 CC       parameter( na=1500000,
 122 CC      >           nonzer=21,
 123 CC      >           shift=500.,
 124 CC      >           niter=100,
 125 CC      >           rcond=1.0d-1 )
 126 C----------
 127 C  Class E:
 128 C----------
 129 CC       parameter( na=9000000,
 130 CC      >           nonzer=26,
 131 CC      >           shift=1500.,
 132 CC      >           niter=100,
 133 CC      >           rcond=1.0d-1 )
 134
 135
 136
 137       integer    nz
 138       parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer
 139      >              + na*(nonzer+2+num_procs/256)/num_proc_cols )
 140
 141
 142
 143       common / partit_size  /  naa, nzz,
 144      >                         npcols, nprows,
 145      >                         proc_col, proc_row,
 146      >                         firstrow,
 147      >                         lastrow,
 148      >                         firstcol,
 149      >                         lastcol,
 150      >                         exch_proc,
 151      >                         exch_recv_length,
 152      >                         send_start,
 153      >                         send_len
 154       integer                  naa, nzz,
 155      >                         npcols, nprows,
 156      >                         proc_col, proc_row,
 157      >                         firstrow,
 158      >                         lastrow,
 159      >                         firstcol,
 160      >                         lastcol,
 161      >                         exch_proc,
 162      >                         exch_recv_length,
 163      >                         send_start,
 164      >                         send_len
 165
 166
 167       common / main_int_mem /  colidx,     rowstr,
 168      >                         iv,         arow,     acol
 169       integer                  colidx(nz), rowstr(na+1),
 170      >                         iv(2*na+1), arow(nz), acol(nz)
 171
 172
 173       common / main_flt_mem /  v,       aelt,     a,
 174      >                         x,
 175      >                         z,
 176      >                         p,
 177      >                         q,
 178      >                         r,
 179      >                         w
 180       double precision         v(na+1), aelt(nz), a(nz),
 181      >                         x(na/num_proc_rows+2),
 182      >                         z(na/num_proc_rows+2),
 183      >                         p(na/num_proc_rows+2),
 184      >                         q(na/num_proc_rows+2),
 185      >                         r(na/num_proc_rows+2),
 186      >                         w(na/num_proc_rows+2)
 187
 188
 189       common /urando/          amult, tran
 190       double precision         amult, tran
 191
 192
 193
 194       integer            l2npcols
 195       integer            reduce_exch_proc(num_proc_cols)
 196       integer            reduce_send_starts(num_proc_cols)
 197       integer            reduce_send_lengths(num_proc_cols)
 198       integer            reduce_recv_starts(num_proc_cols)
 199       integer            reduce_recv_lengths(num_proc_cols)
 200
 201       integer            i, j, k, it
 202
 203       double precision   zeta, randlc
 204       external           randlc
 205       double precision   rnorm
 206       double precision   norm_temp1(2), norm_temp2(2)
 207
 208       double precision   t, tmax, mflops
 209       external           timer_read
 210       double precision   timer_read
 211       character          class
 212       logical            verified
 213       double precision   zeta_verify_value, epsilon, err
 214
 215
 216 c---------------------------------------------------------------------
 217 c  Set up mpi initialization and number of proc testing
 218 c---------------------------------------------------------------------
 219       call initialize_mpi
 220
 221
 222       if( na .eq. 1400 .and.
 223      &    nonzer .eq. 7 .and.
 224      &    niter .eq. 15 .and.
 225      &    shift .eq. 10.d0 ) then
 226          class = 'S'
 227          zeta_verify_value = 8.5971775078648d0
 228       else if( na .eq. 7000 .and.
 229      &         nonzer .eq. 8 .and.
 230      &         niter .eq. 15 .and.
 231      &         shift .eq. 12.d0 ) then
 232          class = 'W'
 233          zeta_verify_value = 10.362595087124d0
 234       else if( na .eq. 14000 .and.
 235      &         nonzer .eq. 11 .and.
 236      &         niter .eq. 15 .and.
 237      &         shift .eq. 20.d0 ) then
 238          class = 'A'
 239          zeta_verify_value = 17.130235054029d0
 240       else if( na .eq. 75000 .and.
 241      &         nonzer .eq. 13 .and.
 242      &         niter .eq. 75 .and.
 243      &         shift .eq. 60.d0 ) then
 244          class = 'B'
 245          zeta_verify_value = 22.712745482631d0
 246       else if( na .eq. 150000 .and.
 247      &         nonzer .eq. 15 .and.
 248      &         niter .eq. 75 .and.
 249      &         shift .eq. 110.d0 ) then
 250          class = 'C'
 251          zeta_verify_value = 28.973605592845d0
 252       else if( na .eq. 1500000 .and.
 253      &         nonzer .eq. 21 .and.
 254      &         niter .eq. 100 .and.
 255      &         shift .eq. 500.d0 ) then
 256          class = 'D'
 257          zeta_verify_value = 52.514532105794d0
 258       else if( na .eq. 9000000 .and.
 259      &         nonzer .eq. 26 .and.
 260      &         niter .eq. 100 .and.
 261      &         shift .eq. 1.5d3 ) then
 262          class = 'E'
 263          zeta_verify_value = 77.522164599383d0
 264       else
 265          class = 'U'
 266       endif
 267
 268       if( me .eq. root )then
 269          write( *,1000 )
 270          write( *,1001 ) na
 271          write( *,1002 ) niter
 272          write( *,1003 ) nprocs
 273          write( *,1004 ) nonzer
 274          write( *,1005 ) shift
 275  1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /)
 276  1001 format(' Size: ', i10 )
 277  1002 format(' Iterations: ', i5 )
 278  1003 format(' Number of active processes: ', i5 )
 279  1004 format(' Number of nonzeroes per row: ', i8)
 280  1005 format(' Eigenvalue shift: ', e8.3)
 281       endif
 282
 283       if (.not. convertdouble) then
 284          dp_type = MPI_DOUBLE_PRECISION
 285       else
 286          dp_type = MPI_REAL
 287       endif
 288
 289
 290       naa = na
 291       nzz = nz
 292
 293
 294 c---------------------------------------------------------------------
 295 c  Set up processor info, such as whether sq num of procs, etc
 296 c---------------------------------------------------------------------
 297       call setup_proc_info( num_procs,
 298      >                      num_proc_rows,
 299      >                      num_proc_cols )
 300
 301
 302 c---------------------------------------------------------------------
 303 c  Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow
 304 c---------------------------------------------------------------------
 305       call setup_submatrix_info( l2npcols,
 306      >                           reduce_exch_proc,
 307      >                           reduce_send_starts,
 308      >                           reduce_send_lengths,
 309      >                           reduce_recv_starts,
 310      >                           reduce_recv_lengths )
 311
 312
 313
 314 c---------------------------------------------------------------------
 315 c  Inialize random number generator
 316 c---------------------------------------------------------------------
 317       tran    = 314159265.0D0
 318       amult   = 1220703125.0D0
 319       zeta    = randlc( tran, amult )
 320
 321 c---------------------------------------------------------------------
 322 c  Set up partition's sparse random matrix for given class size
 323 c---------------------------------------------------------------------
 324       call makea(naa, nzz, a, colidx, rowstr, nonzer,
 325      >           firstrow, lastrow, firstcol, lastcol,
 326      >           rcond, arow, acol, aelt, v, iv, shift)
 327
 328
 329
 330 c---------------------------------------------------------------------
 331 c  Note: as a result of the above call to makea:
 332 c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
 333 c        values of colidx which are col indexes go from firstcol --> lastcol
 334 c        So:
 335 c        Shift the col index vals from actual (firstcol --> lastcol )
 336 c        to local, i.e., (1 --> lastcol-firstcol+1)
 337 c---------------------------------------------------------------------
 338       do j=1,lastrow-firstrow+1
 339          do k=rowstr(j),rowstr(j+1)-1
 340             colidx(k) = colidx(k) - firstcol + 1
 341          enddo
 342       enddo
 343
 344 c---------------------------------------------------------------------
 345 c  set starting vector to (1, 1, .... 1)
 346 c---------------------------------------------------------------------
 347       do i = 1, na/num_proc_rows+1
 348          x(i) = 1.0D0
 349       enddo
 350
 351       zeta  = 0.0d0
 352
 353 c---------------------------------------------------------------------
 354 c---->
 355 c  Do one iteration untimed to init all code and data page tables
 356 c---->                    (then reinit, start timing, to niter its)
 357 c---------------------------------------------------------------------
 358       do it = 1, 1
 359
 360 c---------------------------------------------------------------------
 361 c  The call to the conjugate gradient routine:
 362 c---------------------------------------------------------------------
 363          call conj_grad ( colidx,
 364      >                    rowstr,
 365      >                    x,
 366      >                    z,
 367      >                    a,
 368      >                    p,
 369      >                    q,
 370      >                    r,
 371      >                    w,
 372      >                    rnorm,
 373      >                    l2npcols,
 374      >                    reduce_exch_proc,
 375      >                    reduce_send_starts,
 376      >                    reduce_send_lengths,
 377      >                    reduce_recv_starts,
 378      >                    reduce_recv_lengths )
 379
 380 c---------------------------------------------------------------------
 381 c  zeta = shift + 1/(x.z)
 382 c  So, first: (x.z)
 383 c  Also, find norm of z
 384 c  So, first: (z.z)
 385 c---------------------------------------------------------------------
 386          norm_temp1(1) = 0.0d0
 387          norm_temp1(2) = 0.0d0
 388          do j=1, lastcol-firstcol+1
 389             norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
 390             norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
 391          enddo
 392
 393          do i = 1, l2npcols
 394             call mpi_irecv( norm_temp2,
 395      >                      2,
 396      >                      dp_type,
 397      >                      reduce_exch_proc(i),
 398      >                      i,
 399      >                      mpi_comm_world,
 400      >                      request,
 401      >                      ierr )
 402             call mpi_send(  norm_temp1,
 403      >                      2,
 404      >                      dp_type,
 405      >                      reduce_exch_proc(i),
 406      >                      i,
 407      >                      mpi_comm_world,
 408      >                      ierr )
 409             call mpi_wait( request, status, ierr )
 410
 411             norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
 412             norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
 413          enddo
 414
 415          norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
 416
 417
 418 c---------------------------------------------------------------------
 419 c  Normalize z to obtain x
 420 c---------------------------------------------------------------------
 421          do j=1, lastcol-firstcol+1
 422             x(j) = norm_temp1(2)*z(j)
 423          enddo
 424
 425
 426       enddo                              ! end of do one iteration untimed
 427
 428
 429 c---------------------------------------------------------------------
 430 c  set starting vector to (1, 1, .... 1)
 431 c---------------------------------------------------------------------
 432 c
 433 c  NOTE: a questionable limit on size:  should this be na/num_proc_cols+1 ?
 434 c
 435       do i = 1, na/num_proc_rows+1
 436          x(i) = 1.0D0
 437       enddo
 438
 439       zeta  = 0.0d0
 440
 441 c---------------------------------------------------------------------
 442 c  Synchronize and start timing
 443 c---------------------------------------------------------------------
 444       call mpi_barrier( mpi_comm_world,
 445      >                  ierr )
 446
 447       call timer_clear( 1 )
 448       call timer_start( 1 )
 449
 450 c---------------------------------------------------------------------
 451 c---->
 452 c  Main Iteration for inverse power method
 453 c---->
 454 c---------------------------------------------------------------------
 455       do it = 1, niter
 456
 457 c---------------------------------------------------------------------
 458 c  The call to the conjugate gradient routine:
 459 c---------------------------------------------------------------------
 460          call conj_grad ( colidx,
 461      >                    rowstr,
 462      >                    x,
 463      >                    z,
 464      >                    a,
 465      >                    p,
 466      >                    q,
 467      >                    r,
 468      >                    w,
 469      >                    rnorm,
 470      >                    l2npcols,
 471      >                    reduce_exch_proc,
 472      >                    reduce_send_starts,
 473      >                    reduce_send_lengths,
 474      >                    reduce_recv_starts,
 475      >                    reduce_recv_lengths )
 476
 477
 478 c---------------------------------------------------------------------
 479 c  zeta = shift + 1/(x.z)
 480 c  So, first: (x.z)
 481 c  Also, find norm of z
 482 c  So, first: (z.z)
 483 c---------------------------------------------------------------------
 484          norm_temp1(1) = 0.0d0
 485          norm_temp1(2) = 0.0d0
 486          do j=1, lastcol-firstcol+1
 487             norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
 488             norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
 489          enddo
 490
 491          do i = 1, l2npcols
 492             call mpi_irecv( norm_temp2,
 493      >                      2,
 494      >                      dp_type,
 495      >                      reduce_exch_proc(i),
 496      >                      i,
 497      >                      mpi_comm_world,
 498      >                      request,
 499      >                      ierr )
 500             call mpi_send(  norm_temp1,
 501      >                      2,
 502      >                      dp_type,
 503      >                      reduce_exch_proc(i),
 504      >                      i,
 505      >                      mpi_comm_world,
 506      >                      ierr )
 507             call mpi_wait( request, status, ierr )
 508
 509             norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
 510             norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
 511          enddo
 512
 513          norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
 514
 515
 516          if( me .eq. root )then
 517             zeta = shift + 1.0d0 / norm_temp1(1)
 518             if( it .eq. 1 ) write( *,9000 )
 519             write( *,9001 ) it, rnorm, zeta
 520          endif
 521  9000 format( /,'   iteration           ||r||                 zeta' )
 522  9001 format( 4x, i5, 7x, e20.14, f20.13 )
 523
 524 c---------------------------------------------------------------------
 525 c  Normalize z to obtain x
 526 c---------------------------------------------------------------------
 527          do j=1, lastcol-firstcol+1
 528             x(j) = norm_temp1(2)*z(j)
 529          enddo
 530
 531
 532       enddo                              ! end of main iter inv pow meth
 533
 534       call timer_stop( 1 )
 535
 536 c---------------------------------------------------------------------
 537 c  End of timed section
 538 c---------------------------------------------------------------------
 539
 540       t = timer_read( 1 )
 541
 542       call mpi_reduce( t,
 543      >                 tmax,
 544      >                 1,
 545      >                 dp_type,
 546      >                 MPI_MAX,
 547      >                 root,
 548      >                 mpi_comm_world,
 549      >                 ierr )
 550
 551       if( me .eq. root )then
 552          write(*,100)
 553  100     format(' Benchmark completed ')
 554
 555          epsilon = 1.d-10
 556          if (class .ne. 'U') then
 557
 558             err = abs( zeta - zeta_verify_value )/zeta_verify_value
 559             if( err .le. epsilon ) then
 560                verified = .TRUE.
 561                write(*, 200)
 562                write(*, 201) zeta
 563                write(*, 202) err
 564  200           format(' VERIFICATION SUCCESSFUL ')
 565  201           format(' Zeta is    ', E20.13)
 566  202           format(' Error is   ', E20.13)
 567             else
 568                verified = .FALSE.
 569                write(*, 300)
 570                write(*, 301) zeta
 571                write(*, 302) zeta_verify_value
 572  300           format(' VERIFICATION FAILED')
 573  301           format(' Zeta                ', E20.13)
 574  302           format(' The correct zeta is ', E20.13)
 575             endif
 576          else
 577             verified = .FALSE.
 578             write (*, 400)
 579             write (*, 401)
 580             write (*, 201) zeta
 581  400        format(' Problem size unknown')
 582  401        format(' NO VERIFICATION PERFORMED')
 583          endif
 584
 585
 586          if( tmax .ne. 0. ) then
 587             mflops = float( 2*niter*na )
 588      &                  * ( 3.+float( nonzer*(nonzer+1) )
 589      &                    + 25.*(5.+float( nonzer*(nonzer+1) ))
 590      &                    + 3. ) / tmax / 1000000.0
 591          else
 592             mflops = 0.0
 593          endif
 594
 595          call print_results('CG', class, na, 0, 0,
 596      >                      niter, nnodes_compiled, nprocs, tmax,
 597      >                      mflops, '          floating point',
 598      >                      verified, npbversion, compiletime,
 599      >                      cs1, cs2, cs3, cs4, cs5, cs6, cs7)
 600
 601
 602       endif
 603
 604
 605       call mpi_finalize(ierr)
 606
 607
 608
 609       end                              ! end main
 610
 611
 612
 613
 614
 615 c---------------------------------------------------------------------
 616 c---------------------------------------------------------------------
 617       subroutine initialize_mpi
 618 c---------------------------------------------------------------------
 619 c---------------------------------------------------------------------
 620
 621       implicit none
 622
 623       include 'mpinpb.h'
 624
 625       integer   ierr
 626
 627
 628       call mpi_init( ierr )
 629       call mpi_comm_rank( mpi_comm_world, me, ierr )
 630       call mpi_comm_size( mpi_comm_world, nprocs, ierr )
 631       root = 0
 632
 633
 634       return
 635       end
 636
 637
 638
 639 c---------------------------------------------------------------------
 640 c---------------------------------------------------------------------
 641       subroutine setup_proc_info( num_procs,
 642      >                            num_proc_rows,
 643      >                            num_proc_cols )
 644 c---------------------------------------------------------------------
 645 c---------------------------------------------------------------------
 646
 647       implicit none
 648
 649       include 'mpinpb.h'
 650
 651       common / partit_size  /  naa, nzz,
 652      >                         npcols, nprows,
 653      >                         proc_col, proc_row,
 654      >                         firstrow,
 655      >                         lastrow,
 656      >                         firstcol,
 657      >                         lastcol,
 658      >                         exch_proc,
 659      >                         exch_recv_length,
 660      >                         send_start,
 661      >                         send_len
 662       integer                  naa, nzz,
 663      >                         npcols, nprows,
 664      >                         proc_col, proc_row,
 665      >                         firstrow,
 666      >                         lastrow,
 667      >                         firstcol,
 668      >                         lastcol,
 669      >                         exch_proc,
 670      >                         exch_recv_length,
 671      >                         send_start,
 672      >                         send_len
 673
 674       integer   num_procs, num_proc_cols, num_proc_rows
 675       integer   i, ierr
 676       integer   log2nprocs
 677
 678 c---------------------------------------------------------------------
 679 c  num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows
 680 c  When num_procs is not square, then num_proc_cols = 2*num_proc_rows
 681 c---------------------------------------------------------------------
 682 c  First, number of procs must be power of two.
 683 c---------------------------------------------------------------------
 684       if( nprocs .ne. num_procs )then
 685           if( me .eq. root ) write( *,9000 ) nprocs, num_procs
 686  9000     format(      /,'Error: ',/,'num of procs allocated   (',
 687      >                 i4, ' )',
 688      >                 /,'is not equal to',/,
 689      >                 'compiled number of procs (',
 690      >                 i4, ' )',/   )
 691           call mpi_finalize(ierr)
 692           stop
 693       endif
 694
 695
 696       i = num_proc_cols
 697  100  continue
 698           if( i .ne. 1 .and. i/2*2 .ne. i )then
 699               if ( me .eq. root ) then
 700                  write( *,* ) 'Error: num_proc_cols is ',
 701      >                         num_proc_cols,
 702      >                        ' which is not a power of two'
 703               endif
 704               call mpi_finalize(ierr)
 705               stop
 706           endif
 707           i = i / 2
 708           if( i .ne. 0 )then
 709               goto 100
 710           endif
 711
 712       i = num_proc_rows
 713  200  continue
 714           if( i .ne. 1 .and. i/2*2 .ne. i )then
 715               if ( me .eq. root ) then
 716                  write( *,* ) 'Error: num_proc_rows is ',
 717      >                         num_proc_rows,
 718      >                        ' which is not a power of two'
 719               endif
 720               call mpi_finalize(ierr)
 721               stop
 722           endif
 723           i = i / 2
 724           if( i .ne. 0 )then
 725               goto 200
 726           endif
 727
 728       log2nprocs = 0
 729       i = nprocs
 730  300  continue
 731           if( i .ne. 1 .and. i/2*2 .ne. i )then
 732               write( *,* ) 'Error: nprocs is ',
 733      >                      nprocs,
 734      >                      ' which is not a power of two'
 735               call mpi_finalize(ierr)
 736               stop
 737           endif
 738           i = i / 2
 739           if( i .ne. 0 )then
 740               log2nprocs = log2nprocs + 1
 741               goto 300
 742           endif
 743
 744 CC       write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs
 745
 746
 747       npcols = num_proc_cols
 748       nprows = num_proc_rows
 749
 750
 751       return
 752       end
 753
 754
 755
 756
 757 c---------------------------------------------------------------------
 758 c---------------------------------------------------------------------
 759       subroutine setup_submatrix_info( l2npcols,
 760      >                                 reduce_exch_proc,
 761      >                                 reduce_send_starts,
 762      >                                 reduce_send_lengths,
 763      >                                 reduce_recv_starts,
 764      >                                 reduce_recv_lengths )
 765      >
 766 c---------------------------------------------------------------------
 767 c---------------------------------------------------------------------
 768
 769       implicit none
 770
 771       include 'mpinpb.h'
 772
 773       integer      col_size, row_size
 774
 775       common / partit_size  /  naa, nzz,
 776      >                         npcols, nprows,
 777      >                         proc_col, proc_row,
 778      >                         firstrow,
 779      >                         lastrow,
 780      >                         firstcol,
 781      >                         lastcol,
 782      >                         exch_proc,
 783      >                         exch_recv_length,
 784      >                         send_start,
 785      >                         send_len
 786       integer                  naa, nzz,
 787      >                         npcols, nprows,
 788      >                         proc_col, proc_row,
 789      >                         firstrow,
 790      >                         lastrow,
 791      >                         firstcol,
 792      >                         lastcol,
 793      >                         exch_proc,
 794      >                         exch_recv_length,
 795      >                         send_start,
 796      >                         send_len
 797
 798       integer   reduce_exch_proc(*)
 799       integer   reduce_send_starts(*)
 800       integer   reduce_send_lengths(*)
 801       integer   reduce_recv_starts(*)
 802       integer   reduce_recv_lengths(*)
 803
 804       integer   i, j
 805       integer   div_factor
 806       integer   l2npcols
 807
 808
 809       proc_row = me / npcols
 810       proc_col = me - proc_row*npcols
 811
 812
 813
 814 c---------------------------------------------------------------------
 815 c  If naa evenly divisible by npcols, then it is evenly divisible
 816 c  by nprows
 817 c---------------------------------------------------------------------
 818
 819       if( naa/npcols*npcols .eq. naa )then
 820           col_size = naa/npcols
 821           firstcol = proc_col*col_size + 1
 822           lastcol  = firstcol - 1 + col_size
 823           row_size = naa/nprows
 824           firstrow = proc_row*row_size + 1
 825           lastrow  = firstrow - 1 + row_size
 826 c---------------------------------------------------------------------
 827 c  If naa not evenly divisible by npcols, then first subdivide for nprows
 828 c  and then, if npcols not equal to nprows (i.e., not a sq number of procs),
 829 c  get col subdivisions by dividing by 2 each row subdivision.
 830 c---------------------------------------------------------------------
 831       else
 832           if( proc_row .lt. naa - naa/nprows*nprows)then
 833               row_size = naa/nprows+ 1
 834               firstrow = proc_row*row_size + 1
 835               lastrow  = firstrow - 1 + row_size
 836           else
 837               row_size = naa/nprows
 838               firstrow = (naa - naa/nprows*nprows)*(row_size+1)
 839      >                 + (proc_row-(naa-naa/nprows*nprows))
 840      >                     *row_size + 1
 841               lastrow  = firstrow - 1 + row_size
 842           endif
 843           if( npcols .eq. nprows )then
 844               if( proc_col .lt. naa - naa/npcols*npcols )then
 845                   col_size = naa/npcols+ 1
 846                   firstcol = proc_col*col_size + 1
 847                   lastcol  = firstcol - 1 + col_size
 848               else
 849                   col_size = naa/npcols
 850                   firstcol = (naa - naa/npcols*npcols)*(col_size+1)
 851      >                     + (proc_col-(naa-naa/npcols*npcols))
 852      >                         *col_size + 1
 853                   lastcol  = firstcol - 1 + col_size
 854               endif
 855           else
 856               if( (proc_col/2) .lt.
 857      >                           naa - naa/(npcols/2)*(npcols/2) )then
 858                   col_size = naa/(npcols/2) + 1
 859                   firstcol = (proc_col/2)*col_size + 1
 860                   lastcol  = firstcol - 1 + col_size
 861               else
 862                   col_size = naa/(npcols/2)
 863                   firstcol = (naa - naa/(npcols/2)*(npcols/2))
 864      >                                                 *(col_size+1)
 865      >               + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2)))
 866      >                         *col_size + 1
 867                   lastcol  = firstcol - 1 + col_size
 868               endif
 869 CC               write( *,* ) col_size,firstcol,lastcol
 870               if( mod( me,2 ) .eq. 0 )then
 871                   lastcol  = firstcol - 1 + (col_size-1)/2 + 1
 872               else
 873                   firstcol = firstcol + (col_size-1)/2 + 1
 874                   lastcol  = firstcol - 1 + col_size/2
 875 CC                   write( *,* ) firstcol,lastcol
 876               endif
 877           endif
 878       endif
 879
 880
 881
 882       if( npcols .eq. nprows )then
 883           send_start = 1
 884           send_len   = lastrow - firstrow + 1
 885       else
 886           if( mod( me,2 ) .eq. 0 )then
 887               send_start = 1
 888               send_len   = (1 + lastrow-firstrow+1)/2
 889           else
 890               send_start = (1 + lastrow-firstrow+1)/2 + 1
 891               send_len   = (lastrow-firstrow+1)/2
 892           endif
 893       endif
 894
 895
 896
 897
 898 c---------------------------------------------------------------------
 899 c  Transpose exchange processor
 900 c---------------------------------------------------------------------
 901
 902       if( npcols .eq. nprows )then
 903           exch_proc = mod( me,nprows )*nprows + me/nprows
 904       else
 905           exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows)
 906      >                 + mod( me,2 )
 907       endif
 908
 909
 910
 911       i = npcols / 2
 912       l2npcols = 0
 913       do while( i .gt. 0 )
 914          l2npcols = l2npcols + 1
 915          i = i / 2
 916       enddo
 917
 918
 919 c---------------------------------------------------------------------
 920 c  Set up the reduce phase schedules...
 921 c---------------------------------------------------------------------
 922
 923       div_factor = npcols
 924       do i = 1, l2npcols
 925
 926          j = mod( proc_col+div_factor/2, div_factor )
 927      >     + proc_col / div_factor * div_factor
 928          reduce_exch_proc(i) = proc_row*npcols + j
 929
 930          div_factor = div_factor / 2
 931
 932       enddo
 933
 934
 935       do i = l2npcols, 1, -1
 936
 937             if( nprows .eq. npcols )then
 938                reduce_send_starts(i)  = send_start
 939                reduce_send_lengths(i) = send_len
 940                reduce_recv_lengths(i) = lastrow - firstrow + 1
 941             else
 942                reduce_recv_lengths(i) = send_len
 943                if( i .eq. l2npcols )then
 944                   reduce_send_lengths(i) = lastrow-firstrow+1 - send_len
 945                   if( me/2*2 .eq. me )then
 946                      reduce_send_starts(i) = send_start + send_len
 947                   else
 948                      reduce_send_starts(i) = 1
 949                   endif
 950                else
 951                   reduce_send_lengths(i) = send_len
 952                   reduce_send_starts(i)  = send_start
 953                endif
 954             endif
 955             reduce_recv_starts(i) = send_start
 956
 957       enddo
 958
 959
 960       exch_recv_length = lastcol - firstcol + 1
 961
 962
 963       return
 964       end
 965
 966
 967
 968
 969 c---------------------------------------------------------------------
 970 c---------------------------------------------------------------------
 971       subroutine conj_grad ( colidx,
 972      >                       rowstr,
 973      >                       x,
 974      >                       z,
 975      >                       a,
 976      >                       p,
 977      >                       q,
 978      >                       r,
 979      >                       w,
 980      >                       rnorm,
 981      >                       l2npcols,
 982      >                       reduce_exch_proc,
 983      >                       reduce_send_starts,
 984      >                       reduce_send_lengths,
 985      >                       reduce_recv_starts,
 986      >                       reduce_recv_lengths )
 987 c---------------------------------------------------------------------
 988 c---------------------------------------------------------------------
 989
 990 c---------------------------------------------------------------------
 991 c  Floaging point arrays here are named as in NPB1 spec discussion of
 992 c  CG algorithm
 993 c---------------------------------------------------------------------
 994
 995       implicit none
 996
 997       include 'mpinpb.h'
 998
 999       integer status(MPI_STATUS_SIZE ), request
1000
1001
1002       common / partit_size  /  naa, nzz,
1003      >                         npcols, nprows,
1004      >                         proc_col, proc_row,
1005      >                         firstrow,
1006      >                         lastrow,
1007      >                         firstcol,
1008      >                         lastcol,
1009      >                         exch_proc,
1010      >                         exch_recv_length,
1011      >                         send_start,
1012      >                         send_len
1013       integer                  naa, nzz,
1014      >                         npcols, nprows,
1015      >                         proc_col, proc_row,
1016      >                         firstrow,
1017      >                         lastrow,
1018      >                         firstcol,
1019      >                         lastcol,
1020      >                         exch_proc,
1021      >                         exch_recv_length,
1022      >                         send_start,
1023      >                         send_len
1024
1025
1026
1027       double precision   x(*),
1028      >                   z(*),
1029      >                   a(nzz)
1030       integer            colidx(nzz), rowstr(naa+1)
1031
1032       double precision   p(*),
1033      >                   q(*),
1034      >                   r(*),
1035      >                   w(*)                ! used as work temporary
1036
1037       integer   l2npcols
1038       integer   reduce_exch_proc(l2npcols)
1039       integer   reduce_send_starts(l2npcols)
1040       integer   reduce_send_lengths(l2npcols)
1041       integer   reduce_recv_starts(l2npcols)
1042       integer   reduce_recv_lengths(l2npcols)
1043
1044       integer   i, j, k, ierr
1045       integer   cgit, cgitmax
1046
1047       double precision   d, sum, rho, rho0, alpha, beta, rnorm
1048
1049       external         timer_read
1050       double precision timer_read
1051
1052       data      cgitmax / 25 /
1053
1054
1055 c---------------------------------------------------------------------
1056 c  Initialize the CG algorithm:
1057 c---------------------------------------------------------------------
1058       do j=1,naa/nprows+1
1059          q(j) = 0.0d0
1060          z(j) = 0.0d0
1061          r(j) = x(j)
1062          p(j) = r(j)
1063          w(j) = 0.0d0
1064       enddo
1065
1066
1067 c---------------------------------------------------------------------
1068 c  rho = r.r
1069 c  Now, obtain the norm of r: First, sum squares of r elements locally...
1070 c---------------------------------------------------------------------
1071       sum = 0.0d0
1072       do j=1, lastcol-firstcol+1
1073          sum = sum + r(j)*r(j)
1074       enddo
1075
1076 c---------------------------------------------------------------------
1077 c  Exchange and sum with procs identified in reduce_exch_proc
1078 c  (This is equivalent to mpi_allreduce.)
1079 c  Sum the partial sums of rho, leaving rho on all processors
1080 c---------------------------------------------------------------------
1081       do i = 1, l2npcols
1082          call mpi_irecv( rho,
1083      >                   1,
1084      >                   dp_type,
1085      >                   reduce_exch_proc(i),
1086      >                   i,
1087      >                   mpi_comm_world,
1088      >                   request,
1089      >                   ierr )
1090          call mpi_send(  sum,
1091      >                   1,
1092      >                   dp_type,
1093      >                   reduce_exch_proc(i),
1094      >                   i,
1095      >                   mpi_comm_world,
1096      >                   ierr )
1097          call mpi_wait( request, status, ierr )
1098
1099          sum = sum + rho
1100       enddo
1101       rho = sum
1102
1103
1104
1105 c---------------------------------------------------------------------
1106 c---->
1107 c  The conj grad iteration loop
1108 c---->
1109 c---------------------------------------------------------------------
1110       do cgit = 1, cgitmax
1111
1112
1113 c---------------------------------------------------------------------
1114 c  q = A.p
1115 c  The partition submatrix-vector multiply: use workspace w
1116 c---------------------------------------------------------------------
1117          do j=1,lastrow-firstrow+1
1118             sum = 0.d0
1119             do k=rowstr(j),rowstr(j+1)-1
1120                sum = sum + a(k)*p(colidx(k))
1121             enddo
1122             w(j) = sum
1123          enddo
1124
1125 c---------------------------------------------------------------------
1126 c  Sum the partition submatrix-vec A.p's across rows
1127 c  Exchange and sum piece of w with procs identified in reduce_exch_proc
1128 c---------------------------------------------------------------------
1129          do i = l2npcols, 1, -1
1130             call mpi_irecv( q(reduce_recv_starts(i)),
1131      >                      reduce_recv_lengths(i),
1132      >                      dp_type,
1133      >                      reduce_exch_proc(i),
1134      >                      i,
1135      >                      mpi_comm_world,
1136      >                      request,
1137      >                      ierr )
1138             call mpi_send(  w(reduce_send_starts(i)),
1139      >                      reduce_send_lengths(i),
1140      >                      dp_type,
1141      >                      reduce_exch_proc(i),
1142      >                      i,
1143      >                      mpi_comm_world,
1144      >                      ierr )
1145             call mpi_wait( request, status, ierr )
1146             do j=send_start,send_start + reduce_recv_lengths(i) - 1
1147                w(j) = w(j) + q(j)
1148             enddo
1149          enddo
1150
1151
1152 c---------------------------------------------------------------------
1153 c  Exchange piece of q with transpose processor:
1154 c---------------------------------------------------------------------
1155          if( l2npcols .ne. 0 )then
1156             call mpi_irecv( q,
1157      >                      exch_recv_length,
1158      >                      dp_type,
1159      >                      exch_proc,
1160      >                      1,
1161      >                      mpi_comm_world,
1162      >                      request,
1163      >                      ierr )
1164
1165             call mpi_send(  w(send_start),
1166      >                      send_len,
1167      >                      dp_type,
1168      >                      exch_proc,
1169      >                      1,
1170      >                      mpi_comm_world,
1171      >                      ierr )
1172             call mpi_wait( request, status, ierr )
1173          else
1174             do j=1,exch_recv_length
1175                q(j) = w(j)
1176             enddo
1177          endif
1178
1179
1180 c---------------------------------------------------------------------
1181 c  Clear w for reuse...
1182 c---------------------------------------------------------------------
1183          do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 )
1184             w(j) = 0.0d0
1185          enddo
1186
1187
1188 c---------------------------------------------------------------------
1189 c  Obtain p.q
1190 c---------------------------------------------------------------------
1191          sum = 0.0d0
1192          do j=1, lastcol-firstcol+1
1193             sum = sum + p(j)*q(j)
1194          enddo
1195
1196 c---------------------------------------------------------------------
1197 c  Obtain d with a sum-reduce
1198 c---------------------------------------------------------------------
1199          do i = 1, l2npcols
1200             call mpi_irecv( d,
1201      >                      1,
1202      >                      dp_type,
1203      >                      reduce_exch_proc(i),
1204      >                      i,
1205      >                      mpi_comm_world,
1206      >                      request,
1207      >                      ierr )
1208             call mpi_send(  sum,
1209      >                      1,
1210      >                      dp_type,
1211      >                      reduce_exch_proc(i),
1212      >                      i,
1213      >                      mpi_comm_world,
1214      >                      ierr )
1215
1216             call mpi_wait( request, status, ierr )
1217
1218             sum = sum + d
1219          enddo
1220          d = sum
1221
1222
1223 c---------------------------------------------------------------------
1224 c  Obtain alpha = rho / (p.q)
1225 c---------------------------------------------------------------------
1226          alpha = rho / d
1227
1228 c---------------------------------------------------------------------
1229 c  Save a temporary of rho
1230 c---------------------------------------------------------------------
1231          rho0 = rho
1232
1233 c---------------------------------------------------------------------
1234 c  Obtain z = z + alpha*p
1235 c  and    r = r - alpha*q
1236 c---------------------------------------------------------------------
1237          do j=1, lastcol-firstcol+1
1238             z(j) = z(j) + alpha*p(j)
1239             r(j) = r(j) - alpha*q(j)
1240          enddo
1241
1242 c---------------------------------------------------------------------
1243 c  rho = r.r
1244 c  Now, obtain the norm of r: First, sum squares of r elements locally...
1245 c---------------------------------------------------------------------
1246          sum = 0.0d0
1247          do j=1, lastcol-firstcol+1
1248             sum = sum + r(j)*r(j)
1249          enddo
1250
1251 c---------------------------------------------------------------------
1252 c  Obtain rho with a sum-reduce
1253 c---------------------------------------------------------------------
1254          do i = 1, l2npcols
1255             call mpi_irecv( rho,
1256      >                      1,
1257      >                      dp_type,
1258      >                      reduce_exch_proc(i),
1259      >                      i,
1260      >                      mpi_comm_world,
1261      >                      request,
1262      >                      ierr )
1263             call mpi_send(  sum,
1264      >                      1,
1265      >                      dp_type,
1266      >                      reduce_exch_proc(i),
1267      >                      i,
1268      >                      mpi_comm_world,
1269      >                      ierr )
1270             call mpi_wait( request, status, ierr )
1271
1272             sum = sum + rho
1273          enddo
1274          rho = sum
1275
1276 c---------------------------------------------------------------------
1277 c  Obtain beta:
1278 c---------------------------------------------------------------------
1279          beta = rho / rho0
1280
1281 c---------------------------------------------------------------------
1282 c  p = r + beta*p
1283 c---------------------------------------------------------------------
1284          do j=1, lastcol-firstcol+1
1285             p(j) = r(j) + beta*p(j)
1286          enddo
1287
1288
1289
1290       enddo                             ! end of do cgit=1,cgitmax
1291
1292
1293
1294 c---------------------------------------------------------------------
1295 c  Compute residual norm explicitly:  ||r|| = ||x - A.z||
1296 c  First, form A.z
1297 c  The partition submatrix-vector multiply
1298 c---------------------------------------------------------------------
1299       do j=1,lastrow-firstrow+1
1300          sum = 0.d0
1301          do k=rowstr(j),rowstr(j+1)-1
1302             sum = sum + a(k)*z(colidx(k))
1303          enddo
1304          w(j) = sum
1305       enddo
1306
1307
1308
1309 c---------------------------------------------------------------------
1310 c  Sum the partition submatrix-vec A.z's across rows
1311 c---------------------------------------------------------------------
1312       do i = l2npcols, 1, -1
1313          call mpi_irecv( r(reduce_recv_starts(i)),
1314      >                   reduce_recv_lengths(i),
1315      >                   dp_type,
1316      >                   reduce_exch_proc(i),
1317      >                   i,
1318      >                   mpi_comm_world,
1319      >                   request,
1320      >                   ierr )
1321          call mpi_send(  w(reduce_send_starts(i)),
1322      >                   reduce_send_lengths(i),
1323      >                   dp_type,
1324      >                   reduce_exch_proc(i),
1325      >                   i,
1326      >                   mpi_comm_world,
1327      >                   ierr )
1328          call mpi_wait( request, status, ierr )
1329
1330          do j=send_start,send_start + reduce_recv_lengths(i) - 1
1331             w(j) = w(j) + r(j)
1332          enddo
1333       enddo
1334
1335
1336 c---------------------------------------------------------------------
1337 c  Exchange piece of q with transpose processor:
1338 c---------------------------------------------------------------------
1339       if( l2npcols .ne. 0 )then
1340          call mpi_irecv( r,
1341      >                   exch_recv_length,
1342      >                   dp_type,
1343      >                   exch_proc,
1344      >                   1,
1345      >                   mpi_comm_world,
1346      >                   request,
1347      >                   ierr )
1348
1349          call mpi_send(  w(send_start),
1350      >                   send_len,
1351      >                   dp_type,
1352      >                   exch_proc,
1353      >                   1,
1354      >                   mpi_comm_world,
1355      >                   ierr )
1356          call mpi_wait( request, status, ierr )
1357       else
1358          do j=1,exch_recv_length
1359             r(j) = w(j)
1360          enddo
1361       endif
1362
1363
1364 c---------------------------------------------------------------------
1365 c  At this point, r contains A.z
1366 c---------------------------------------------------------------------
1367          sum = 0.0d0
1368          do j=1, lastcol-firstcol+1
1369             d   = x(j) - r(j)
1370             sum = sum + d*d
1371          enddo
1372
1373 c---------------------------------------------------------------------
1374 c  Obtain d with a sum-reduce
1375 c---------------------------------------------------------------------
1376       do i = 1, l2npcols
1377          call mpi_irecv( d,
1378      >                   1,
1379      >                   dp_type,
1380      >                   reduce_exch_proc(i),
1381      >                   i,
1382      >                   mpi_comm_world,
1383      >                   request,
1384      >                   ierr )
1385          call mpi_send(  sum,
1386      >                   1,
1387      >                   dp_type,
1388      >                   reduce_exch_proc(i),
1389      >                   i,
1390      >                   mpi_comm_world,
1391      >                   ierr )
1392          call mpi_wait( request, status, ierr )
1393
1394          sum = sum + d
1395       enddo
1396       d = sum
1397
1398
1399       if( me .eq. root ) rnorm = sqrt( d )
1400
1401
1402
1403       return
1404       end                               ! end of routine conj_grad
1405
1406
1407
1408 c---------------------------------------------------------------------
1409 c---------------------------------------------------------------------
1410       subroutine makea( n, nz, a, colidx, rowstr, nonzer,
1411      >                  firstrow, lastrow, firstcol, lastcol,
1412      >                  rcond, arow, acol, aelt, v, iv, shift )
1413 c---------------------------------------------------------------------
1414 c---------------------------------------------------------------------
1415
1416       integer             n, nz
1417       integer             firstrow, lastrow, firstcol, lastcol
1418       integer             colidx(nz), rowstr(n+1)
1419       integer             iv(2*n+1), arow(nz), acol(nz)
1420       double precision    v(n+1), aelt(nz)
1421       double precision    rcond, a(nz), shift
1422
1423 c---------------------------------------------------------------------
1424 c       generate the test problem for benchmark 6
1425 c       makea generates a sparse matrix with a
1426 c       prescribed sparsity distribution
1427 c
1428 c       parameter    type        usage
1429 c
1430 c       input
1431 c
1432 c       n            i           number of cols/rows of matrix
1433 c       nz           i           nonzeros as declared array size
1434 c       rcond        r*8         condition number
1435 c       shift        r*8         main diagonal shift
1436 c
1437 c       output
1438 c
1439 c       a            r*8         array for nonzeros
1440 c       colidx       i           col indices
1441 c       rowstr       i           row pointers
1442 c
1443 c       workspace
1444 c
1445 c       iv, arow, acol i
1446 c       v, aelt        r*8
1447 c---------------------------------------------------------------------
1448
1449       integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER
1450
1451 c---------------------------------------------------------------------
1452 c      nonzer is approximately  (int(sqrt(nnza /n)));
1453 c---------------------------------------------------------------------
1454
1455       double precision  size, ratio, scale
1456       external          sparse, sprnvc, vecset
1457
1458       size = 1.0D0
1459       ratio = rcond ** (1.0D0 / dfloat(n))
1460       nnza = 0
1461
1462 c---------------------------------------------------------------------
1463 c  Initialize iv(n+1 .. 2n) to zero.
1464 c  Used by sprnvc to mark nonzero positions
1465 c---------------------------------------------------------------------
1466
1467       do i = 1, n
1468            iv(n+i) = 0
1469       enddo
1470       do iouter = 1, n
1471          nzv = nonzer
1472          call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) )
1473          call vecset( n, v, colidx, nzv, iouter, .5D0 )
1474          do ivelt = 1, nzv
1475               jcol = colidx(ivelt)
1476               if (jcol.ge.firstcol .and. jcol.le.lastcol) then
1477                  scale = size * v(ivelt)
1478                  do ivelt1 = 1, nzv
1479                     irow = colidx(ivelt1)
1480                     if (irow.ge.firstrow .and. irow.le.lastrow) then
1481                        nnza = nnza + 1
1482                        if (nnza .gt. nz) goto 9999
1483                        acol(nnza) = jcol
1484                        arow(nnza) = irow
1485                        aelt(nnza) = v(ivelt1) * scale
1486                     endif
1487                  enddo
1488               endif
1489          enddo
1490          size = size * ratio
1491       enddo
1492
1493
1494 c---------------------------------------------------------------------
1495 c       ... add the identity * rcond to the generated matrix to bound
1496 c           the smallest eigenvalue from below by rcond
1497 c---------------------------------------------------------------------
1498         do i = firstrow, lastrow
1499            if (i.ge.firstcol .and. i.le.lastcol) then
1500               iouter = n + i
1501               nnza = nnza + 1
1502               if (nnza .gt. nz) goto 9999
1503               acol(nnza) = i
1504               arow(nnza) = i
1505               aelt(nnza) = rcond - shift
1506            endif
1507         enddo
1508
1509
1510 c---------------------------------------------------------------------
1511 c       ... make the sparse matrix from list of elements with duplicates
1512 c           (v and iv are used as  workspace)
1513 c---------------------------------------------------------------------
1514       call sparse( a, colidx, rowstr, n, arow, acol, aelt,
1515      >             firstrow, lastrow,
1516      >             v, iv(1), iv(n+1), nnza )
1517       return
1518
1519  9999 continue
1520       write(*,*) 'Space for matrix elements exceeded in makea'
1521       write(*,*) 'nnza, nzmax = ',nnza, nz
1522       write(*,*) ' iouter = ',iouter
1523
1524       stop
1525       end
1526 c-------end   of makea------------------------------
1527
1528 c---------------------------------------------------------------------
1529 c---------------------------------------------------------------------
1530       subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt,
1531      >                   firstrow, lastrow,
1532      >                   x, mark, nzloc, nnza )
1533 c---------------------------------------------------------------------
1534 c---------------------------------------------------------------------
1535
1536       implicit           logical (a-z)
1537       integer            colidx(*), rowstr(*)
1538       integer            firstrow, lastrow
1539       integer            n, arow(*), acol(*), nnza
1540       double precision   a(*), aelt(*)
1541
1542 c---------------------------------------------------------------------
1543 c       rows range from firstrow to lastrow
1544 c       the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
1545 c---------------------------------------------------------------------
1546       integer            nzloc(n), nrows
1547       double precision   x(n)
1548       logical            mark(n)
1549
1550 c---------------------------------------------------
1551 c       generate a sparse matrix from a list of
1552 c       [col, row, element] tri
1553 c---------------------------------------------------
1554
1555       integer            i, j, jajp1, nza, k, nzrow
1556       double precision   xi
1557
1558 c---------------------------------------------------------------------
1559 c    how many rows of result
1560 c---------------------------------------------------------------------
1561       nrows = lastrow - firstrow + 1
1562
1563 c---------------------------------------------------------------------
1564 c     ...count the number of triples in each row
1565 c---------------------------------------------------------------------
1566       do j = 1, n
1567          rowstr(j) = 0
1568          mark(j) = .false.
1569       enddo
1570       rowstr(n+1) = 0
1571
1572       do nza = 1, nnza
1573          j = (arow(nza) - firstrow + 1) + 1
1574          rowstr(j) = rowstr(j) + 1
1575       enddo
1576
1577       rowstr(1) = 1
1578       do j = 2, nrows+1
1579          rowstr(j) = rowstr(j) + rowstr(j-1)
1580       enddo
1581
1582
1583 c---------------------------------------------------------------------
1584 c     ... rowstr(j) now is the location of the first nonzero
1585 c           of row j of a
1586 c---------------------------------------------------------------------
1587
1588
1589 c---------------------------------------------------------------------
1590 c     ... do a bucket sort of the triples on the row index
1591 c---------------------------------------------------------------------
1592       do nza = 1, nnza
1593          j = arow(nza) - firstrow + 1
1594          k = rowstr(j)
1595          a(k) = aelt(nza)
1596          colidx(k) = acol(nza)
1597          rowstr(j) = rowstr(j) + 1
1598       enddo
1599
1600
1601 c---------------------------------------------------------------------
1602 c       ... rowstr(j) now points to the first element of row j+1
1603 c---------------------------------------------------------------------
1604       do j = nrows, 1, -1
1605           rowstr(j+1) = rowstr(j)
1606       enddo
1607       rowstr(1) = 1
1608
1609
1610 c---------------------------------------------------------------------
1611 c       ... generate the actual output rows by adding elements
1612 c---------------------------------------------------------------------
1613       nza = 0
1614       do i = 1, n
1615           x(i)    = 0.0
1616           mark(i) = .false.
1617       enddo
1618
1619       jajp1 = rowstr(1)
1620       do j = 1, nrows
1621          nzrow = 0
1622
1623 c---------------------------------------------------------------------
1624 c          ...loop over the jth row of a
1625 c---------------------------------------------------------------------
1626          do k = jajp1 , rowstr(j+1)-1
1627             i = colidx(k)
1628             x(i) = x(i) + a(k)
1629             if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then
1630              mark(i) = .true.
1631              nzrow = nzrow + 1
1632              nzloc(nzrow) = i
1633             endif
1634          enddo
1635
1636 c---------------------------------------------------------------------
1637 c          ... extract the nonzeros of this row
1638 c---------------------------------------------------------------------
1639          do k = 1, nzrow
1640             i = nzloc(k)
1641             mark(i) = .false.
1642             xi = x(i)
1643             x(i) = 0.D0
1644             if (xi .ne. 0.D0) then
1645              nza = nza + 1
1646              a(nza) = xi
1647              colidx(nza) = i
1648             endif
1649          enddo
1650          jajp1 = rowstr(j+1)
1651          rowstr(j+1) = nza + rowstr(1)
1652       enddo
1653 CC       write (*, 11000) nza
1654       return
1655 11000   format ( //,'final nonzero count in sparse ',
1656      1            /,'number of nonzeros       = ', i16 )
1657       end
1658 c-------end   of sparse-----------------------------
1659
1660
1661 c---------------------------------------------------------------------
1662 c---------------------------------------------------------------------
1663       subroutine sprnvc( n, nz, v, iv, nzloc, mark )
1664 c---------------------------------------------------------------------
1665 c---------------------------------------------------------------------
1666
1667       implicit           logical (a-z)
1668       double precision   v(*)
1669       integer            n, nz, iv(*), nzloc(n), nn1
1670       integer mark(n)
1671       common /urando/    amult, tran
1672       double precision   amult, tran
1673
1674
1675 c---------------------------------------------------------------------
1676 c       generate a sparse n-vector (v, iv)
1677 c       having nzv nonzeros
1678 c
1679 c       mark(i) is set to 1 if position i is nonzero.
1680 c       mark is all zero on entry and is reset to all zero before exit
1681 c       this corrects a performance bug found by John G. Lewis, caused by
1682 c       reinitialization of mark on every one of the n calls to sprnvc
1683 c---------------------------------------------------------------------
1684
1685         integer            nzrow, nzv, ii, i, icnvrt
1686
1687         external           randlc, icnvrt
1688         double precision   randlc, vecelt, vecloc
1689
1690
1691         nzv = 0
1692         nzrow = 0
1693         nn1 = 1
1694  50     continue
1695           nn1 = 2 * nn1
1696           if (nn1 .lt. n) goto 50
1697
1698 c---------------------------------------------------------------------
1699 c    nn1 is the smallest power of two not less than n
1700 c---------------------------------------------------------------------
1701
1702 100     continue
1703         if (nzv .ge. nz) goto 110
1704          vecelt = randlc( tran, amult )
1705
1706 c---------------------------------------------------------------------
1707 c   generate an integer between 1 and n in a portable manner
1708 c---------------------------------------------------------------------
1709          vecloc = randlc(tran, amult)
1710          i = icnvrt(vecloc, nn1) + 1
1711          if (i .gt. n) goto 100
1712
1713 c---------------------------------------------------------------------
1714 c  was this integer generated already?
1715 c---------------------------------------------------------------------
1716          if (mark(i) .eq. 0) then
1717             mark(i) = 1
1718             nzrow = nzrow + 1
1719             nzloc(nzrow) = i
1720             nzv = nzv + 1
1721             v(nzv) = vecelt
1722             iv(nzv) = i
1723          endif
1724          goto 100
1725 110      continue
1726       do ii = 1, nzrow
1727          i = nzloc(ii)
1728          mark(i) = 0
1729       enddo
1730       return
1731       end
1732 c-------end   of sprnvc-----------------------------
1733
1734
1735 c---------------------------------------------------------------------
1736 c---------------------------------------------------------------------
1737       function icnvrt(x, ipwr2)
1738 c---------------------------------------------------------------------
1739 c---------------------------------------------------------------------
1740
1741       implicit           logical (a-z)
1742       double precision   x
1743       integer            ipwr2, icnvrt
1744
1745 c---------------------------------------------------------------------
1746 c    scale a double precision number x in (0,1) by a power of 2 and chop it
1747 c---------------------------------------------------------------------
1748       icnvrt = int(ipwr2 * x)
1749
1750       return
1751       end
1752 c-------end   of icnvrt-----------------------------
1753
1754
1755 c---------------------------------------------------------------------
1756 c---------------------------------------------------------------------
1757       subroutine vecset(n, v, iv, nzv, i, val)
1758 c---------------------------------------------------------------------
1759 c---------------------------------------------------------------------
1760
1761       implicit           logical (a-z)
1762       integer            n, iv(*), nzv, i, k
1763       double precision   v(*), val
1764
1765 c---------------------------------------------------------------------
1766 c       set ith element of sparse vector (v, iv) with
1767 c       nzv nonzeros to val
1768 c---------------------------------------------------------------------
1769
1770       logical set
1771
1772       set = .false.
1773       do k = 1, nzv
1774          if (iv(k) .eq. i) then
1775             v(k) = val
1776             set  = .true.
1777          endif
1778       enddo
1779       if (.not. set) then
1780          nzv     = nzv + 1
1781          v(nzv)  = val
1782          iv(nzv) = i
1783       endif
1784       return
1785       end
1786 c-------end   of vecset-----------------------------
1787