!-------------------------------------------------------------------------!
!                                                                         !
!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
!                                                                         !
!                                   C G                                   !
!                                                                         !
!-------------------------------------------------------------------------!
!                                                                         !
!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
!    It is described in NAS Technical Reports 95-020 and 02-007           !
!                                                                         !
!    Permission to use, copy, distribute and modify this software         !
!    for any purpose with or without fee is hereby granted.  We           !
!    request, however, that all derived work reference the NAS            !
!    Parallel Benchmarks 3.3. This software is provided "as is"           !
!    without express or implied warranty.                                 !
!                                                                         !
!    Information on NPB 3.3, including the technical report, the          !
!    original specifications, source code, results and information        !
!    on how to submit new results, is available at:                       !
!                                                                         !
!           http://www.nas.nasa.gov/Software/NPB/                         !
!                                                                         !
!    Send comments or suggestions to  npb@nas.nasa.gov                    !
!                                                                         !
!          NAS Parallel Benchmarks Group                                  !
!          NASA Ames Research Center                                      !
!          Mail Stop: T27A-1                                              !
!          Moffett Field, CA   94035-1000                                 !
!                                                                         !
!          E-mail:  npb@nas.nasa.gov                                      !
!          Fax:     (650) 604-3957                                        !
!                                                                         !
!-------------------------------------------------------------------------!


c---------------------------------------------------------------------
c
c Authors: M. Yarrow
c          C. Kuszmaul
c          R. F. Van der Wijngaart
c          H. Jin
c
c---------------------------------------------------------------------


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      program cg
c---------------------------------------------------------------------
c---------------------------------------------------------------------


      implicit none

      include 'mpinpb.h'
      integer status(MPI_STATUS_SIZE), request, ierr

      include 'npbparams.h'

c---------------------------------------------------------------------
c  num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows.
c  num_proc_cols and num_proc_cols are to be found in npbparams.h.
c  When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows.
c---------------------------------------------------------------------
      integer    num_procs 
      parameter( num_procs = num_proc_cols * num_proc_rows )


c---------------------------------------------------------------------
c  Class specific parameters: 
c  It appears here for reference only.
c  These are their values, however, this info is imported in the npbparams.h
c  include file, which is written by the sys/setparams.c program.
c---------------------------------------------------------------------

C----------
C  Class S:
C----------
CC       parameter( na=1400, 
CC      >           nonzer=7, 
CC      >           shift=10., 
CC      >           niter=15,
CC      >           rcond=1.0d-1 )
C----------
C  Class W:
C----------
CC       parameter( na=7000,
CC      >           nonzer=8, 
CC      >           shift=12., 
CC      >           niter=15,
CC      >           rcond=1.0d-1 )
C----------
C  Class A:
C----------
CC       parameter( na=14000,
CC      >           nonzer=11, 
CC      >           shift=20., 
CC      >           niter=15,
CC      >           rcond=1.0d-1 )
C----------
C  Class B:
C----------
CC       parameter( na=75000, 
CC      >           nonzer=13, 
CC      >           shift=60., 
CC      >           niter=75,
CC      >           rcond=1.0d-1 )
C----------
C  Class C:
C----------
CC       parameter( na=150000, 
CC      >           nonzer=15, 
CC      >           shift=110., 
CC      >           niter=75,
CC      >           rcond=1.0d-1 )
C----------
C  Class D:
C----------
CC       parameter( na=1500000, 
CC      >           nonzer=21, 
CC      >           shift=500., 
CC      >           niter=100,
CC      >           rcond=1.0d-1 )
C----------
C  Class E:
C----------
CC       parameter( na=9000000, 
CC      >           nonzer=26, 
CC      >           shift=1500., 
CC      >           niter=100,
CC      >           rcond=1.0d-1 )


      integer    nz
      parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer
     >              + na*(nonzer+2+num_procs/256)/num_proc_cols )


      common / partit_size  /  naa, nzz, 
     >                         npcols, nprows,
     >                         proc_col, proc_row,
     >                         firstrow, 
     >                         lastrow, 
     >                         firstcol, 
     >                         lastcol,
     >                         exch_proc,
     >                         exch_recv_length,
     >                         send_start,
     >                         send_len
      integer                  naa, nzz, 
     >                         npcols, nprows,
     >                         proc_col, proc_row,
     >                         firstrow, 
     >                         lastrow, 
     >                         firstcol, 
     >                         lastcol,
     >                         exch_proc,
     >                         exch_recv_length,
     >                         send_start,
     >                         send_len


      common / main_int_mem /  colidx,     rowstr,
     >                         iv,         arow,     acol
      integer                  colidx(nz), rowstr(na+1),
     >                         iv(2*na+1), arow(nz), acol(nz)


      common / main_flt_mem /  v,       aelt,     a,
     >                         x,
     >                         z,
     >                         p,
     >                         q,
     >                         r,
     >                         w
      double precision         v(na+1), aelt(nz), a(nz),
     >                         x(na/num_proc_rows+2),
     >                         z(na/num_proc_rows+2),
     >                         p(na/num_proc_rows+2),
     >                         q(na/num_proc_rows+2),
     >                         r(na/num_proc_rows+2),
     >                         w(na/num_proc_rows+2)


      common /urando/          amult, tran
      double precision         amult, tran


      integer            l2npcols
      integer            reduce_exch_proc(num_proc_cols)
      integer            reduce_send_starts(num_proc_cols)
      integer            reduce_send_lengths(num_proc_cols)
      integer            reduce_recv_starts(num_proc_cols)
      integer            reduce_recv_lengths(num_proc_cols)

      integer            i, j, k, it

      double precision   zeta, randlc
      external           randlc
      double precision   rnorm
      double precision   norm_temp1(2), norm_temp2(2)

      double precision   t, tmax, mflops
      external           timer_read
      double precision   timer_read
      character          class
      logical            verified
      double precision   zeta_verify_value, epsilon, err


c---------------------------------------------------------------------
c  Set up mpi initialization and number of proc testing
c---------------------------------------------------------------------
      call initialize_mpi


      if( na .eq. 1400 .and. 
     &    nonzer .eq. 7 .and. 
     &    niter .eq. 15 .and.
     &    shift .eq. 10.d0 ) then
         class = 'S'
         zeta_verify_value = 8.5971775078648d0
      else if( na .eq. 7000 .and. 
     &         nonzer .eq. 8 .and. 
     &         niter .eq. 15 .and.
     &         shift .eq. 12.d0 ) then
         class = 'W'
         zeta_verify_value = 10.362595087124d0
      else if( na .eq. 14000 .and. 
     &         nonzer .eq. 11 .and. 
     &         niter .eq. 15 .and.
     &         shift .eq. 20.d0 ) then
         class = 'A'
         zeta_verify_value = 17.130235054029d0
      else if( na .eq. 75000 .and. 
     &         nonzer .eq. 13 .and. 
     &         niter .eq. 75 .and.
     &         shift .eq. 60.d0 ) then
         class = 'B'
         zeta_verify_value = 22.712745482631d0
      else if( na .eq. 150000 .and. 
     &         nonzer .eq. 15 .and. 
     &         niter .eq. 75 .and.
     &         shift .eq. 110.d0 ) then
         class = 'C'
         zeta_verify_value = 28.973605592845d0
      else if( na .eq. 1500000 .and. 
     &         nonzer .eq. 21 .and. 
     &         niter .eq. 100 .and.
     &         shift .eq. 500.d0 ) then
         class = 'D'
         zeta_verify_value = 52.514532105794d0
      else if( na .eq. 9000000 .and. 
     &         nonzer .eq. 26 .and. 
     &         niter .eq. 100 .and.
     &         shift .eq. 1.5d3 ) then
         class = 'E'
         zeta_verify_value = 77.522164599383d0
      else
         class = 'U'
      endif

      if( me .eq. root )then
         write( *,1000 ) 
         write( *,1001 ) na
         write( *,1002 ) niter
         write( *,1003 ) nprocs
         write( *,1004 ) nonzer
         write( *,1005 ) shift
 1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /)
 1001 format(' Size: ', i10 )
 1002 format(' Iterations: ', i5 )
 1003 format(' Number of active processes: ', i5 )
 1004 format(' Number of nonzeroes per row: ', i8)
 1005 format(' Eigenvalue shift: ', e8.3)
      endif

      if (.not. convertdouble) then
         dp_type = MPI_DOUBLE_PRECISION
      else
         dp_type = MPI_REAL
      endif


      naa = na
      nzz = nz


c---------------------------------------------------------------------
c  Set up processor info, such as whether sq num of procs, etc
c---------------------------------------------------------------------
      call setup_proc_info( num_procs, 
     >                      num_proc_rows, 
     >                      num_proc_cols )


c---------------------------------------------------------------------
c  Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow
c---------------------------------------------------------------------
      call setup_submatrix_info( l2npcols,
     >                           reduce_exch_proc,
     >                           reduce_send_starts,
     >                           reduce_send_lengths,
     >                           reduce_recv_starts,
     >                           reduce_recv_lengths )


c---------------------------------------------------------------------
c  Inialize random number generator
c---------------------------------------------------------------------
      tran    = 314159265.0D0
      amult   = 1220703125.0D0
      zeta    = randlc( tran, amult )

c---------------------------------------------------------------------
c  Set up partition's sparse random matrix for given class size
c---------------------------------------------------------------------
      call makea(naa, nzz, a, colidx, rowstr, nonzer,
     >           firstrow, lastrow, firstcol, lastcol, 
     >           rcond, arow, acol, aelt, v, iv, shift)


c---------------------------------------------------------------------
c  Note: as a result of the above call to makea:
c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
c        values of colidx which are col indexes go from firstcol --> lastcol
c        So:
c        Shift the col index vals from actual (firstcol --> lastcol ) 
c        to local, i.e., (1 --> lastcol-firstcol+1)
c---------------------------------------------------------------------
      do j=1,lastrow-firstrow+1
         do k=rowstr(j),rowstr(j+1)-1
            colidx(k) = colidx(k) - firstcol + 1
         enddo
      enddo

c---------------------------------------------------------------------
c  set starting vector to (1, 1, .... 1)
c---------------------------------------------------------------------
      do i = 1, na/num_proc_rows+1
         x(i) = 1.0D0
      enddo

      zeta  = 0.0d0

c---------------------------------------------------------------------
c---->
c  Do one iteration untimed to init all code and data page tables
c---->                    (then reinit, start timing, to niter its)
c---------------------------------------------------------------------
      do it = 1, 1

c---------------------------------------------------------------------
c  The call to the conjugate gradient routine:
c---------------------------------------------------------------------
         call conj_grad ( colidx,
     >                    rowstr,
     >                    x,
     >                    z,
     >                    a,
     >                    p,
     >                    q,
     >                    r,
     >                    w,
     >                    rnorm, 
     >                    l2npcols,
     >                    reduce_exch_proc,
     >                    reduce_send_starts,
     >                    reduce_send_lengths,
     >                    reduce_recv_starts,
     >                    reduce_recv_lengths )

c---------------------------------------------------------------------
c  zeta = shift + 1/(x.z)
c  So, first: (x.z)
c  Also, find norm of z
c  So, first: (z.z)
c---------------------------------------------------------------------
         norm_temp1(1) = 0.0d0
         norm_temp1(2) = 0.0d0
         do j=1, lastcol-firstcol+1
            norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
            norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
         enddo

         do i = 1, l2npcols
            call mpi_irecv( norm_temp2,
     >                      2, 
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      request,
     >                      ierr )
            call mpi_send(  norm_temp1,
     >                      2, 
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      ierr )
            call mpi_wait( request, status, ierr )

            norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
            norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
         enddo

         norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )


c---------------------------------------------------------------------
c  Normalize z to obtain x
c---------------------------------------------------------------------
         do j=1, lastcol-firstcol+1      
            x(j) = norm_temp1(2)*z(j)    
         enddo                           


      enddo                              ! end of do one iteration untimed


c---------------------------------------------------------------------
c  set starting vector to (1, 1, .... 1)
c---------------------------------------------------------------------
c
c  NOTE: a questionable limit on size:  should this be na/num_proc_cols+1 ?
c
      do i = 1, na/num_proc_rows+1
         x(i) = 1.0D0
      enddo

      zeta  = 0.0d0

c---------------------------------------------------------------------
c  Synchronize and start timing
c---------------------------------------------------------------------
      call mpi_barrier( mpi_comm_world,
     >                  ierr )

      call timer_clear( 1 )
      call timer_start( 1 )

c---------------------------------------------------------------------
c---->
c  Main Iteration for inverse power method
c---->
c---------------------------------------------------------------------
      do it = 1, niter

c---------------------------------------------------------------------
c  The call to the conjugate gradient routine:
c---------------------------------------------------------------------
         call conj_grad ( colidx,
     >                    rowstr,
     >                    x,
     >                    z,
     >                    a,
     >                    p,
     >                    q,
     >                    r,
     >                    w,
     >                    rnorm, 
     >                    l2npcols,
     >                    reduce_exch_proc,
     >                    reduce_send_starts,
     >                    reduce_send_lengths,
     >                    reduce_recv_starts,
     >                    reduce_recv_lengths )


c---------------------------------------------------------------------
c  zeta = shift + 1/(x.z)
c  So, first: (x.z)
c  Also, find norm of z
c  So, first: (z.z)
c---------------------------------------------------------------------
         norm_temp1(1) = 0.0d0
         norm_temp1(2) = 0.0d0
         do j=1, lastcol-firstcol+1
            norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
            norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
         enddo

         do i = 1, l2npcols
            call mpi_irecv( norm_temp2,
     >                      2, 
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      request,
     >                      ierr )
            call mpi_send(  norm_temp1,
     >                      2, 
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      ierr )
            call mpi_wait( request, status, ierr )

            norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
            norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
         enddo

         norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )


         if( me .eq. root )then
            zeta = shift + 1.0d0 / norm_temp1(1)
            if( it .eq. 1 ) write( *,9000 )
            write( *,9001 ) it, rnorm, zeta
         endif
 9000 format( /,'   iteration           ||r||                 zeta' )
 9001 format( 4x, i5, 7x, e20.14, f20.13 )

c---------------------------------------------------------------------
c  Normalize z to obtain x
c---------------------------------------------------------------------
         do j=1, lastcol-firstcol+1      
            x(j) = norm_temp1(2)*z(j)    
         enddo                           


      enddo                              ! end of main iter inv pow meth

      call timer_stop( 1 )

c---------------------------------------------------------------------
c  End of timed section
c---------------------------------------------------------------------

      t = timer_read( 1 )

      call mpi_reduce( t,
     >                 tmax,
     >                 1, 
     >                 dp_type,
     >                 MPI_MAX,
     >                 root,
     >                 mpi_comm_world,
     >                 ierr )

      if( me .eq. root )then
         write(*,100)
 100     format(' Benchmark completed ')

         epsilon = 1.d-10
         if (class .ne. 'U') then

            err = abs( zeta - zeta_verify_value )/zeta_verify_value
            if( err .le. epsilon ) then
               verified = .TRUE.
               write(*, 200)
               write(*, 201) zeta
               write(*, 202) err
 200           format(' VERIFICATION SUCCESSFUL ')
 201           format(' Zeta is    ', E20.13)
 202           format(' Error is   ', E20.13)
            else
               verified = .FALSE.
               write(*, 300) 
               write(*, 301) zeta
               write(*, 302) zeta_verify_value
 300           format(' VERIFICATION FAILED')
 301           format(' Zeta                ', E20.13)
 302           format(' The correct zeta is ', E20.13)
            endif
         else
            verified = .FALSE.
            write (*, 400)
            write (*, 401)
            write (*, 201) zeta
 400        format(' Problem size unknown')
 401        format(' NO VERIFICATION PERFORMED')
         endif


         if( tmax .ne. 0. ) then
            mflops = float( 2*niter*na )
     &                  * ( 3.+float( nonzer*(nonzer+1) )
     &                    + 25.*(5.+float( nonzer*(nonzer+1) ))
     &                    + 3. ) / tmax / 1000000.0
         else
            mflops = 0.0
         endif

         call print_results('CG', class, na, 0, 0,
     >                      niter, nnodes_compiled, nprocs, tmax,
     >                      mflops, '          floating point', 
     >                      verified, npbversion, compiletime,
     >                      cs1, cs2, cs3, cs4, cs5, cs6, cs7)


      endif


      call mpi_finalize(ierr)


      end                              ! end main


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      subroutine initialize_mpi
c---------------------------------------------------------------------
c---------------------------------------------------------------------

      implicit none

      include 'mpinpb.h'

      integer   ierr


      call mpi_init( ierr )
      call mpi_comm_rank( mpi_comm_world, me, ierr )
      call mpi_comm_size( mpi_comm_world, nprocs, ierr )
      root = 0


      return
      end


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      subroutine setup_proc_info( num_procs, 
     >                            num_proc_rows, 
     >                            num_proc_cols )
c---------------------------------------------------------------------
c---------------------------------------------------------------------

      implicit none

      include 'mpinpb.h'

      common / partit_size  /  naa, nzz, 
     >                         npcols, nprows,
     >                         proc_col, proc_row,
     >                         firstrow, 
     >                         lastrow, 
     >                         firstcol, 
     >                         lastcol,
     >                         exch_proc,
     >                         exch_recv_length,
     >                         send_start,
     >                         send_len
      integer                  naa, nzz, 
     >                         npcols, nprows,
     >                         proc_col, proc_row,
     >                         firstrow, 
     >                         lastrow, 
     >                         firstcol, 
     >                         lastcol,
     >                         exch_proc,
     >                         exch_recv_length,
     >                         send_start,
     >                         send_len

      integer   num_procs, num_proc_cols, num_proc_rows
      integer   i, ierr
      integer   log2nprocs

c---------------------------------------------------------------------
c  num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows
c  When num_procs is not square, then num_proc_cols = 2*num_proc_rows
c---------------------------------------------------------------------
c  First, number of procs must be power of two. 
c---------------------------------------------------------------------
      if( nprocs .ne. num_procs )then
          if( me .eq. root ) write( *,9000 ) nprocs, num_procs
 9000     format(      /,'Error: ',/,'num of procs allocated   (', 
     >                 i4, ' )',
     >                 /,'is not equal to',/,
     >                 'compiled number of procs (',
     >                 i4, ' )',/   )
          call mpi_finalize(ierr)
          stop
      endif


      i = num_proc_cols
 100  continue
          if( i .ne. 1 .and. i/2*2 .ne. i )then
              if ( me .eq. root ) then  
                 write( *,* ) 'Error: num_proc_cols is ',
     >                         num_proc_cols,
     >                        ' which is not a power of two'
              endif
              call mpi_finalize(ierr)
              stop
          endif
          i = i / 2
          if( i .ne. 0 )then
              goto 100
          endif
      
      i = num_proc_rows
 200  continue
          if( i .ne. 1 .and. i/2*2 .ne. i )then
              if ( me .eq. root ) then 
                 write( *,* ) 'Error: num_proc_rows is ',
     >                         num_proc_rows,
     >                        ' which is not a power of two'
              endif
              call mpi_finalize(ierr)
              stop
          endif
          i = i / 2
          if( i .ne. 0 )then
              goto 200
          endif
      
      log2nprocs = 0
      i = nprocs
 300  continue
          if( i .ne. 1 .and. i/2*2 .ne. i )then
              write( *,* ) 'Error: nprocs is ',
     >                      nprocs,
     >                      ' which is not a power of two'
              call mpi_finalize(ierr)
              stop
          endif
          i = i / 2
          if( i .ne. 0 )then
              log2nprocs = log2nprocs + 1
              goto 300
          endif

CC       write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs

      
      npcols = num_proc_cols
      nprows = num_proc_rows


      return
      end


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      subroutine setup_submatrix_info( l2npcols,
     >                                 reduce_exch_proc,
     >                                 reduce_send_starts,
     >                                 reduce_send_lengths,
     >                                 reduce_recv_starts,
     >                                 reduce_recv_lengths )
     >                                 
c---------------------------------------------------------------------
c---------------------------------------------------------------------

      implicit none

      include 'mpinpb.h'

      integer      col_size, row_size

      common / partit_size  /  naa, nzz, 
     >                         npcols, nprows,
     >                         proc_col, proc_row,
     >                         firstrow, 
     >                         lastrow, 
     >                         firstcol, 
     >                         lastcol,
     >                         exch_proc,
     >                         exch_recv_length,
     >                         send_start,
     >                         send_len
      integer                  naa, nzz, 
     >                         npcols, nprows,
     >                         proc_col, proc_row,
     >                         firstrow, 
     >                         lastrow, 
     >                         firstcol, 
     >                         lastcol,
     >                         exch_proc,
     >                         exch_recv_length,
     >                         send_start,
     >                         send_len

      integer   reduce_exch_proc(*)
      integer   reduce_send_starts(*)
      integer   reduce_send_lengths(*)
      integer   reduce_recv_starts(*)
      integer   reduce_recv_lengths(*)

      integer   i, j
      integer   div_factor
      integer   l2npcols


      proc_row = me / npcols
      proc_col = me - proc_row*npcols


c---------------------------------------------------------------------
c  If naa evenly divisible by npcols, then it is evenly divisible 
c  by nprows 
c---------------------------------------------------------------------

      if( naa/npcols*npcols .eq. naa )then
          col_size = naa/npcols
          firstcol = proc_col*col_size + 1
          lastcol  = firstcol - 1 + col_size
          row_size = naa/nprows
          firstrow = proc_row*row_size + 1
          lastrow  = firstrow - 1 + row_size
c---------------------------------------------------------------------
c  If naa not evenly divisible by npcols, then first subdivide for nprows
c  and then, if npcols not equal to nprows (i.e., not a sq number of procs), 
c  get col subdivisions by dividing by 2 each row subdivision.
c---------------------------------------------------------------------
      else
          if( proc_row .lt. naa - naa/nprows*nprows)then
              row_size = naa/nprows+ 1
              firstrow = proc_row*row_size + 1
              lastrow  = firstrow - 1 + row_size
          else
              row_size = naa/nprows
              firstrow = (naa - naa/nprows*nprows)*(row_size+1)
     >                 + (proc_row-(naa-naa/nprows*nprows))
     >                     *row_size + 1
              lastrow  = firstrow - 1 + row_size
          endif
          if( npcols .eq. nprows )then
              if( proc_col .lt. naa - naa/npcols*npcols )then
                  col_size = naa/npcols+ 1
                  firstcol = proc_col*col_size + 1
                  lastcol  = firstcol - 1 + col_size
              else
                  col_size = naa/npcols
                  firstcol = (naa - naa/npcols*npcols)*(col_size+1)
     >                     + (proc_col-(naa-naa/npcols*npcols))
     >                         *col_size + 1
                  lastcol  = firstcol - 1 + col_size
              endif
          else
              if( (proc_col/2) .lt. 
     >                           naa - naa/(npcols/2)*(npcols/2) )then
                  col_size = naa/(npcols/2) + 1
                  firstcol = (proc_col/2)*col_size + 1
                  lastcol  = firstcol - 1 + col_size
              else
                  col_size = naa/(npcols/2)
                  firstcol = (naa - naa/(npcols/2)*(npcols/2))
     >                                                 *(col_size+1)
     >               + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2)))
     >                         *col_size + 1
                  lastcol  = firstcol - 1 + col_size
              endif
CC               write( *,* ) col_size,firstcol,lastcol
              if( mod( me,2 ) .eq. 0 )then
                  lastcol  = firstcol - 1 + (col_size-1)/2 + 1
              else
                  firstcol = firstcol + (col_size-1)/2 + 1
                  lastcol  = firstcol - 1 + col_size/2
CC                   write( *,* ) firstcol,lastcol
              endif
          endif
      endif


      if( npcols .eq. nprows )then
          send_start = 1
          send_len   = lastrow - firstrow + 1
      else
          if( mod( me,2 ) .eq. 0 )then
              send_start = 1
              send_len   = (1 + lastrow-firstrow+1)/2
          else
              send_start = (1 + lastrow-firstrow+1)/2 + 1
              send_len   = (lastrow-firstrow+1)/2
          endif
      endif
          

c---------------------------------------------------------------------
c  Transpose exchange processor
c---------------------------------------------------------------------

      if( npcols .eq. nprows )then
          exch_proc = mod( me,nprows )*nprows + me/nprows
      else
          exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows)
     >                 + mod( me,2 )
      endif


      i = npcols / 2
      l2npcols = 0
      do while( i .gt. 0 )
         l2npcols = l2npcols + 1
         i = i / 2
      enddo


c---------------------------------------------------------------------
c  Set up the reduce phase schedules...
c---------------------------------------------------------------------

      div_factor = npcols
      do i = 1, l2npcols

         j = mod( proc_col+div_factor/2, div_factor )
     >     + proc_col / div_factor * div_factor
         reduce_exch_proc(i) = proc_row*npcols + j

         div_factor = div_factor / 2

      enddo


      do i = l2npcols, 1, -1

            if( nprows .eq. npcols )then
               reduce_send_starts(i)  = send_start
               reduce_send_lengths(i) = send_len
               reduce_recv_lengths(i) = lastrow - firstrow + 1
            else
               reduce_recv_lengths(i) = send_len
               if( i .eq. l2npcols )then
                  reduce_send_lengths(i) = lastrow-firstrow+1 - send_len
                  if( me/2*2 .eq. me )then
                     reduce_send_starts(i) = send_start + send_len
                  else
                     reduce_send_starts(i) = 1
                  endif
               else
                  reduce_send_lengths(i) = send_len
                  reduce_send_starts(i)  = send_start
               endif
            endif
            reduce_recv_starts(i) = send_start

      enddo


      exch_recv_length = lastcol - firstcol + 1


      return
      end


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      subroutine conj_grad ( colidx,
     >                       rowstr,
     >                       x,
     >                       z,
     >                       a,
     >                       p,
     >                       q,
     >                       r,
     >                       w,
     >                       rnorm, 
     >                       l2npcols,
     >                       reduce_exch_proc,
     >                       reduce_send_starts,
     >                       reduce_send_lengths,
     >                       reduce_recv_starts,
     >                       reduce_recv_lengths )
c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c  Floaging point arrays here are named as in NPB1 spec discussion of 
c  CG algorithm
c---------------------------------------------------------------------
 
      implicit none

      include 'mpinpb.h'

      integer status(MPI_STATUS_SIZE ), request


      common / partit_size  /  naa, nzz, 
     >                         npcols, nprows,
     >                         proc_col, proc_row,
     >                         firstrow, 
     >                         lastrow, 
     >                         firstcol, 
     >                         lastcol,
     >                         exch_proc,
     >                         exch_recv_length,
     >                         send_start,
     >                         send_len
      integer                  naa, nzz, 
     >                         npcols, nprows,
     >                         proc_col, proc_row,
     >                         firstrow, 
     >                         lastrow, 
     >                         firstcol, 
     >                         lastcol,
     >                         exch_proc,
     >                         exch_recv_length,
     >                         send_start,
     >                         send_len


      double precision   x(*),
     >                   z(*),
     >                   a(nzz)
      integer            colidx(nzz), rowstr(naa+1)

      double precision   p(*),
     >                   q(*),
     >                   r(*),               
     >                   w(*)                ! used as work temporary

      integer   l2npcols
      integer   reduce_exch_proc(l2npcols)
      integer   reduce_send_starts(l2npcols)
      integer   reduce_send_lengths(l2npcols)
      integer   reduce_recv_starts(l2npcols)
      integer   reduce_recv_lengths(l2npcols)

      integer   i, j, k, ierr
      integer   cgit, cgitmax

      double precision   d, sum, rho, rho0, alpha, beta, rnorm

      external         timer_read
      double precision timer_read

      data      cgitmax / 25 /


c---------------------------------------------------------------------
c  Initialize the CG algorithm:
c---------------------------------------------------------------------
      do j=1,naa/nprows+1
         q(j) = 0.0d0
         z(j) = 0.0d0
         r(j) = x(j)
         p(j) = r(j)
         w(j) = 0.0d0                 
      enddo


c---------------------------------------------------------------------
c  rho = r.r
c  Now, obtain the norm of r: First, sum squares of r elements locally...
c---------------------------------------------------------------------
      sum = 0.0d0
      do j=1, lastcol-firstcol+1
         sum = sum + r(j)*r(j)
      enddo

c---------------------------------------------------------------------
c  Exchange and sum with procs identified in reduce_exch_proc
c  (This is equivalent to mpi_allreduce.)
c  Sum the partial sums of rho, leaving rho on all processors
c---------------------------------------------------------------------
      do i = 1, l2npcols
         call mpi_irecv( rho,
     >                   1,
     >                   dp_type,
     >                   reduce_exch_proc(i),
     >                   i,
     >                   mpi_comm_world,
     >                   request,
     >                   ierr )
         call mpi_send(  sum,
     >                   1,
     >                   dp_type,
     >                   reduce_exch_proc(i),
     >                   i,
     >                   mpi_comm_world,
     >                   ierr )
         call mpi_wait( request, status, ierr )

         sum = sum + rho
      enddo
      rho = sum


c---------------------------------------------------------------------
c---->
c  The conj grad iteration loop
c---->
c---------------------------------------------------------------------
      do cgit = 1, cgitmax


c---------------------------------------------------------------------
c  q = A.p
c  The partition submatrix-vector multiply: use workspace w
c---------------------------------------------------------------------
         do j=1,lastrow-firstrow+1
            sum = 0.d0
            do k=rowstr(j),rowstr(j+1)-1
               sum = sum + a(k)*p(colidx(k))
            enddo
            w(j) = sum
         enddo

c---------------------------------------------------------------------
c  Sum the partition submatrix-vec A.p's across rows
c  Exchange and sum piece of w with procs identified in reduce_exch_proc
c---------------------------------------------------------------------
         do i = l2npcols, 1, -1
            call mpi_irecv( q(reduce_recv_starts(i)),
     >                      reduce_recv_lengths(i),
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      request,
     >                      ierr )
            call mpi_send(  w(reduce_send_starts(i)),
     >                      reduce_send_lengths(i),
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      ierr )
            call mpi_wait( request, status, ierr )
            do j=send_start,send_start + reduce_recv_lengths(i) - 1
               w(j) = w(j) + q(j)
            enddo
         enddo
      

c---------------------------------------------------------------------
c  Exchange piece of q with transpose processor:
c---------------------------------------------------------------------
         if( l2npcols .ne. 0 )then
            call mpi_irecv( q,               
     >                      exch_recv_length,
     >                      dp_type,
     >                      exch_proc,
     >                      1,
     >                      mpi_comm_world,
     >                      request,
     >                      ierr )

            call mpi_send(  w(send_start),   
     >                      send_len,
     >                      dp_type,
     >                      exch_proc,
     >                      1,
     >                      mpi_comm_world,
     >                      ierr )
            call mpi_wait( request, status, ierr )
         else
            do j=1,exch_recv_length
               q(j) = w(j)
            enddo
         endif


c---------------------------------------------------------------------
c  Clear w for reuse...
c---------------------------------------------------------------------
         do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 )
            w(j) = 0.0d0
         enddo
         

c---------------------------------------------------------------------
c  Obtain p.q
c---------------------------------------------------------------------
         sum = 0.0d0
         do j=1, lastcol-firstcol+1
            sum = sum + p(j)*q(j)
         enddo

c---------------------------------------------------------------------
c  Obtain d with a sum-reduce
c---------------------------------------------------------------------
         do i = 1, l2npcols
            call mpi_irecv( d,
     >                      1,
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      request,
     >                      ierr )
            call mpi_send(  sum,
     >                      1,
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      ierr )

            call mpi_wait( request, status, ierr )

            sum = sum + d
         enddo
         d = sum


c---------------------------------------------------------------------
c  Obtain alpha = rho / (p.q)
c---------------------------------------------------------------------
         alpha = rho / d

c---------------------------------------------------------------------
c  Save a temporary of rho
c---------------------------------------------------------------------
         rho0 = rho

c---------------------------------------------------------------------
c  Obtain z = z + alpha*p
c  and    r = r - alpha*q
c---------------------------------------------------------------------
         do j=1, lastcol-firstcol+1
            z(j) = z(j) + alpha*p(j)
            r(j) = r(j) - alpha*q(j)
         enddo
            
c---------------------------------------------------------------------
c  rho = r.r
c  Now, obtain the norm of r: First, sum squares of r elements locally...
c---------------------------------------------------------------------
         sum = 0.0d0
         do j=1, lastcol-firstcol+1
            sum = sum + r(j)*r(j)
         enddo

c---------------------------------------------------------------------
c  Obtain rho with a sum-reduce
c---------------------------------------------------------------------
         do i = 1, l2npcols
            call mpi_irecv( rho,
     >                      1,
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      request,
     >                      ierr )
            call mpi_send(  sum,
     >                      1,
     >                      dp_type,
     >                      reduce_exch_proc(i),
     >                      i,
     >                      mpi_comm_world,
     >                      ierr )
            call mpi_wait( request, status, ierr )

            sum = sum + rho
         enddo
         rho = sum

c---------------------------------------------------------------------
c  Obtain beta:
c---------------------------------------------------------------------
         beta = rho / rho0

c---------------------------------------------------------------------
c  p = r + beta*p
c---------------------------------------------------------------------
         do j=1, lastcol-firstcol+1
            p(j) = r(j) + beta*p(j)
         enddo


      enddo                             ! end of do cgit=1,cgitmax


c---------------------------------------------------------------------
c  Compute residual norm explicitly:  ||r|| = ||x - A.z||
c  First, form A.z
c  The partition submatrix-vector multiply
c---------------------------------------------------------------------
      do j=1,lastrow-firstrow+1
         sum = 0.d0
         do k=rowstr(j),rowstr(j+1)-1
            sum = sum + a(k)*z(colidx(k))
         enddo
         w(j) = sum
      enddo


c---------------------------------------------------------------------
c  Sum the partition submatrix-vec A.z's across rows
c---------------------------------------------------------------------
      do i = l2npcols, 1, -1
         call mpi_irecv( r(reduce_recv_starts(i)),
     >                   reduce_recv_lengths(i),
     >                   dp_type,
     >                   reduce_exch_proc(i),
     >                   i,
     >                   mpi_comm_world,
     >                   request,
     >                   ierr )
         call mpi_send(  w(reduce_send_starts(i)),
     >                   reduce_send_lengths(i),
     >                   dp_type,
     >                   reduce_exch_proc(i),
     >                   i,
     >                   mpi_comm_world,
     >                   ierr )
         call mpi_wait( request, status, ierr )

         do j=send_start,send_start + reduce_recv_lengths(i) - 1
            w(j) = w(j) + r(j)
         enddo
      enddo
      

c---------------------------------------------------------------------
c  Exchange piece of q with transpose processor:
c---------------------------------------------------------------------
      if( l2npcols .ne. 0 )then
         call mpi_irecv( r,               
     >                   exch_recv_length,
     >                   dp_type,
     >                   exch_proc,
     >                   1,
     >                   mpi_comm_world,
     >                   request,
     >                   ierr )
   
         call mpi_send(  w(send_start),   
     >                   send_len,
     >                   dp_type,
     >                   exch_proc,
     >                   1,
     >                   mpi_comm_world,
     >                   ierr )
         call mpi_wait( request, status, ierr )
      else
         do j=1,exch_recv_length
            r(j) = w(j)
         enddo
      endif


c---------------------------------------------------------------------
c  At this point, r contains A.z
c---------------------------------------------------------------------
         sum = 0.0d0
         do j=1, lastcol-firstcol+1
            d   = x(j) - r(j)         
            sum = sum + d*d
         enddo
         
c---------------------------------------------------------------------
c  Obtain d with a sum-reduce
c---------------------------------------------------------------------
      do i = 1, l2npcols
         call mpi_irecv( d,
     >                   1,
     >                   dp_type,
     >                   reduce_exch_proc(i),
     >                   i,
     >                   mpi_comm_world,
     >                   request,
     >                   ierr )
         call mpi_send(  sum,
     >                   1,
     >                   dp_type,
     >                   reduce_exch_proc(i),
     >                   i,
     >                   mpi_comm_world,
     >                   ierr )
         call mpi_wait( request, status, ierr )

         sum = sum + d
      enddo
      d = sum


      if( me .eq. root ) rnorm = sqrt( d )


      return
      end                               ! end of routine conj_grad


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      subroutine makea( n, nz, a, colidx, rowstr, nonzer,
     >                  firstrow, lastrow, firstcol, lastcol,
     >                  rcond, arow, acol, aelt, v, iv, shift )
c---------------------------------------------------------------------
c---------------------------------------------------------------------

      integer             n, nz
      integer             firstrow, lastrow, firstcol, lastcol
      integer             colidx(nz), rowstr(n+1)
      integer             iv(2*n+1), arow(nz), acol(nz)
      double precision    v(n+1), aelt(nz)
      double precision    rcond, a(nz), shift

c---------------------------------------------------------------------
c       generate the test problem for benchmark 6
c       makea generates a sparse matrix with a
c       prescribed sparsity distribution
c
c       parameter    type        usage
c
c       input
c
c       n            i           number of cols/rows of matrix
c       nz           i           nonzeros as declared array size
c       rcond        r*8         condition number
c       shift        r*8         main diagonal shift
c
c       output
c
c       a            r*8         array for nonzeros
c       colidx       i           col indices
c       rowstr       i           row pointers
c
c       workspace
c
c       iv, arow, acol i
c       v, aelt        r*8
c---------------------------------------------------------------------

      integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER

c---------------------------------------------------------------------
c      nonzer is approximately  (int(sqrt(nnza /n)));
c---------------------------------------------------------------------

      double precision  size, ratio, scale
      external          sparse, sprnvc, vecset

      size = 1.0D0
      ratio = rcond ** (1.0D0 / dfloat(n))
      nnza = 0

c---------------------------------------------------------------------
c  Initialize iv(n+1 .. 2n) to zero.
c  Used by sprnvc to mark nonzero positions
c---------------------------------------------------------------------

      do i = 1, n
           iv(n+i) = 0
      enddo
      do iouter = 1, n
         nzv = nonzer
         call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) )
         call vecset( n, v, colidx, nzv, iouter, .5D0 )
         do ivelt = 1, nzv
              jcol = colidx(ivelt)
              if (jcol.ge.firstcol .and. jcol.le.lastcol) then
                 scale = size * v(ivelt)
                 do ivelt1 = 1, nzv
                    irow = colidx(ivelt1)
                    if (irow.ge.firstrow .and. irow.le.lastrow) then
                       nnza = nnza + 1
                       if (nnza .gt. nz) goto 9999
                       acol(nnza) = jcol
                       arow(nnza) = irow
                       aelt(nnza) = v(ivelt1) * scale
                    endif
                 enddo
              endif
         enddo
         size = size * ratio
      enddo


c---------------------------------------------------------------------
c       ... add the identity * rcond to the generated matrix to bound
c           the smallest eigenvalue from below by rcond
c---------------------------------------------------------------------
        do i = firstrow, lastrow
           if (i.ge.firstcol .and. i.le.lastcol) then
              iouter = n + i
              nnza = nnza + 1
              if (nnza .gt. nz) goto 9999
              acol(nnza) = i
              arow(nnza) = i
              aelt(nnza) = rcond - shift
           endif
        enddo


c---------------------------------------------------------------------
c       ... make the sparse matrix from list of elements with duplicates
c           (v and iv are used as  workspace)
c---------------------------------------------------------------------
      call sparse( a, colidx, rowstr, n, arow, acol, aelt,
     >             firstrow, lastrow,
     >             v, iv(1), iv(n+1), nnza )
      return

 9999 continue
      write(*,*) 'Space for matrix elements exceeded in makea'
      write(*,*) 'nnza, nzmax = ',nnza, nz
      write(*,*) ' iouter = ',iouter

      stop
      end
c-------end   of makea------------------------------

c---------------------------------------------------------------------
c---------------------------------------------------------------------
      subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt,
     >                   firstrow, lastrow,
     >                   x, mark, nzloc, nnza )
c---------------------------------------------------------------------
c---------------------------------------------------------------------

      implicit           logical (a-z)
      integer            colidx(*), rowstr(*)
      integer            firstrow, lastrow
      integer            n, arow(*), acol(*), nnza
      double precision   a(*), aelt(*)

c---------------------------------------------------------------------
c       rows range from firstrow to lastrow
c       the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
c---------------------------------------------------------------------
      integer            nzloc(n), nrows
      double precision   x(n)
      logical            mark(n)

c---------------------------------------------------
c       generate a sparse matrix from a list of
c       [col, row, element] tri
c---------------------------------------------------

      integer            i, j, jajp1, nza, k, nzrow
      double precision   xi

c---------------------------------------------------------------------
c    how many rows of result
c---------------------------------------------------------------------
      nrows = lastrow - firstrow + 1

c---------------------------------------------------------------------
c     ...count the number of triples in each row
c---------------------------------------------------------------------
      do j = 1, n
         rowstr(j) = 0
         mark(j) = .false.
      enddo
      rowstr(n+1) = 0

      do nza = 1, nnza
         j = (arow(nza) - firstrow + 1) + 1
         rowstr(j) = rowstr(j) + 1
      enddo

      rowstr(1) = 1
      do j = 2, nrows+1
         rowstr(j) = rowstr(j) + rowstr(j-1)
      enddo


c---------------------------------------------------------------------
c     ... rowstr(j) now is the location of the first nonzero
c           of row j of a
c---------------------------------------------------------------------


c---------------------------------------------------------------------
c     ... do a bucket sort of the triples on the row index
c---------------------------------------------------------------------
      do nza = 1, nnza
         j = arow(nza) - firstrow + 1
         k = rowstr(j)
         a(k) = aelt(nza)
         colidx(k) = acol(nza)
         rowstr(j) = rowstr(j) + 1
      enddo


c---------------------------------------------------------------------
c       ... rowstr(j) now points to the first element of row j+1
c---------------------------------------------------------------------
      do j = nrows, 1, -1
          rowstr(j+1) = rowstr(j)
      enddo
      rowstr(1) = 1


c---------------------------------------------------------------------
c       ... generate the actual output rows by adding elements
c---------------------------------------------------------------------
      nza = 0
      do i = 1, n
          x(i)    = 0.0
          mark(i) = .false.
      enddo

      jajp1 = rowstr(1)
      do j = 1, nrows
         nzrow = 0

c---------------------------------------------------------------------
c          ...loop over the jth row of a
c---------------------------------------------------------------------
         do k = jajp1 , rowstr(j+1)-1
            i = colidx(k)
            x(i) = x(i) + a(k)
            if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then
             mark(i) = .true.
             nzrow = nzrow + 1
             nzloc(nzrow) = i
            endif
         enddo

c---------------------------------------------------------------------
c          ... extract the nonzeros of this row
c---------------------------------------------------------------------
         do k = 1, nzrow
            i = nzloc(k)
            mark(i) = .false.
            xi = x(i)
            x(i) = 0.D0
            if (xi .ne. 0.D0) then
             nza = nza + 1
             a(nza) = xi
             colidx(nza) = i
            endif
         enddo
         jajp1 = rowstr(j+1)
         rowstr(j+1) = nza + rowstr(1)
      enddo
CC       write (*, 11000) nza
      return
11000   format ( //,'final nonzero count in sparse ',
     1            /,'number of nonzeros       = ', i16 )
      end
c-------end   of sparse-----------------------------


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      subroutine sprnvc( n, nz, v, iv, nzloc, mark )
c---------------------------------------------------------------------
c---------------------------------------------------------------------

      implicit           logical (a-z)
      double precision   v(*)
      integer            n, nz, iv(*), nzloc(n), nn1
      integer mark(n)
      common /urando/    amult, tran
      double precision   amult, tran


c---------------------------------------------------------------------
c       generate a sparse n-vector (v, iv)
c       having nzv nonzeros
c
c       mark(i) is set to 1 if position i is nonzero.
c       mark is all zero on entry and is reset to all zero before exit
c       this corrects a performance bug found by John G. Lewis, caused by
c       reinitialization of mark on every one of the n calls to sprnvc
c---------------------------------------------------------------------

        integer            nzrow, nzv, ii, i, icnvrt

        external           randlc, icnvrt
        double precision   randlc, vecelt, vecloc


        nzv = 0
        nzrow = 0
        nn1 = 1
 50     continue
          nn1 = 2 * nn1
          if (nn1 .lt. n) goto 50

c---------------------------------------------------------------------
c    nn1 is the smallest power of two not less than n
c---------------------------------------------------------------------

100     continue
        if (nzv .ge. nz) goto 110
         vecelt = randlc( tran, amult )

c---------------------------------------------------------------------
c   generate an integer between 1 and n in a portable manner
c---------------------------------------------------------------------
         vecloc = randlc(tran, amult)
         i = icnvrt(vecloc, nn1) + 1
         if (i .gt. n) goto 100

c---------------------------------------------------------------------
c  was this integer generated already?
c---------------------------------------------------------------------
         if (mark(i) .eq. 0) then
            mark(i) = 1
            nzrow = nzrow + 1
            nzloc(nzrow) = i
            nzv = nzv + 1
            v(nzv) = vecelt
            iv(nzv) = i
         endif
         goto 100
110      continue
      do ii = 1, nzrow
         i = nzloc(ii)
         mark(i) = 0
      enddo
      return
      end
c-------end   of sprnvc-----------------------------


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      function icnvrt(x, ipwr2)
c---------------------------------------------------------------------
c---------------------------------------------------------------------

      implicit           logical (a-z)
      double precision   x
      integer            ipwr2, icnvrt

c---------------------------------------------------------------------
c    scale a double precision number x in (0,1) by a power of 2 and chop it
c---------------------------------------------------------------------
      icnvrt = int(ipwr2 * x)

      return
      end
c-------end   of icnvrt-----------------------------


c---------------------------------------------------------------------
c---------------------------------------------------------------------
      subroutine vecset(n, v, iv, nzv, i, val)
c---------------------------------------------------------------------
c---------------------------------------------------------------------

      implicit           logical (a-z)
      integer            n, iv(*), nzv, i, k
      double precision   v(*), val

c---------------------------------------------------------------------
c       set ith element of sparse vector (v, iv) with
c       nzv nonzeros to val
c---------------------------------------------------------------------

      logical set

      set = .false.
      do k = 1, nzv
         if (iv(k) .eq. i) then
            v(k) = val
            set  = .true.
         endif
      enddo
      if (.not. set) then
         nzv     = nzv + 1
         v(nzv)  = val
         iv(nzv) = i
      endif
      return
      end
c-------end   of vecset-----------------------------