Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Added our tweaked version of NAS benchmarks.
[simgrid.git] / examples / smpi / NAS / BT / z_solve_vec.f
1 c---------------------------------------------------------------------
2 c---------------------------------------------------------------------
3
4       subroutine z_solve
5
6 c---------------------------------------------------------------------
7 c---------------------------------------------------------------------
8
9 c---------------------------------------------------------------------
10 c     Performs line solves in Z direction by first factoring
11 c     the block-tridiagonal matrix into an upper triangular matrix, 
12 c     and then performing back substitution to solve for the unknow
13 c     vectors of each line.  
14 c     
15 c     Make sure we treat elements zero to cell_size in the direction
16 c     of the sweep.
17 c---------------------------------------------------------------------
18
19       include 'header.h'
20       include 'mpinpb.h'
21
22       integer c, kstart, stage,
23      >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
24      >     isize,jsize,ksize,send_id
25
26       kstart = 0
27
28 c---------------------------------------------------------------------
29 c     in our terminology stage is the number of the cell in the y-direct
30 c     i.e. stage = 1 means the start of the line stage=ncells means end
31 c---------------------------------------------------------------------
32       do stage = 1,ncells
33          c = slice(3,stage)
34          isize = cell_size(1,c) - 1
35          jsize = cell_size(2,c) - 1
36          ksize = cell_size(3,c) - 1
37 c---------------------------------------------------------------------
38 c     set last-cell flag
39 c---------------------------------------------------------------------
40          if (stage .eq. ncells) then
41             last = 1
42          else
43             last = 0
44          endif
45
46          if (stage .eq. 1) then
47 c---------------------------------------------------------------------
48 c     This is the first cell, so solve without receiving data
49 c---------------------------------------------------------------------
50             first = 1
51 c            call lhsz(c)
52             call z_solve_cell(first,last,c)
53          else
54 c---------------------------------------------------------------------
55 c     Not the first cell of this line, so receive info from
56 c     processor working on preceeding cell
57 c---------------------------------------------------------------------
58             first = 0
59             call z_receive_solve_info(recv_id,c)
60 c---------------------------------------------------------------------
61 c     overlap computations and communications
62 c---------------------------------------------------------------------
63 c            call lhsz(c)
64 c---------------------------------------------------------------------
65 c     wait for completion
66 c---------------------------------------------------------------------
67             call mpi_wait(send_id,r_status,error)
68             call mpi_wait(recv_id,r_status,error)
69 c---------------------------------------------------------------------
70 c     install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
71 c---------------------------------------------------------------------
72             call z_unpack_solve_info(c)
73             call z_solve_cell(first,last,c)
74          endif
75
76          if (last .eq. 0) call z_send_solve_info(send_id,c)
77       enddo
78
79 c---------------------------------------------------------------------
80 c     now perform backsubstitution in reverse direction
81 c---------------------------------------------------------------------
82       do stage = ncells, 1, -1
83          c = slice(3,stage)
84          first = 0
85          last = 0
86          if (stage .eq. 1) first = 1
87          if (stage .eq. ncells) then
88             last = 1
89 c---------------------------------------------------------------------
90 c     last cell, so perform back substitute without waiting
91 c---------------------------------------------------------------------
92             call z_backsubstitute(first, last,c)
93          else
94             call z_receive_backsub_info(recv_id,c)
95             call mpi_wait(send_id,r_status,error)
96             call mpi_wait(recv_id,r_status,error)
97             call z_unpack_backsub_info(c)
98             call z_backsubstitute(first,last,c)
99          endif
100          if (first .eq. 0) call z_send_backsub_info(send_id,c)
101       enddo
102
103
104       return
105       end
106       
107 c---------------------------------------------------------------------
108 c---------------------------------------------------------------------
109       
110       subroutine z_unpack_solve_info(c)
111 c---------------------------------------------------------------------
112 c---------------------------------------------------------------------
113
114 c---------------------------------------------------------------------
115 c     unpack C'(-1) and rhs'(-1) for
116 c     all i and j
117 c---------------------------------------------------------------------
118
119       include 'header.h'
120
121       integer i,j,m,n,ptr,c,kstart 
122
123       kstart = 0
124       ptr = 0
125       do j=0,JMAX-1
126          do i=0,IMAX-1
127             do m=1,BLOCK_SIZE
128                do n=1,BLOCK_SIZE
129                   lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
130                enddo
131                ptr = ptr+BLOCK_SIZE
132             enddo
133             do n=1,BLOCK_SIZE
134                rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
135             enddo
136             ptr = ptr+BLOCK_SIZE
137          enddo
138       enddo
139
140       return
141       end
142
143 c---------------------------------------------------------------------
144 c---------------------------------------------------------------------
145       
146       subroutine z_send_solve_info(send_id,c)
147
148 c---------------------------------------------------------------------
149 c---------------------------------------------------------------------
150
151 c---------------------------------------------------------------------
152 c     pack up and send C'(kend) and rhs'(kend) for
153 c     all i and j
154 c---------------------------------------------------------------------
155
156       include 'header.h'
157       include 'mpinpb.h'
158
159       integer i,j,m,n,ksize,ptr,c,ip,jp
160       integer error,send_id,buffer_size
161
162       ksize = cell_size(3,c)-1
163       ip = cell_coord(1,c) - 1
164       jp = cell_coord(2,c) - 1
165       buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
166      >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
167
168 c---------------------------------------------------------------------
169 c     pack up buffer
170 c---------------------------------------------------------------------
171       ptr = 0
172       do j=0,JMAX-1
173          do i=0,IMAX-1
174             do m=1,BLOCK_SIZE
175                do n=1,BLOCK_SIZE
176                   in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
177                enddo
178                ptr = ptr+BLOCK_SIZE
179             enddo
180             do n=1,BLOCK_SIZE
181                in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
182             enddo
183             ptr = ptr+BLOCK_SIZE
184          enddo
185       enddo
186
187 c---------------------------------------------------------------------
188 c     send buffer 
189 c---------------------------------------------------------------------
190       call mpi_isend(in_buffer, buffer_size,
191      >     dp_type, successor(3),
192      >     BOTTOM+ip+jp*NCELLS, comm_solve,
193      >     send_id,error)
194
195       return
196       end
197
198 c---------------------------------------------------------------------
199 c---------------------------------------------------------------------
200
201       subroutine z_send_backsub_info(send_id,c)
202
203 c---------------------------------------------------------------------
204 c---------------------------------------------------------------------
205
206 c---------------------------------------------------------------------
207 c     pack up and send U(jstart) for all i and j
208 c---------------------------------------------------------------------
209
210       include 'header.h'
211       include 'mpinpb.h'
212
213       integer i,j,n,ptr,c,kstart,ip,jp
214       integer error,send_id,buffer_size
215
216 c---------------------------------------------------------------------
217 c     Send element 0 to previous processor
218 c---------------------------------------------------------------------
219       kstart = 0
220       ip = cell_coord(1,c)-1
221       jp = cell_coord(2,c)-1
222       buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
223       ptr = 0
224       do j=0,JMAX-1
225          do i=0,IMAX-1
226             do n=1,BLOCK_SIZE
227                in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
228             enddo
229             ptr = ptr+BLOCK_SIZE
230          enddo
231       enddo
232
233       call mpi_isend(in_buffer, buffer_size,
234      >     dp_type, predecessor(3), 
235      >     TOP+ip+jp*NCELLS, comm_solve, 
236      >     send_id,error)
237
238       return
239       end
240
241 c---------------------------------------------------------------------
242 c---------------------------------------------------------------------
243
244       subroutine z_unpack_backsub_info(c)
245
246 c---------------------------------------------------------------------
247 c---------------------------------------------------------------------
248
249 c---------------------------------------------------------------------
250 c     unpack U(ksize) for all i and j
251 c---------------------------------------------------------------------
252
253       include 'header.h'
254
255       integer i,j,n,ptr,c
256
257       ptr = 0
258       do j=0,JMAX-1
259          do i=0,IMAX-1
260             do n=1,BLOCK_SIZE
261                backsub_info(n,i,j,c) = out_buffer(ptr+n)
262             enddo
263             ptr = ptr+BLOCK_SIZE
264          enddo
265       enddo
266
267       return
268       end
269
270
271 c---------------------------------------------------------------------
272 c---------------------------------------------------------------------
273
274       subroutine z_receive_backsub_info(recv_id,c)
275
276 c---------------------------------------------------------------------
277 c---------------------------------------------------------------------
278
279 c---------------------------------------------------------------------
280 c     post mpi receives
281 c---------------------------------------------------------------------
282
283       include 'header.h'
284       include 'mpinpb.h'
285
286       integer error,recv_id,ip,jp,c,buffer_size
287       ip = cell_coord(1,c) - 1
288       jp = cell_coord(2,c) - 1
289       buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
290       call mpi_irecv(out_buffer, buffer_size,
291      >     dp_type, successor(3), 
292      >     TOP+ip+jp*NCELLS, comm_solve, 
293      >     recv_id, error)
294
295       return
296       end
297
298 c---------------------------------------------------------------------
299 c---------------------------------------------------------------------
300
301       subroutine z_receive_solve_info(recv_id,c)
302
303 c---------------------------------------------------------------------
304 c---------------------------------------------------------------------
305
306 c---------------------------------------------------------------------
307 c     post mpi receives 
308 c---------------------------------------------------------------------
309
310       include 'header.h'
311       include 'mpinpb.h'
312
313       integer ip,jp,recv_id,error,c,buffer_size
314       ip = cell_coord(1,c) - 1
315       jp = cell_coord(2,c) - 1
316       buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
317      >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
318       call mpi_irecv(out_buffer, buffer_size,
319      >     dp_type, predecessor(3), 
320      >     BOTTOM+ip+jp*NCELLS, comm_solve,
321      >     recv_id, error)
322
323       return
324       end
325       
326 c---------------------------------------------------------------------
327 c---------------------------------------------------------------------
328
329       subroutine z_backsubstitute(first, last, c)
330
331 c---------------------------------------------------------------------
332 c---------------------------------------------------------------------
333
334 c---------------------------------------------------------------------
335 c     back solve: if last cell, then generate U(ksize)=rhs(ksize)
336 c     else assume U(ksize) is loaded in un pack backsub_info
337 c     so just use it
338 c     after call u(kstart) will be sent to next cell
339 c---------------------------------------------------------------------
340
341       include 'header.h'
342
343       integer first, last, c, i, k
344       integer m,n,j,jsize,isize,ksize,kstart
345       
346       kstart = 0
347       isize = cell_size(1,c)-end(1,c)-1      
348       jsize = cell_size(2,c)-end(2,c)-1
349       ksize = cell_size(3,c)-1
350       if (last .eq. 0) then
351          do j=start(2,c),jsize
352             do i=start(1,c),isize
353 c---------------------------------------------------------------------
354 c     U(jsize) uses info from previous cell if not last cell
355 c---------------------------------------------------------------------
356                do m=1,BLOCK_SIZE
357                   do n=1,BLOCK_SIZE
358                      rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) 
359      >                    - lhsc(m,n,i,j,ksize,c)*
360      >                    backsub_info(n,i,j,c)
361                   enddo
362                enddo
363             enddo
364          enddo
365       endif
366       do k=ksize-1,kstart,-1
367          do j=start(2,c),jsize
368             do i=start(1,c),isize
369                do m=1,BLOCK_SIZE
370                   do n=1,BLOCK_SIZE
371                      rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
372      >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
373                   enddo
374                enddo
375             enddo
376          enddo
377       enddo
378
379       return
380       end
381
382 c---------------------------------------------------------------------
383 c---------------------------------------------------------------------
384
385       subroutine z_solve_cell(first,last,c)
386
387 c---------------------------------------------------------------------
388 c---------------------------------------------------------------------
389
390 c---------------------------------------------------------------------
391 c     performs guaussian elimination on this cell.
392 c     
393 c     assumes that unpacking routines for non-first cells 
394 c     preload C' and rhs' from previous cell.
395 c     
396 c     assumed send happens outside this routine, but that
397 c     c'(KMAX) and rhs'(KMAX) will be sent to next cell.
398 c---------------------------------------------------------------------
399
400       include 'header.h'
401       include 'work_lhs_vec.h'
402
403       integer first,last,c
404       integer i,j,k,m,n,isize,ksize,jsize,kstart
405
406       kstart = 0
407       isize = cell_size(1,c)-end(1,c)-1
408       jsize = cell_size(2,c)-end(2,c)-1
409       ksize = cell_size(3,c)-1
410
411 c---------------------------------------------------------------------
412 c     zero the left hand side for starters
413 c     set diagonal values to 1. This is overkill, but convenient
414 c---------------------------------------------------------------------
415       do i = 0, isize
416          do m = 1, 5
417             do n = 1, 5
418                lhsa(m,n,i,0) = 0.0d0
419                lhsb(m,n,i,0) = 0.0d0
420                lhsa(m,n,i,ksize) = 0.0d0
421                lhsb(m,n,i,ksize) = 0.0d0
422             enddo
423             lhsb(m,m,i,0) = 1.0d0
424             lhsb(m,m,i,ksize) = 1.0d0
425          enddo
426       enddo
427
428       do j=start(2,c),jsize 
429
430 c---------------------------------------------------------------------
431 c     This function computes the left hand side for the three z-factors 
432 c---------------------------------------------------------------------
433
434 c---------------------------------------------------------------------
435 c     Compute the indices for storing the block-diagonal matrix;
436 c     determine c (labeled f) and s jacobians for cell c
437 c---------------------------------------------------------------------
438
439          do k = start(3,c)-1, cell_size(3,c)-end(3,c)
440             do i=start(1,c),isize
441
442                tmp1 = 1.0d0 / u(1,i,j,k,c)
443                tmp2 = tmp1 * tmp1
444                tmp3 = tmp1 * tmp2
445
446                fjac(1,1,i,k) = 0.0d+00
447                fjac(1,2,i,k) = 0.0d+00
448                fjac(1,3,i,k) = 0.0d+00
449                fjac(1,4,i,k) = 1.0d+00
450                fjac(1,5,i,k) = 0.0d+00
451
452                fjac(2,1,i,k) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) 
453      >              * tmp2 
454                fjac(2,2,i,k) = u(4,i,j,k,c) * tmp1
455                fjac(2,3,i,k) = 0.0d+00
456                fjac(2,4,i,k) = u(2,i,j,k,c) * tmp1
457                fjac(2,5,i,k) = 0.0d+00
458
459                fjac(3,1,i,k) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
460      >              * tmp2 
461                fjac(3,2,i,k) = 0.0d+00
462                fjac(3,3,i,k) = u(4,i,j,k,c) * tmp1
463                fjac(3,4,i,k) = u(3,i,j,k,c) * tmp1
464                fjac(3,5,i,k) = 0.0d+00
465
466                fjac(4,1,i,k) = - (u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 ) 
467      >              + c2 * qs(i,j,k,c)
468                fjac(4,2,i,k) = - c2 *  u(2,i,j,k,c) * tmp1 
469                fjac(4,3,i,k) = - c2 *  u(3,i,j,k,c) * tmp1
470                fjac(4,4,i,k) = ( 2.0d+00 - c2 )
471      >              *  u(4,i,j,k,c) * tmp1 
472                fjac(4,5,i,k) = c2
473
474                fjac(5,1,i,k) = ( c2 * 2.0d0 * qs(i,j,k,c)
475      >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
476      >              * ( u(4,i,j,k,c) * tmp1 )
477                fjac(5,2,i,k) = - c2 * ( u(2,i,j,k,c)*u(4,i,j,k,c) )
478      >              * tmp2 
479                fjac(5,3,i,k) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
480      >              * tmp2
481                fjac(5,4,i,k) = c1 * ( u(5,i,j,k,c) * tmp1 )
482      >              - c2 * ( qs(i,j,k,c)
483      >              + u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 )
484                fjac(5,5,i,k) = c1 * u(4,i,j,k,c) * tmp1
485
486                njac(1,1,i,k) = 0.0d+00
487                njac(1,2,i,k) = 0.0d+00
488                njac(1,3,i,k) = 0.0d+00
489                njac(1,4,i,k) = 0.0d+00
490                njac(1,5,i,k) = 0.0d+00
491
492                njac(2,1,i,k) = - c3c4 * tmp2 * u(2,i,j,k,c)
493                njac(2,2,i,k) =   c3c4 * tmp1
494                njac(2,3,i,k) =   0.0d+00
495                njac(2,4,i,k) =   0.0d+00
496                njac(2,5,i,k) =   0.0d+00
497
498                njac(3,1,i,k) = - c3c4 * tmp2 * u(3,i,j,k,c)
499                njac(3,2,i,k) =   0.0d+00
500                njac(3,3,i,k) =   c3c4 * tmp1
501                njac(3,4,i,k) =   0.0d+00
502                njac(3,5,i,k) =   0.0d+00
503
504                njac(4,1,i,k) = - con43 * c3c4 * tmp2 * u(4,i,j,k,c)
505                njac(4,2,i,k) =   0.0d+00
506                njac(4,3,i,k) =   0.0d+00
507                njac(4,4,i,k) =   con43 * c3 * c4 * tmp1
508                njac(4,5,i,k) =   0.0d+00
509
510                njac(5,1,i,k) = - (  c3c4
511      >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
512      >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
513      >              - ( con43 * c3c4
514      >              - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
515      >              - c1345 * tmp2 * u(5,i,j,k,c)
516
517                njac(5,2,i,k) = (  c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
518                njac(5,3,i,k) = (  c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
519                njac(5,4,i,k) = ( con43 * c3c4
520      >              - c1345 ) * tmp2 * u(4,i,j,k,c)
521                njac(5,5,i,k) = ( c1345 )* tmp1
522
523
524             enddo
525          enddo
526
527 c---------------------------------------------------------------------
528 c     now joacobians set, so form left hand side in z direction
529 c---------------------------------------------------------------------
530          do k = start(3,c), ksize-end(3,c)
531             do i=start(1,c),isize
532
533                tmp1 = dt * tz1
534                tmp2 = dt * tz2
535
536                lhsa(1,1,i,k) = - tmp2 * fjac(1,1,i,k-1)
537      >              - tmp1 * njac(1,1,i,k-1)
538      >              - tmp1 * dz1 
539                lhsa(1,2,i,k) = - tmp2 * fjac(1,2,i,k-1)
540      >              - tmp1 * njac(1,2,i,k-1)
541                lhsa(1,3,i,k) = - tmp2 * fjac(1,3,i,k-1)
542      >              - tmp1 * njac(1,3,i,k-1)
543                lhsa(1,4,i,k) = - tmp2 * fjac(1,4,i,k-1)
544      >              - tmp1 * njac(1,4,i,k-1)
545                lhsa(1,5,i,k) = - tmp2 * fjac(1,5,i,k-1)
546      >              - tmp1 * njac(1,5,i,k-1)
547
548                lhsa(2,1,i,k) = - tmp2 * fjac(2,1,i,k-1)
549      >              - tmp1 * njac(2,1,i,k-1)
550                lhsa(2,2,i,k) = - tmp2 * fjac(2,2,i,k-1)
551      >              - tmp1 * njac(2,2,i,k-1)
552      >              - tmp1 * dz2
553                lhsa(2,3,i,k) = - tmp2 * fjac(2,3,i,k-1)
554      >              - tmp1 * njac(2,3,i,k-1)
555                lhsa(2,4,i,k) = - tmp2 * fjac(2,4,i,k-1)
556      >              - tmp1 * njac(2,4,i,k-1)
557                lhsa(2,5,i,k) = - tmp2 * fjac(2,5,i,k-1)
558      >              - tmp1 * njac(2,5,i,k-1)
559
560                lhsa(3,1,i,k) = - tmp2 * fjac(3,1,i,k-1)
561      >              - tmp1 * njac(3,1,i,k-1)
562                lhsa(3,2,i,k) = - tmp2 * fjac(3,2,i,k-1)
563      >              - tmp1 * njac(3,2,i,k-1)
564                lhsa(3,3,i,k) = - tmp2 * fjac(3,3,i,k-1)
565      >              - tmp1 * njac(3,3,i,k-1)
566      >              - tmp1 * dz3 
567                lhsa(3,4,i,k) = - tmp2 * fjac(3,4,i,k-1)
568      >              - tmp1 * njac(3,4,i,k-1)
569                lhsa(3,5,i,k) = - tmp2 * fjac(3,5,i,k-1)
570      >              - tmp1 * njac(3,5,i,k-1)
571
572                lhsa(4,1,i,k) = - tmp2 * fjac(4,1,i,k-1)
573      >              - tmp1 * njac(4,1,i,k-1)
574                lhsa(4,2,i,k) = - tmp2 * fjac(4,2,i,k-1)
575      >              - tmp1 * njac(4,2,i,k-1)
576                lhsa(4,3,i,k) = - tmp2 * fjac(4,3,i,k-1)
577      >              - tmp1 * njac(4,3,i,k-1)
578                lhsa(4,4,i,k) = - tmp2 * fjac(4,4,i,k-1)
579      >              - tmp1 * njac(4,4,i,k-1)
580      >              - tmp1 * dz4
581                lhsa(4,5,i,k) = - tmp2 * fjac(4,5,i,k-1)
582      >              - tmp1 * njac(4,5,i,k-1)
583
584                lhsa(5,1,i,k) = - tmp2 * fjac(5,1,i,k-1)
585      >              - tmp1 * njac(5,1,i,k-1)
586                lhsa(5,2,i,k) = - tmp2 * fjac(5,2,i,k-1)
587      >              - tmp1 * njac(5,2,i,k-1)
588                lhsa(5,3,i,k) = - tmp2 * fjac(5,3,i,k-1)
589      >              - tmp1 * njac(5,3,i,k-1)
590                lhsa(5,4,i,k) = - tmp2 * fjac(5,4,i,k-1)
591      >              - tmp1 * njac(5,4,i,k-1)
592                lhsa(5,5,i,k) = - tmp2 * fjac(5,5,i,k-1)
593      >              - tmp1 * njac(5,5,i,k-1)
594      >              - tmp1 * dz5
595
596                lhsb(1,1,i,k) = 1.0d+00
597      >              + tmp1 * 2.0d+00 * njac(1,1,i,k)
598      >              + tmp1 * 2.0d+00 * dz1
599                lhsb(1,2,i,k) = tmp1 * 2.0d+00 * njac(1,2,i,k)
600                lhsb(1,3,i,k) = tmp1 * 2.0d+00 * njac(1,3,i,k)
601                lhsb(1,4,i,k) = tmp1 * 2.0d+00 * njac(1,4,i,k)
602                lhsb(1,5,i,k) = tmp1 * 2.0d+00 * njac(1,5,i,k)
603
604                lhsb(2,1,i,k) = tmp1 * 2.0d+00 * njac(2,1,i,k)
605                lhsb(2,2,i,k) = 1.0d+00
606      >              + tmp1 * 2.0d+00 * njac(2,2,i,k)
607      >              + tmp1 * 2.0d+00 * dz2
608                lhsb(2,3,i,k) = tmp1 * 2.0d+00 * njac(2,3,i,k)
609                lhsb(2,4,i,k) = tmp1 * 2.0d+00 * njac(2,4,i,k)
610                lhsb(2,5,i,k) = tmp1 * 2.0d+00 * njac(2,5,i,k)
611
612                lhsb(3,1,i,k) = tmp1 * 2.0d+00 * njac(3,1,i,k)
613                lhsb(3,2,i,k) = tmp1 * 2.0d+00 * njac(3,2,i,k)
614                lhsb(3,3,i,k) = 1.0d+00
615      >              + tmp1 * 2.0d+00 * njac(3,3,i,k)
616      >              + tmp1 * 2.0d+00 * dz3
617                lhsb(3,4,i,k) = tmp1 * 2.0d+00 * njac(3,4,i,k)
618                lhsb(3,5,i,k) = tmp1 * 2.0d+00 * njac(3,5,i,k)
619
620                lhsb(4,1,i,k) = tmp1 * 2.0d+00 * njac(4,1,i,k)
621                lhsb(4,2,i,k) = tmp1 * 2.0d+00 * njac(4,2,i,k)
622                lhsb(4,3,i,k) = tmp1 * 2.0d+00 * njac(4,3,i,k)
623                lhsb(4,4,i,k) = 1.0d+00
624      >              + tmp1 * 2.0d+00 * njac(4,4,i,k)
625      >              + tmp1 * 2.0d+00 * dz4
626                lhsb(4,5,i,k) = tmp1 * 2.0d+00 * njac(4,5,i,k)
627
628                lhsb(5,1,i,k) = tmp1 * 2.0d+00 * njac(5,1,i,k)
629                lhsb(5,2,i,k) = tmp1 * 2.0d+00 * njac(5,2,i,k)
630                lhsb(5,3,i,k) = tmp1 * 2.0d+00 * njac(5,3,i,k)
631                lhsb(5,4,i,k) = tmp1 * 2.0d+00 * njac(5,4,i,k)
632                lhsb(5,5,i,k) = 1.0d+00
633      >              + tmp1 * 2.0d+00 * njac(5,5,i,k) 
634      >              + tmp1 * 2.0d+00 * dz5
635
636                lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i,k+1)
637      >              - tmp1 * njac(1,1,i,k+1)
638      >              - tmp1 * dz1
639                lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i,k+1)
640      >              - tmp1 * njac(1,2,i,k+1)
641                lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i,k+1)
642      >              - tmp1 * njac(1,3,i,k+1)
643                lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i,k+1)
644      >              - tmp1 * njac(1,4,i,k+1)
645                lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i,k+1)
646      >              - tmp1 * njac(1,5,i,k+1)
647
648                lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i,k+1)
649      >              - tmp1 * njac(2,1,i,k+1)
650                lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i,k+1)
651      >              - tmp1 * njac(2,2,i,k+1)
652      >              - tmp1 * dz2
653                lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i,k+1)
654      >              - tmp1 * njac(2,3,i,k+1)
655                lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i,k+1)
656      >              - tmp1 * njac(2,4,i,k+1)
657                lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i,k+1)
658      >              - tmp1 * njac(2,5,i,k+1)
659
660                lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i,k+1)
661      >              - tmp1 * njac(3,1,i,k+1)
662                lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i,k+1)
663      >              - tmp1 * njac(3,2,i,k+1)
664                lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i,k+1)
665      >              - tmp1 * njac(3,3,i,k+1)
666      >              - tmp1 * dz3
667                lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i,k+1)
668      >              - tmp1 * njac(3,4,i,k+1)
669                lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i,k+1)
670      >              - tmp1 * njac(3,5,i,k+1)
671
672                lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i,k+1)
673      >              - tmp1 * njac(4,1,i,k+1)
674                lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i,k+1)
675      >              - tmp1 * njac(4,2,i,k+1)
676                lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i,k+1)
677      >              - tmp1 * njac(4,3,i,k+1)
678                lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i,k+1)
679      >              - tmp1 * njac(4,4,i,k+1)
680      >              - tmp1 * dz4
681                lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i,k+1)
682      >              - tmp1 * njac(4,5,i,k+1)
683
684                lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i,k+1)
685      >              - tmp1 * njac(5,1,i,k+1)
686                lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i,k+1)
687      >              - tmp1 * njac(5,2,i,k+1)
688                lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i,k+1)
689      >              - tmp1 * njac(5,3,i,k+1)
690                lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i,k+1)
691      >              - tmp1 * njac(5,4,i,k+1)
692                lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i,k+1)
693      >              - tmp1 * njac(5,5,i,k+1)
694      >              - tmp1 * dz5
695
696             enddo
697          enddo
698
699
700 c---------------------------------------------------------------------
701 c     outer most do loops - sweeping in i direction
702 c---------------------------------------------------------------------
703          if (first .eq. 1) then 
704
705 c---------------------------------------------------------------------
706 c     multiply c(i,j,kstart) by b_inverse and copy back to c
707 c     multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
708 c---------------------------------------------------------------------
709 !dir$ ivdep
710             do i=start(1,c),isize
711                call binvcrhs( lhsb(1,1,i,kstart),
712      >                        lhsc(1,1,i,j,kstart,c),
713      >                        rhs(1,i,j,kstart,c) )
714             enddo
715
716          endif
717
718 c---------------------------------------------------------------------
719 c     begin inner most do loop
720 c     do all the elements of the cell unless last 
721 c---------------------------------------------------------------------
722          do k=kstart+first,ksize-last
723 !dir$ ivdep
724             do i=start(1,c),isize
725
726 c---------------------------------------------------------------------
727 c     subtract A*lhs_vector(k-1) from lhs_vector(k)
728 c     
729 c     rhs(k) = rhs(k) - A*rhs(k-1)
730 c---------------------------------------------------------------------
731                call matvec_sub(lhsa(1,1,i,k),
732      >                         rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
733
734 c---------------------------------------------------------------------
735 c     B(k) = B(k) - C(k-1)*A(k)
736 c     call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
737 c---------------------------------------------------------------------
738                call matmul_sub(lhsa(1,1,i,k),
739      >                         lhsc(1,1,i,j,k-1,c),
740      >                         lhsb(1,1,i,k))
741
742 c---------------------------------------------------------------------
743 c     multiply c(i,j,k) by b_inverse and copy back to c
744 c     multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
745 c---------------------------------------------------------------------
746                call binvcrhs( lhsb(1,1,i,k),
747      >                        lhsc(1,1,i,j,k,c),
748      >                        rhs(1,i,j,k,c) )
749
750             enddo
751          enddo
752
753 c---------------------------------------------------------------------
754 c     Now finish up special cases for last cell
755 c---------------------------------------------------------------------
756          if (last .eq. 1) then
757
758 !dir$ ivdep
759             do i=start(1,c),isize
760 c---------------------------------------------------------------------
761 c     rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
762 c---------------------------------------------------------------------
763                call matvec_sub(lhsa(1,1,i,ksize),
764      >                         rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
765
766 c---------------------------------------------------------------------
767 c     B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
768 c     call matmul_sub(aa,i,j,ksize,c,
769 c     $              cc,i,j,ksize-1,c,bb,i,j,ksize,c)
770 c---------------------------------------------------------------------
771                call matmul_sub(lhsa(1,1,i,ksize),
772      >                         lhsc(1,1,i,j,ksize-1,c),
773      >                         lhsb(1,1,i,ksize))
774
775 c---------------------------------------------------------------------
776 c     multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
777 c---------------------------------------------------------------------
778                call binvrhs( lhsb(1,1,i,ksize),
779      >                       rhs(1,i,j,ksize,c) )
780             enddo
781
782          endif
783       enddo
784
785
786       return
787       end
788       
789
790
791
792
793