Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
ad0dc7e7277ecae89f0612dfbd33dabdf32ff6a4
[simgrid.git] / examples / smpi / NAS / SP / z_solve.f
1
2 c---------------------------------------------------------------------
3 c---------------------------------------------------------------------
4
5        subroutine z_solve
6
7 c---------------------------------------------------------------------
8 c---------------------------------------------------------------------
9
10 c---------------------------------------------------------------------
11 c this function performs the solution of the approximate factorization
12 c step in the z-direction for all five matrix components
13 c simultaneously. The Thomas algorithm is employed to solve the
14 c systems for the z-lines. Boundary conditions are non-periodic
15 c---------------------------------------------------------------------
16
17        include 'header.h'
18        include 'mpinpb.h'
19
20        integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2,
21      >         buffer_size, c, m, p, kstart, error,
22      >         requests(2), statuses(MPI_STATUS_SIZE, 2)
23        double precision  r1, r2, d, e, s(5), sm1, sm2,
24      >                   fac1, fac2
25
26 c---------------------------------------------------------------------
27 c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
28 c on this node in the direction of increasing i for the forward sweep,
29 c and after that reversing the direction for the backsubstitution  
30 c---------------------------------------------------------------------
31
32 c---------------------------------------------------------------------
33 c                          FORWARD ELIMINATION  
34 c---------------------------------------------------------------------
35        do    stage = 1, ncells
36           c         = slice(3,stage)
37
38           kstart = 0
39           kend   = cell_size(3,c)-1
40
41           isize     = cell_size(1,c)
42           jsize     = cell_size(2,c)
43           ip        = cell_coord(1,c)-1
44           jp        = cell_coord(2,c)-1
45
46           buffer_size = (isize-start(1,c)-end(1,c)) * 
47      >                  (jsize-start(2,c)-end(2,c))
48
49           if (stage .ne. 1) then
50
51
52 c---------------------------------------------------------------------
53 c            if this is not the first processor in this row of cells, 
54 c            receive data from predecessor containing the right hand
55 c            sides and the upper diagonal elements of the previous two rows
56 c---------------------------------------------------------------------
57
58              call mpi_irecv(in_buffer, 22*buffer_size, 
59      >                      dp_type, predecessor(3), 
60      >                      DEFAULT_TAG, comm_solve, 
61      >                      requests(1), error)
62
63
64 c---------------------------------------------------------------------
65 c            communication has already been started. 
66 c            compute the left hand side while waiting for the msg
67 c---------------------------------------------------------------------
68              call lhsz(c)
69
70 c---------------------------------------------------------------------
71 c            wait for pending communication to complete
72 c---------------------------------------------------------------------
73              call mpi_waitall(2, requests, statuses, error)
74              
75 c---------------------------------------------------------------------
76 c            unpack the buffer                                 
77 c---------------------------------------------------------------------
78              k  = kstart
79              k1 = kstart + 1
80              n = 0
81
82 c---------------------------------------------------------------------
83 c            create a running pointer
84 c---------------------------------------------------------------------
85              p = 0
86              do    j = start(2,c), jsize-end(2,c)-1
87                 do    i = start(1,c), isize-end(1,c)-1
88                    lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
89      >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
90                    lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
91      >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
92                    do    m = 1, 3
93                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
94      >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
95                    end do
96                    d            = in_buffer(p+6)
97                    e            = in_buffer(p+7)
98                    do    m = 1, 3
99                       s(m) = in_buffer(p+7+m)
100                    end do
101                    r1 = lhs(i,j,k,n+2,c)
102                    lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
103                    lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
104                    do    m = 1, 3
105                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
106                    end do
107                    r2 = lhs(i,j,k1,n+1,c)
108                    lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
109                    lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
110                    do    m = 1, 3
111                       rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
112                    end do
113                    p = p + 10
114                 end do
115              end do
116
117              do    m = 4, 5
118                 n = (m-3)*5
119                 do    j = start(2,c), jsize-end(2,c)-1
120                    do    i = start(1,c), isize-end(1,c)-1
121                       lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
122      >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
123                       lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
124      >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
125                       rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
126      >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
127                       d                = in_buffer(p+4)
128                       e                = in_buffer(p+5)
129                       s(m)             = in_buffer(p+6)
130                       r1 = lhs(i,j,k,n+2,c)
131                       lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
132                       lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
133                       rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
134                       r2 = lhs(i,j,k1,n+1,c)
135                       lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
136                       lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
137                       rhs(i,j,k1,m,c)   = rhs(i,j,k1,m,c) - s(m) * r2
138                       p = p + 6
139                    end do
140                 end do
141              end do
142
143           else            
144
145 c---------------------------------------------------------------------
146 c            if this IS the first cell, we still compute the lhs
147 c---------------------------------------------------------------------
148              call lhsz(c)
149           endif
150
151 c---------------------------------------------------------------------
152 c         perform the Thomas algorithm; first, FORWARD ELIMINATION     
153 c---------------------------------------------------------------------
154           n = 0
155
156           do    k = kstart, kend-2
157              do    j = start(2,c), jsize-end(2,c)-1
158                 do    i = start(1,c), isize-end(1,c)-1
159                    k1 = k  + 1
160                    k2 = k  + 2
161                    fac1               = 1.d0/lhs(i,j,k,n+3,c)
162                    lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
163                    lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
164                    do    m = 1, 3
165                       rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
166                    end do
167                    lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
168      >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
169                    lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
170      >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
171                    do    m = 1, 3
172                       rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
173      >                         lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
174                    end do
175                    lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
176      >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
177                    lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
178      >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
179                    do    m = 1, 3
180                       rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
181      >                         lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
182                    end do
183                 end do
184              end do
185           end do
186
187 c---------------------------------------------------------------------
188 c         The last two rows in this grid block are a bit different, 
189 c         since they do not have two more rows available for the
190 c         elimination of off-diagonal entries
191 c---------------------------------------------------------------------
192           k  = kend - 1
193           k1 = kend
194           do    j = start(2,c), jsize-end(2,c)-1
195              do    i = start(1,c), isize-end(1,c)-1
196                 fac1               = 1.d0/lhs(i,j,k,n+3,c)
197                 lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
198                 lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
199                 do    m = 1, 3
200                    rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
201                 end do
202                 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
203      >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
204                 lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
205      >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
206                 do    m = 1, 3
207                    rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
208      >                      lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
209                 end do
210 c---------------------------------------------------------------------
211 c               scale the last row immediately (some of this is
212 c               overkill in case this is the last cell)
213 c---------------------------------------------------------------------
214                 fac2               = 1.d0/lhs(i,j,k1,n+3,c)
215                 lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
216                 lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)  
217                 do    m = 1, 3
218                    rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
219                 end do
220              end do
221           end do
222
223 c---------------------------------------------------------------------
224 c         do the u+c and the u-c factors               
225 c---------------------------------------------------------------------
226           do   m = 4, 5
227              n = (m-3)*5
228              do    k = kstart, kend-2
229                 do    j = start(2,c), jsize-end(2,c)-1
230                    do    i = start(1,c), isize-end(1,c)-1
231                    k1 = k  + 1
232                    k2 = k  + 2
233                    fac1               = 1.d0/lhs(i,j,k,n+3,c)
234                    lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
235                    lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
236                    rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
237                    lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
238      >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
239                    lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
240      >                         lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
241                    rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
242      >                         lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
243                    lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
244      >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
245                    lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
246      >                         lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
247                    rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
248      >                         lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
249                 end do
250              end do
251           end do
252
253 c---------------------------------------------------------------------
254 c            And again the last two rows separately
255 c---------------------------------------------------------------------
256              k  = kend - 1
257              k1 = kend
258              do    j = start(2,c), jsize-end(2,c)-1
259                 do    i = start(1,c), isize-end(1,c)-1
260                 fac1               = 1.d0/lhs(i,j,k,n+3,c)
261                 lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
262                 lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
263                 rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
264                 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
265      >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
266                 lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
267      >                      lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
268                 rhs(i,j,k1,m,c)   = rhs(i,j,k1,m,c) -
269      >                      lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
270 c---------------------------------------------------------------------
271 c               Scale the last row immediately (some of this is overkill
272 c               if this is the last cell)
273 c---------------------------------------------------------------------
274                 fac2               = 1.d0/lhs(i,j,k1,n+3,c)
275                 lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
276                 lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
277                 rhs(i,j,k1,m,c)   = fac2*rhs(i,j,k1,m,c)
278
279              end do
280           end do
281        end do
282
283 c---------------------------------------------------------------------
284 c         send information to the next processor, except when this
285 c         is the last grid block,
286 c---------------------------------------------------------------------
287
288           if (stage .ne. ncells) then
289
290 c---------------------------------------------------------------------
291 c            create a running pointer for the send buffer  
292 c---------------------------------------------------------------------
293              p = 0
294              n = 0
295              do    j = start(2,c), jsize-end(2,c)-1
296                 do    i = start(1,c), isize-end(1,c)-1
297                    do    k = kend-1, kend
298                       out_buffer(p+1) = lhs(i,j,k,n+4,c)
299                       out_buffer(p+2) = lhs(i,j,k,n+5,c)
300                       do    m = 1, 3
301                          out_buffer(p+2+m) = rhs(i,j,k,m,c)
302                       end do
303                       p = p+5
304                    end do
305                 end do
306              end do
307
308              do    m = 4, 5
309                 n = (m-3)*5
310                 do    j = start(2,c), jsize-end(2,c)-1
311                    do    i = start(1,c), isize-end(1,c)-1
312                       do    k = kend-1, kend
313                          out_buffer(p+1) = lhs(i,j,k,n+4,c)
314                          out_buffer(p+2) = lhs(i,j,k,n+5,c)
315                          out_buffer(p+3) = rhs(i,j,k,m,c)
316                          p = p + 3
317                       end do
318                    end do
319                 end do
320              end do
321
322
323              call mpi_isend(out_buffer, 22*buffer_size, 
324      >                     dp_type, successor(3), 
325      >                     DEFAULT_TAG, comm_solve, 
326      >                     requests(2), error)
327
328           endif
329        end do
330
331 c---------------------------------------------------------------------
332 c      now go in the reverse direction                      
333 c---------------------------------------------------------------------
334
335 c---------------------------------------------------------------------
336 c                         BACKSUBSTITUTION 
337 c---------------------------------------------------------------------
338        do    stage = ncells, 1, -1
339           c = slice(3,stage)
340
341           kstart = 0
342           kend   = cell_size(3,c)-1
343
344           isize     = cell_size(1,c)
345           jsize     = cell_size(2,c)
346           ip        = cell_coord(1,c)-1
347           jp        = cell_coord(2,c)-1
348
349           buffer_size = (isize-start(1,c)-end(1,c)) * 
350      >                  (jsize-start(2,c)-end(2,c))
351
352           if (stage .ne. ncells) then
353
354 c---------------------------------------------------------------------
355 c            if this is not the starting cell in this row of cells, 
356 c            wait for a message to be received, containing the 
357 c            solution of the previous two stations     
358 c---------------------------------------------------------------------
359
360              call mpi_irecv(in_buffer, 10*buffer_size, 
361      >                      dp_type, successor(3), 
362      >                      DEFAULT_TAG, comm_solve, 
363      >                      requests(1), error)
364
365
366 c---------------------------------------------------------------------
367 c            communication has already been started
368 c            while waiting, do the  block-diagonal inversion for the 
369 c            cell that was just finished                
370 c---------------------------------------------------------------------
371
372              call tzetar(slice(3,stage+1))
373
374 c---------------------------------------------------------------------
375 c            wait for pending communication to complete
376 c---------------------------------------------------------------------
377              call mpi_waitall(2, requests, statuses, error)
378
379 c---------------------------------------------------------------------
380 c            unpack the buffer for the first three factors         
381 c---------------------------------------------------------------------
382              n = 0
383              p = 0
384              k  = kend
385              k1 = k - 1
386              do    m = 1, 3
387                 do   j = start(2,c), jsize-end(2,c)-1
388                    do   i = start(1,c), isize-end(1,c)-1
389                       sm1 = in_buffer(p+1)
390                       sm2 = in_buffer(p+2)
391                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
392      >                        lhs(i,j,k,n+4,c)*sm1 -
393      >                        lhs(i,j,k,n+5,c)*sm2
394                       rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
395      >                        lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - 
396      >                        lhs(i,j,k1,n+5,c) * sm1
397                       p = p + 2
398                    end do
399                 end do
400              end do
401
402 c---------------------------------------------------------------------
403 c            now unpack the buffer for the remaining two factors
404 c---------------------------------------------------------------------
405              do    m = 4, 5
406                 n = (m-3)*5
407                 do   j = start(2,c), jsize-end(2,c)-1
408                    do   i = start(1,c), isize-end(1,c)-1
409                       sm1 = in_buffer(p+1)
410                       sm2 = in_buffer(p+2)
411                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
412      >                        lhs(i,j,k,n+4,c)*sm1 -
413      >                        lhs(i,j,k,n+5,c)*sm2
414                       rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
415      >                        lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - 
416      >                        lhs(i,j,k1,n+5,c) * sm1
417                       p = p + 2
418                    end do
419                 end do
420              end do
421
422           else
423
424 c---------------------------------------------------------------------
425 c            now we know this is the first grid block on the back sweep,
426 c            so we don't need a message to start the substitution. 
427 c---------------------------------------------------------------------
428
429              k  = kend - 1
430              k1 = kend
431              n = 0
432              do   m = 1, 3
433                 do   j = start(2,c), jsize-end(2,c)-1
434                    do   i = start(1,c), isize-end(1,c)-1
435                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
436      >                             lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
437                    end do
438                 end do
439              end do
440
441              do    m = 4, 5
442                 n = (m-3)*5
443                 do   j = start(2,c), jsize-end(2,c)-1
444                    do   i = start(1,c), isize-end(1,c)-1
445                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
446      >                             lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
447                    end do
448                 end do
449              end do
450           endif
451
452 c---------------------------------------------------------------------
453 c         Whether or not this is the last processor, we always have
454 c         to complete the back-substitution 
455 c---------------------------------------------------------------------
456
457 c---------------------------------------------------------------------
458 c         The first three factors
459 c---------------------------------------------------------------------
460           n = 0
461           do   m = 1, 3
462              do   k = kend-2, kstart, -1
463                 do   j = start(2,c), jsize-end(2,c)-1
464                    do    i = start(1,c), isize-end(1,c)-1
465                       k1 = k  + 1
466                       k2 = k  + 2
467                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
468      >                          lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
469      >                          lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
470                    end do
471                 end do
472              end do
473           end do
474
475 c---------------------------------------------------------------------
476 c         And the remaining two
477 c---------------------------------------------------------------------
478           do    m = 4, 5
479              n = (m-3)*5
480              do   k = kend-2, kstart, -1
481                 do   j = start(2,c), jsize-end(2,c)-1
482                    do    i = start(1,c), isize-end(1,c)-1
483                       k1 = k  + 1
484                       k2 = k  + 2
485                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
486      >                          lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
487      >                          lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
488                    end do
489                 end do
490              end do
491           end do
492
493 c---------------------------------------------------------------------
494 c         send on information to the previous processor, if needed
495 c---------------------------------------------------------------------
496           if (stage .ne.  1) then
497              k  = kstart
498              k1 = kstart + 1
499              p = 0
500              do    m = 1, 5
501                 do    j = start(2,c), jsize-end(2,c)-1
502                    do    i = start(1,c), isize-end(1,c)-1
503                       out_buffer(p+1) = rhs(i,j,k,m,c)
504                       out_buffer(p+2) = rhs(i,j,k1,m,c)
505                       p = p + 2
506                    end do
507                 end do
508              end do
509
510              call mpi_isend(out_buffer, 10*buffer_size, 
511      >                     dp_type, predecessor(3), 
512      >                     DEFAULT_TAG, comm_solve, 
513      >                     requests(2), error)
514
515           endif
516
517 c---------------------------------------------------------------------
518 c         If this was the last stage, do the block-diagonal inversion
519 c---------------------------------------------------------------------
520           if (stage .eq. 1) call tzetar(c)
521
522        end do
523
524        return
525        end
526     
527
528
529
530
531
532