Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
cd40756ec39f5038871335b63c424404b3ee6f68
[simgrid.git] / examples / smpi / NAS / SP / x_solve.f
1
2 c---------------------------------------------------------------------
3 c---------------------------------------------------------------------
4
5        subroutine x_solve
6
7 c---------------------------------------------------------------------
8 c---------------------------------------------------------------------
9
10 c---------------------------------------------------------------------
11 c this function performs the solution of the approximate factorization
12 c step in the x-direction for all five matrix components
13 c simultaneously. The Thomas algorithm is employed to solve the
14 c systems for the x-lines. Boundary conditions are non-periodic
15 c---------------------------------------------------------------------
16
17        include 'header.h'
18        include 'mpinpb.h'
19
20
21        integer i, j, k, jp, kp, n, iend, jsize, ksize, i1, i2,
22      >         buffer_size, c, m, p, istart, stage, error,
23      >         requests(2), statuses(MPI_STATUS_SIZE, 2)
24        double precision  r1, r2, d, e, s(5), sm1, sm2,
25      >                   fac1, fac2
26
27
28
29 c---------------------------------------------------------------------
30 c      OK, now we know that there are multiple processors
31 c---------------------------------------------------------------------
32
33 c---------------------------------------------------------------------
34 c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
35 c on this node in the direction of increasing i for the forward sweep,
36 c and after that reversing the direction for the backsubstitution.
37 c---------------------------------------------------------------------
38
39 c---------------------------------------------------------------------
40 c                          FORWARD ELIMINATION  
41 c---------------------------------------------------------------------
42        do    stage = 1, ncells
43           c         = slice(1,stage)
44
45           istart = 0
46           iend   = cell_size(1,c)-1
47
48           jsize     = cell_size(2,c)
49           ksize     = cell_size(3,c)
50           jp        = cell_coord(2,c)-1
51           kp        = cell_coord(3,c)-1
52
53           buffer_size = (jsize-start(2,c)-end(2,c)) * 
54      >                  (ksize-start(3,c)-end(3,c))
55
56           if ( stage .ne. 1) then
57
58 c---------------------------------------------------------------------
59 c            if this is not the first processor in this row of cells, 
60 c            receive data from predecessor containing the right hand
61 c            sides and the upper diagonal elements of the previous two rows
62 c---------------------------------------------------------------------
63              call mpi_irecv(in_buffer, 22*buffer_size, 
64      >                      dp_type, predecessor(1), 
65      >                      DEFAULT_TAG,  comm_solve, 
66      >                      requests(1), error)
67
68
69 c---------------------------------------------------------------------
70 c            communication has already been started. 
71 c            compute the left hand side while waiting for the msg
72 c---------------------------------------------------------------------
73              call lhsx(c)
74
75 c---------------------------------------------------------------------
76 c            wait for pending communication to complete
77 c            This waits on the current receive and on the send
78 c            from the previous stage. They always come in pairs. 
79 c---------------------------------------------------------------------
80
81              call mpi_waitall(2, requests, statuses, error)
82
83 c---------------------------------------------------------------------
84 c            unpack the buffer                                 
85 c---------------------------------------------------------------------
86              i  = istart
87              i1 = istart + 1
88              n = 0
89
90 c---------------------------------------------------------------------
91 c            create a running pointer
92 c---------------------------------------------------------------------
93              p = 0
94              do    k = start(3,c), ksize-end(3,c)-1
95                 do    j = start(2,c), jsize-end(2,c)-1
96                    lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
97      >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
98                    lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
99      >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
100                    do    m = 1, 3
101                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
102      >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
103                    end do
104                    d            = in_buffer(p+6)
105                    e            = in_buffer(p+7)
106                    do    m = 1, 3
107                       s(m) = in_buffer(p+7+m)
108                    end do
109                    r1 = lhs(i,j,k,n+2,c)
110                    lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
111                    lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
112                    do    m = 1, 3
113                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
114                    end do
115                    r2 = lhs(i1,j,k,n+1,c)
116                    lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
117                    lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
118                    do    m = 1, 3
119                       rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
120                    end do
121                    p = p + 10
122                 end do
123              end do
124
125              do    m = 4, 5
126                 n = (m-3)*5
127                 do    k = start(3,c), ksize-end(3,c)-1
128                    do    j = start(2,c), jsize-end(2,c)-1
129                       lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
130      >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
131                       lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
132      >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
133                       rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
134      >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
135                       d                = in_buffer(p+4)
136                       e                = in_buffer(p+5)
137                       s(m)             = in_buffer(p+6)
138                       r1 = lhs(i,j,k,n+2,c)
139                       lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
140                       lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
141                       rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
142                       r2 = lhs(i1,j,k,n+1,c)
143                       lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
144                       lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
145                       rhs(i1,j,k,m,c)   = rhs(i1,j,k,m,c) - s(m) * r2
146                       p = p + 6
147                    end do
148                 end do
149              end do
150
151           else            
152
153 c---------------------------------------------------------------------
154 c            if this IS the first cell, we still compute the lhs
155 c---------------------------------------------------------------------
156              call lhsx(c)
157           endif
158
159 c---------------------------------------------------------------------
160 c         perform the Thomas algorithm; first, FORWARD ELIMINATION     
161 c---------------------------------------------------------------------
162           n = 0
163
164           do    k = start(3,c), ksize-end(3,c)-1
165              do    j = start(2,c), jsize-end(2,c)-1
166                 do    i = istart, iend-2
167                    i1 = i  + 1
168                    i2 = i  + 2
169                    fac1               = 1.d0/lhs(i,j,k,n+3,c)
170                    lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
171                    lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
172                    do    m = 1, 3
173                       rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
174                    end do
175                    lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
176      >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
177                    lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
178      >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
179                    do    m = 1, 3
180                       rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
181      >                         lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
182                    end do
183                    lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
184      >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
185                    lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
186      >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
187                    do    m = 1, 3
188                       rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
189      >                         lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
190                    end do
191                 end do
192              end do
193           end do
194
195 c---------------------------------------------------------------------
196 c         The last two rows in this grid block are a bit different, 
197 c         since they do not have two more rows available for the
198 c         elimination of off-diagonal entries
199 c---------------------------------------------------------------------
200
201           i  = iend - 1
202           i1 = iend
203           do    k = start(3,c), ksize-end(3,c)-1
204              do    j = start(2,c), jsize-end(2,c)-1
205                 fac1               = 1.d0/lhs(i,j,k,n+3,c)
206                 lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
207                 lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
208                 do    m = 1, 3
209                    rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
210                 end do
211                 lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
212      >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
213                 lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
214      >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
215                 do    m = 1, 3
216                    rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
217      >                      lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
218                 end do
219 c---------------------------------------------------------------------
220 c               scale the last row immediately (some of this is
221 c               overkill in case this is the last cell)
222 c---------------------------------------------------------------------
223                 fac2               = 1.d0/lhs(i1,j,k,n+3,c)
224                 lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
225                 lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)  
226                 do    m = 1, 3
227                    rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
228                 end do
229              end do
230           end do
231
232 c---------------------------------------------------------------------
233 c         do the u+c and the u-c factors                 
234 c---------------------------------------------------------------------
235
236           do    m = 4, 5
237              n = (m-3)*5
238              do    k = start(3,c), ksize-end(3,c)-1
239                 do    j = start(2,c), jsize-end(2,c)-1
240                    do    i = istart, iend-2
241                    i1 = i  + 1
242                    i2 = i  + 2
243                    fac1               = 1.d0/lhs(i,j,k,n+3,c)
244                    lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
245                    lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
246                    rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
247                    lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
248      >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
249                    lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
250      >                         lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
251                    rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
252      >                         lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
253                    lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
254      >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
255                    lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
256      >                         lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
257                    rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
258      >                         lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
259                 end do
260              end do
261           end do
262
263 c---------------------------------------------------------------------
264 c            And again the last two rows separately
265 c---------------------------------------------------------------------
266              i  = iend - 1
267              i1 = iend
268              do    k = start(3,c), ksize-end(3,c)-1
269                 do    j = start(2,c), jsize-end(2,c)-1
270                 fac1               = 1.d0/lhs(i,j,k,n+3,c)
271                 lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
272                 lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
273                 rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
274                 lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
275      >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
276                 lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
277      >                      lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
278                 rhs(i1,j,k,m,c)   = rhs(i1,j,k,m,c) -
279      >                      lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
280 c---------------------------------------------------------------------
281 c               Scale the last row immediately (some of this is overkill
282 c               if this is the last cell)
283 c---------------------------------------------------------------------
284                 fac2               = 1.d0/lhs(i1,j,k,n+3,c)
285                 lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
286                 lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
287                 rhs(i1,j,k,m,c)   = fac2*rhs(i1,j,k,m,c)
288
289              end do
290           end do
291        end do
292
293 c---------------------------------------------------------------------
294 c         send information to the next processor, except when this
295 c         is the last grid block
296 c---------------------------------------------------------------------
297           if (stage .ne. ncells) then
298
299 c---------------------------------------------------------------------
300 c            create a running pointer for the send buffer  
301 c---------------------------------------------------------------------
302              p = 0
303              n = 0
304              do    k = start(3,c), ksize-end(3,c)-1
305                 do    j = start(2,c), jsize-end(2,c)-1
306                    do    i = iend-1, iend
307                       out_buffer(p+1) = lhs(i,j,k,n+4,c)
308                       out_buffer(p+2) = lhs(i,j,k,n+5,c)
309                       do    m = 1, 3
310                          out_buffer(p+2+m) = rhs(i,j,k,m,c)
311                       end do
312                       p = p+5
313                    end do
314                 end do
315              end do
316
317              do    m = 4, 5
318                 n = (m-3)*5
319                 do    k = start(3,c), ksize-end(3,c)-1
320                    do    j = start(2,c), jsize-end(2,c)-1
321                       do    i = iend-1, iend
322                          out_buffer(p+1) = lhs(i,j,k,n+4,c)
323                          out_buffer(p+2) = lhs(i,j,k,n+5,c)
324                          out_buffer(p+3) = rhs(i,j,k,m,c)
325                          p = p + 3
326                       end do
327                    end do
328                 end do
329              end do
330
331 c---------------------------------------------------------------------
332 c send data to next phase
333 c can't receive data yet because buffer size will be wrong 
334 c---------------------------------------------------------------------
335              call mpi_isend(out_buffer, 22*buffer_size, 
336      >                     dp_type, successor(1), 
337      >                     DEFAULT_TAG, comm_solve, 
338      >                     requests(2), error)
339
340           endif
341        end do
342
343 c---------------------------------------------------------------------
344 c      now go in the reverse direction                      
345 c---------------------------------------------------------------------
346
347 c---------------------------------------------------------------------
348 c                         BACKSUBSTITUTION 
349 c---------------------------------------------------------------------
350        do    stage = ncells, 1, -1
351           c = slice(1,stage)
352
353           istart = 0
354           iend   = cell_size(1,c)-1
355
356           jsize = cell_size(2,c)
357           ksize = cell_size(3,c)
358           jp    = cell_coord(2,c)-1
359           kp    = cell_coord(3,c)-1
360
361           buffer_size = (jsize-start(2,c)-end(2,c)) * 
362      >                  (ksize-start(3,c)-end(3,c))
363
364           if (stage .ne. ncells) then
365
366 c---------------------------------------------------------------------
367 c            if this is not the starting cell in this row of cells, 
368 c            wait for a message to be received, containing the 
369 c            solution of the previous two stations     
370 c---------------------------------------------------------------------
371              call mpi_irecv(in_buffer, 10*buffer_size, 
372      >                      dp_type, successor(1), 
373      >                      DEFAULT_TAG, comm_solve, 
374      >                      requests(1), error)
375
376
377 c---------------------------------------------------------------------
378 c            communication has already been started
379 c            while waiting, do the block-diagonal inversion for the 
380 c            cell that was just finished                
381 c---------------------------------------------------------------------
382
383              call ninvr(slice(1,stage+1))
384
385 c---------------------------------------------------------------------
386 c            wait for pending communication to complete
387 c---------------------------------------------------------------------
388              call mpi_waitall(2, requests, statuses, error)
389
390 c---------------------------------------------------------------------
391 c            unpack the buffer for the first three factors         
392 c---------------------------------------------------------------------
393              n = 0
394              p = 0
395              i  = iend
396              i1 = i - 1
397              do    m = 1, 3
398                 do   k = start(3,c), ksize-end(3,c)-1
399                    do   j = start(2,c), jsize-end(2,c)-1
400                       sm1 = in_buffer(p+1)
401                       sm2 = in_buffer(p+2)
402                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
403      >                        lhs(i,j,k,n+4,c)*sm1 -
404      >                        lhs(i,j,k,n+5,c)*sm2
405                       rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
406      >                        lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - 
407      >                        lhs(i1,j,k,n+5,c) * sm1
408                       p = p + 2
409                    end do
410                 end do
411              end do
412
413 c---------------------------------------------------------------------
414 c            now unpack the buffer for the remaining two factors
415 c---------------------------------------------------------------------
416              do    m = 4, 5
417                 n = (m-3)*5
418                 do   k = start(3,c), ksize-end(3,c)-1
419                    do   j = start(2,c), jsize-end(2,c)-1
420                       sm1 = in_buffer(p+1)
421                       sm2 = in_buffer(p+2)
422                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
423      >                        lhs(i,j,k,n+4,c)*sm1 -
424      >                        lhs(i,j,k,n+5,c)*sm2
425                       rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
426      >                        lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - 
427      >                        lhs(i1,j,k,n+5,c) * sm1
428                       p = p + 2
429                    end do
430                 end do
431              end do
432
433           else
434
435 c---------------------------------------------------------------------
436 c            now we know this is the first grid block on the back sweep,
437 c            so we don't need a message to start the substitution. 
438 c---------------------------------------------------------------------
439              i  = iend-1
440              i1 = iend
441              n = 0
442              do   m = 1, 3
443                 do   k = start(3,c), ksize-end(3,c)-1
444                    do   j = start(2,c), jsize-end(2,c)-1
445                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
446      >                             lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
447                    end do
448                 end do
449              end do
450
451              do    m = 4, 5
452                 n = (m-3)*5
453                 do   k = start(3,c), ksize-end(3,c)-1
454                    do   j = start(2,c), jsize-end(2,c)-1
455                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
456      >                             lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
457                    end do
458                 end do
459              end do
460           endif
461
462 c---------------------------------------------------------------------
463 c         Whether or not this is the last processor, we always have
464 c         to complete the back-substitution 
465 c---------------------------------------------------------------------
466
467 c---------------------------------------------------------------------
468 c         The first three factors
469 c---------------------------------------------------------------------
470           n = 0
471           do   m = 1, 3
472              do   k = start(3,c), ksize-end(3,c)-1
473                 do   j = start(2,c), jsize-end(2,c)-1
474                    do    i = iend-2, istart, -1
475                       i1 = i  + 1
476                       i2 = i  + 2
477                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
478      >                          lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
479      >                          lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
480                    end do
481                 end do
482              end do
483           end do
484
485 c---------------------------------------------------------------------
486 c         And the remaining two
487 c---------------------------------------------------------------------
488           do    m = 4, 5
489              n = (m-3)*5
490              do   k = start(3,c), ksize-end(3,c)-1
491                 do   j = start(2,c), jsize-end(2,c)-1
492                    do    i = iend-2, istart, -1
493                       i1 = i  + 1
494                       i2 = i  + 2
495                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - 
496      >                          lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
497      >                          lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
498                    end do
499                 end do
500              end do
501           end do
502
503 c---------------------------------------------------------------------
504 c         send on information to the previous processor, if needed
505 c---------------------------------------------------------------------
506           if (stage .ne.  1) then
507              i  = istart
508              i1 = istart+1
509              p = 0
510              do    m = 1, 5
511                 do    k = start(3,c), ksize-end(3,c)-1
512                    do    j = start(2,c), jsize-end(2,c)-1
513                       out_buffer(p+1) = rhs(i,j,k,m,c)
514                        out_buffer(p+2) = rhs(i1,j,k,m,c)
515                       p = p + 2
516                    end do
517                 end do
518              end do
519
520 c---------------------------------------------------------------------
521 c            pack and send the buffer
522 c---------------------------------------------------------------------
523              call mpi_isend(out_buffer, 10*buffer_size, 
524      >                     dp_type, predecessor(1), 
525      >                     DEFAULT_TAG, comm_solve, 
526      >                     requests(2), error)
527
528           endif
529
530 c---------------------------------------------------------------------
531 c         If this was the last stage, do the block-diagonal inversion          
532 c---------------------------------------------------------------------
533           if (stage .eq. 1) call ninvr(c)
534
535        end do
536
537        return
538        end
539     
540
541
542
543
544
545