2 c---------------------------------------------------------------------
3 c---------------------------------------------------------------------
7 c---------------------------------------------------------------------
8 c---------------------------------------------------------------------
10 c---------------------------------------------------------------------
11 c this function performs the solution of the approximate factorization
12 c step in the x-direction for all five matrix components
13 c simultaneously. The Thomas algorithm is employed to solve the
14 c systems for the x-lines. Boundary conditions are non-periodic
15 c---------------------------------------------------------------------
21 integer i, j, k, jp, kp, n, iend, jsize, ksize, i1, i2,
22 > buffer_size, c, m, p, istart, stage, error,
23 > requests(2), statuses(MPI_STATUS_SIZE, 2)
24 double precision r1, r2, d, e, s(5), sm1, sm2,
29 c---------------------------------------------------------------------
30 c OK, now we know that there are multiple processors
31 c---------------------------------------------------------------------
33 c---------------------------------------------------------------------
34 c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
35 c on this node in the direction of increasing i for the forward sweep,
36 c and after that reversing the direction for the backsubstitution.
37 c---------------------------------------------------------------------
39 c---------------------------------------------------------------------
41 c---------------------------------------------------------------------
46 iend = cell_size(1,c)-1
48 jsize = cell_size(2,c)
49 ksize = cell_size(3,c)
50 jp = cell_coord(2,c)-1
51 kp = cell_coord(3,c)-1
53 buffer_size = (jsize-start(2,c)-end(2,c)) *
54 > (ksize-start(3,c)-end(3,c))
56 if ( stage .ne. 1) then
58 c---------------------------------------------------------------------
59 c if this is not the first processor in this row of cells,
60 c receive data from predecessor containing the right hand
61 c sides and the upper diagonal elements of the previous two rows
62 c---------------------------------------------------------------------
63 call mpi_irecv(in_buffer, 22*buffer_size,
64 > dp_type, predecessor(1),
65 > DEFAULT_TAG, comm_solve,
69 c---------------------------------------------------------------------
70 c communication has already been started.
71 c compute the left hand side while waiting for the msg
72 c---------------------------------------------------------------------
75 c---------------------------------------------------------------------
76 c wait for pending communication to complete
77 c This waits on the current receive and on the send
78 c from the previous stage. They always come in pairs.
79 c---------------------------------------------------------------------
81 call mpi_waitall(2, requests, statuses, error)
83 c---------------------------------------------------------------------
85 c---------------------------------------------------------------------
90 c---------------------------------------------------------------------
91 c create a running pointer
92 c---------------------------------------------------------------------
94 do k = start(3,c), ksize-end(3,c)-1
95 do j = start(2,c), jsize-end(2,c)-1
96 lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
97 > in_buffer(p+1) * lhs(i,j,k,n+1,c)
98 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
99 > in_buffer(p+2) * lhs(i,j,k,n+1,c)
101 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
102 > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
107 s(m) = in_buffer(p+7+m)
109 r1 = lhs(i,j,k,n+2,c)
110 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
111 lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
113 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
115 r2 = lhs(i1,j,k,n+1,c)
116 lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
117 lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
119 rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
127 do k = start(3,c), ksize-end(3,c)-1
128 do j = start(2,c), jsize-end(2,c)-1
129 lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
130 > in_buffer(p+1) * lhs(i,j,k,n+1,c)
131 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
132 > in_buffer(p+2) * lhs(i,j,k,n+1,c)
133 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
134 > in_buffer(p+3) * lhs(i,j,k,n+1,c)
137 s(m) = in_buffer(p+6)
138 r1 = lhs(i,j,k,n+2,c)
139 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
140 lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
141 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
142 r2 = lhs(i1,j,k,n+1,c)
143 lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
144 lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
145 rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
153 c---------------------------------------------------------------------
154 c if this IS the first cell, we still compute the lhs
155 c---------------------------------------------------------------------
159 c---------------------------------------------------------------------
160 c perform the Thomas algorithm; first, FORWARD ELIMINATION
161 c---------------------------------------------------------------------
164 do k = start(3,c), ksize-end(3,c)-1
165 do j = start(2,c), jsize-end(2,c)-1
166 do i = istart, iend-2
169 fac1 = 1.d0/lhs(i,j,k,n+3,c)
170 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
171 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
173 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
175 lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
176 > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
177 lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
178 > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
180 rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
181 > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
183 lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
184 > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
185 lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
186 > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
188 rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
189 > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
195 c---------------------------------------------------------------------
196 c The last two rows in this grid block are a bit different,
197 c since they do not have two more rows available for the
198 c elimination of off-diagonal entries
199 c---------------------------------------------------------------------
203 do k = start(3,c), ksize-end(3,c)-1
204 do j = start(2,c), jsize-end(2,c)-1
205 fac1 = 1.d0/lhs(i,j,k,n+3,c)
206 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
207 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
209 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
211 lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
212 > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
213 lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
214 > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
216 rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
217 > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
219 c---------------------------------------------------------------------
220 c scale the last row immediately (some of this is
221 c overkill in case this is the last cell)
222 c---------------------------------------------------------------------
223 fac2 = 1.d0/lhs(i1,j,k,n+3,c)
224 lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
225 lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
227 rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
232 c---------------------------------------------------------------------
233 c do the u+c and the u-c factors
234 c---------------------------------------------------------------------
238 do k = start(3,c), ksize-end(3,c)-1
239 do j = start(2,c), jsize-end(2,c)-1
240 do i = istart, iend-2
243 fac1 = 1.d0/lhs(i,j,k,n+3,c)
244 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
245 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
246 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
247 lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
248 > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
249 lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
250 > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
251 rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
252 > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
253 lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
254 > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
255 lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
256 > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
257 rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
258 > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
263 c---------------------------------------------------------------------
264 c And again the last two rows separately
265 c---------------------------------------------------------------------
268 do k = start(3,c), ksize-end(3,c)-1
269 do j = start(2,c), jsize-end(2,c)-1
270 fac1 = 1.d0/lhs(i,j,k,n+3,c)
271 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
272 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
273 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
274 lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
275 > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
276 lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
277 > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
278 rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
279 > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
280 c---------------------------------------------------------------------
281 c Scale the last row immediately (some of this is overkill
282 c if this is the last cell)
283 c---------------------------------------------------------------------
284 fac2 = 1.d0/lhs(i1,j,k,n+3,c)
285 lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
286 lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
287 rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
293 c---------------------------------------------------------------------
294 c send information to the next processor, except when this
295 c is the last grid block
296 c---------------------------------------------------------------------
297 if (stage .ne. ncells) then
299 c---------------------------------------------------------------------
300 c create a running pointer for the send buffer
301 c---------------------------------------------------------------------
304 do k = start(3,c), ksize-end(3,c)-1
305 do j = start(2,c), jsize-end(2,c)-1
307 out_buffer(p+1) = lhs(i,j,k,n+4,c)
308 out_buffer(p+2) = lhs(i,j,k,n+5,c)
310 out_buffer(p+2+m) = rhs(i,j,k,m,c)
319 do k = start(3,c), ksize-end(3,c)-1
320 do j = start(2,c), jsize-end(2,c)-1
322 out_buffer(p+1) = lhs(i,j,k,n+4,c)
323 out_buffer(p+2) = lhs(i,j,k,n+5,c)
324 out_buffer(p+3) = rhs(i,j,k,m,c)
331 c---------------------------------------------------------------------
332 c send data to next phase
333 c can't receive data yet because buffer size will be wrong
334 c---------------------------------------------------------------------
335 call mpi_isend(out_buffer, 22*buffer_size,
336 > dp_type, successor(1),
337 > DEFAULT_TAG, comm_solve,
338 > requests(2), error)
343 c---------------------------------------------------------------------
344 c now go in the reverse direction
345 c---------------------------------------------------------------------
347 c---------------------------------------------------------------------
349 c---------------------------------------------------------------------
350 do stage = ncells, 1, -1
354 iend = cell_size(1,c)-1
356 jsize = cell_size(2,c)
357 ksize = cell_size(3,c)
358 jp = cell_coord(2,c)-1
359 kp = cell_coord(3,c)-1
361 buffer_size = (jsize-start(2,c)-end(2,c)) *
362 > (ksize-start(3,c)-end(3,c))
364 if (stage .ne. ncells) then
366 c---------------------------------------------------------------------
367 c if this is not the starting cell in this row of cells,
368 c wait for a message to be received, containing the
369 c solution of the previous two stations
370 c---------------------------------------------------------------------
371 call mpi_irecv(in_buffer, 10*buffer_size,
372 > dp_type, successor(1),
373 > DEFAULT_TAG, comm_solve,
374 > requests(1), error)
377 c---------------------------------------------------------------------
378 c communication has already been started
379 c while waiting, do the block-diagonal inversion for the
380 c cell that was just finished
381 c---------------------------------------------------------------------
383 call ninvr(slice(1,stage+1))
385 c---------------------------------------------------------------------
386 c wait for pending communication to complete
387 c---------------------------------------------------------------------
388 call mpi_waitall(2, requests, statuses, error)
390 c---------------------------------------------------------------------
391 c unpack the buffer for the first three factors
392 c---------------------------------------------------------------------
398 do k = start(3,c), ksize-end(3,c)-1
399 do j = start(2,c), jsize-end(2,c)-1
402 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
403 > lhs(i,j,k,n+4,c)*sm1 -
404 > lhs(i,j,k,n+5,c)*sm2
405 rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
406 > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) -
407 > lhs(i1,j,k,n+5,c) * sm1
413 c---------------------------------------------------------------------
414 c now unpack the buffer for the remaining two factors
415 c---------------------------------------------------------------------
418 do k = start(3,c), ksize-end(3,c)-1
419 do j = start(2,c), jsize-end(2,c)-1
422 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
423 > lhs(i,j,k,n+4,c)*sm1 -
424 > lhs(i,j,k,n+5,c)*sm2
425 rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
426 > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) -
427 > lhs(i1,j,k,n+5,c) * sm1
435 c---------------------------------------------------------------------
436 c now we know this is the first grid block on the back sweep,
437 c so we don't need a message to start the substitution.
438 c---------------------------------------------------------------------
443 do k = start(3,c), ksize-end(3,c)-1
444 do j = start(2,c), jsize-end(2,c)-1
445 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
446 > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
453 do k = start(3,c), ksize-end(3,c)-1
454 do j = start(2,c), jsize-end(2,c)-1
455 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
456 > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
462 c---------------------------------------------------------------------
463 c Whether or not this is the last processor, we always have
464 c to complete the back-substitution
465 c---------------------------------------------------------------------
467 c---------------------------------------------------------------------
468 c The first three factors
469 c---------------------------------------------------------------------
472 do k = start(3,c), ksize-end(3,c)-1
473 do j = start(2,c), jsize-end(2,c)-1
474 do i = iend-2, istart, -1
477 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
478 > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
479 > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
485 c---------------------------------------------------------------------
486 c And the remaining two
487 c---------------------------------------------------------------------
490 do k = start(3,c), ksize-end(3,c)-1
491 do j = start(2,c), jsize-end(2,c)-1
492 do i = iend-2, istart, -1
495 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
496 > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
497 > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
503 c---------------------------------------------------------------------
504 c send on information to the previous processor, if needed
505 c---------------------------------------------------------------------
506 if (stage .ne. 1) then
511 do k = start(3,c), ksize-end(3,c)-1
512 do j = start(2,c), jsize-end(2,c)-1
513 out_buffer(p+1) = rhs(i,j,k,m,c)
514 out_buffer(p+2) = rhs(i1,j,k,m,c)
520 c---------------------------------------------------------------------
521 c pack and send the buffer
522 c---------------------------------------------------------------------
523 call mpi_isend(out_buffer, 10*buffer_size,
524 > dp_type, predecessor(1),
525 > DEFAULT_TAG, comm_solve,
526 > requests(2), error)
530 c---------------------------------------------------------------------
531 c If this was the last stage, do the block-diagonal inversion
532 c---------------------------------------------------------------------
533 if (stage .eq. 1) call ninvr(c)