2 c---------------------------------------------------------------------
3 c---------------------------------------------------------------------
7 c---------------------------------------------------------------------
8 c---------------------------------------------------------------------
10 c---------------------------------------------------------------------
11 c this function performs the solution of the approximate factorization
12 c step in the z-direction for all five matrix components
13 c simultaneously. The Thomas algorithm is employed to solve the
14 c systems for the z-lines. Boundary conditions are non-periodic
15 c---------------------------------------------------------------------
20 integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2,
21 > buffer_size, c, m, p, kstart, error,
22 > requests(2), statuses(MPI_STATUS_SIZE, 2)
23 double precision r1, r2, d, e, s(5), sm1, sm2,
26 c---------------------------------------------------------------------
27 c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
28 c on this node in the direction of increasing i for the forward sweep,
29 c and after that reversing the direction for the backsubstitution
30 c---------------------------------------------------------------------
32 c---------------------------------------------------------------------
34 c---------------------------------------------------------------------
39 kend = cell_size(3,c)-1
41 isize = cell_size(1,c)
42 jsize = cell_size(2,c)
43 ip = cell_coord(1,c)-1
44 jp = cell_coord(2,c)-1
46 buffer_size = (isize-start(1,c)-end(1,c)) *
47 > (jsize-start(2,c)-end(2,c))
49 if (stage .ne. 1) then
52 c---------------------------------------------------------------------
53 c if this is not the first processor in this row of cells,
54 c receive data from predecessor containing the right hand
55 c sides and the upper diagonal elements of the previous two rows
56 c---------------------------------------------------------------------
58 call mpi_irecv(in_buffer, 22*buffer_size,
59 > dp_type, predecessor(3),
60 > DEFAULT_TAG, comm_solve,
64 c---------------------------------------------------------------------
65 c communication has already been started.
66 c compute the left hand side while waiting for the msg
67 c---------------------------------------------------------------------
70 c---------------------------------------------------------------------
71 c wait for pending communication to complete
72 c---------------------------------------------------------------------
73 call mpi_waitall(2, requests, statuses, error)
75 c---------------------------------------------------------------------
77 c---------------------------------------------------------------------
82 c---------------------------------------------------------------------
83 c create a running pointer
84 c---------------------------------------------------------------------
86 do j = start(2,c), jsize-end(2,c)-1
87 do i = start(1,c), isize-end(1,c)-1
88 lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
89 > in_buffer(p+1) * lhs(i,j,k,n+1,c)
90 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
91 > in_buffer(p+2) * lhs(i,j,k,n+1,c)
93 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
94 > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
99 s(m) = in_buffer(p+7+m)
101 r1 = lhs(i,j,k,n+2,c)
102 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
103 lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
105 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
107 r2 = lhs(i,j,k1,n+1,c)
108 lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
109 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
111 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
119 do j = start(2,c), jsize-end(2,c)-1
120 do i = start(1,c), isize-end(1,c)-1
121 lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
122 > in_buffer(p+1) * lhs(i,j,k,n+1,c)
123 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
124 > in_buffer(p+2) * lhs(i,j,k,n+1,c)
125 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
126 > in_buffer(p+3) * lhs(i,j,k,n+1,c)
129 s(m) = in_buffer(p+6)
130 r1 = lhs(i,j,k,n+2,c)
131 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
132 lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
133 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
134 r2 = lhs(i,j,k1,n+1,c)
135 lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
136 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
137 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
145 c---------------------------------------------------------------------
146 c if this IS the first cell, we still compute the lhs
147 c---------------------------------------------------------------------
151 c---------------------------------------------------------------------
152 c perform the Thomas algorithm; first, FORWARD ELIMINATION
153 c---------------------------------------------------------------------
156 do k = kstart, kend-2
157 do j = start(2,c), jsize-end(2,c)-1
158 do i = start(1,c), isize-end(1,c)-1
161 fac1 = 1.d0/lhs(i,j,k,n+3,c)
162 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
163 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
165 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
167 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
168 > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
169 lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
170 > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
172 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
173 > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
175 lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
176 > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
177 lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
178 > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
180 rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
181 > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
187 c---------------------------------------------------------------------
188 c The last two rows in this grid block are a bit different,
189 c since they do not have two more rows available for the
190 c elimination of off-diagonal entries
191 c---------------------------------------------------------------------
194 do j = start(2,c), jsize-end(2,c)-1
195 do i = start(1,c), isize-end(1,c)-1
196 fac1 = 1.d0/lhs(i,j,k,n+3,c)
197 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
198 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
200 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
202 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
203 > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
204 lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
205 > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
207 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
208 > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
210 c---------------------------------------------------------------------
211 c scale the last row immediately (some of this is
212 c overkill in case this is the last cell)
213 c---------------------------------------------------------------------
214 fac2 = 1.d0/lhs(i,j,k1,n+3,c)
215 lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
216 lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
218 rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
223 c---------------------------------------------------------------------
224 c do the u+c and the u-c factors
225 c---------------------------------------------------------------------
228 do k = kstart, kend-2
229 do j = start(2,c), jsize-end(2,c)-1
230 do i = start(1,c), isize-end(1,c)-1
233 fac1 = 1.d0/lhs(i,j,k,n+3,c)
234 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
235 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
236 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
237 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
238 > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
239 lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
240 > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
241 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
242 > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
243 lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
244 > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
245 lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
246 > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
247 rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
248 > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
253 c---------------------------------------------------------------------
254 c And again the last two rows separately
255 c---------------------------------------------------------------------
258 do j = start(2,c), jsize-end(2,c)-1
259 do i = start(1,c), isize-end(1,c)-1
260 fac1 = 1.d0/lhs(i,j,k,n+3,c)
261 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
262 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
263 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
264 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
265 > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
266 lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
267 > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
268 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
269 > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
270 c---------------------------------------------------------------------
271 c Scale the last row immediately (some of this is overkill
272 c if this is the last cell)
273 c---------------------------------------------------------------------
274 fac2 = 1.d0/lhs(i,j,k1,n+3,c)
275 lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
276 lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
277 rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
283 c---------------------------------------------------------------------
284 c send information to the next processor, except when this
285 c is the last grid block,
286 c---------------------------------------------------------------------
288 if (stage .ne. ncells) then
290 c---------------------------------------------------------------------
291 c create a running pointer for the send buffer
292 c---------------------------------------------------------------------
295 do j = start(2,c), jsize-end(2,c)-1
296 do i = start(1,c), isize-end(1,c)-1
298 out_buffer(p+1) = lhs(i,j,k,n+4,c)
299 out_buffer(p+2) = lhs(i,j,k,n+5,c)
301 out_buffer(p+2+m) = rhs(i,j,k,m,c)
310 do j = start(2,c), jsize-end(2,c)-1
311 do i = start(1,c), isize-end(1,c)-1
313 out_buffer(p+1) = lhs(i,j,k,n+4,c)
314 out_buffer(p+2) = lhs(i,j,k,n+5,c)
315 out_buffer(p+3) = rhs(i,j,k,m,c)
323 call mpi_isend(out_buffer, 22*buffer_size,
324 > dp_type, successor(3),
325 > DEFAULT_TAG, comm_solve,
326 > requests(2), error)
331 c---------------------------------------------------------------------
332 c now go in the reverse direction
333 c---------------------------------------------------------------------
335 c---------------------------------------------------------------------
337 c---------------------------------------------------------------------
338 do stage = ncells, 1, -1
342 kend = cell_size(3,c)-1
344 isize = cell_size(1,c)
345 jsize = cell_size(2,c)
346 ip = cell_coord(1,c)-1
347 jp = cell_coord(2,c)-1
349 buffer_size = (isize-start(1,c)-end(1,c)) *
350 > (jsize-start(2,c)-end(2,c))
352 if (stage .ne. ncells) then
354 c---------------------------------------------------------------------
355 c if this is not the starting cell in this row of cells,
356 c wait for a message to be received, containing the
357 c solution of the previous two stations
358 c---------------------------------------------------------------------
360 call mpi_irecv(in_buffer, 10*buffer_size,
361 > dp_type, successor(3),
362 > DEFAULT_TAG, comm_solve,
363 > requests(1), error)
366 c---------------------------------------------------------------------
367 c communication has already been started
368 c while waiting, do the block-diagonal inversion for the
369 c cell that was just finished
370 c---------------------------------------------------------------------
372 call tzetar(slice(3,stage+1))
374 c---------------------------------------------------------------------
375 c wait for pending communication to complete
376 c---------------------------------------------------------------------
377 call mpi_waitall(2, requests, statuses, error)
379 c---------------------------------------------------------------------
380 c unpack the buffer for the first three factors
381 c---------------------------------------------------------------------
387 do j = start(2,c), jsize-end(2,c)-1
388 do i = start(1,c), isize-end(1,c)-1
391 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
392 > lhs(i,j,k,n+4,c)*sm1 -
393 > lhs(i,j,k,n+5,c)*sm2
394 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
395 > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) -
396 > lhs(i,j,k1,n+5,c) * sm1
402 c---------------------------------------------------------------------
403 c now unpack the buffer for the remaining two factors
404 c---------------------------------------------------------------------
407 do j = start(2,c), jsize-end(2,c)-1
408 do i = start(1,c), isize-end(1,c)-1
411 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
412 > lhs(i,j,k,n+4,c)*sm1 -
413 > lhs(i,j,k,n+5,c)*sm2
414 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
415 > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) -
416 > lhs(i,j,k1,n+5,c) * sm1
424 c---------------------------------------------------------------------
425 c now we know this is the first grid block on the back sweep,
426 c so we don't need a message to start the substitution.
427 c---------------------------------------------------------------------
433 do j = start(2,c), jsize-end(2,c)-1
434 do i = start(1,c), isize-end(1,c)-1
435 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
436 > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
443 do j = start(2,c), jsize-end(2,c)-1
444 do i = start(1,c), isize-end(1,c)-1
445 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
446 > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
452 c---------------------------------------------------------------------
453 c Whether or not this is the last processor, we always have
454 c to complete the back-substitution
455 c---------------------------------------------------------------------
457 c---------------------------------------------------------------------
458 c The first three factors
459 c---------------------------------------------------------------------
462 do k = kend-2, kstart, -1
463 do j = start(2,c), jsize-end(2,c)-1
464 do i = start(1,c), isize-end(1,c)-1
467 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
468 > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
469 > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
475 c---------------------------------------------------------------------
476 c And the remaining two
477 c---------------------------------------------------------------------
480 do k = kend-2, kstart, -1
481 do j = start(2,c), jsize-end(2,c)-1
482 do i = start(1,c), isize-end(1,c)-1
485 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
486 > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
487 > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
493 c---------------------------------------------------------------------
494 c send on information to the previous processor, if needed
495 c---------------------------------------------------------------------
496 if (stage .ne. 1) then
501 do j = start(2,c), jsize-end(2,c)-1
502 do i = start(1,c), isize-end(1,c)-1
503 out_buffer(p+1) = rhs(i,j,k,m,c)
504 out_buffer(p+2) = rhs(i,j,k1,m,c)
510 call mpi_isend(out_buffer, 10*buffer_size,
511 > dp_type, predecessor(3),
512 > DEFAULT_TAG, comm_solve,
513 > requests(2), error)
517 c---------------------------------------------------------------------
518 c If this was the last stage, do the block-diagonal inversion
519 c---------------------------------------------------------------------
520 if (stage .eq. 1) call tzetar(c)