Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.05.001.rc1
Loading...
Searching...
No Matches
solve_tridi_col_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58 subroutine solve_tridi_col_&
59 &precision_and_suffix &
60 ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads )
61
62 ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
63 ! with the divide and conquer method.
64 ! Works best if the number of processor rows is a power of 2!
65 use precision
67 use elpa_mpi
69 use elpa_utilities
71 implicit none
72 class(elpa_abstract_impl_t), intent(inout) :: obj
73
74 integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
75 real(kind=real_datatype) :: d(na), e(na)
76#ifdef USE_ASSUMED_SIZE
77 real(kind=real_datatype) :: q(ldq,*)
78#else
79 real(kind=real_datatype) :: q(ldq,matrixcols)
80#endif
81
82 integer(kind=ik), parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
83
84 real(kind=real_datatype), allocatable :: qmat1(:,:), qmat2(:,:)
85 integer(kind=ik) :: i, n, np
86 integer(kind=ik) :: ndiv, noff, nmid, nlen, max_size
87 integer(kind=ik) :: my_prow, np_rows
88 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, np_rowsMPI
89
90 integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
91 logical, intent(in) :: useGPU, wantDebug
92 logical, intent(out) :: success
93 integer(kind=ik) :: istat
94 character(200) :: errorMessage
95
96 integer(kind=ik), intent(in) :: max_threads
97
98 integer(kind=MPI_KIND) :: bcast_request1, bcast_request2
99 logical :: useNonBlockingCollectivesRows
100 integer(kind=c_int) :: non_blocking_collectives, error
101
102 success = .true.
103
104 call obj%timer%start("solve_tridi_col" // precision_suffix)
105
106 call obj%get("nbc_row_solve_tridi", non_blocking_collectives, error)
107 if (error .ne. elpa_ok) then
108 write(error_unit,*) "Problem setting option for non blocking collectives for rows in solve_tridi. Aborting..."
109 success = .false.
110 return
111 endif
112
113 if (non_blocking_collectives .eq. 1) then
114 usenonblockingcollectivesrows = .true.
115 else
116 usenonblockingcollectivesrows = .false.
117 endif
118
119 call obj%timer%start("mpi_communication")
120 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind), my_prowmpi, mpierr)
121 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind), np_rowsmpi, mpierr)
122
123 my_prow = int(my_prowmpi,kind=c_int)
124 np_rows = int(np_rowsmpi,kind=c_int)
125 call obj%timer%stop("mpi_communication")
126 success = .true.
127 ! Calculate the number of subdivisions needed.
128
129 n = na
130 ndiv = 1
131 do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
132 n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
133 ndiv = ndiv*2
134 enddo
135
136 ! If there is only 1 processor row and not all eigenvectors are needed
137 ! and the matrix size is big enough, then use 2 subdivisions
138 ! so that merge_systems is called once and only the needed
139 ! eigenvectors are calculated for the final problem.
140
141 if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
142
143 allocate(limits(0:ndiv), stat=istat, errmsg=errormessage)
144 check_deallocate("solve_tridi_col: limits", istat, errormessage)
145
146 limits(0) = 0
147 limits(ndiv) = na
148
149 n = ndiv
150 do while(n>1)
151 n = n/2 ! n is always a power of 2
152 do i=0,ndiv-1,2*n
153 ! We want to have even boundaries (for cache line alignments)
154 limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
155 enddo
156 enddo
157
158 ! Calculate the maximum size of a subproblem
159
160 max_size = 0
161 do i=1,ndiv
162 max_size = max(max_size,limits(i)-limits(i-1))
163 enddo
164
165 ! Subdivide matrix by subtracting rank 1 modifications
166
167 do i=1,ndiv-1
168 n = limits(i)
169 d(n) = d(n)-abs(e(n))
170 d(n+1) = d(n+1)-abs(e(n))
171 enddo
172
173 if (np_rows==1) then
174
175 ! For 1 processor row there may be 1 or 2 subdivisions
176 do n=0,ndiv-1
177 noff = limits(n) ! Start of subproblem
178 nlen = limits(n+1)-noff ! Size of subproblem
179
180 call solve_tridi_single_problem_&
181 &precision_and_suffix &
182 (obj, nlen,d(noff+1),e(noff+1), &
183 q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantdebug, success)
184
185 if (.not.(success)) return
186 enddo
187
188 else
189
190 ! Solve sub problems in parallel with solve_tridi_single
191 ! There is at maximum 1 subproblem per processor
192
193 allocate(qmat1(max_size,max_size), stat=istat, errmsg=errormessage)
194 check_deallocate("solve_tridi_col: qmat1", istat, errormessage)
195
196 allocate(qmat2(max_size,max_size), stat=istat, errmsg=errormessage)
197 check_deallocate("solve_tridi_col: qmat2", istat, errormessage)
198
199 qmat1 = 0 ! Make sure that all elements are defined
200
201 if (my_prow < ndiv) then
202
203 noff = limits(my_prow) ! Start of subproblem
204 nlen = limits(my_prow+1)-noff ! Size of subproblem
205 call solve_tridi_single_problem_&
206 &precision_and_suffix &
207 (obj, nlen,d(noff+1),e(noff+1),qmat1, &
208 ubound(qmat1,dim=1), wantdebug, success)
209
210 if (.not.(success)) return
211 endif
212
213 ! Fill eigenvectors in qmat1 into global matrix q
214
215 do np = 0, ndiv-1
216
217 noff = limits(np)
218 nlen = limits(np+1)-noff
219#ifdef WITH_MPI
220 if (usenonblockingcollectivesrows) then
221 call obj%timer%start("mpi_nbc_communication")
222 call mpi_ibcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
223 int(mpi_comm_rows,kind=mpi_kind), bcast_request1, mpierr)
224 call mpi_wait(bcast_request1, mpi_status_ignore, mpierr)
225
226 qmat2 = qmat1
227 call mpi_ibcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
228 int(mpi_comm_rows,kind=mpi_kind), bcast_request2, mpierr)
229 call mpi_wait(bcast_request2, mpi_status_ignore, mpierr)
230 call obj%timer%stop("mpi_nbc_communication")
231 else
232 call obj%timer%start("mpi_communication")
233 call mpi_bcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
234 int(mpi_comm_rows,kind=mpi_kind), mpierr)
235
236 qmat2 = qmat1
237 call mpi_bcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
238 int(mpi_comm_rows,kind=mpi_kind), mpierr)
239 call obj%timer%stop("mpi_communication")
240 endif
241#else /* WITH_MPI */
242! qmat2 = qmat1 ! is this correct
243#endif /* WITH_MPI */
244 do i=1,nlen
245
246#ifdef WITH_MPI
247 call distribute_global_column_&
248 &precision &
249 (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
250#else /* WITH_MPI */
251 call distribute_global_column_&
252 &precision &
253 (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
254#endif /* WITH_MPI */
255 enddo
256
257 enddo
258
259 deallocate(qmat1, qmat2, stat=istat, errmsg=errormessage)
260 check_deallocate("solve_tridi_col: qmat1, qmat2", istat, errormessage)
261
262 endif
263
264 ! Allocate and set index arrays l_col and p_col
265
266 allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errormessage)
267 check_deallocate("solve_tridi_col: l_col, p_col_i, p_col_o", istat, errormessage)
268
269 do i=1,na
270 l_col(i) = i
271 p_col_i(i) = 0
272 p_col_o(i) = 0
273 enddo
274
275 ! Merge subproblems
276
277 n = 1
278 do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single
279
280 do i=0,ndiv-1,2*n
281
282 noff = limits(i)
283 nmid = limits(i+n) - noff
284 nlen = limits(i+2*n) - noff
285
286 if (nlen == na) then
287 ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
288 p_col_o(nev+1:na) = -1
289 endif
290 call merge_systems_&
291 &precision &
292 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
293 matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_self,kind=ik), &
294 l_col(noff+1), p_col_i(noff+1), &
295 l_col(noff+1), p_col_o(noff+1), 0, 1, usegpu, wantdebug, success, max_threads)
296 if (.not.(success)) return
297
298 enddo
299
300 n = 2*n
301
302 enddo
303
304 deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errormessage)
305 check_deallocate("solve_tridi_col: limits, l_col, p_col_i, p_col_o", istat, errormessage)
306
307 call obj%timer%stop("solve_tridi_col" // precision_suffix)
308
309 end subroutine solve_tridi_col_&
310 &precision_and_suffix
311
312 subroutine solve_tridi_single_problem_&
313 &precision_and_suffix &
314 (obj, nlen, d, e, q, ldq, wantdebug, success)
315
316 ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
317 ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
318 use precision
320 use elpa_blas_interfaces
321 use elpa_utilities
322 implicit none
323 class(elpa_abstract_impl_t), intent(inout) :: obj
324 integer(kind=ik) :: nlen, ldq
325 real(kind=real_datatype) :: d(nlen), e(nlen), q(ldq,nlen)
326
327 real(kind=real_datatype), allocatable :: work(:), qtmp(:), ds(:), es(:)
328 real(kind=real_datatype) :: dtmp
329
330 integer(kind=ik) :: i, j, lwork, liwork, info
331 integer(kind=BLAS_KIND) :: infoBLAS
332 integer(kind=ik), allocatable :: iwork(:)
333
334 logical, intent(in) :: wantDebug
335 logical, intent(out) :: success
336 integer(kind=ik) :: istat
337 character(200) :: errorMessage
338
339 call obj%timer%start("solve_tridi_single" // precision_suffix)
340
341 success = .true.
342 allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)
343 check_allocate("solve_tridi_single: ds, es", istat, errormessage)
344
345 ! Save d and e for the case that dstedc fails
346
347 ds(:) = d(:)
348 es(:) = e(:)
349
350 ! First try dstedc, this is normally faster but it may fail sometimes (why???)
351
352 lwork = 1 + 4*nlen + nlen**2
353 liwork = 3 + 5*nlen
354 allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errormessage)
355 check_allocate("solve_tridi_single: work, iwork", istat, errormessage)
356 call obj%timer%start("lapack")
357 call precision_stedc('I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), &
358 work, int(lwork,kind=blas_kind), int(iwork,kind=blas_kind), int(liwork,kind=blas_kind), &
359 infoblas)
360 info = int(infoblas,kind=ik)
361 call obj%timer%stop("lapack")
362
363 if (info /= 0) then
364
365 ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.
366
367 write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'
368
369 d(:) = ds(:)
370 e(:) = es(:)
371 call obj%timer%start("lapack")
372 call precision_steqr('I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), work, infoblas )
373 info = int(infoblas,kind=ik)
374 call obj%timer%stop("lapack")
375
376 ! If DSTEQR fails also, we don't know what to do further ...
377
378 if (info /= 0) then
379 if (wantdebug) then
380 write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
381 endif
382 success = .false.
383 return
384 endif
385 end if
386
387 deallocate(work,iwork,ds,es, stat=istat, errmsg=errormessage)
388 check_deallocate("solve_tridi_single: work, iwork, ds, es", istat, errormessage)
389
390 ! Check if eigenvalues are monotonically increasing
391 ! This seems to be not always the case (in the IBM implementation of dstedc ???)
392
393 do i=1,nlen-1
394 if (d(i+1)<d(i)) then
395#ifdef DOUBLE_PRECISION_REAL
396 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
397#else
398 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
399#endif
400 write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
401 else
402 write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
403 write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
404 write(error_unit,'(a)') 'In this extent, this is completely harmless.'
405 write(error_unit,'(a)') 'Still, we keep this info message just in case.'
406 end if
407 allocate(qtmp(nlen), stat=istat, errmsg=errormessage)
408 check_allocate("solve_tridi_single: qtmp", istat, errormessage)
409
410 dtmp = d(i+1)
411 qtmp(1:nlen) = q(1:nlen,i+1)
412 do j=i,1,-1
413 if (dtmp<d(j)) then
414 d(j+1) = d(j)
415 q(1:nlen,j+1) = q(1:nlen,j)
416 else
417 exit ! Loop
418 endif
419 enddo
420 d(j+1) = dtmp
421 q(1:nlen,j+1) = qtmp(1:nlen)
422 deallocate(qtmp, stat=istat, errmsg=errormessage)
423 check_deallocate("solve_tridi_single: qtmp", istat, errormessage)
424
425 endif
426 enddo
427 call obj%timer%stop("solve_tridi_single" // precision_suffix)
428
429 end subroutine solve_tridi_single_problem_&
430 &precision_and_suffix
431
Definition mod_distribute_global_column.F90:55
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_merge_systems.F90:3
Definition elpa_abstract_impl.F90:73