Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.03.001
Loading...
Searching...
No Matches
solve_tridi_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58subroutine solve_tridi_&
59&precision_and_suffix &
60 ( obj, na, nev, d, e, q, ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &
61 mpi_comm_cols, usegpu, wantdebug, success, max_threads )
62
63 use precision
67 use elpa_mpi
68 use elpa_utilities
70 use elpa_mpi
71 implicit none
72#include "../../src/general/precision_kinds.F90"
73 class(elpa_abstract_impl_t), intent(inout) :: obj
74 integer(kind=ik), intent(in) :: na, nev, ldq, nblk, matrixCols, &
75 mpi_comm_all, mpi_comm_rows, mpi_comm_cols
76 real(kind=real_datatype), intent(inout) :: d(na), e(na)
77#ifdef USE_ASSUMED_SIZE
78 real(kind=real_datatype), intent(inout) :: q(ldq,*)
79#else
80 real(kind=real_datatype), intent(inout) :: q(ldq,matrixcols)
81#endif
82 logical, intent(in) :: useGPU, wantDebug
83 logical, intent(out) :: success
84
85 integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
86 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
87 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
88 integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
89
90 integer(kind=ik) :: istat
91 character(200) :: errorMessage
92 character(20) :: gpuString
93 integer(kind=ik), intent(in) :: max_threads
94
95 if(usegpu) then
96 gpustring = "_gpu"
97 else
98 gpustring = ""
99 endif
100
101 call obj%timer%start("solve_tridi" // precision_suffix // gpustring)
102
103 call obj%timer%start("mpi_communication")
104 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
105 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
106 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
107 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
108
109 my_prow = int(my_prowmpi,kind=c_int)
110 np_rows = int(np_rowsmpi,kind=c_int)
111 my_pcol = int(my_pcolmpi,kind=c_int)
112 np_cols = int(np_colsmpi,kind=c_int)
113
114 call obj%timer%stop("mpi_communication")
115
116 success = .true.
117
118 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
119 l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
120
121 ! Set Q to 0
122 q(1:l_rows, 1:l_cols) = 0.0_rk
123
124 ! Get the limits of the subdivisons, each subdivison has as many cols
125 ! as fit on the respective processor column
126
127 allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)
128 check_allocate("solve_tridi: limits", istat, errormessage)
129
130 limits(0) = 0
131 do np=0,np_cols-1
132 nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
133
134 ! Check for the case that a column has have zero width.
135 ! This is not supported!
136 ! Scalapack supports it but delivers no results for these columns,
137 ! which is rather annoying
138 if (nc==0) then
139 call obj%timer%stop("solve_tridi" // precision_suffix)
140 if (wantdebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
141 success = .false.
142 return
143 endif
144 limits(np+1) = limits(np) + nc
145 enddo
146
147 ! Subdivide matrix by subtracting rank 1 modifications
148
149 do i=1,np_cols-1
150 n = limits(i)
151 d(n) = d(n)-abs(e(n))
152 d(n+1) = d(n+1)-abs(e(n))
153 enddo
154
155 ! Solve sub problems on processsor columns
156
157 nc = limits(my_pcol) ! column after which my problem starts
158
159 if (np_cols>1) then
160 nev1 = l_cols ! all eigenvectors are needed
161 else
162 nev1 = min(nev,l_cols)
163 endif
164 call solve_tridi_col_&
165 &precision_and_suffix &
166 (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
167 matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads)
168 if (.not.(success)) then
169 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
170 return
171 endif
172 ! If there is only 1 processor column, we are done
173
174 if (np_cols==1) then
175 deallocate(limits, stat=istat, errmsg=errormessage)
176 check_deallocate("solve_tridi: limits", istat, errormessage)
177
178 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
179 return
180 endif
181
182 ! Set index arrays for Q columns
183
184 ! Dense distribution scheme:
185
186 allocate(l_col(na), stat=istat, errmsg=errormessage)
187 check_allocate("solve_tridi: l_col", istat, errormessage)
188
189 allocate(p_col(na), stat=istat, errmsg=errormessage)
190 check_allocate("solve_tridi: p_col", istat, errormessage)
191
192 n = 0
193 do np=0,np_cols-1
194 nc = local_index(na, np, np_cols, nblk, -1)
195 do i=1,nc
196 n = n+1
197 l_col(n) = i
198 p_col(n) = np
199 enddo
200 enddo
201
202 ! Block cyclic distribution scheme, only nev columns are set:
203
204 allocate(l_col_bc(na), stat=istat, errmsg=errormessage)
205 check_allocate("solve_tridi: l_col_bc", istat, errormessage)
206
207 allocate(p_col_bc(na), stat=istat, errmsg=errormessage)
208 check_allocate("solve_tridi: p_col_bc", istat, errormessage)
209
210 p_col_bc(:) = -1
211 l_col_bc(:) = -1
212
213 do i = 0, na-1, nblk*np_cols
214 do j = 0, np_cols-1
215 do n = 1, nblk
216 if (i+j*nblk+n <= min(nev,na)) then
217 p_col_bc(i+j*nblk+n) = j
218 l_col_bc(i+j*nblk+n) = i/np_cols + n
219 endif
220 enddo
221 enddo
222 enddo
223
224 ! Recursively merge sub problems
225 call merge_recursive_&
226 &precision &
227 (obj, 0, np_cols, ldq, matrixcols, nblk, &
228 l_col, p_col, l_col_bc, p_col_bc, limits, &
229 np_cols, na, q, d, e, &
230 mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&
231 usegpu, wantdebug, success, max_threads)
232
233 if (.not.(success)) then
234 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
235 return
236 endif
237
238 deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)
239 check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)
240
241 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
242 return
243
244#if 0
245 contains
246 recursive subroutine merge_recursive_&
247 &precision_and_suffix &
248 (obj, np_off, nprocs, usegpu, wantdebug, success)
249 use precision
251 use merge_systems
252 implicit none
253
254 ! noff is always a multiple of nblk_ev
255 ! nlen-noff is always > nblk_ev
256
257 class(elpa_abstract_impl_t), intent(inout) :: obj
258 integer(kind=ik) :: np_off, nprocs
259 integer(kind=ik) :: np1, np2, noff, nlen, nmid, n
260 logical, intent(in) :: useGPU, wantDebug
261 logical, intent(out) :: success
262
263 success = .true.
264
265 if (nprocs<=1) then
266 ! Safety check only
267 if (wantdebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
268 success = .false.
269 return
270 endif
271 ! Split problem into 2 subproblems of size np1 / np2
272
273 np1 = nprocs/2
274 np2 = nprocs-np1
275
276 if (np1 > 1) call merge_recursive_&
277 &precision_and_suffix &
278 (obj, np_off, np1, usegpu, wantdebug, success)
279 if (.not.(success)) return
280 if (np2 > 1) call merge_recursive_&
281 &precision_and_suffix &
282 (obj, np_off+np1, np2, usegpu, wantdebug, success)
283 if (.not.(success)) return
284
285 noff = limits(np_off)
286 nmid = limits(np_off+np1) - noff
287 nlen = limits(np_off+nprocs) - noff
288
289#ifdef WITH_MPI
290 call obj%timer%start("mpi_communication")
291 if (my_pcol==np_off) then
292 do n=np_off+np1,np_off+nprocs-1
293 call mpi_send(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), 1_mpi_kind, &
294 int(mpi_comm_cols,kind=mpi_kind), mpierr)
295 enddo
296 endif
297 call obj%timer%stop("mpi_communication")
298#endif /* WITH_MPI */
299
300 if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
301#ifdef WITH_MPI
302 call obj%timer%start("mpi_communication")
303 call mpi_recv(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(np_off,kind=mpi_kind), 1_mpi_kind, &
304 int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
305 call obj%timer%stop("mpi_communication")
306#else /* WITH_MPI */
307! d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
308#endif /* WITH_MPI */
309 endif
310
311 if (my_pcol==np_off+np1) then
312 do n=np_off,np_off+np1-1
313#ifdef WITH_MPI
314 call obj%timer%start("mpi_communication")
315 call mpi_send(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), &
316 1_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpierr)
317 call obj%timer%stop("mpi_communication")
318#endif /* WITH_MPI */
319
320 enddo
321 endif
322 if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
323#ifdef WITH_MPI
324 call obj%timer%start("mpi_communication")
325 call mpi_recv(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(np_off+np1,kind=mpi_kind), &
326 1_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
327 call obj%timer%stop("mpi_communication")
328#else /* WITH_MPI */
329! d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
330#endif /* WITH_MPI */
331 endif
332 if (nprocs == np_cols) then
333
334 ! Last merge, result distribution must be block cyclic, noff==0,
335 ! p_col_bc is set so that only nev eigenvalues are calculated
336 call merge_systems_&
337 &precision &
338 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
339 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
340 l_col, p_col, &
341 l_col_bc, p_col_bc, np_off, nprocs, usegpu, wantdebug, success, max_threads )
342 if (.not.(success)) return
343 else
344 ! Not last merge, leave dense column distribution
345 call merge_systems_&
346 &precision &
347 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
348 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
349 l_col(noff+1), p_col(noff+1), &
350 l_col(noff+1), p_col(noff+1), np_off, nprocs, usegpu, wantdebug, success, max_threads )
351 if (.not.(success)) return
352 endif
353 end subroutine merge_recursive_&
354 &precision_and_suffix
355#endif
356 end subroutine solve_tridi_&
357 &precision_and_suffix
358
359 subroutine solve_tridi_col_&
360 &precision_and_suffix &
361 ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads )
362
363 ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
364 ! with the divide and conquer method.
365 ! Works best if the number of processor rows is a power of 2!
366 use precision
368 use elpa_mpi
369 use merge_systems
370 use elpa_utilities
372 implicit none
373 class(elpa_abstract_impl_t), intent(inout) :: obj
374
375 integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
376 real(kind=real_datatype) :: d(na), e(na)
377#ifdef USE_ASSUMED_SIZE
378 real(kind=real_datatype) :: q(ldq,*)
379#else
380 real(kind=real_datatype) :: q(ldq,matrixcols)
381#endif
382
383 integer(kind=ik), parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
384
385 real(kind=real_datatype), allocatable :: qmat1(:,:), qmat2(:,:)
386 integer(kind=ik) :: i, n, np
387 integer(kind=ik) :: ndiv, noff, nmid, nlen, max_size
388 integer(kind=ik) :: my_prow, np_rows
389 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, np_rowsMPI
390
391 integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
392 logical, intent(in) :: useGPU, wantDebug
393 logical, intent(out) :: success
394 integer(kind=ik) :: istat
395 character(200) :: errorMessage
396
397 integer(kind=ik), intent(in) :: max_threads
398
399 integer(kind=MPI_KIND) :: bcast_request1, bcast_request2
400 logical :: useNonBlockingCollectivesRows
401 integer(kind=c_int) :: non_blocking_collectives, error
402
403 success = .true.
404
405 call obj%timer%start("solve_tridi_col" // precision_suffix)
406
407 call obj%get("nbc_row_solve_tridi", non_blocking_collectives, error)
408 if (error .ne. elpa_ok) then
409 write(error_unit,*) "Problem setting option for non blocking collectives for rows in solve_tridi. Aborting..."
410 success = .false.
411 return
412 endif
413
414 if (non_blocking_collectives .eq. 1) then
415 usenonblockingcollectivesrows = .true.
416 else
417 usenonblockingcollectivesrows = .false.
418 endif
419
420 call obj%timer%start("mpi_communication")
421 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind), my_prowmpi, mpierr)
422 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind), np_rowsmpi, mpierr)
423
424 my_prow = int(my_prowmpi,kind=c_int)
425 np_rows = int(np_rowsmpi,kind=c_int)
426 call obj%timer%stop("mpi_communication")
427 success = .true.
428 ! Calculate the number of subdivisions needed.
429
430 n = na
431 ndiv = 1
432 do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
433 n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
434 ndiv = ndiv*2
435 enddo
436
437 ! If there is only 1 processor row and not all eigenvectors are needed
438 ! and the matrix size is big enough, then use 2 subdivisions
439 ! so that merge_systems is called once and only the needed
440 ! eigenvectors are calculated for the final problem.
441
442 if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
443
444 allocate(limits(0:ndiv), stat=istat, errmsg=errormessage)
445 check_deallocate("solve_tridi_col: limits", istat, errormessage)
446
447 limits(0) = 0
448 limits(ndiv) = na
449
450 n = ndiv
451 do while(n>1)
452 n = n/2 ! n is always a power of 2
453 do i=0,ndiv-1,2*n
454 ! We want to have even boundaries (for cache line alignments)
455 limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
456 enddo
457 enddo
458
459 ! Calculate the maximum size of a subproblem
460
461 max_size = 0
462 do i=1,ndiv
463 max_size = max(max_size,limits(i)-limits(i-1))
464 enddo
465
466 ! Subdivide matrix by subtracting rank 1 modifications
467
468 do i=1,ndiv-1
469 n = limits(i)
470 d(n) = d(n)-abs(e(n))
471 d(n+1) = d(n+1)-abs(e(n))
472 enddo
473
474 if (np_rows==1) then
475
476 ! For 1 processor row there may be 1 or 2 subdivisions
477 do n=0,ndiv-1
478 noff = limits(n) ! Start of subproblem
479 nlen = limits(n+1)-noff ! Size of subproblem
480
481 call solve_tridi_single_problem_&
482 &precision_and_suffix &
483 (obj, nlen,d(noff+1),e(noff+1), &
484 q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantdebug, success)
485
486 if (.not.(success)) return
487 enddo
488
489 else
490
491 ! Solve sub problems in parallel with solve_tridi_single
492 ! There is at maximum 1 subproblem per processor
493
494 allocate(qmat1(max_size,max_size), stat=istat, errmsg=errormessage)
495 check_deallocate("solve_tridi_col: qmat1", istat, errormessage)
496
497 allocate(qmat2(max_size,max_size), stat=istat, errmsg=errormessage)
498 check_deallocate("solve_tridi_col: qmat2", istat, errormessage)
499
500 qmat1 = 0 ! Make sure that all elements are defined
501
502 if (my_prow < ndiv) then
503
504 noff = limits(my_prow) ! Start of subproblem
505 nlen = limits(my_prow+1)-noff ! Size of subproblem
506 call solve_tridi_single_problem_&
507 &precision_and_suffix &
508 (obj, nlen,d(noff+1),e(noff+1),qmat1, &
509 ubound(qmat1,dim=1), wantdebug, success)
510
511 if (.not.(success)) return
512 endif
513
514 ! Fill eigenvectors in qmat1 into global matrix q
515
516 do np = 0, ndiv-1
517
518 noff = limits(np)
519 nlen = limits(np+1)-noff
520#ifdef WITH_MPI
521 if (usenonblockingcollectivesrows) then
522 call obj%timer%start("mpi_nbc_communication")
523 call mpi_ibcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
524 int(mpi_comm_rows,kind=mpi_kind), bcast_request1, mpierr)
525 call mpi_wait(bcast_request1, mpi_status_ignore, mpierr)
526
527 qmat2 = qmat1
528 call mpi_ibcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
529 int(mpi_comm_rows,kind=mpi_kind), bcast_request2, mpierr)
530 call mpi_wait(bcast_request2, mpi_status_ignore, mpierr)
531 call obj%timer%stop("mpi_nbc_communication")
532 else
533 call obj%timer%start("mpi_communication")
534 call mpi_bcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
535 int(mpi_comm_rows,kind=mpi_kind), mpierr)
536
537 qmat2 = qmat1
538 call mpi_bcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
539 int(mpi_comm_rows,kind=mpi_kind), mpierr)
540 call obj%timer%stop("mpi_communication")
541 endif
542#else /* WITH_MPI */
543! qmat2 = qmat1 ! is this correct
544#endif /* WITH_MPI */
545 do i=1,nlen
546
547#ifdef WITH_MPI
548 call distribute_global_column_&
549 &precision &
550 (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
551#else /* WITH_MPI */
552 call distribute_global_column_&
553 &precision &
554 (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
555#endif /* WITH_MPI */
556 enddo
557
558 enddo
559
560 deallocate(qmat1, qmat2, stat=istat, errmsg=errormessage)
561 check_deallocate("solve_tridi_col: qmat1, qmat2", istat, errormessage)
562
563 endif
564
565 ! Allocate and set index arrays l_col and p_col
566
567 allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errormessage)
568 check_deallocate("solve_tridi_col: l_col, p_col_i, p_col_o", istat, errormessage)
569
570 do i=1,na
571 l_col(i) = i
572 p_col_i(i) = 0
573 p_col_o(i) = 0
574 enddo
575
576 ! Merge subproblems
577
578 n = 1
579 do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single
580
581 do i=0,ndiv-1,2*n
582
583 noff = limits(i)
584 nmid = limits(i+n) - noff
585 nlen = limits(i+2*n) - noff
586
587 if (nlen == na) then
588 ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
589 p_col_o(nev+1:na) = -1
590 endif
591 call merge_systems_&
592 &precision &
593 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
594 matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_self,kind=ik), &
595 l_col(noff+1), p_col_i(noff+1), &
596 l_col(noff+1), p_col_o(noff+1), 0, 1, usegpu, wantdebug, success, max_threads)
597 if (.not.(success)) return
598
599 enddo
600
601 n = 2*n
602
603 enddo
604
605 deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errormessage)
606 check_deallocate("solve_tridi_col: limits, l_col, p_col_i, p_col_o", istat, errormessage)
607
608 call obj%timer%stop("solve_tridi_col" // precision_suffix)
609
610 end subroutine solve_tridi_col_&
611 &precision_and_suffix
612
613 subroutine solve_tridi_single_problem_&
614 &precision_and_suffix &
615 (obj, nlen, d, e, q, ldq, wantdebug, success)
616
617 ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
618 ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
619 use precision
621 use elpa_blas_interfaces
622 use elpa_utilities
623 implicit none
624 class(elpa_abstract_impl_t), intent(inout) :: obj
625 integer(kind=ik) :: nlen, ldq
626 real(kind=real_datatype) :: d(nlen), e(nlen), q(ldq,nlen)
627
628 real(kind=real_datatype), allocatable :: work(:), qtmp(:), ds(:), es(:)
629 real(kind=real_datatype) :: dtmp
630
631 integer(kind=ik) :: i, j, lwork, liwork, info
632 integer(kind=BLAS_KIND) :: infoBLAS
633 integer(kind=ik), allocatable :: iwork(:)
634
635 logical, intent(in) :: wantDebug
636 logical, intent(out) :: success
637 integer(kind=ik) :: istat
638 character(200) :: errorMessage
639
640 call obj%timer%start("solve_tridi_single" // precision_suffix)
641
642 success = .true.
643 allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)
644 check_allocate("solve_tridi_single: ds, es", istat, errormessage)
645
646 ! Save d and e for the case that dstedc fails
647
648 ds(:) = d(:)
649 es(:) = e(:)
650
651 ! First try dstedc, this is normally faster but it may fail sometimes (why???)
652
653 lwork = 1 + 4*nlen + nlen**2
654 liwork = 3 + 5*nlen
655 allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errormessage)
656 check_allocate("solve_tridi_single: work, iwork", istat, errormessage)
657 call obj%timer%start("lapack")
658 call precision_stedc('I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), &
659 work, int(lwork,kind=blas_kind), int(iwork,kind=blas_kind), int(liwork,kind=blas_kind), &
660 infoblas)
661 info = int(infoblas,kind=ik)
662 call obj%timer%stop("lapack")
663
664 if (info /= 0) then
665
666 ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.
667
668 write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'
669
670 d(:) = ds(:)
671 e(:) = es(:)
672 call obj%timer%start("lapack")
673 call precision_steqr('I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), work, infoblas )
674 info = int(infoblas,kind=ik)
675 call obj%timer%stop("lapack")
676
677 ! If DSTEQR fails also, we don't know what to do further ...
678
679 if (info /= 0) then
680 if (wantdebug) then
681 write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
682 endif
683 success = .false.
684 return
685 endif
686 end if
687
688 deallocate(work,iwork,ds,es, stat=istat, errmsg=errormessage)
689 check_deallocate("solve_tridi_single: work, iwork, ds, es", istat, errormessage)
690
691 ! Check if eigenvalues are monotonically increasing
692 ! This seems to be not always the case (in the IBM implementation of dstedc ???)
693
694 do i=1,nlen-1
695 if (d(i+1)<d(i)) then
696#ifdef DOUBLE_PRECISION_REAL
697 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
698#else
699 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
700#endif
701 write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
702 else
703 write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
704 write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
705 write(error_unit,'(a)') 'In this extent, this is completely harmless.'
706 write(error_unit,'(a)') 'Still, we keep this info message just in case.'
707 end if
708 allocate(qtmp(nlen), stat=istat, errmsg=errormessage)
709 check_allocate("solve_tridi_single: qtmp", istat, errormessage)
710
711 dtmp = d(i+1)
712 qtmp(1:nlen) = q(1:nlen,i+1)
713 do j=i,1,-1
714 if (dtmp<d(j)) then
715 d(j+1) = d(j)
716 q(1:nlen,j+1) = q(1:nlen,j)
717 else
718 exit ! Loop
719 endif
720 enddo
721 d(j+1) = dtmp
722 q(1:nlen,j+1) = qtmp(1:nlen)
723 deallocate(qtmp, stat=istat, errmsg=errormessage)
724 check_deallocate("solve_tridi_single: qtmp", istat, errormessage)
725
726 endif
727 enddo
728 call obj%timer%stop("solve_tridi_single" // precision_suffix)
729
730 end subroutine solve_tridi_single_problem_&
731 &precision_and_suffix
732
Definition mod_distribute_global_column.F90:55
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_merge_recursive.F90:3
Definition mod_merge_systems.F90:3
Definition elpa_abstract_impl.F90:73