58subroutine solve_tridi_&
59&precision_and_suffix &
60 ( obj, na, nev, d, e, q, ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &
61 mpi_comm_cols, usegpu, wantdebug, success, max_threads )
72#include "../../src/general/precision_kinds.F90"
74 integer(kind=ik),
intent(in) :: na, nev, ldq, nblk, matrixCols, &
75 mpi_comm_all, mpi_comm_rows, mpi_comm_cols
76 real(kind=real_datatype),
intent(inout) :: d(na), e(na)
77#ifdef USE_ASSUMED_SIZE
78 real(kind=real_datatype),
intent(inout) :: q(ldq,*)
80 real(kind=real_datatype),
intent(inout) :: q(ldq,matrixcols)
82 logical,
intent(in) :: useGPU, wantDebug
83 logical,
intent(out) :: success
85 integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
86 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
87 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
88 integer(kind=ik),
allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
90 integer(kind=ik) :: istat
91 character(200) :: errorMessage
92 character(20) :: gpuString
93 integer(kind=ik),
intent(in) :: max_threads
101 call obj%timer%start(
"solve_tridi" // precision_suffix // gpustring)
103 call obj%timer%start(
"mpi_communication")
104 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
105 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
106 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
107 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
109 my_prow = int(my_prowmpi,kind=c_int)
110 np_rows = int(np_rowsmpi,kind=c_int)
111 my_pcol = int(my_pcolmpi,kind=c_int)
112 np_cols = int(np_colsmpi,kind=c_int)
114 call obj%timer%stop(
"mpi_communication")
118 l_rows = local_index(na, my_prow, np_rows, nblk, -1)
119 l_cols = local_index(na, my_pcol, np_cols, nblk, -1)
122 q(1:l_rows, 1:l_cols) = 0.0_rk
127 allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)
128 check_allocate(
"solve_tridi: limits", istat, errormessage)
132 nc = local_index(na, np, np_cols, nblk, -1)
139 call obj%timer%stop(
"solve_tridi" // precision_suffix)
140 if (wantdebug)
write(error_unit,*)
'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
144 limits(np+1) = limits(np) + nc
151 d(n) = d(n)-abs(e(n))
152 d(n+1) = d(n+1)-abs(e(n))
162 nev1 = min(nev,l_cols)
164 call solve_tridi_col_&
165 &precision_and_suffix &
166 (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
167 matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads)
168 if (.not.(success))
then
169 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
175 deallocate(limits, stat=istat, errmsg=errormessage)
176 check_deallocate(
"solve_tridi: limits", istat, errormessage)
178 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
186 allocate(l_col(na), stat=istat, errmsg=errormessage)
187 check_allocate(
"solve_tridi: l_col", istat, errormessage)
189 allocate(p_col(na), stat=istat, errmsg=errormessage)
190 check_allocate(
"solve_tridi: p_col", istat, errormessage)
194 nc = local_index(na, np, np_cols, nblk, -1)
204 allocate(l_col_bc(na), stat=istat, errmsg=errormessage)
205 check_allocate(
"solve_tridi: l_col_bc", istat, errormessage)
207 allocate(p_col_bc(na), stat=istat, errmsg=errormessage)
208 check_allocate(
"solve_tridi: p_col_bc", istat, errormessage)
213 do i = 0, na-1, nblk*np_cols
216 if (i+j*nblk+n <= min(nev,na))
then
217 p_col_bc(i+j*nblk+n) = j
218 l_col_bc(i+j*nblk+n) = i/np_cols + n
225 call merge_recursive_&
227 (obj, 0, np_cols, ldq, matrixcols, nblk, &
228 l_col, p_col, l_col_bc, p_col_bc, limits, &
229 np_cols, na, q, d, e, &
230 mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&
231 usegpu, wantdebug, success, max_threads)
233 if (.not.(success))
then
234 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
238 deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)
239 check_deallocate(
"solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)
241 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
246 recursive subroutine merge_recursive_&
247 &precision_and_suffix &
248 (obj, np_off, nprocs, usegpu, wantdebug, success)
258 integer(kind=ik) :: np_off, nprocs
259 integer(kind=ik) :: np1, np2, noff, nlen, nmid, n
260 logical,
intent(in) :: useGPU, wantDebug
261 logical,
intent(out) :: success
267 if (wantdebug)
write(error_unit,*)
"ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
276 if (np1 > 1)
call merge_recursive_&
277 &precision_and_suffix &
278 (obj, np_off, np1, usegpu, wantdebug, success)
279 if (.not.(success))
return
280 if (np2 > 1)
call merge_recursive_&
281 &precision_and_suffix &
282 (obj, np_off+np1, np2, usegpu, wantdebug, success)
283 if (.not.(success))
return
285 noff = limits(np_off)
286 nmid = limits(np_off+np1) - noff
287 nlen = limits(np_off+nprocs) - noff
290 call obj%timer%start(
"mpi_communication")
291 if (my_pcol==np_off)
then
292 do n=np_off+np1,np_off+nprocs-1
293 call mpi_send(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), 1_mpi_kind, &
294 int(mpi_comm_cols,kind=mpi_kind), mpierr)
297 call obj%timer%stop(
"mpi_communication")
300 if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs)
then
302 call obj%timer%start(
"mpi_communication")
303 call mpi_recv(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(np_off,kind=mpi_kind), 1_mpi_kind, &
304 int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
305 call obj%timer%stop(
"mpi_communication")
311 if (my_pcol==np_off+np1)
then
312 do n=np_off,np_off+np1-1
314 call obj%timer%start(
"mpi_communication")
315 call mpi_send(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), &
316 1_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpierr)
317 call obj%timer%stop(
"mpi_communication")
322 if (my_pcol>=np_off .and. my_pcol<np_off+np1)
then
324 call obj%timer%start(
"mpi_communication")
325 call mpi_recv(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(np_off+np1,kind=mpi_kind), &
326 1_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
327 call obj%timer%stop(
"mpi_communication")
332 if (nprocs == np_cols)
then
338 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
339 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
341 l_col_bc, p_col_bc, np_off, nprocs, usegpu, wantdebug, success, max_threads )
342 if (.not.(success))
return
347 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
348 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
349 l_col(noff+1), p_col(noff+1), &
350 l_col(noff+1), p_col(noff+1), np_off, nprocs, usegpu, wantdebug, success, max_threads )
351 if (.not.(success))
return
353 end subroutine merge_recursive_&
354 &precision_and_suffix
359 subroutine solve_tridi_col_&
360 &precision_and_suffix &
361 ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads )
375 integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
376 real(kind=real_datatype) :: d(na), e(na)
377#ifdef USE_ASSUMED_SIZE
378 real(kind=real_datatype) :: q(ldq,*)
380 real(kind=real_datatype) :: q(ldq,matrixcols)
383 integer(kind=ik),
parameter :: min_submatrix_size = 16
385 real(kind=real_datatype),
allocatable :: qmat1(:,:), qmat2(:,:)
386 integer(kind=ik) :: i, n, np
387 integer(kind=ik) :: ndiv, noff, nmid, nlen, max_size
388 integer(kind=ik) :: my_prow, np_rows
389 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, np_rowsMPI
391 integer(kind=ik),
allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
392 logical,
intent(in) :: useGPU, wantDebug
393 logical,
intent(out) :: success
394 integer(kind=ik) :: istat
395 character(200) :: errorMessage
397 integer(kind=ik),
intent(in) :: max_threads
399 integer(kind=MPI_KIND) :: bcast_request1, bcast_request2
400 logical :: useNonBlockingCollectivesRows
401 integer(kind=c_int) :: non_blocking_collectives, error
405 call obj%timer%start(
"solve_tridi_col" // precision_suffix)
407 call obj%get(
"nbc_row_solve_tridi", non_blocking_collectives, error)
408 if (error .ne. elpa_ok)
then
409 write(error_unit,*)
"Problem setting option for non blocking collectives for rows in solve_tridi. Aborting..."
414 if (non_blocking_collectives .eq. 1)
then
415 usenonblockingcollectivesrows = .true.
417 usenonblockingcollectivesrows = .false.
420 call obj%timer%start(
"mpi_communication")
421 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind), my_prowmpi, mpierr)
422 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind), np_rowsmpi, mpierr)
424 my_prow = int(my_prowmpi,kind=c_int)
425 np_rows = int(np_rowsmpi,kind=c_int)
426 call obj%timer%stop(
"mpi_communication")
432 do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
442 if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
444 allocate(limits(0:ndiv), stat=istat, errmsg=errormessage)
445 check_deallocate(
"solve_tridi_col: limits", istat, errormessage)
455 limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
463 max_size = max(max_size,limits(i)-limits(i-1))
470 d(n) = d(n)-abs(e(n))
471 d(n+1) = d(n+1)-abs(e(n))
479 nlen = limits(n+1)-noff
481 call solve_tridi_single_problem_&
482 &precision_and_suffix &
483 (obj, nlen,d(noff+1),e(noff+1), &
484 q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantdebug, success)
486 if (.not.(success))
return
494 allocate(qmat1(max_size,max_size), stat=istat, errmsg=errormessage)
495 check_deallocate(
"solve_tridi_col: qmat1", istat, errormessage)
497 allocate(qmat2(max_size,max_size), stat=istat, errmsg=errormessage)
498 check_deallocate(
"solve_tridi_col: qmat2", istat, errormessage)
502 if (my_prow < ndiv)
then
504 noff = limits(my_prow)
505 nlen = limits(my_prow+1)-noff
506 call solve_tridi_single_problem_&
507 &precision_and_suffix &
508 (obj, nlen,d(noff+1),e(noff+1),qmat1, &
509 ubound(qmat1,dim=1), wantdebug, success)
511 if (.not.(success))
return
519 nlen = limits(np+1)-noff
521 if (usenonblockingcollectivesrows)
then
522 call obj%timer%start(
"mpi_nbc_communication")
523 call mpi_ibcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
524 int(mpi_comm_rows,kind=mpi_kind), bcast_request1, mpierr)
525 call mpi_wait(bcast_request1, mpi_status_ignore, mpierr)
528 call mpi_ibcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
529 int(mpi_comm_rows,kind=mpi_kind), bcast_request2, mpierr)
530 call mpi_wait(bcast_request2, mpi_status_ignore, mpierr)
531 call obj%timer%stop(
"mpi_nbc_communication")
533 call obj%timer%start(
"mpi_communication")
534 call mpi_bcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
535 int(mpi_comm_rows,kind=mpi_kind), mpierr)
538 call mpi_bcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
539 int(mpi_comm_rows,kind=mpi_kind), mpierr)
540 call obj%timer%stop(
"mpi_communication")
548 call distribute_global_column_&
550 (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
552 call distribute_global_column_&
554 (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
560 deallocate(qmat1, qmat2, stat=istat, errmsg=errormessage)
561 check_deallocate(
"solve_tridi_col: qmat1, qmat2", istat, errormessage)
567 allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errormessage)
568 check_deallocate(
"solve_tridi_col: l_col, p_col_i, p_col_o", istat, errormessage)
584 nmid = limits(i+n) - noff
585 nlen = limits(i+2*n) - noff
589 p_col_o(nev+1:na) = -1
593 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
594 matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_self,kind=ik), &
595 l_col(noff+1), p_col_i(noff+1), &
596 l_col(noff+1), p_col_o(noff+1), 0, 1, usegpu, wantdebug, success, max_threads)
597 if (.not.(success))
return
605 deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errormessage)
606 check_deallocate(
"solve_tridi_col: limits, l_col, p_col_i, p_col_o", istat, errormessage)
608 call obj%timer%stop(
"solve_tridi_col" // precision_suffix)
613 subroutine solve_tridi_single_problem_&
614 &precision_and_suffix &
615 (obj, nlen, d, e, q, ldq, wantdebug, success)
621 use elpa_blas_interfaces
625 integer(kind=ik) :: nlen, ldq
626 real(kind=real_datatype) :: d(nlen), e(nlen), q(ldq,nlen)
628 real(kind=real_datatype),
allocatable :: work(:), qtmp(:), ds(:), es(:)
629 real(kind=real_datatype) :: dtmp
631 integer(kind=ik) :: i, j, lwork, liwork, info
632 integer(kind=BLAS_KIND) :: infoBLAS
633 integer(kind=ik),
allocatable :: iwork(:)
635 logical,
intent(in) :: wantDebug
636 logical,
intent(out) :: success
637 integer(kind=ik) :: istat
638 character(200) :: errorMessage
640 call obj%timer%start(
"solve_tridi_single" // precision_suffix)
643 allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)
644 check_allocate(
"solve_tridi_single: ds, es", istat, errormessage)
653 lwork = 1 + 4*nlen + nlen**2
655 allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errormessage)
656 check_allocate(
"solve_tridi_single: work, iwork", istat, errormessage)
657 call obj%timer%start(
"lapack")
658 call precision_stedc(
'I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), &
659 work, int(lwork,kind=blas_kind), int(iwork,kind=blas_kind), int(liwork,kind=blas_kind), &
661 info = int(infoblas,kind=ik)
662 call obj%timer%stop(
"lapack")
668 write(error_unit,
'(a,i8,a)')
'Warning: Lapack routine DSTEDC failed, info= ',info,
', Trying DSTEQR!'
672 call obj%timer%start(
"lapack")
673 call precision_steqr(
'I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), work, infoblas )
674 info = int(infoblas,kind=ik)
675 call obj%timer%stop(
"lapack")
681 write(error_unit,
'(a,i8,a)')
'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,
', Aborting!'
688 deallocate(work,iwork,ds,es, stat=istat, errmsg=errormessage)
689 check_deallocate(
"solve_tridi_single: work, iwork, ds, es", istat, errormessage)
695 if (d(i+1)<d(i))
then
696#ifdef DOUBLE_PRECISION_REAL
697 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8)
then
699 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4)
then
701 write(error_unit,
'(a,i8,2g25.16)')
'***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
703 write(error_unit,
'(a,i8,2g25.16)')
'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
704 write(error_unit,
'(a)')
'The eigenvalues from a lapack call are not sorted to machine precision.'
705 write(error_unit,
'(a)')
'In this extent, this is completely harmless.'
706 write(error_unit,
'(a)')
'Still, we keep this info message just in case.'
708 allocate(qtmp(nlen), stat=istat, errmsg=errormessage)
709 check_allocate(
"solve_tridi_single: qtmp", istat, errormessage)
712 qtmp(1:nlen) = q(1:nlen,i+1)
716 q(1:nlen,j+1) = q(1:nlen,j)
722 q(1:nlen,j+1) = qtmp(1:nlen)
723 deallocate(qtmp, stat=istat, errmsg=errormessage)
724 check_deallocate(
"solve_tridi_single: qtmp", istat, errormessage)
728 call obj%timer%stop(
"solve_tridi_single" // precision_suffix)