58 subroutine solve_tridi_col_&
59 &precision_and_suffix &
60 ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads )
74 integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
75 real(kind=real_datatype) :: d(na), e(na)
76#ifdef USE_ASSUMED_SIZE
77 real(kind=real_datatype) :: q(ldq,*)
79 real(kind=real_datatype) :: q(ldq,matrixcols)
82 integer(kind=ik),
parameter :: min_submatrix_size = 16
84 real(kind=real_datatype),
allocatable :: qmat1(:,:), qmat2(:,:)
85 integer(kind=ik) :: i, n, np
86 integer(kind=ik) :: ndiv, noff, nmid, nlen, max_size
87 integer(kind=ik) :: my_prow, np_rows
88 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, np_rowsMPI
90 integer(kind=ik),
allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
91 logical,
intent(in) :: useGPU, wantDebug
92 logical,
intent(out) :: success
93 integer(kind=ik) :: istat
94 character(200) :: errorMessage
96 integer(kind=ik),
intent(in) :: max_threads
98 integer(kind=MPI_KIND) :: bcast_request1, bcast_request2
99 logical :: useNonBlockingCollectivesRows
100 integer(kind=c_int) :: non_blocking_collectives, error
104 call obj%timer%start(
"solve_tridi_col" // precision_suffix)
106 call obj%get(
"nbc_row_solve_tridi", non_blocking_collectives, error)
107 if (error .ne. elpa_ok)
then
108 write(error_unit,*)
"Problem setting option for non blocking collectives for rows in solve_tridi. Aborting..."
113 if (non_blocking_collectives .eq. 1)
then
114 usenonblockingcollectivesrows = .true.
116 usenonblockingcollectivesrows = .false.
119 call obj%timer%start(
"mpi_communication")
120 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind), my_prowmpi, mpierr)
121 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind), np_rowsmpi, mpierr)
123 my_prow = int(my_prowmpi,kind=c_int)
124 np_rows = int(np_rowsmpi,kind=c_int)
125 call obj%timer%stop(
"mpi_communication")
131 do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
141 if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
143 allocate(limits(0:ndiv), stat=istat, errmsg=errormessage)
144 check_deallocate(
"solve_tridi_col: limits", istat, errormessage)
154 limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
162 max_size = max(max_size,limits(i)-limits(i-1))
169 d(n) = d(n)-abs(e(n))
170 d(n+1) = d(n+1)-abs(e(n))
178 nlen = limits(n+1)-noff
180 call solve_tridi_single_problem_&
181 &precision_and_suffix &
182 (obj, nlen,d(noff+1),e(noff+1), &
183 q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantdebug, success)
185 if (.not.(success))
return
193 allocate(qmat1(max_size,max_size), stat=istat, errmsg=errormessage)
194 check_deallocate(
"solve_tridi_col: qmat1", istat, errormessage)
196 allocate(qmat2(max_size,max_size), stat=istat, errmsg=errormessage)
197 check_deallocate(
"solve_tridi_col: qmat2", istat, errormessage)
201 if (my_prow < ndiv)
then
203 noff = limits(my_prow)
204 nlen = limits(my_prow+1)-noff
205 call solve_tridi_single_problem_&
206 &precision_and_suffix &
207 (obj, nlen,d(noff+1),e(noff+1),qmat1, &
208 ubound(qmat1,dim=1), wantdebug, success)
210 if (.not.(success))
return
218 nlen = limits(np+1)-noff
220 if (usenonblockingcollectivesrows)
then
221 call obj%timer%start(
"mpi_nbc_communication")
222 call mpi_ibcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
223 int(mpi_comm_rows,kind=mpi_kind), bcast_request1, mpierr)
224 call mpi_wait(bcast_request1, mpi_status_ignore, mpierr)
227 call mpi_ibcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
228 int(mpi_comm_rows,kind=mpi_kind), bcast_request2, mpierr)
229 call mpi_wait(bcast_request2, mpi_status_ignore, mpierr)
230 call obj%timer%stop(
"mpi_nbc_communication")
232 call obj%timer%start(
"mpi_communication")
233 call mpi_bcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
234 int(mpi_comm_rows,kind=mpi_kind), mpierr)
237 call mpi_bcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &
238 int(mpi_comm_rows,kind=mpi_kind), mpierr)
239 call obj%timer%stop(
"mpi_communication")
247 call distribute_global_column_&
249 (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
251 call distribute_global_column_&
253 (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
259 deallocate(qmat1, qmat2, stat=istat, errmsg=errormessage)
260 check_deallocate(
"solve_tridi_col: qmat1, qmat2", istat, errormessage)
266 allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errormessage)
267 check_deallocate(
"solve_tridi_col: l_col, p_col_i, p_col_o", istat, errormessage)
283 nmid = limits(i+n) - noff
284 nlen = limits(i+2*n) - noff
288 p_col_o(nev+1:na) = -1
292 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
293 matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_self,kind=ik), &
294 l_col(noff+1), p_col_i(noff+1), &
295 l_col(noff+1), p_col_o(noff+1), 0, 1, usegpu, wantdebug, success, max_threads)
296 if (.not.(success))
return
304 deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errormessage)
305 check_deallocate(
"solve_tridi_col: limits, l_col, p_col_i, p_col_o", istat, errormessage)
307 call obj%timer%stop(
"solve_tridi_col" // precision_suffix)
312 subroutine solve_tridi_single_problem_&
313 &precision_and_suffix &
314 (obj, nlen, d, e, q, ldq, wantdebug, success)
320 use elpa_blas_interfaces
324 integer(kind=ik) :: nlen, ldq
325 real(kind=real_datatype) :: d(nlen), e(nlen), q(ldq,nlen)
327 real(kind=real_datatype),
allocatable :: work(:), qtmp(:), ds(:), es(:)
328 real(kind=real_datatype) :: dtmp
330 integer(kind=ik) :: i, j, lwork, liwork, info
331 integer(kind=BLAS_KIND) :: infoBLAS
332 integer(kind=ik),
allocatable :: iwork(:)
334 logical,
intent(in) :: wantDebug
335 logical,
intent(out) :: success
336 integer(kind=ik) :: istat
337 character(200) :: errorMessage
339 call obj%timer%start(
"solve_tridi_single" // precision_suffix)
342 allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)
343 check_allocate(
"solve_tridi_single: ds, es", istat, errormessage)
352 lwork = 1 + 4*nlen + nlen**2
354 allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errormessage)
355 check_allocate(
"solve_tridi_single: work, iwork", istat, errormessage)
356 call obj%timer%start(
"lapack")
357 call precision_stedc(
'I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), &
358 work, int(lwork,kind=blas_kind), int(iwork,kind=blas_kind), int(liwork,kind=blas_kind), &
360 info = int(infoblas,kind=ik)
361 call obj%timer%stop(
"lapack")
367 write(error_unit,
'(a,i8,a)')
'Warning: Lapack routine DSTEDC failed, info= ',info,
', Trying DSTEQR!'
371 call obj%timer%start(
"lapack")
372 call precision_steqr(
'I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), work, infoblas )
373 info = int(infoblas,kind=ik)
374 call obj%timer%stop(
"lapack")
380 write(error_unit,
'(a,i8,a)')
'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,
', Aborting!'
387 deallocate(work,iwork,ds,es, stat=istat, errmsg=errormessage)
388 check_deallocate(
"solve_tridi_single: work, iwork, ds, es", istat, errormessage)
394 if (d(i+1)<d(i))
then
395#ifdef DOUBLE_PRECISION_REAL
396 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8)
then
398 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4)
then
400 write(error_unit,
'(a,i8,2g25.16)')
'***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
402 write(error_unit,
'(a,i8,2g25.16)')
'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
403 write(error_unit,
'(a)')
'The eigenvalues from a lapack call are not sorted to machine precision.'
404 write(error_unit,
'(a)')
'In this extent, this is completely harmless.'
405 write(error_unit,
'(a)')
'Still, we keep this info message just in case.'
407 allocate(qtmp(nlen), stat=istat, errmsg=errormessage)
408 check_allocate(
"solve_tridi_single: qtmp", istat, errormessage)
411 qtmp(1:nlen) = q(1:nlen,i+1)
415 q(1:nlen,j+1) = q(1:nlen,j)
421 q(1:nlen,j+1) = qtmp(1:nlen)
422 deallocate(qtmp, stat=istat, errmsg=errormessage)
423 check_deallocate(
"solve_tridi_single: qtmp", istat, errormessage)
427 call obj%timer%stop(
"solve_tridi_single" // precision_suffix)