ELPA-2024.03.001/html/solve__tridi__template_8F90_source.html

#if 0

!    This file is part of ELPA.

!

!    The ELPA library was originally created by the ELPA consortium,

!    consisting of the following organizations:

!

!    - Max Planck Computing and Data Facility (MPCDF), formerly known as

!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),

!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte

!      Informatik,

!    - Technische Universität München, Lehrstuhl für Informatik mit

!      Schwerpunkt Wissenschaftliches Rechnen ,

!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,

!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,

!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,

!      and

!    - IBM Deutschland GmbH

!

!    This particular source code file contains additions, changes and

!    enhancements authored by Intel Corporation which is not part of

!    the ELPA consortium.

!

!    More information can be found here:

!    http://elpa.mpcdf.mpg.de/

!

!    ELPA is free software: you can redistribute it and/or modify

!    it under the terms of the version 3 of the license of the

!    GNU Lesser General Public License as published by the Free

!    Software Foundation.

!

!    ELPA is distributed in the hope that it will be useful,

!    but WITHOUT ANY WARRANTY; without even the implied warranty of

!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

!    GNU Lesser General Public License for more details.

!

!    You should have received a copy of the GNU Lesser General Public License

!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>

!

!    ELPA reflects a substantial effort on the part of the original

!    ELPA consortium, and we ask you to respect the spirit of the

!    license that we chose: i.e., please contribute any changes you

!    may have back to the original ELPA library distribution, and keep

!    any derivatives of ELPA under the same license that we chose for

!    the original distribution, the GNU Lesser General Public License.

!

!

! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines

!

! Copyright of the original code rests with the authors inside the ELPA

! consortium. The copyright of any additional modifications shall rest

! with their original authors, but shall adhere to the licensing terms

! distributed along with the original code in the file "COPYING".

#endif


#include "../general/sanity.F90"

#include "../general/error_checking.inc"


subroutine solve_tridi_&

&precision_and_suffix &

    ( obj, na, nev, d, e, q, ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &

                                           mpi_comm_cols, usegpu, wantdebug, success, max_threads )


      use precision

      use elpa_abstract_impl

      use merge_recursive

      use merge_systems

      use elpa_mpi

      use elpa_utilities

      use distribute_global_column

      use elpa_mpi

      implicit none

#include "../../src/general/precision_kinds.F90"

      class(elpa_abstract_impl_t), intent(inout) :: obj

      integer(kind=ik), intent(in)               :: na, nev, ldq, nblk, matrixCols, &

                                                    mpi_comm_all, mpi_comm_rows, mpi_comm_cols

      real(kind=real_datatype), intent(inout)    :: d(na), e(na)

#ifdef USE_ASSUMED_SIZE

      real(kind=real_datatype), intent(inout)    :: q(ldq,*)

#else

      real(kind=real_datatype), intent(inout)    :: q(ldq,matrixcols)

#endif

      logical, intent(in)                        :: useGPU, wantDebug

      logical, intent(out)                       :: success


      integer(kind=ik)                           :: i, j, n, np, nc, nev1, l_cols, l_rows

      integer(kind=ik)                           :: my_prow, my_pcol, np_rows, np_cols

      integer(kind=MPI_KIND)                     :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI

      integer(kind=ik), allocatable              :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)


      integer(kind=ik)                           :: istat

      character(200)                             :: errorMessage

      character(20)                              :: gpuString

      integer(kind=ik), intent(in)               :: max_threads


      if(usegpu) then

        gpustring = "_gpu"

      else

        gpustring = ""

      endif


      call obj%timer%start("solve_tridi" // precision_suffix // gpustring)


      call obj%timer%start("mpi_communication")

      call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)

      call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)


      my_prow = int(my_prowmpi,kind=c_int)

      np_rows = int(np_rowsmpi,kind=c_int)

      my_pcol = int(my_pcolmpi,kind=c_int)

      np_cols = int(np_colsmpi,kind=c_int)


      call obj%timer%stop("mpi_communication")


      success = .true.


      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q

      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q


      ! Set Q to 0

      q(1:l_rows, 1:l_cols) = 0.0_rk


      ! Get the limits of the subdivisons, each subdivison has as many cols

      ! as fit on the respective processor column


      allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: limits", istat, errormessage)


      limits(0) = 0

      do np=0,np_cols-1

        nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np


        ! Check for the case that a column has have zero width.

        ! This is not supported!

        ! Scalapack supports it but delivers no results for these columns,

        ! which is rather annoying

        if (nc==0) then

          call obj%timer%stop("solve_tridi" // precision_suffix)

          if (wantdebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'

          success = .false.

          return

        endif

        limits(np+1) = limits(np) + nc

      enddo


      ! Subdivide matrix by subtracting rank 1 modifications


      do i=1,np_cols-1

        n = limits(i)

        d(n) = d(n)-abs(e(n))

        d(n+1) = d(n+1)-abs(e(n))

      enddo


      ! Solve sub problems on processsor columns


      nc = limits(my_pcol) ! column after which my problem starts


      if (np_cols>1) then

        nev1 = l_cols ! all eigenvectors are needed

      else

        nev1 = min(nev,l_cols)

      endif

      call solve_tridi_col_&

           &precision_and_suffix &

             (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk,  &

                        matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads)

      if (.not.(success)) then

        call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)

        return

      endif

      ! If there is only 1 processor column, we are done


      if (np_cols==1) then

        deallocate(limits, stat=istat, errmsg=errormessage)

        check_deallocate("solve_tridi: limits", istat, errormessage)


        call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)

        return

      endif


      ! Set index arrays for Q columns


      ! Dense distribution scheme:


      allocate(l_col(na), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: l_col", istat, errormessage)


      allocate(p_col(na), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: p_col", istat, errormessage)


      n = 0

      do np=0,np_cols-1

        nc = local_index(na, np, np_cols, nblk, -1)

        do i=1,nc

          n = n+1

          l_col(n) = i

          p_col(n) = np

        enddo

      enddo


      ! Block cyclic distribution scheme, only nev columns are set:


      allocate(l_col_bc(na), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: l_col_bc", istat, errormessage)


      allocate(p_col_bc(na), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: p_col_bc", istat, errormessage)


      p_col_bc(:) = -1

      l_col_bc(:) = -1


      do i = 0, na-1, nblk*np_cols

        do j = 0, np_cols-1

          do n = 1, nblk

            if (i+j*nblk+n <= min(nev,na)) then

              p_col_bc(i+j*nblk+n) = j

              l_col_bc(i+j*nblk+n) = i/np_cols + n

             endif

           enddo

         enddo

      enddo


      ! Recursively merge sub problems

      call merge_recursive_&

           &precision &

           (obj, 0, np_cols, ldq, matrixcols, nblk, &

           l_col, p_col, l_col_bc, p_col_bc, limits, &

           np_cols, na, q, d, e, &

           mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&

           usegpu, wantdebug, success, max_threads)


      if (.not.(success)) then

        call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)

        return

      endif


      deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)

      check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)


      call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)

      return


#if 0

      contains

        recursive subroutine merge_recursive_&

                  &precision_and_suffix &

           (obj, np_off, nprocs, usegpu, wantdebug, success)

           use precision

           use elpa_abstract_impl

           use merge_systems

           implicit none


           ! noff is always a multiple of nblk_ev

           ! nlen-noff is always > nblk_ev


           class(elpa_abstract_impl_t), intent(inout) :: obj

           integer(kind=ik)     :: np_off, nprocs

           integer(kind=ik)     :: np1, np2, noff, nlen, nmid, n

           logical, intent(in)  :: useGPU, wantDebug

           logical, intent(out) :: success


           success = .true.


           if (nprocs<=1) then

             ! Safety check only

             if (wantdebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs

             success = .false.

             return

           endif

           ! Split problem into 2 subproblems of size np1 / np2


           np1 = nprocs/2

           np2 = nprocs-np1


           if (np1 > 1) call merge_recursive_&

                        &precision_and_suffix &

           (obj, np_off, np1, usegpu, wantdebug, success)

           if (.not.(success)) return

           if (np2 > 1) call merge_recursive_&

                        &precision_and_suffix &

           (obj, np_off+np1, np2, usegpu, wantdebug, success)

           if (.not.(success)) return


           noff = limits(np_off)

           nmid = limits(np_off+np1) - noff

           nlen = limits(np_off+nprocs) - noff


#ifdef WITH_MPI

           call obj%timer%start("mpi_communication")

           if (my_pcol==np_off) then

             do n=np_off+np1,np_off+nprocs-1

               call mpi_send(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), 1_mpi_kind, &

                             int(mpi_comm_cols,kind=mpi_kind), mpierr)

             enddo

           endif

           call obj%timer%stop("mpi_communication")

#endif /* WITH_MPI */


           if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then

#ifdef WITH_MPI

             call obj%timer%start("mpi_communication")

             call mpi_recv(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(np_off,kind=mpi_kind), 1_mpi_kind, &

                           int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)

             call obj%timer%stop("mpi_communication")

#else /* WITH_MPI */

!             d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)

#endif /* WITH_MPI */

           endif


           if (my_pcol==np_off+np1) then

             do n=np_off,np_off+np1-1

#ifdef WITH_MPI

               call obj%timer%start("mpi_communication")

               call mpi_send(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), &

                             1_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpierr)

               call obj%timer%stop("mpi_communication")

#endif /* WITH_MPI */


             enddo

           endif

           if (my_pcol>=np_off .and. my_pcol<np_off+np1) then

#ifdef WITH_MPI

             call obj%timer%start("mpi_communication")

             call mpi_recv(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(np_off+np1,kind=mpi_kind), &

                           1_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)

             call obj%timer%stop("mpi_communication")

#else /* WITH_MPI */

!             d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)

#endif /* WITH_MPI */

           endif

           if (nprocs == np_cols) then


             ! Last merge, result distribution must be block cyclic, noff==0,

             ! p_col_bc is set so that only nev eigenvalues are calculated

             call merge_systems_&

                  &precision &

                                 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &

                                 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &

                                 l_col, p_col, &

                                 l_col_bc, p_col_bc, np_off, nprocs, usegpu, wantdebug, success, max_threads )

             if (.not.(success)) return

           else

             ! Not last merge, leave dense column distribution

             call merge_systems_&

                  &precision &

                                (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &

                                 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &

                                 l_col(noff+1), p_col(noff+1), &

                                 l_col(noff+1), p_col(noff+1), np_off, nprocs, usegpu, wantdebug, success, max_threads )

             if (.not.(success)) return

           endif

       end subroutine merge_recursive_&

           &precision_and_suffix

#endif


    end subroutine solve_tridi_&

        &precision_and_suffix


    subroutine solve_tridi_col_&

    &precision_and_suffix &

      ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads )


   ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column

   ! with the divide and conquer method.

   ! Works best if the number of processor rows is a power of 2!

      use precision

      use elpa_abstract_impl

      use elpa_mpi

      use merge_systems

      use elpa_utilities

      use distribute_global_column

      implicit none

      class(elpa_abstract_impl_t), intent(inout) :: obj


      integer(kind=ik)              :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows

      real(kind=real_datatype)      :: d(na), e(na)

#ifdef USE_ASSUMED_SIZE

      real(kind=real_datatype)      :: q(ldq,*)

#else

      real(kind=real_datatype)      :: q(ldq,matrixcols)

#endif


      integer(kind=ik), parameter   :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used


      real(kind=real_datatype), allocatable    :: qmat1(:,:), qmat2(:,:)

      integer(kind=ik)              :: i, n, np

      integer(kind=ik)              :: ndiv, noff, nmid, nlen, max_size

      integer(kind=ik)              :: my_prow, np_rows

      integer(kind=MPI_KIND)        :: mpierr, my_prowMPI, np_rowsMPI


      integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)

      logical, intent(in)           :: useGPU, wantDebug

      logical, intent(out)          :: success

      integer(kind=ik)              :: istat

      character(200)                :: errorMessage


      integer(kind=ik), intent(in)  :: max_threads


      integer(kind=MPI_KIND)        :: bcast_request1, bcast_request2

      logical                       :: useNonBlockingCollectivesRows

      integer(kind=c_int)           :: non_blocking_collectives, error


      success = .true.


      call obj%timer%start("solve_tridi_col" // precision_suffix)


      call obj%get("nbc_row_solve_tridi", non_blocking_collectives, error)

      if (error .ne. elpa_ok) then

        write(error_unit,*) "Problem setting option for non blocking collectives for rows in solve_tridi. Aborting..."

        success = .false.

        return

      endif


      if (non_blocking_collectives .eq. 1) then

        usenonblockingcollectivesrows = .true.

      else

        usenonblockingcollectivesrows = .false.

      endif


      call obj%timer%start("mpi_communication")

      call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind), my_prowmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind), np_rowsmpi, mpierr)


      my_prow = int(my_prowmpi,kind=c_int)

      np_rows = int(np_rowsmpi,kind=c_int)

      call obj%timer%stop("mpi_communication")

      success = .true.

      ! Calculate the number of subdivisions needed.


      n = na

      ndiv = 1

      do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)

        n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries

        ndiv = ndiv*2

      enddo


      ! If there is only 1 processor row and not all eigenvectors are needed

      ! and the matrix size is big enough, then use 2 subdivisions

      ! so that merge_systems is called once and only the needed

      ! eigenvectors are calculated for the final problem.


      if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2


      allocate(limits(0:ndiv), stat=istat, errmsg=errormessage)

      check_deallocate("solve_tridi_col: limits", istat, errormessage)


      limits(0) = 0

      limits(ndiv) = na


      n = ndiv

      do while(n>1)

        n = n/2 ! n is always a power of 2

        do i=0,ndiv-1,2*n

          ! We want to have even boundaries (for cache line alignments)

          limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2

        enddo

      enddo


      ! Calculate the maximum size of a subproblem


      max_size = 0

      do i=1,ndiv

        max_size = max(max_size,limits(i)-limits(i-1))

      enddo


      ! Subdivide matrix by subtracting rank 1 modifications


      do i=1,ndiv-1

        n = limits(i)

        d(n) = d(n)-abs(e(n))

        d(n+1) = d(n+1)-abs(e(n))

      enddo


      if (np_rows==1)    then


        ! For 1 processor row there may be 1 or 2 subdivisions

        do n=0,ndiv-1

          noff = limits(n)        ! Start of subproblem

          nlen = limits(n+1)-noff ! Size of subproblem


          call solve_tridi_single_problem_&

          &precision_and_suffix &

                                  (obj, nlen,d(noff+1),e(noff+1), &

                                    q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantdebug, success)


          if (.not.(success)) return

        enddo


      else


        ! Solve sub problems in parallel with solve_tridi_single

        ! There is at maximum 1 subproblem per processor


        allocate(qmat1(max_size,max_size), stat=istat, errmsg=errormessage)

        check_deallocate("solve_tridi_col: qmat1", istat, errormessage)


        allocate(qmat2(max_size,max_size), stat=istat, errmsg=errormessage)

        check_deallocate("solve_tridi_col: qmat2", istat, errormessage)


        qmat1 = 0 ! Make sure that all elements are defined


        if (my_prow < ndiv) then


          noff = limits(my_prow)        ! Start of subproblem

          nlen = limits(my_prow+1)-noff ! Size of subproblem

          call solve_tridi_single_problem_&

          &precision_and_suffix &

                                    (obj, nlen,d(noff+1),e(noff+1),qmat1, &

                                    ubound(qmat1,dim=1), wantdebug, success)


          if (.not.(success)) return

        endif


        ! Fill eigenvectors in qmat1 into global matrix q


        do np = 0, ndiv-1


          noff = limits(np)

          nlen = limits(np+1)-noff

#ifdef WITH_MPI

          if (usenonblockingcollectivesrows) then

            call obj%timer%start("mpi_nbc_communication")

            call mpi_ibcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &

                         int(mpi_comm_rows,kind=mpi_kind), bcast_request1, mpierr)

            call mpi_wait(bcast_request1, mpi_status_ignore, mpierr)


            qmat2 = qmat1

            call mpi_ibcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &

                         int(mpi_comm_rows,kind=mpi_kind), bcast_request2, mpierr)

            call mpi_wait(bcast_request2, mpi_status_ignore, mpierr)

            call obj%timer%stop("mpi_nbc_communication")

          else

            call obj%timer%start("mpi_communication")

            call mpi_bcast(d(noff+1), int(nlen,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &

                         int(mpi_comm_rows,kind=mpi_kind), mpierr)


            qmat2 = qmat1

            call mpi_bcast(qmat2, int(max_size*max_size,kind=mpi_kind), mpi_real_precision, int(np,kind=mpi_kind), &

                         int(mpi_comm_rows,kind=mpi_kind), mpierr)

            call obj%timer%stop("mpi_communication")

          endif

#else /* WITH_MPI */

!          qmat2 = qmat1 ! is this correct

#endif /* WITH_MPI */

          do i=1,nlen


#ifdef WITH_MPI

            call distribute_global_column_&

            &precision &

                     (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)

#else /* WITH_MPI */

            call distribute_global_column_&

            &precision &

                     (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)

#endif /* WITH_MPI */

          enddo


        enddo


        deallocate(qmat1, qmat2, stat=istat, errmsg=errormessage)

        check_deallocate("solve_tridi_col: qmat1, qmat2", istat, errormessage)


      endif


      ! Allocate and set index arrays l_col and p_col


      allocate(l_col(na), p_col_i(na),  p_col_o(na), stat=istat, errmsg=errormessage)

      check_deallocate("solve_tridi_col: l_col, p_col_i, p_col_o", istat, errormessage)


      do i=1,na

        l_col(i) = i

        p_col_i(i) = 0

        p_col_o(i) = 0

      enddo


      ! Merge subproblems


      n = 1

      do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single


        do i=0,ndiv-1,2*n


          noff = limits(i)

          nmid = limits(i+n) - noff

          nlen = limits(i+2*n) - noff


          if (nlen == na) then

            ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors

            p_col_o(nev+1:na) = -1

          endif

          call merge_systems_&

          &precision &

                              (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &

                               matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_self,kind=ik), &

                               l_col(noff+1), p_col_i(noff+1), &

                               l_col(noff+1), p_col_o(noff+1), 0, 1, usegpu, wantdebug, success, max_threads)

          if (.not.(success)) return


        enddo


        n = 2*n


      enddo


      deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errormessage)

      check_deallocate("solve_tridi_col: limits, l_col, p_col_i, p_col_o", istat, errormessage)


      call obj%timer%stop("solve_tridi_col" // precision_suffix)


    end subroutine solve_tridi_col_&

    &precision_and_suffix


    subroutine solve_tridi_single_problem_&

    &precision_and_suffix &

    (obj, nlen, d, e, q, ldq, wantdebug, success)


   ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.

   ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.

     use precision

     use elpa_abstract_impl

     use elpa_blas_interfaces

     use elpa_utilities

     implicit none

     class(elpa_abstract_impl_t), intent(inout) :: obj

     integer(kind=ik)                         :: nlen, ldq

     real(kind=real_datatype)                 :: d(nlen), e(nlen), q(ldq,nlen)


     real(kind=real_datatype), allocatable    :: work(:), qtmp(:), ds(:), es(:)

     real(kind=real_datatype)                 :: dtmp


     integer(kind=ik)              :: i, j, lwork, liwork, info

     integer(kind=BLAS_KIND)       :: infoBLAS

     integer(kind=ik), allocatable :: iwork(:)


     logical, intent(in)           :: wantDebug

     logical, intent(out)          :: success

      integer(kind=ik)             :: istat

      character(200)               :: errorMessage


     call obj%timer%start("solve_tridi_single" // precision_suffix)


     success = .true.

     allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)

     check_allocate("solve_tridi_single: ds, es", istat, errormessage)


     ! Save d and e for the case that dstedc fails


     ds(:) = d(:)

     es(:) = e(:)


     ! First try dstedc, this is normally faster but it may fail sometimes (why???)


     lwork = 1 + 4*nlen + nlen**2

     liwork =  3 + 5*nlen

     allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errormessage)

     check_allocate("solve_tridi_single: work, iwork", istat, errormessage)

     call obj%timer%start("lapack")

     call precision_stedc('I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind),    &

                          work, int(lwork,kind=blas_kind), int(iwork,kind=blas_kind), int(liwork,kind=blas_kind), &

                          infoblas)

     info = int(infoblas,kind=ik)

     call obj%timer%stop("lapack")


     if (info /= 0) then


       ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.


       write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'


       d(:) = ds(:)

       e(:) = es(:)

       call obj%timer%start("lapack")

       call precision_steqr('I', int(nlen,kind=blas_kind), d, e, q, int(ldq,kind=blas_kind), work, infoblas )

       info = int(infoblas,kind=ik)

       call obj%timer%stop("lapack")


       ! If DSTEQR fails also, we don't know what to do further ...


       if (info /= 0) then

         if (wantdebug) then

           write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'

         endif

         success = .false.

         return

       endif

     end if


       deallocate(work,iwork,ds,es, stat=istat, errmsg=errormessage)

       check_deallocate("solve_tridi_single: work, iwork, ds, es", istat, errormessage)


      ! Check if eigenvalues are monotonically increasing

      ! This seems to be not always the case  (in the IBM implementation of dstedc ???)


      do i=1,nlen-1

        if (d(i+1)<d(i)) then

#ifdef DOUBLE_PRECISION_REAL

          if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then

#else

          if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then

#endif

            write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)

          else

            write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)

            write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'

            write(error_unit,'(a)') 'In this extent, this is completely harmless.'

            write(error_unit,'(a)') 'Still, we keep this info message just in case.'

          end if

          allocate(qtmp(nlen), stat=istat, errmsg=errormessage)

          check_allocate("solve_tridi_single: qtmp", istat, errormessage)


          dtmp = d(i+1)

          qtmp(1:nlen) = q(1:nlen,i+1)

          do j=i,1,-1

            if (dtmp<d(j)) then

              d(j+1)        = d(j)

              q(1:nlen,j+1) = q(1:nlen,j)

            else

              exit ! Loop

            endif

          enddo

          d(j+1)        = dtmp

          q(1:nlen,j+1) = qtmp(1:nlen)

          deallocate(qtmp, stat=istat, errmsg=errormessage)

          check_deallocate("solve_tridi_single: qtmp", istat, errormessage)


       endif

     enddo

     call obj%timer%stop("solve_tridi_single" // precision_suffix)


    end subroutine solve_tridi_single_problem_&

    &precision_and_suffix


distribute_global_column
Definition mod_distribute_global_column.F90:55

elpa_abstract_impl
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50

merge_recursive
Definition mod_merge_recursive.F90:3

merge_systems
Definition mod_merge_systems.F90:3

elpa_abstract_impl::elpa_abstract_impl_t
Definition elpa_abstract_impl.F90:73