ELPA-2024.05.001/html/merge__systems__template_8F90_source.html

#if 0

!    This file is part of ELPA.

!

!    The ELPA library was originally created by the ELPA consortium,

!    consisting of the following organizations:

!

!    - Max Planck Computing and Data Facility (MPCDF), formerly known as

!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),

!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte

!      Informatik,

!    - Technische Universität München, Lehrstuhl für Informatik mit

!      Schwerpunkt Wissenschaftliches Rechnen ,

!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,

!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,

!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,

!      and

!    - IBM Deutschland GmbH

!

!    This particular source code file contains additions, changes and

!    enhancements authored by Intel Corporation which is not part of

!    the ELPA consortium.

!

!    More information can be found here:

!    http://elpa.mpcdf.mpg.de/

!

!    ELPA is free software: you can redistribute it and/or modify

!    it under the terms of the version 3 of the license of the

!    GNU Lesser General Public License as published by the Free

!    Software Foundation.

!

!    ELPA is distributed in the hope that it will be useful,

!    but WITHOUT ANY WARRANTY; without even the implied warranty of

!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

!    GNU Lesser General Public License for more details.

!

!    You should have received a copy of the GNU Lesser General Public License

!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>

!

!    ELPA reflects a substantial effort on the part of the original

!    ELPA consortium, and we ask you to respect the spirit of the

!    license that we chose: i.e., please contribute any changes you

!    may have back to the original ELPA library distribution, and keep

!    any derivatives of ELPA under the same license that we chose for

!    the original distribution, the GNU Lesser General Public License.

!

!

! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines

!

! Copyright of the original code rests with the authors inside the ELPA

! consortium. The copyright of any additional modifications shall rest

! with their original authors, but shall adhere to the licensing terms

! distributed along with the original code in the file "COPYING".

#endif


#include "../general/sanity.F90"

#include "../general/error_checking.inc"


    subroutine merge_systems_&

    &precision &

                         (obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &

                          l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)

      use elpa_gpu

      use, intrinsic :: iso_c_binding

      use precision

      use elpa_abstract_impl

      use elpa_blas_interfaces

      use global_product

      use global_gather

      use resort_ev

      use transform_columns

      use check_monotony

      use add_tmp

      use v_add_s

      use elpa_utilities

      use elpa_mpi

      use solve_secular_equation

#ifdef WITH_OPENMP_TRADITIONAL

      use omp_lib

#endif

      implicit none

#include "../general/precision_kinds.F90"

      class(elpa_abstract_impl_t), intent(inout)  :: obj

      integer(kind=ik), intent(in)                :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &

                                                     mpi_comm_cols, npc_0, npc_n

      integer(kind=ik), intent(in)                :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)

      real(kind=real_datatype), intent(inout)     :: d(na), e

#ifdef USE_ASSUMED_SIZE

      real(kind=real_datatype), intent(inout)     :: q(ldq,*)

#else

      real(kind=real_datatype), intent(inout)     :: q(ldq,matrixcols)

#endif

      logical, intent(in)                         :: useGPU, wantDebug


      logical, intent(out)                        :: success


      ! TODO: play with max_strip. If it was larger, matrices being multiplied

      ! might be larger as well!

      integer(kind=ik), parameter                 :: max_strip=128


      real(kind=real_datatype)                    :: beta, sig, s, c, t, tau, rho, eps, tol, &

                                                     qtrans(2,2), dmax, zmax, d1new, d2new

      real(kind=real_datatype)                    :: z(na), d1(na), d2(na), z1(na), delta(na),  &

                                                     dbase(na), ddiff(na), ev_scale(na), tmp(na)

      real(kind=real_datatype)                    :: d1u(na), zu(na), d1l(na), zl(na)

      real(kind=real_datatype), allocatable       :: qtmp1(:,:), qtmp2(:,:), ev(:,:)

#ifdef WITH_OPENMP_TRADITIONAL

      real(kind=real_datatype), allocatable       :: z_p(:,:)

#endif


      integer(kind=ik)                            :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &

                                                     l_rqm, ns, info

      integer(kind=BLAS_KIND)                     :: infoBLAS

      integer(kind=ik)                            :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &

                                                     l_cols_qreorg, np, l_idx, nqcols1, nqcols2

      integer(kind=ik)                            :: my_proc, n_procs, my_prow, my_pcol, np_rows, &

                                                     np_cols

      integer(kind=MPI_KIND)                      :: mpierr

      integer(kind=MPI_KIND)                      :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI

      integer(kind=ik)                            :: np_next, np_prev, np_rem

      integer(kind=ik)                            :: idx(na), idx1(na), idx2(na)

      integer(kind=BLAS_KIND)                     :: idxBLAS(NA)

      integer(kind=ik)                            :: coltyp(na), idxq1(na), idxq2(na)


      integer(kind=ik)                            :: istat

      character(200)                              :: errorMessage

      integer(kind=ik)                            :: gemm_dim_k, gemm_dim_l, gemm_dim_m


      integer(kind=c_intptr_t)                    :: num

      integer(kind=C_intptr_T)                    :: qtmp1_dev, qtmp2_dev, ev_dev

      logical                                     :: successGPU

      integer(kind=c_intptr_t), parameter         :: size_of_datatype = size_of_&

                                                                      &precision&

                                                                      &_real

      integer(kind=c_intptr_t)                    :: gpuHandle

      integer(kind=ik), intent(in)                :: max_threads

      integer(kind=c_intptr_t)                    :: my_stream

#ifdef WITH_OPENMP_TRADITIONAL

      integer(kind=ik)                            :: my_thread


      allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errormessage)

      check_allocate("merge_systems: z_p",istat, errormessage)

#endif


      call obj%timer%start("merge_systems" // precision_suffix)

      success = .true.


      call obj%timer%start("mpi_communication")

      call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)

      call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)


      my_prow = int(my_prowmpi,kind=c_int)

      np_rows = int(np_rowsmpi,kind=c_int)

      my_pcol = int(my_pcolmpi,kind=c_int)

      np_cols = int(np_colsmpi,kind=c_int)


      call obj%timer%stop("mpi_communication")


      ! If my processor column isn't in the requested set, do nothing


      if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      ! Determine number of "next" and "prev" column for ring sends


      if (my_pcol == npc_0+npc_n-1) then

        np_next = npc_0

      else

        np_next = my_pcol + 1

      endif


      if (my_pcol == npc_0) then

        np_prev = npc_0+npc_n-1

      else

        np_prev = my_pcol - 1

      endif

      call check_monotony_&

      &precision&

      &(obj, nm,d,'Input1',wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      call check_monotony_&

      &precision&

      &(obj,na-nm,d(nm+1),'Input2',wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      ! Get global number of processors and my processor number.

      ! Please note that my_proc does not need to match any real processor number,

      ! it is just used for load balancing some loops.


      n_procs = np_rows*npc_n

      my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major


      ! Local limits of the rows of Q


      l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q

      l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm

      l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q


      l_rnm  = l_rqm-l_rqs+1 ! Number of local rows <= nm

      l_rows = l_rqe-l_rqs+1 ! Total number of local rows


      ! My number of local columns


      l_cols = count(p_col(1:na)==my_pcol)


      ! Get max number of local columns


      max_local_cols = 0

      do np = npc_0, npc_0+npc_n-1

        max_local_cols = max(max_local_cols,count(p_col(1:na)==np))

      enddo


      ! Calculations start here


      beta = abs(e)

      sig  = sign(1.0_rk,e)


      ! Calculate rank-1 modifier z


      z(:) = 0


      if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow) then

        ! nm is local on my row

        do i = 1, na

          if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))

         enddo

      endif


      if (mod((nqoff+nm)/nblk,np_rows)==my_prow) then

        ! nm+1 is local on my row

        do i = 1, na

          if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))

        enddo

      endif


      call global_gather_&

      &precision&

      &(obj, z, na, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)

      if (.not.(success)) then

        write(error_unit,*) "Error in global_gather. ABorting"

        success = .false.

        return

      endif

      ! Normalize z so that norm(z) = 1.  Since z is the concatenation of

      ! two normalized vectors, norm2(z) = sqrt(2).

      z = z/sqrt(2.0_rk)

      rho = 2.0_rk*beta

      ! Calculate index for merging both systems by ascending eigenvalues

      call obj%timer%start("lapack")

      call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &

                            1_blas_kind, 1_blas_kind, idxblas )

      idx(:) = int(idxblas(:),kind=ik)

      call obj%timer%stop("lapack")


      ! Calculate the allowable deflation tolerance


      zmax = maxval(abs(z))

      dmax = maxval(abs(d))

      eps = precision_lamch( 'E' ) ! return epsilon

      tol = 8.0_rk*eps*max(dmax,zmax)


      ! If the rank-1 modifier is small enough, no more needs to be done

      ! except to reorganize D and Q


      IF ( rho*zmax <= tol ) THEN


        ! Rearrange eigenvalues


        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo


        ! Rearrange eigenvectors

        call resort_ev_&

        &precision &

                       (obj, idx, na, na, p_col_out, q, ldq, matrixcols, l_rows, l_rqe, &

                        l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)


        call obj%timer%stop("merge_systems" // precision_suffix)


        return

      ENDIF


      ! Merge and deflate system


      na1 = 0

      na2 = 0


      ! COLTYP:

      ! 1 : non-zero in the upper half only;

      ! 2 : dense;

      ! 3 : non-zero in the lower half only;

      ! 4 : deflated.


      coltyp(1:nm) = 1

      coltyp(nm+1:na) = 3


      do i=1,na


        if (rho*abs(z(idx(i))) <= tol) then


          ! Deflate due to small z component.


          na2 = na2+1

          d2(na2)   = d(idx(i))

          idx2(na2) = idx(i)

          coltyp(idx(i)) = 4


        else if (na1>0) then


          ! Check if eigenvalues are close enough to allow deflation.


          s = z(idx(i))

          c = z1(na1)


          ! Find sqrt(a**2+b**2) without overflow or

          ! destructive underflow.

          tau = precision_lapy2( c, s )

          t = d1(na1) - d(idx(i))

          c = c / tau

          s = -s / tau

          IF ( abs( t*c*s ) <= tol ) THEN


            ! Deflation is possible.


            na2 = na2+1


            z1(na1) = tau


            d2new = d(idx(i))*c**2 + d1(na1)*s**2

            d1new = d(idx(i))*s**2 + d1(na1)*c**2


            ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0

            ! This means that after the above transformation it must be

            !    D1(na1) <= d1new <= D(idx(i))

            !    D1(na1) <= d2new <= D(idx(i))

            !

            ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))

            ! so there is no problem with sorting here.

            ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)

            ! which makes a check (and possibly a resort) necessary.

            !

            ! The above relations may not hold exactly due to numeric differences

            ! so they have to be enforced in order not to get troubles with sorting.


            if (d1new<d1(na1)  ) d1new = d1(na1)

            if (d1new>d(idx(i))) d1new = d(idx(i))


            if (d2new<d1(na1)  ) d2new = d1(na1)

            if (d2new>d(idx(i))) d2new = d(idx(i))


            d1(na1) = d1new


            do j=na2-1,1,-1

              if (d2new<d2(j)) then

                d2(j+1)   = d2(j)

                idx2(j+1) = idx2(j)

              else

                exit ! Loop

              endif

            enddo


            d2(j+1)   = d2new

            idx2(j+1) = idx(i)


            qtrans(1,1) = c; qtrans(1,2) =-s

            qtrans(2,1) = s; qtrans(2,2) = c

            call transform_columns_&

            &precision &

                        (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &

                         q, ldq, matrixcols, l_rows, mpi_comm_cols, &

                          p_col, l_col, qtrans)

            if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2

            if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2


            coltyp(idx(i)) = 4


          else

            na1 = na1+1

            d1(na1) = d(idx(i))

            z1(na1) = z(idx(i))

            idx1(na1) = idx(i)

          endif

        else

          na1 = na1+1

          d1(na1) = d(idx(i))

          z1(na1) = z(idx(i))

          idx1(na1) = idx(i)

        endif


      enddo

      call check_monotony_&

      &precision&

      &(obj, na1,d1,'Sorted1', wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      call check_monotony_&

      &precision&

      &(obj, na2,d2,'Sorted2', wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif


      if (na1==1 .or. na1==2) then

        ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid


        if (na1==1) then

          d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation

        else ! na1==2

          call obj%timer%start("lapack")

          call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))

          call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))

          call obj%timer%stop("lapack")

          call transform_columns_&

          &precision&

          &(obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &

            ldq, matrixcols, l_rows, mpi_comm_cols, &

             p_col, l_col, qtrans)


        endif


        ! Add the deflated eigenvalues

        d(na1+1:na) = d2(1:na2)


        ! Calculate arrangement of all eigenvalues  in output

        call obj%timer%start("lapack")

        call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &

                              1_blas_kind, 1_blas_kind, idxblas )

        idx(:) = int(idxblas(:),kind=ik)

        call obj%timer%stop("lapack")

        ! Rearrange eigenvalues


        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo


        ! Rearrange eigenvectors


        do i=1,na

          if (idx(i)<=na1) then

            idxq1(i) = idx1(idx(i))

          else

            idxq1(i) = idx2(idx(i)-na1)

          endif

        enddo

        call resort_ev_&

        &precision&

        &(obj, idxq1, na, na, p_col_out, q, ldq, matrixcols, l_rows, l_rqe, &

          l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)


      else if (na1>2) then


        ! Solve secular equation


        z(1:na1) = 1

#ifdef WITH_OPENMP_TRADITIONAL

        z_p(1:na1,:) = 1

#endif

        dbase(1:na1) = 0

        ddiff(1:na1) = 0


        info = 0

        infoblas = int(info,kind=blas_kind)

!#ifdef WITH_OPENMP_TRADITIONAL

!

!        call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)

!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)

!        my_thread = omp_get_thread_num()

!!$OMP DO

!#endif

        DO i = my_proc+1, na1, n_procs ! work distributed over all processors

          call obj%timer%start("lapack")

          call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &

                               rho, s, infoblas) ! s is not used!

          info = int(infoblas,kind=ik)

          call obj%timer%stop("lapack")

          if (info/=0) then

            ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)

            ! use the more stable bisection algorithm in solve_secular_equation

            ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'

            call solve_secular_equation_&

            &precision&

            &(obj, na1, i, d1, z1, delta, rho, s)

          endif


          ! Compute updated z


!#ifdef WITH_OPENMP_TRADITIONAL

!          do j=1,na1

!            if (i/=j)  z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )

!          enddo

!          z_p(i,my_thread) = z_p(i,my_thread)*delta(i)

!#else

          do j=1,na1

            if (i/=j)  z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )

          enddo

          z(i) = z(i)*delta(i)

!#endif

          ! store dbase/ddiff


          if (i<na1) then

            if (abs(delta(i+1)) < abs(delta(i))) then

              dbase(i) = d1(i+1)

              ddiff(i) = delta(i+1)

            else

              dbase(i) = d1(i)

              ddiff(i) = delta(i)

            endif

          else

            dbase(i) = d1(i)

            ddiff(i) = delta(i)

          endif

        enddo

!#ifdef WITH_OPENMP_TRADITIONAL

!!$OMP END PARALLEL

!

!        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)

!

!        do i = 0, max_threads-1

!          z(1:na1) = z(1:na1)*z_p(1:na1,i)

!        enddo

!#endif


        call global_product_&

        &precision&

        (obj, z, na1, mpi_comm_rows, mpi_comm_cols, npc_0, npc_n, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_product. Aborting..."

          return

        endif

        z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )


        call global_gather_&

        &precision&

        &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        call global_gather_&

        &precision&

        &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        d(1:na1) = dbase(1:na1) - ddiff(1:na1)


        ! Calculate scale factors for eigenvectors

        ev_scale(:) = 0.0_rk


#ifdef WITH_OPENMP_TRADITIONAL


        call obj%timer%start("OpenMP parallel" // precision_suffix)


!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i) &

!$omp SHARED(na1, my_proc, n_procs,  &

!$OMP d1, dbase, ddiff, z, ev_scale, obj)


#endif

        DO i = my_proc+1, na1, n_procs ! work distributed over all processors


          ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code

          ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results


          ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)

          ! in exactly this order, but we want to prevent compiler optimization

!         ev_scale_val = ev_scale(i)

          call add_tmp_&

          &precision&

          &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)

!         ev_scale(i) = ev_scale_val

        enddo

#ifdef WITH_OPENMP_TRADITIONAL

!$OMP END PARALLEL DO


        call obj%timer%stop("OpenMP parallel" // precision_suffix)


#endif


        call global_gather_&

        &precision&

        &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        ! Add the deflated eigenvalues

        d(na1+1:na) = d2(1:na2)


        call obj%timer%start("lapack")

        ! Calculate arrangement of all eigenvalues  in output

        call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &

                             1_blas_kind, 1_blas_kind, idxblas )

        idx(:) = int(idxblas(:),kind=ik)

        call obj%timer%stop("lapack")

        ! Rearrange eigenvalues

        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo

        call check_monotony_&

        &precision&

        &(obj, na,d,'Output', wantdebug, success)


        if (.not.(success)) then

          call obj%timer%stop("merge_systems" // precision_suffix)

          return

        endif

        ! Eigenvector calculations


        ! Calculate the number of columns in the new local matrix Q

        ! which are updated from non-deflated/deflated eigenvectors.

        ! idxq1/2 stores the global column numbers.


        nqcols1 = 0 ! number of non-deflated eigenvectors

        nqcols2 = 0 ! number of deflated eigenvectors

        DO i = 1, na

          if (p_col_out(i)==my_pcol) then

            if (idx(i)<=na1) then

              nqcols1 = nqcols1+1

              idxq1(nqcols1) = i

            else

              nqcols2 = nqcols2+1

              idxq2(nqcols2) = i

            endif

          endif

        enddo


        gemm_dim_k = max(1,l_rows)

        gemm_dim_l = max_local_cols

        gemm_dim_m = min(max_strip,max(1,nqcols1))


        allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)

        check_allocate("merge_systems: qtmp1",istat, errormessage)


        allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)

        check_allocate("merge_systems: ev",istat, errormessage)


        allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)

        check_allocate("merge_systems: qtmp2",istat, errormessage)


        qtmp1 = 0 ! May contain empty (unset) parts

        qtmp2 = 0 ! Not really needed


        if (usegpu) then

          num = (gemm_dim_k * gemm_dim_l) * size_of_datatype

#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&

                        gpuhostregisterdefault)

            check_host_register_gpu("merge_systems: qtmp1", successgpu)

          endif

#endif


          successgpu = gpu_malloc(qtmp1_dev, num)

          check_alloc_gpu("merge_systems: qtmp1_dev", successgpu)


          num = (gemm_dim_l * gemm_dim_m) * size_of_datatype

#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t),num,&

                        gpuhostregisterdefault)

            check_host_register_gpu("merge_systems: ev", successgpu)

          endif

#endif

          successgpu = gpu_malloc(ev_dev, num)

          check_alloc_gpu("merge_systems: ev_dev", successgpu)


          num = (gemm_dim_k * gemm_dim_m) * size_of_datatype

#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&

                        gpuhostregisterdefault)

            check_host_register_gpu("merge_systems: qtmp2", successgpu)

          endif

#endif

          successgpu = gpu_malloc(qtmp2_dev, num)

          check_alloc_gpu("merge_systems: qtmp2_dev", successgpu)

        endif !useGPU


        ! Gather nonzero upper/lower components of old matrix Q

        ! which are needed for multiplication with new eigenvectors


        nnzu = 0

        nnzl = 0

        do i = 1, na1

          l_idx = l_col(idx1(i))

          if (p_col(idx1(i))==my_pcol) then

            if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then

              nnzu = nnzu+1

              qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)

            endif

            if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then

              nnzl = nnzl+1

              qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)

            endif

          endif

        enddo


        ! Gather deflated eigenvalues behind nonzero components


        ndef = max(nnzu,nnzl)

        do i = 1, na2

          l_idx = l_col(idx2(i))

          if (p_col(idx2(i))==my_pcol) then

            ndef = ndef+1

            qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)

          endif

        enddo


        l_cols_qreorg = ndef ! Number of columns in reorganized matrix


        ! Set (output) Q to 0, it will sum up new Q


        DO i = 1, na

          if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0

        enddo


        np_rem = my_pcol


        do np = 1, npc_n

          ! Do a ring send of qtmp1


          if (np > 1) then


            if (np_rem == npc_0) then

              np_rem = npc_0+npc_n-1

            else

              np_rem = np_rem-1

            endif

#ifdef WITH_MPI

            call obj%timer%start("mpi_communication")

            call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision,     &

                                        int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &

                                        1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)

            call obj%timer%stop("mpi_communication")

#endif /* WITH_MPI */

          endif


          if (usegpu) then

            ! copy back after sendrecv

#ifdef WITH_GPU_STREAMS

            my_stream = obj%gpu_setup%my_stream

            successgpu = gpu_stream_synchronize(my_stream)

            check_stream_synchronize_gpu("tridiag qtmp1_dev", successgpu)


            successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &

                 gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpyhosttodevice, my_stream)

            check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)


            my_stream = obj%gpu_setup%my_stream

            successgpu = gpu_stream_synchronize(my_stream)

            check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

            ! synchronize streamsPerThread; maybe not neccessary

            successgpu = gpu_stream_synchronize()

            check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)


#else

            successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &

                 gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpyhosttodevice)

            check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)

#endif

          endif


          ! Gather the parts in d1 and z which are fitting to qtmp1.

          ! This also delivers nnzu/nnzl for proc np_rem


          nnzu = 0

          nnzl = 0

          do i=1,na1

            if (p_col(idx1(i)) == np_rem) then

              if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then

                nnzu = nnzu+1

                d1u(nnzu) = d1(i)

                zu(nnzu) = z(i)

              endif

              if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then

                nnzl = nnzl+1

                d1l(nnzl) = d1(i)

                zl(nnzl) = z(i)

              endif

            endif

          enddo


          ! Set the deflated eigenvectors in Q (comming from proc np_rem)


          ndef = max(nnzu,nnzl) ! Remote counter in input matrix

          do i = 1, na

            j = idx(i)

            if (j>na1) then

              if (p_col(idx2(j-na1)) == np_rem) then

                ndef = ndef+1

                if (p_col_out(i) == my_pcol) then

                  q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)

                endif

              endif

            endif

          enddo


          do ns = 0, nqcols1-1, max_strip ! strimining loop


            ncnt = min(max_strip,nqcols1-ns) ! number of columns in this strip


            ! Get partial result from (output) Q

!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i, j, k) &

!$omp SHARED(ns, q, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)

            do i = 1, ncnt

              j = idxq1(i+ns)

              k = l_col_out(j)

              qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)

            enddo

!$OMP END PARALLEL DO

            ! Compute eigenvectors of the rank-1 modified matrix.

            ! Parts for multiplying with upper half of Q:

!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i, j, k, tmp) &

!$omp shared(ncnt, nnzu, idx, idxq1, ns, d1u, dbase, ddiff, zu, ev_scale, ev)

            do i = 1, ncnt

              do k = 1, nnzu

              j = idx(idxq1(i+ns))

              ! Calculate the j-th eigenvector of the deflated system

              ! See above why we are doing it this way!

                tmp(k) = d1u(k) - dbase(j)

                tmp(k) = tmp(k) + ddiff(j)

                ev(k,i) = zu(k) / tmp(k) * ev_scale(j)

            enddo

            enddo

!$OMP END PARALLEL DO


            if (usegpu) then

              !TODO: it should be enough to copy l_rows x ncnt

              ! copy to device

#ifdef WITH_GPU_STREAMS

              successgpu = gpu_stream_synchronize(my_stream)

              check_stream_synchronize_gpu("tridiag qtmp2_dev", successgpu)


              successgpu = gpu_memcpy_async(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &

                                 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, my_stream)

              check_memcpy_gpu("merge_systems: qtmp2_dev", successgpu)


              !TODO the previous loop could be possible to do on device and thus

              !copy less

              successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &

                                 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, my_stream)

              check_memcpy_gpu("merge_systems: ev_dev", successgpu)


              successgpu = gpu_stream_synchronize(my_stream)

              check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

              ! synchronize streamsPerThread; maybe not neccessary

              successgpu = gpu_stream_synchronize()

              check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

#else

              successgpu = gpu_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &

                                 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)

              check_memcpy_gpu("merge_systems: qtmp2_dev", successgpu)


              !TODO the previous loop could be possible to do on device and thus

              !copy less

              successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &

                                 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)

              check_memcpy_gpu("merge_systems: ev_dev", successgpu)

#endif

            endif


            ! Multiply old Q with eigenvectors (upper half)


            if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then

              if (usegpu) then

                call obj%timer%start("gpublas")

                gpuhandle = obj%gpu_setup%gpublasHandleArray(0)

                call gpublas_precision_gemm('N', 'N', l_rnm, ncnt, nnzu,   &

                                    1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1),    &

                                    ev_dev, ubound(ev,dim=1), &

                                    1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1), gpuhandle)

                call obj%timer%stop("gpublas")

              else ! useGPU

                call obj%timer%start("blas")

                call obj%timer%start("gemm")

                call precision_gemm('N', 'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &

                                    int(nnzu,kind=blas_kind),   &

                                    1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=blas_kind),    &

                                    ev, int(ubound(ev,dim=1),kind=blas_kind), &

                                    1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))

                call obj%timer%stop("gemm")

                call obj%timer%stop("blas")

              endif ! useGPU

            endif


            ! Compute eigenvectors of the rank-1 modified matrix.

            ! Parts for multiplying with lower half of Q:


!$omp PARALLEL DO &

!$omp private(i, j, k, tmp)

            do i = 1, ncnt

              do k = 1, nnzl

              j = idx(idxq1(i+ns))

              ! Calculate the j-th eigenvector of the deflated system

              ! See above why we are doing it this way!

                tmp(k) = d1l(k) - dbase(j)

                tmp(k) = tmp(k) + ddiff(j)

                ev(k,i) = zl(k) / tmp(k) * ev_scale(j)

            enddo

            enddo

!$OMP END PARALLEL DO


            if (usegpu) then

              !TODO the previous loop could be possible to do on device and thus

              !copy less

#ifdef WITH_GPU_STREAMS

              successgpu = gpu_stream_synchronize(my_stream)

              check_stream_synchronize_gpu("tridiag qtmp1_dev", successgpu)


              successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &

                                 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, &

                                 my_stream)

              check_memcpy_gpu("merge_systems: ev_dev", successgpu)


              successgpu = gpu_stream_synchronize(my_stream)

              check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

              ! synchronize streamsPerThread; maybe not neccessary

              successgpu = gpu_stream_synchronize()

              check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

#else

              successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &

                                 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)

              check_memcpy_gpu("merge_systems: ev_dev", successgpu)

#endif

            endif


            ! Multiply old Q with eigenvectors (lower half)


            if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then

              if (usegpu) then

                call obj%timer%start("gpublas")

                gpuhandle = obj%gpu_setup%gpublasHandleArray(0)

                call gpublas_precision_gemm('N', 'N', l_rows-l_rnm, ncnt, nnzl,   &

                                    1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1),    &

                                    ev_dev, ubound(ev,dim=1), &

                                    1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1), gpuhandle)

                call obj%timer%stop("gpublas")

              else ! useGPU

                call obj%timer%start("blas")

                call obj%timer%start("gemm")

                call precision_gemm('N', 'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind),  &

                                     int(nnzl,kind=blas_kind),   &

                                     1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=blas_kind),    &

                                     ev,  int(ubound(ev,dim=1),kind=blas_kind),   &

                                     1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))

                call obj%timer%stop("gemm")

                call obj%timer%stop("blas")

              endif ! useGPU

            endif


            if (usegpu) then

              !TODO either copy only half of the matrix here, and get rid of the

              !previous copy or copy whole array here


              ! COPY BACK

#ifdef WITH_GPU_STREAMS

              successgpu = gpu_stream_synchronize(my_stream)

              check_stream_synchronize_gpu("tridiag qtmp2_dev", successgpu)


              successgpu = gpu_memcpy_async(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &

                                 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost, my_stream)

              check_memcpy_gpu("merge_systems: qtmp2_dev", successgpu)


              successgpu = gpu_stream_synchronize(my_stream)

              check_stream_synchronize_gpu("merge_systems: qtmp2_dev", successgpu)

              ! synchronize streamsPerThread; maybe not neccessary

              successgpu = gpu_stream_synchronize()

              check_stream_synchronize_gpu("merge_systems: qtmp2_dev", successgpu)

#else

              successgpu = gpu_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &

                                 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost)

              check_memcpy_gpu("merge_systems: qtmp2_dev", successgpu)

#endif

            endif


             ! Put partial result into (output) Q


!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i) &

!$omp SHARED(q, ns, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)

            do i = 1, ncnt

              q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)

            enddo

!$OMP END PARALLEL DO


          enddo   !ns = 0, nqcols1-1, max_strip ! strimining loop

        enddo    !do np = 1, npc_n


        if (usegpu) then


#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))

            check_host_unregister_gpu("merge_systems: qtmp1", successgpu)

          endif

#endif

          successgpu = gpu_free(qtmp1_dev)

          check_dealloc_gpu("merge_systems: qtmp1_dev", successgpu)


#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))

            check_host_unregister_gpu("merge_systems: qtmp2", successgpu)

          endif

#endif

          successgpu = gpu_free(qtmp2_dev)

          check_dealloc_gpu("merge_systems: qtmp2_dev", successgpu)


#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))

            check_host_unregister_gpu("merge_systems: ev", successgpu)

          endif

#endif

          successgpu = gpu_free(ev_dev)

          check_dealloc_gpu("merge_systems: ev_dev", successgpu)

        endif ! useGPU


        deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)

        check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errormessage)

      endif !very outer test (na1==1 .or. na1==2)

#ifdef WITH_OPENMP_TRADITIONAL

      deallocate(z_p, stat=istat, errmsg=errormessage)

      check_deallocate("merge_systems: z_p",istat, errormessage)

#endif


      call obj%timer%stop("merge_systems" // precision_suffix)


      return


    end subroutine merge_systems_&

    &precision

add_tmp
Definition mod_add_tmp.F90:3

check_monotony
Definition mod_check_monotony.F90:3

elpa_abstract_impl
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50

global_gather
Definition mod_global_gather.F90:3

global_product
Definition mod_global_product.F90:3

resort_ev
Definition mod_resort_ev.F90:3

solve_secular_equation
Definition mod_solve_secular_equation.F90:55

transform_columns
Definition mod_transform_columns.F90:3

v_add_s
Definition mod_v_add_s.F90:55

elpa_abstract_impl::elpa_abstract_impl_t
Definition elpa_abstract_impl.F90:73