ELPA-2025.01.002/html/solve__tridi_2merge__systems__template_8F90_source.html

#if 0

!    This file is part of ELPA.

!

!    The ELPA library was originally created by the ELPA consortium,

!    consisting of the following organizations:

!

!    - Max Planck Computing and Data Facility (MPCDF), formerly known as

!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),

!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte

!      Informatik,

!    - Technische Universität München, Lehrstuhl für Informatik mit

!      Schwerpunkt Wissenschaftliches Rechnen ,

!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,

!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,

!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,

!      and

!    - IBM Deutschland GmbH

!

!    This particular source code file contains additions, changes and

!    enhancements authored by Intel Corporation which is not part of

!    the ELPA consortium.

!

!    More information can be found here:

!    http://elpa.mpcdf.mpg.de/

!

!    ELPA is free software: you can redistribute it and/or modify

!    it under the terms of the version 3 of the license of the

!    GNU Lesser General Public License as published by the Free

!    Software Foundation.

!

!    ELPA is distributed in the hope that it will be useful,

!    but WITHOUT ANY WARRANTY; without even the implied warranty of

!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

!    GNU Lesser General Public License for more details.

!

!    You should have received a copy of the GNU Lesser General Public License

!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>

!

!    ELPA reflects a substantial effort on the part of the original

!    ELPA consortium, and we ask you to respect the spirit of the

!    license that we chose: i.e., please contribute any changes you

!    may have back to the original ELPA library distribution, and keep

!    any derivatives of ELPA under the same license that we chose for

!    the original distribution, the GNU Lesser General Public License.

!

!

! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines

!

! Copyright of the original code rests with the authors inside the ELPA

! consortium. The copyright of any additional modifications shall rest

! with their original authors, but shall adhere to the licensing terms

! distributed along with the original code in the file "COPYING".

#endif


#include "../general/sanity.F90"

#include "../general/error_checking.inc"


#ifdef SOLVE_TRIDI_GPU_BUILD

    subroutine merge_systems_gpu_&

    &precision &

                         (obj, na, nm, d, e, q, matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &

                          l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)

#else


    subroutine merge_systems_cpu_&

    &precision &

                         (obj, na, nm, d, e, q, matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &

                          l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)

#endif


      use elpa_gpu

      use, intrinsic :: iso_c_binding

      use precision

      use elpa_abstract_impl

      use elpa_blas_interfaces

      use global_product

      use global_gather

      use resort_ev

      use transform_columns

      use check_monotony

      use add_tmp

      use v_add_s

      use elpa_utilities

      use elpa_mpi

      use solve_secular_equation

#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)

      use elpa_ccl_gpu

#endif

      use merge_systems_gpu


#ifdef WITH_OPENMP_TRADITIONAL

      use omp_lib

#endif

      implicit none

#include "../general/precision_kinds.F90"

      class(elpa_abstract_impl_t), intent(inout)  :: obj

      integer(kind=ik), intent(in)                :: na, nm, matrixRows, nqoff, nblk, matrixCols, mpi_comm_rows, &

                                                     mpi_comm_cols, npc_0, npc_n

      integer(kind=ik), intent(in)                :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)

      real(kind=real_datatype), intent(inout)     :: d(na), e

#ifdef USE_ASSUMED_SIZE

      real(kind=real_datatype), intent(inout)     :: q(matrixrows,*)

#else

      real(kind=real_datatype), intent(inout)     :: q(matrixrows,matrixcols)

#endif

      logical, intent(in)                         :: useGPU, wantDebug


      logical, intent(out)                        :: success


      ! TODO: play with max_strip. If it was larger, matrices being multiplied

      ! might be larger as well!

      integer(kind=ik), parameter                 :: max_strip=128


      real(kind=real_datatype)                    :: beta, sig, s, c, t, tau, rho, eps, tol, &

                                                     qtrans(2,2), dmax, zmax, d1new, d2new

      real(kind=real_datatype)                    :: z(na), d1(na), d2(na), z1(na), delta(na),  &

                                                     dbase(na), ddiff(na), ev_scale(na), tmp(na)

      real(kind=real_datatype)                    :: d1u(na), zu(na), d1l(na), zl(na)

      real(kind=real_datatype), allocatable       :: qtmp1(:,:), qtmp2(:,:), ev(:,:)

#ifdef WITH_OPENMP_TRADITIONAL

      real(kind=real_datatype), allocatable       :: z_p(:,:)

#endif


      integer(kind=ik)                            :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &

                                                     l_rqm, ns, info

      integer(kind=BLAS_KIND)                     :: infoBLAS

      integer(kind=ik)                            :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &

                                                     l_cols_qreorg, np, l_idx, nqcols1 !, nqcols2

      integer(kind=ik)                            :: nnzu_save, nnzl_save

      integer(kind=ik)                            :: my_proc, n_procs, my_prow, my_pcol, np_rows, &

                                                     np_cols

      integer(kind=MPI_KIND)                      :: mpierr

      integer(kind=MPI_KIND)                      :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI

      integer(kind=ik)                            :: np_next, np_prev, np_rem

      integer(kind=ik)                            :: idx(na), idx1(na), idx2(na)

      integer(kind=BLAS_KIND)                     :: idxBLAS(NA)

      integer(kind=ik)                            :: coltyp(na), idxq1(na) !, idxq2(na)


      integer(kind=ik)                            :: istat

      character(200)                              :: errorMessage

      integer(kind=ik)                            :: gemm_dim_k, gemm_dim_l, gemm_dim_m


      integer(kind=c_intptr_t)                    :: num

      integer(kind=C_intptr_T)                    :: qtmp1_dev, qtmp1_tmp_dev, qtmp2_dev, ev_dev, q_dev

      integer(kind=c_intptr_t)                    :: d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev

      integer(kind=c_intptr_t)                    :: d1l_dev, zl_dev, z_dev, d1_dev

      integer(kind=c_intptr_t)                    :: idx1_dev, p_col_dev, coltyp_dev, p_col_out_dev, ndef_c_dev

      integer(kind=c_intptr_t)                    :: idxq1_dev, l_col_out_dev, idx_dev, idx2_dev, l_col_dev

      integer(kind=c_intptr_t)                    :: nnzul_dev


      integer(kind=c_intptr_t)                    :: nnzu_val_dev, nnzl_val_dev

      logical                                     :: successGPU

      integer(kind=c_intptr_t), parameter         :: size_of_datatype = size_of_&

                                                                      &precision&

                                                                      &_real

      integer(kind=c_intptr_t)                    :: gpuHandle

      integer(kind=ik), intent(in)                :: max_threads

      integer(kind=c_intptr_t)                    :: my_stream

      integer(kind=ik)                            :: l_col_out_tmp

      integer(kind=ik), allocatable               :: nnzu_val(:,:), nnzl_val(:,:)

      integer(kind=ik)                            :: nnzul(2)


      integer(kind=ik)                            :: nnzu_start, nnzl_start


      integer(kind=ik), allocatable               :: ndef_c(:)


      !real(kind=REAL_DATATYPE), allocatable       :: qtmp11(:,:)

      integer(kind=ik) :: ii,jj, indx, ind_ex, ind_ex2, p_col_tmp, index2, counter1, counter2

#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)

      integer(kind=c_intptr_t)                     :: ccl_comm_rows, ccl_comm_cols

#endif

      logical                                      :: useCCL

#ifdef WITH_OPENMP_TRADITIONAL

      integer(kind=ik)                            :: my_thread


      allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errormessage)

      check_allocate("merge_systems: z_p",istat, errormessage)

#endif


      call obj%timer%start("merge_systems" // precision_suffix)

      success = .true.


      call obj%timer%start("mpi_communication")

      call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)

      call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)


      my_prow = int(my_prowmpi,kind=c_int)

      np_rows = int(np_rowsmpi,kind=c_int)

      my_pcol = int(my_pcolmpi,kind=c_int)

      np_cols = int(np_colsmpi,kind=c_int)


      call obj%timer%stop("mpi_communication")


      useccl = obj%gpu_setup%useCCL

      ! If my processor column isn't in the requested set, do nothing


      if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      ! Determine number of "next" and "prev" column for ring sends


      if (my_pcol == npc_0+npc_n-1) then

        np_next = npc_0

      else

        np_next = my_pcol + 1

      endif


      if (my_pcol == npc_0) then

        np_prev = npc_0+npc_n-1

      else

        np_prev = my_pcol - 1

      endif

      call check_monotony_&

      &precision&

      &(obj, nm,d,'Input1',wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      call check_monotony_&

      &precision&

      &(obj,na-nm,d(nm+1),'Input2',wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      ! Get global number of processors and my processor number.

      ! Please note that my_proc does not need to match any real processor number,

      ! it is just used for load balancing some loops.


      n_procs = np_rows*npc_n

      my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major


      ! Local limits of the rows of Q


      l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q

      l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm

      l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q


      l_rnm  = l_rqm-l_rqs+1 ! Number of local rows <= nm

      l_rows = l_rqe-l_rqs+1 ! Total number of local rows


      ! My number of local columns


      l_cols = count(p_col(1:na)==my_pcol)


      ! Get max number of local columns


      max_local_cols = 0

      do np = npc_0, npc_0+npc_n-1

        max_local_cols = max(max_local_cols,count(p_col(1:na)==np))

      enddo


      ! Calculations start here


      beta = abs(e)

      sig  = sign(1.0_rk,e)


      ! Calculate rank-1 modifier z


      z(:) = 0


      if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow) then

        ! nm is local on my row

        do i = 1, na

          if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))

         enddo

      endif


      if (mod((nqoff+nm)/nblk,np_rows)==my_prow) then

        ! nm+1 is local on my row

        do i = 1, na

          if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))

        enddo

      endif


      call global_gather_&

      &precision&

      &(obj, z, na, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)

      if (.not.(success)) then

        write(error_unit,*) "Error in global_gather. ABorting"

        success = .false.

        return

      endif

      ! Normalize z so that norm(z) = 1.  Since z is the concatenation of

      ! two normalized vectors, norm2(z) = sqrt(2).

      z = z/sqrt(2.0_rk)

      rho = 2.0_rk*beta

      ! Calculate index for merging both systems by ascending eigenvalues

      call obj%timer%start("lapack_lamrg")

      call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &

                            1_blas_kind, 1_blas_kind, idxblas )

      idx(:) = int(idxblas(:),kind=ik)

      call obj%timer%stop("lapack_lamrg")


      ! Calculate the allowable deflation tolerance


      zmax = maxval(abs(z))

      dmax = maxval(abs(d))

      eps = precision_lamch( 'E' ) ! return epsilon

      tol = 8.0_rk*eps*max(dmax,zmax)


      ! If the rank-1 modifier is small enough, no more needs to be done

      ! except to reorganize D and Q


      IF ( rho*zmax <= tol ) THEN


        ! Rearrange eigenvalues


        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo


        ! Rearrange eigenvectors

        call resort_ev_&

        &precision &

                       (obj, idx, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &

                        l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)


        call obj%timer%stop("merge_systems" // precision_suffix)


        return

      ENDIF


      ! Merge and deflate system


      na1 = 0

      na2 = 0


      ! COLTYP:

      ! 1 : non-zero in the upper half only;

      ! 2 : dense;

      ! 3 : non-zero in the lower half only;

      ! 4 : deflated.


      coltyp(1:nm) = 1

      coltyp(nm+1:na) = 3


      do i=1,na


        if (rho*abs(z(idx(i))) <= tol) then


          ! Deflate due to small z component.


          na2 = na2+1

          d2(na2)   = d(idx(i))

          idx2(na2) = idx(i)

          coltyp(idx(i)) = 4


        else if (na1>0) then


          ! Check if eigenvalues are close enough to allow deflation.


          s = z(idx(i))

          c = z1(na1)


          ! Find sqrt(a**2+b**2) without overflow or

          ! destructive underflow.

          tau = precision_lapy2( c, s )

          t = d1(na1) - d(idx(i))

          c = c / tau

          s = -s / tau

          IF ( abs( t*c*s ) <= tol ) THEN


            ! Deflation is possible.


            na2 = na2+1


            z1(na1) = tau


            d2new = d(idx(i))*c**2 + d1(na1)*s**2

            d1new = d(idx(i))*s**2 + d1(na1)*c**2


            ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0

            ! This means that after the above transformation it must be

            !    D1(na1) <= d1new <= D(idx(i))

            !    D1(na1) <= d2new <= D(idx(i))

            !

            ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))

            ! so there is no problem with sorting here.

            ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)

            ! which makes a check (and possibly a resort) necessary.

            !

            ! The above relations may not hold exactly due to numeric differences

            ! so they have to be enforced in order not to get troubles with sorting.


            if (d1new<d1(na1)  ) d1new = d1(na1)

            if (d1new>d(idx(i))) d1new = d(idx(i))


            if (d2new<d1(na1)  ) d2new = d1(na1)

            if (d2new>d(idx(i))) d2new = d(idx(i))


            d1(na1) = d1new


            do j=na2-1,1,-1

              if (d2new<d2(j)) then

                d2(j+1)   = d2(j)

                idx2(j+1) = idx2(j)

              else

                exit ! Loop

              endif

            enddo


            d2(j+1)   = d2new

            idx2(j+1) = idx(i)


            qtrans(1,1) = c; qtrans(1,2) =-s

            qtrans(2,1) = s; qtrans(2,2) = c

            call transform_columns_&

            &precision &

                        (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &

                         q, matrixrows, matrixcols, l_rows, mpi_comm_cols, &

                          p_col, l_col, qtrans)

            if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2

            if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2


            coltyp(idx(i)) = 4


          else

            na1 = na1+1

            d1(na1) = d(idx(i))

            z1(na1) = z(idx(i))

            idx1(na1) = idx(i)

          endif

        else

          na1 = na1+1

          d1(na1) = d(idx(i))

          z1(na1) = z(idx(i))

          idx1(na1) = idx(i)

        endif


      enddo ! do i=1,na


      call check_monotony_&

      &precision&

      &(obj, na1,d1,'Sorted1', wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      call check_monotony_&

      &precision&

      &(obj, na2,d2,'Sorted2', wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif


      if (na1==1 .or. na1==2) then

        ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid


        if (na1==1) then

          d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation

        else ! na1==2

          call obj%timer%start("lapack_laed5_x2")

          call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))

          call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))

          call obj%timer%stop("lapack_laed5_x2")

          call transform_columns_&

          &precision&

          &(obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &

            matrixrows, matrixcols, l_rows, mpi_comm_cols, &

             p_col, l_col, qtrans)


        endif


        ! Add the deflated eigenvalues

        d(na1+1:na) = d2(1:na2)


        ! Calculate arrangement of all eigenvalues  in output

        call obj%timer%start("lapack_lamrg")

        call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &

                              1_blas_kind, 1_blas_kind, idxblas )

        idx(:) = int(idxblas(:),kind=ik)

        call obj%timer%stop("lapack_lamrg")

        ! Rearrange eigenvalues


        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo


        ! Rearrange eigenvectors


        do i=1,na

          if (idx(i)<=na1) then

            idxq1(i) = idx1(idx(i))

          else

            idxq1(i) = idx2(idx(i)-na1)

          endif

        enddo

        call resort_ev_&

        &precision&

        &(obj, idxq1, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &

          l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)


      else if (na1>2) then


        ! Solve secular equation


        z(1:na1) = 1

#ifdef WITH_OPENMP_TRADITIONAL

        z_p(1:na1,:) = 1

#endif

        dbase(1:na1) = 0

        ddiff(1:na1) = 0


        info = 0

        infoblas = int(info,kind=blas_kind)

!#ifdef WITH_OPENMP_TRADITIONAL

!

!        call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)

!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)

!        my_thread = omp_get_thread_num()

!!$OMP DO

!#endif

        DO i = my_proc+1, na1, n_procs ! work distributed over all processors

          call obj%timer%start("lapack_laed4")

          call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &

                               rho, s, infoblas) ! s is not used!

          info = int(infoblas,kind=ik)

          call obj%timer%stop("lapack_laed4")

          if (info/=0) then

            ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)

            ! use the more stable bisection algorithm in solve_secular_equation

            ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'

            call solve_secular_equation_&

            &precision&

            &(obj, na1, i, d1, z1, delta, rho, s)

          endif


          ! Compute updated z


!#ifdef WITH_OPENMP_TRADITIONAL

!          do j=1,na1

!            if (i/=j)  z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )

!          enddo

!          z_p(i,my_thread) = z_p(i,my_thread)*delta(i)

!#else

          do j=1,na1

            if (i/=j)  z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )

          enddo

          z(i) = z(i)*delta(i)

!#endif

          ! store dbase/ddiff


          if (i<na1) then

            if (abs(delta(i+1)) < abs(delta(i))) then

              dbase(i) = d1(i+1)

              ddiff(i) = delta(i+1)

            else

              dbase(i) = d1(i)

              ddiff(i) = delta(i)

            endif

          else

            dbase(i) = d1(i)

            ddiff(i) = delta(i)

          endif

        enddo ! i = my_proc+1, na1, n_procs

!#ifdef WITH_OPENMP_TRADITIONAL

!!$OMP END PARALLEL

!

!        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)

!

!        do i = 0, max_threads-1

!          z(1:na1) = z(1:na1)*z_p(1:na1,i)

!        enddo

!#endif


        call global_product_&

        &precision&

        (obj, z, na1, mpi_comm_rows, mpi_comm_cols, npc_0, npc_n, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_product. Aborting..."

          return

        endif

        z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )


        call global_gather_&

        &precision&

        &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        call global_gather_&

        &precision&

        &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        d(1:na1) = dbase(1:na1) - ddiff(1:na1)


        ! Calculate scale factors for eigenvectors

        ev_scale(:) = 0.0_rk


#ifdef WITH_OPENMP_TRADITIONAL


        call obj%timer%start("OpenMP parallel" // precision_suffix)


!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i) &

!$omp SHARED(na1, my_proc, n_procs,  &

!$OMP d1, dbase, ddiff, z, ev_scale, obj)


#endif

        DO i = my_proc+1, na1, n_procs ! work distributed over all processors


          ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code

          ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results


          ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)

          ! in exactly this order, but we want to prevent compiler optimization

!         ev_scale_val = ev_scale(i)

          call add_tmp_&

          &precision&

          &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)

!         ev_scale(i) = ev_scale_val

        enddo

#ifdef WITH_OPENMP_TRADITIONAL

!$OMP END PARALLEL DO


        call obj%timer%stop("OpenMP parallel" // precision_suffix)


#endif


        call global_gather_&

        &precision&

        &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        ! Add the deflated eigenvalues

        d(na1+1:na) = d2(1:na2)


        call obj%timer%start("lapack_lamrg")

        ! Calculate arrangement of all eigenvalues  in output

        call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &

                             1_blas_kind, 1_blas_kind, idxblas )

        idx(:) = int(idxblas(:),kind=ik)

        call obj%timer%stop("lapack_lamrg")

        ! Rearrange eigenvalues

        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo

        call check_monotony_&

        &precision&

        &(obj, na,d,'Output', wantdebug, success)


        if (.not.(success)) then

          call obj%timer%stop("merge_systems" // precision_suffix)

          return

        endif

        ! Eigenvector calculations

        if (usegpu) then

          num = 2 * size_of_int

          successgpu = gpu_malloc(nnzul_dev, num)

          check_alloc_gpu("merge_systems: ", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(idxq1_dev, num)

          check_alloc_gpu("merge_systems: ", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(idx_dev, num)

          check_alloc_gpu("merge_systems: idx_dev", successgpu)


          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: idx_dev", successgpu)

#endif

        endif


        ! Calculate the number of columns in the new local matrix Q

        ! which are updated from non-deflated/deflated eigenvectors.

        ! idxq1/2 stores the global column numbers.


        !if (useGPU) then


        !  !nqcols1 is needed later on host !!

        !  ! memcopy back needed!!

        !else

          nqcols1 = 0 ! number of non-deflated eigenvectors

          !nqcols2 = 0 ! number of deflated eigenvectors

          DO i = 1, na

            if (p_col_out(i)==my_pcol) then

              if (idx(i)<=na1) then

                nqcols1 = nqcols1+1

                idxq1(nqcols1) = i

              !else

                !nqcols2 = nqcols2+1

                !idxq2(nqcols2) = i

              endif

            endif

          enddo

        !endif


        if (usegpu) then

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: idxq1_dev", successgpu)

#endif

        endif


        if (usegpu) then

          allocate(ndef_c(na), stat=istat, errmsg=errormessage)

          check_allocate("merge_systems: ndef_c",istat, errormessage)

        endif


        gemm_dim_k = max(1,l_rows)

        gemm_dim_l = max_local_cols

        gemm_dim_m = min(max_strip,max(1,nqcols1))


        allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)

        check_allocate("merge_systems: qtmp1",istat, errormessage)


        allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)

        check_allocate("merge_systems: ev",istat, errormessage)


        allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)

        check_allocate("merge_systems: qtmp2",istat, errormessage)


        qtmp1 = 0 ! May contain empty (unset) parts

        qtmp2 = 0 ! Not really needed


        ! check memory copies


        if (usegpu) then

          num = na * size_of_int

          successgpu = gpu_malloc(ndef_c_dev, num)

          check_alloc_gpu("merge_systems: ndef_c_dev", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(idx1_dev, num)

          check_alloc_gpu("merge_systems: idx1_dev", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(p_col_dev, num)

          check_alloc_gpu("merge_systems: p_col_dev", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(p_col_out_dev, num)

          check_alloc_gpu("merge_systems: p_col_out_dev", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(coltyp_dev, num)

          check_alloc_gpu("merge_systems: coltyp_dev", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(idx2_dev, num)

          check_alloc_gpu("merge_systems: idx2_dev", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(l_col_out_dev, num)

          check_alloc_gpu("merge_systems: l_col_out_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(z_dev, num)

          check_alloc_gpu("merge_systems: z_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(d1_dev, num)

          check_alloc_gpu("merge_systems: d1_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(d1u_dev, num)

          check_alloc_gpu("merge_systems: d1u_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(dbase_dev, num)

          check_alloc_gpu("merge_systems: dbase_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(ddiff_dev, num)

          check_alloc_gpu("merge_systems: ddiff_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(zu_dev, num)

          check_alloc_gpu("merge_systems: zu_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(ev_scale_dev, num)

          check_alloc_gpu("merge_systems: ev_scale_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(d1l_dev, num)

          check_alloc_gpu("merge_systems: d1l_dev", successgpu)


          num = (na) * size_of_datatype

          successgpu = gpu_malloc(zl_dev, num)

          check_alloc_gpu("merge_systems: zl_dev", successgpu)


          num = (matrixrows*matrixcols) * size_of_datatype

          successgpu = gpu_malloc(q_dev, num)

          check_alloc_gpu("merge_systems: q_dev", successgpu)


          num = (gemm_dim_k * gemm_dim_l) * size_of_datatype

#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&

                        gpuhostregisterdefault)

            check_host_register_gpu("merge_systems: qtmp1", successgpu)

          endif

#endif

          successgpu = gpu_malloc(qtmp1_dev, num)

          check_alloc_gpu("merge_systems: qtmp1_dev", successgpu)


          successgpu = gpu_malloc(qtmp1_tmp_dev, num)

          check_alloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)


          num = (gemm_dim_l * gemm_dim_m) * size_of_datatype

#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t),num,&

                        gpuhostregisterdefault)

            check_host_register_gpu("merge_systems: ev", successgpu)

          endif

#endif

          successgpu = gpu_malloc(ev_dev, num)

          check_alloc_gpu("merge_systems: ev_dev", successgpu)


          num = (gemm_dim_k * gemm_dim_m) * size_of_datatype

#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&

                        gpuhostregisterdefault)

            check_host_register_gpu("merge_systems: qtmp2", successgpu)

          endif

#endif

          successgpu = gpu_malloc(qtmp2_dev, num)

          check_alloc_gpu("merge_systems: qtmp2_dev", successgpu)

        endif !useGPU


        ! Gather nonzero upper/lower components of old matrix Q

        ! which are needed for multiplication with new eigenvectors


        ! kernel compute nnzu on device

        nnzu = 0

        nnzl = 0

        do i = 1, na1

          l_idx = l_col(idx1(i))

          if (p_col(idx1(i))==my_pcol) then

            if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then

              nnzu = nnzu+1

              qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)

            endif

            if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then

              nnzl = nnzl+1

              qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)

            endif

          endif

        enddo


        if (usegpu) then


          num = gemm_dim_k * gemm_dim_l * size_of_datatype

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &

               num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)

#else

          successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &

              num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)

#endif


          num = matrixrows*matrixcols*size_of_datatype

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &

               num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: q_dev", successgpu)

#else

          successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &

             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: q_dev", successgpu)

#endif


          num = na * size_of_int

          successgpu = gpu_malloc(l_col_dev, num)

          check_alloc_gpu("merge_systems: l_col_dev", successgpu)


          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: l_col_dev", successgpu)

#endif

        endif


        ! Gather deflated eigenvalues behind nonzero components


        ! compute ndef on device

        ndef = max(nnzu,nnzl)


        if (usegpu) then

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: idx2_dev", successgpu)

#endif

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream


          successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: p_col_dev", successgpu)

#endif

        endif


        if (usegpu) then

          ndef_c(:) = ndef


          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &

                                        num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ndef_c_dev 4", successgpu)


#else

          successgpu = gpu_memcpy(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &

                                  num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: ndef_c_dev", successgpu)

#endif


          call gpu_copy_q_slice_to_qtmp1_precision (qtmp1_dev, q_dev, ndef_c_dev, l_col_dev, idx2_dev, p_col_dev, na2, na, &

                                                    my_pcol, l_rows, l_rqs, l_rqe, matrixrows, gemm_dim_k, my_stream)

        else

          do i = 1, na2

            l_idx = l_col(idx2(i))

            if (p_col(idx2(i))==my_pcol) then

              ndef = ndef+1

              qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)

            endif

          enddo

        endif


        l_cols_qreorg = ndef ! Number of columns in reorganized matrix

        if (usegpu) then

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)


#else

          successgpu = gpu_memcpy(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: p_col_out_dev", successgpu)

#endif

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)


#else

          successgpu = gpu_memcpy(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)

#endif

        endif


        ! Set (output) Q to 0, it will sum up new Q


        if (usegpu) then

          call gpu_zero_q_precision (q_dev, p_col_out_dev, l_col_out_dev, na, my_pcol, l_rqs, l_rqe, matrixrows, my_stream)

        else

          DO i = 1, na

            if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0

          enddo

        endif


       ! check memory copies


       if (usegpu) then

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: idx1_dev", successgpu)

#endif


          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: p_col_dev", successgpu)

#endif


          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)

#endif


          num = na*size_of_datatype

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)

#else

          successgpu = gpu_memcpy(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)

#endif


          num = na*size_of_datatype

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: z_dev", successgpu)

#else

          successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: z_dev", successgpu)

#endif


          num = na*size_of_datatype

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

#else

          successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)

#endif


          num = gemm_dim_l * gemm_dim_m * size_of_datatype

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ev_dev", successgpu)

#else

          !TODO the previous loop could be possible to do on device and thus

          !copy less

          successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: ev_dev", successgpu)

#endif


          num = na*size_of_datatype

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: dbase_dev", successgpu)

#else

          successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: dbase_dev", successgpu)

#endif


          num = na*size_of_datatype

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_stream_synchronize(my_stream)

          successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)

#else

          successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)

#endif

        endif


        allocate(nnzu_val(na1,npc_n))

        allocate(nnzl_val(na1,npc_n))


        nnzu_val(:,:) = 0

        nnzl_val(:,:) = 0


        if (usegpu) then

          num = na1 * npc_n* size_of_int

          successgpu = gpu_malloc(nnzu_val_dev, num)

          check_alloc_gpu("merge_systems: nnzu_val_dev", successgpu)


          num = na1 * npc_n* size_of_int

          successgpu = gpu_malloc(nnzl_val_dev, num)

          check_alloc_gpu("merge_systems: nnzl_val_dev", successgpu)

        endif


        np_rem = my_pcol

        if (usegpu) then

          do np = 1, npc_n

            if (np > 1) then

              if (np_rem == npc_0) then

                np_rem = npc_0+npc_n-1

              else

                np_rem = np_rem-1

              endif

            endif

            nnzu = 0

            nnzl = 0


            call gpu_compute_nnzl_nnzu_val_part1 (p_col_dev, idx1_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, &

                                                  na, na1, np_rem, npc_n, nnzu_save, nnzl_save, np, my_stream)


          enddo ! np = 1, npc_n


          nnzu_start = 0

          nnzl_start = 0


          call gpu_compute_nnzl_nnzu_val_part2 (nnzu_val_dev, nnzl_val_dev, na, na1, nnzu_start, nnzl_start, npc_n, my_stream)

        else

          ! precompute nnzu_val, nnzl_val

          do np = 1, npc_n

            if (np > 1) then

              if (np_rem == npc_0) then

                np_rem = npc_0+npc_n-1

              else

                np_rem = np_rem-1

              endif

            endif

            nnzu = 0

            nnzl = 0

            do i=1,na1

              if (p_col(idx1(i)) == np_rem) then

                if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then

                  nnzu = nnzu+1

                  nnzu_val(i,np) =  nnzu

                endif

                if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then

                  nnzl = nnzl+1

                  nnzl_val(i,np) =  nnzl

                endif

              endif

            enddo

          enddo ! np = 1, npc_n

        endif


        np_rem = my_pcol


        ! is nnzu updated in main loop

        ! main loop


        do np = 1, npc_n

          ! Do a ring send of qtmp1


          if (np > 1) then


            if (np_rem == npc_0) then

              np_rem = npc_0+npc_n-1

            else

              np_rem = np_rem-1

            endif


            if (usegpu) then

#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)

              if (useccl) then

                my_stream = obj%gpu_setup%my_stream

                call gpu_copy_qtmp1_to_qtmp1_tmp_precision (qtmp1_dev, qtmp1_tmp_dev, gemm_dim_k, gemm_dim_l, my_stream)


                call obj%timer%start("ccl_send_recv")

                ccl_comm_cols = obj%gpu_setup%ccl_comm_cols

                successgpu = ccl_group_start() ! PETERDEBUG: should this be moved outside of the loop or deleted?

                if (.not.successgpu) then

                  print *,"Error in setting up nccl_group_start!"

                  stop

                endif

                successgpu = ccl_send(qtmp1_tmp_dev, int(l_rows*max_local_cols,kind=c_size_t), &

#ifdef DOUBLE_PRECISION

                                   ccldouble, &

#endif

#ifdef SINGLE_PRECISION

                                   cclfloat, &

#endif

                                   np_next, ccl_comm_cols, my_stream)

                if (.not.successgpu) then

                  print *,"Error in nccl_send"

                  stop

                endif

                successgpu = ccl_recv(qtmp1_dev, int(l_rows*max_local_cols,kind=c_size_t), &

#ifdef DOUBLE_PRECISION

                                   ccldouble, &

#endif

#ifdef SINGLE_PRECISION

                                   cclfloat, &

#endif

                                   np_prev, ccl_comm_cols, my_stream)


                if (.not.successgpu) then

                  print *,"Error in ccl_send/ccl_recv"

                  stop

                endif

                successgpu = ccl_group_end()

                if (.not.successgpu) then

                  print *,"Error in setting up ccl_group_end!"

                  stop

                endif

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("trans_ev", successgpu)

                call obj%timer%stop("ccl_send_recv")

              else ! useCCL

#endif /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */


#ifdef WITH_MPI

                call obj%timer%start("mpi_communication")

#ifdef WITH_GPU_STREAMS

                my_stream = obj%gpu_setup%my_stream

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)


                successgpu = gpu_memcpy_async(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &

                     gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpydevicetohost, my_stream)

                check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)


                my_stream = obj%gpu_setup%my_stream

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

                ! synchronize streamsPerThread; maybe not neccessary

                successgpu = gpu_stream_synchronize()

                check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)


#else

                successgpu = gpu_memcpy(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &

                     gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpydevicetohost)

                check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)

#endif


                call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision,     &

                                          int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &

                                          1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)

#ifdef WITH_GPU_STREAMS

                my_stream = obj%gpu_setup%my_stream

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)


                successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &

                     gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpyhosttodevice, my_stream)

                check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)


                my_stream = obj%gpu_setup%my_stream

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

                ! synchronize streamsPerThread; maybe not neccessary

                successgpu = gpu_stream_synchronize()

                check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)


#else

                successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &

                     gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpyhosttodevice)

                check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)

#endif

                call obj%timer%stop("mpi_communication")

#endif /* WITH_MPI */

#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)

              endif ! useCCL

#endif /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */

            else ! useGPU

#ifdef WITH_MPI

              call obj%timer%start("mpi_communication")

              call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision,     &

                                          int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &

                                          1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)

              call obj%timer%stop("mpi_communication")

#endif /* WITH_MPI */


            endif ! useGPU


          endif ! (np > 1) then


          ! Gather the parts in d1 and z which are fitting to qtmp1.

          ! This also delivers nnzu/nnzl for proc np_rem

          nnzu = 0

          nnzl = 0

          if (usegpu) then


            my_stream = obj%gpu_setup%my_stream

            call gpu_fill_tmp_arrays_precision (idx1_dev, p_col_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, nnzul_dev, &

                                                d1u_dev, d1_dev, zu_dev, z_dev, d1l_dev, zl_dev, na, np, na1, np_rem, my_stream)


            num = 2* size_of_int

#ifdef WITH_GPU_STREAMS

            successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &

                               num, gpumemcpydevicetohost, my_stream)

            check_memcpy_gpu("merge_systems: nnzul_dev", successgpu)

#else

            successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &

                             num, gpumemcpydevicetohost)

            check_memcpy_gpu("merge_systems: nnzl_val", successgpu)

#endif

            nnzu = nnzul(1)

            nnzl = nnzul(2)


          else ! useGPU

            do i=1,na1

              if (p_col(idx1(i)) == np_rem) then

                if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then

                  nnzu = nnzu+1

                  d1u(nnzu) = d1(i)

                  zu(nnzu) = z(i)

                endif

                if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then

                  nnzl = nnzl+1

                  d1l(nnzl) = d1(i)

                  zl(nnzl) = z(i)

                endif

              endif

            enddo

          endif ! useGPU


          ! Set the deflated eigenvectors in Q (comming from proc np_rem)


          ndef = max(nnzu,nnzl) ! Remote counter in input matrix

          if (usegpu) then

            ! PETERDEBUG: idx2_dev, problem with garbage values

            call gpu_update_ndef_c(ndef_c_dev, idx_dev, p_col_dev, idx2_dev, na, na1, np_rem, ndef, my_stream)


          endif ! useGPU


          ndef = max(nnzu,nnzl) ! Remote counter in input matrix

          if (usegpu) then

            call gpu_copy_qtmp1_slice_to_q_precision (q_dev, qtmp1_dev, l_col_out_dev, p_col_out_dev, ndef_c_dev, p_col_dev, &

                                                      idx2_dev, idx_dev, l_rqs, l_rqe, l_rows, matrixrows, &

                                                      gemm_dim_k,  my_pcol, na1, np_rem,  na, my_stream)

          else ! ! useGPU

            ndef = max(nnzu,nnzl) ! Remote counter in input matrix

            do i = 1, na

              j = idx(i)

              if (j>na1) then

                if (p_col(idx2(j-na1)) == np_rem) then

                  ndef = ndef+1

                  if (p_col_out(i) == my_pcol) then

                    q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)

                  endif

                endif

              endif

            enddo


          endif ! useGPU


          do ns = 0, nqcols1-1, max_strip ! strimining loop

            ncnt = min(max_strip,nqcols1-ns) ! number of columns in this strip


            ! Get partial result from (output) Q

            if (usegpu) then

              call gpu_copy_q_slice_to_qtmp2_precision (q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rows, l_rqs, l_rqe, &

                                                        matrixrows, matrixcols, gemm_dim_k, gemm_dim_m, ns, &

                                                        ncnt, ind_ex, ind_ex2, na, my_stream)

            else ! useGPU

!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i, j, k) &

!$omp SHARED(ns, q, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)

              do i = 1, ncnt

                j = idxq1(i+ns)

                k = l_col_out(j)

                qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)

              enddo

!$OMP END PARALLEL DO

            endif ! useGPU


            ! Compute eigenvectors of the rank-1 modified matrix.

            ! Parts for multiplying with upper half of Q:

            if (usegpu) then

              if (nnzu .ge. 1) then

                ! Calculate the j-th eigenvector of the deflated system

                ! See above why we are doing it this way!

                call gpu_fill_ev_precision (ev_dev, d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev, idxq1_dev, idx_dev, &

                                            na, gemm_dim_l, gemm_dim_m, nnzu, ns, ncnt, my_stream)

              endif ! nnzu


            else ! useGPU

!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i, j, k, tmp) &

!$omp shared(ncnt, nnzu, idx, idxq1, ns, d1u, dbase, ddiff, zu, ev_scale, ev)

              do i = 1, ncnt

                do k = 1, nnzu

                  j = idx(idxq1(i+ns))

                  ! Calculate the j-th eigenvector of the deflated system

                  ! See above why we are doing it this way!


                  ! kernel here

                  tmp(k) = d1u(k) - dbase(j)

                  tmp(k) = tmp(k) + ddiff(j)

                  ev(k,i) = zu(k) / tmp(k) * ev_scale(j)

                enddo

              enddo

!$OMP END PARALLEL DO

            endif ! useGPU


            ! Multiply old Q with eigenvectors (upper half)


            if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then

              if (usegpu) then

                call obj%timer%start("gpublas_gemm")

                gpuhandle = obj%gpu_setup%gpublasHandleArray(0)

                call gpublas_precision_gemm('N', 'N', l_rnm, ncnt, nnzu,   &

                                    1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1),    &

                                    ev_dev, ubound(ev,dim=1), &

                                    1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1), gpuhandle)

                if (wantdebug) successgpu = gpu_devicesynchronize()

                call obj%timer%stop("gpublas_gemm")

              else ! useGPU

                call obj%timer%start("blas_gemm")

                call precision_gemm('N', 'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &

                                    int(nnzu,kind=blas_kind),   &

                                    1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=blas_kind),    &

                                    ev, int(ubound(ev,dim=1),kind=blas_kind), &

                                    1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))

                call obj%timer%stop("blas_gemm")

              endif ! useGPU

            endif ! (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then


            ! Compute eigenvectors of the rank-1 modified matrix.

            ! Parts for multiplying with lower half of Q:


            if (usegpu) then

              if (nnzl .ge. 1) then

                call gpu_fill_ev_precision (ev_dev, d1l_dev, dbase_dev, ddiff_dev, zl_dev, ev_scale_dev, idxq1_dev, idx_dev, &

                                            na, gemm_dim_l, gemm_dim_m, nnzl, ns, ncnt, my_stream)

              endif

            else ! useGPU

!$omp PARALLEL DO &

!$omp private(i, j, k, tmp)

              do i = 1, ncnt

                do k = 1, nnzl

                  j = idx(idxq1(i+ns))

                  ! Calculate the j-th eigenvector of the deflated system

                  ! See above why we are doing it this way!

                  tmp(k) = d1l(k) - dbase(j)

                  tmp(k) = tmp(k) + ddiff(j)

                  ev(k,i) = zl(k) / tmp(k) * ev_scale(j)

                enddo

              enddo

!$OMP END PARALLEL DO

            endif ! useGPU


            ! Multiply old Q with eigenvectors (lower half)


            if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then

              if (usegpu) then

                call obj%timer%start("gpublas_gemm")

                gpuhandle = obj%gpu_setup%gpublasHandleArray(0)

                call gpublas_precision_gemm('N', 'N', l_rows-l_rnm, ncnt, nnzl,   &

                                    1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1),    &

                                    ev_dev, ubound(ev,dim=1), &

                                    1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1), gpuhandle)

                if (wantdebug) successgpu = gpu_devicesynchronize()

                call obj%timer%stop("gpublas_gemm")

              else ! useGPU

                call obj%timer%start("blas_gemm")

                call precision_gemm('N', 'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind),  &

                                     int(nnzl,kind=blas_kind),   &

                                     1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=blas_kind),    &

                                     ev,  int(ubound(ev,dim=1),kind=blas_kind),   &

                                     1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))

                call obj%timer%stop("blas_gemm")

              endif ! useGPU

            endif


            ! Put partial result into (output) Q

            if (usegpu) then

              call gpu_copy_qtmp2_slice_to_q_precision(q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rqs, l_rqe, l_rows, ncnt, &

                                                       gemm_dim_k, matrixrows, ns, my_stream)

            else ! useGPU

!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i) &

!$omp SHARED(q, ns, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)

              do i = 1, ncnt

                q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)

              enddo

!$OMP END PARALLEL DO

            endif ! useGPU


          enddo   !ns = 0, nqcols1-1, max_strip ! strimining loop

        enddo    !do np = 1, npc_n


        deallocate(nnzu_val, nnzl_val)


          if (usegpu) then


            num = matrixrows*matrixcols*size_of_datatype

#ifdef WITH_GPU_STREAMS

            successgpu = gpu_memcpy_async(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &

                 num, gpumemcpydevicetohost, my_stream)

            check_memcpy_gpu("merge_systems: q_dev", successgpu)


            my_stream = obj%gpu_setup%my_stream

            successgpu = gpu_stream_synchronize(my_stream)

            check_stream_synchronize_gpu("merge_systems: q_dev", successgpu)

#else

            successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &

                 num, gpumemcpydevicetohost)

            check_memcpy_gpu("merge_systems: q_dev", successgpu)

#endif

#ifdef WITH_GPU_STREAMS

            successgpu = gpu_memcpy_async(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &

                               gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost, my_stream)

            check_memcpy_gpu("merge_systems: ev_dev", successgpu)


            successgpu = gpu_stream_synchronize(my_stream)

            check_stream_synchronize_gpu("merge_systems: ev_dev", successgpu)

#else

            !TODO the previous loop could be possible to do on device and thus

            !copy less

            successgpu = gpu_memcpy(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &

                               gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost)

            check_memcpy_gpu("merge_systems: ev_dev", successgpu)

#endif

          endif


        if (usegpu) then

          deallocate(ndef_c, stat=istat, errmsg=errormessage)

          check_deallocate("merge_systems: ndef_c",istat, errormessage)

        endif


        if (usegpu) then

          successgpu = gpu_free(nnzul_dev)

          check_dealloc_gpu("merge_systems: nnzul_dev", successgpu)


          successgpu = gpu_free(l_col_dev)

          check_dealloc_gpu("merge_systems: l_col_dev", successgpu)


          successgpu = gpu_free(ndef_c_dev)

          check_dealloc_gpu("merge_systems: ndef_c_dev", successgpu)


          successgpu = gpu_free(nnzu_val_dev)

          check_dealloc_gpu("merge_systems: nnzu_val_dev", successgpu)


          successgpu = gpu_free(nnzl_val_dev)

          check_dealloc_gpu("merge_systems: nnzl_val_dev", successgpu)


          successgpu = gpu_free(idx1_dev)

          check_dealloc_gpu("merge_systems: idx1_dev", successgpu)


          successgpu = gpu_free(idx2_dev)

          check_dealloc_gpu("merge_systems: idx2_dev", successgpu)


          successgpu = gpu_free(p_col_dev)

          check_dealloc_gpu("merge_systems: p_col_dev", successgpu)


          successgpu = gpu_free(p_col_out_dev)

          check_dealloc_gpu("merge_systems: p_col_out_dev", successgpu)


          successgpu = gpu_free(coltyp_dev)

          check_dealloc_gpu("merge_systems: coltyp_dev", successgpu)


          successgpu = gpu_free(idx_dev)

          check_dealloc_gpu("merge_systems: idx_dev", successgpu)


          successgpu = gpu_free(l_col_out_dev)

          check_dealloc_gpu("merge_systems: l_col_out_dev", successgpu)


          successgpu = gpu_free(idxq1_dev)

          check_dealloc_gpu("merge_systems: ", successgpu)


          successgpu = gpu_free(q_dev)

          check_dealloc_gpu("merge_systems: q_dev", successgpu)


          successgpu = gpu_free(d1_dev)

          check_dealloc_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_free(z_dev)

          check_dealloc_gpu("merge_systems: z_dev", successgpu)


          successgpu = gpu_free(d1u_dev)

          check_dealloc_gpu("merge_systems: d1u_dev", successgpu)


          successgpu = gpu_free(dbase_dev)

          check_dealloc_gpu("merge_systems: dbase_dev", successgpu)


          successgpu = gpu_free(ddiff_dev)

          check_dealloc_gpu("merge_systems: ddiff_dev", successgpu)


          successgpu = gpu_free(zu_dev)

          check_dealloc_gpu("merge_systems: zu_dev", successgpu)


          successgpu = gpu_free(ev_scale_dev)

          check_dealloc_gpu("merge_systems: ev_scale_dev", successgpu)


          successgpu = gpu_free(d1l_dev)

          check_dealloc_gpu("merge_systems: d1l_dev", successgpu)


          successgpu = gpu_free(zl_dev)

          check_dealloc_gpu("merge_systems: zl_dev", successgpu)


#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))

            check_host_unregister_gpu("merge_systems: qtmp1", successgpu)

          endif

#endif

          successgpu = gpu_free(qtmp1_dev)

          check_dealloc_gpu("merge_systems: qtmp1_dev", successgpu)


          successgpu = gpu_free(qtmp1_tmp_dev)

          check_dealloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)


#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))

            check_host_unregister_gpu("merge_systems: qtmp2", successgpu)

          endif

#endif

          successgpu = gpu_free(qtmp2_dev)

          check_dealloc_gpu("merge_systems: qtmp2_dev", successgpu)


#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))

            check_host_unregister_gpu("merge_systems: ev", successgpu)

          endif

#endif

          successgpu = gpu_free(ev_dev)

          check_dealloc_gpu("merge_systems: ev_dev", successgpu)

        endif ! useGPU


        deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)

        check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errormessage)

      endif !very outer test (na1==1 .or. na1==2)

#ifdef WITH_OPENMP_TRADITIONAL

      deallocate(z_p, stat=istat, errmsg=errormessage)

      check_deallocate("merge_systems: z_p",istat, errormessage)

#endif


      call obj%timer%stop("merge_systems" // precision_suffix)


      return


    subroutine merge_systems_cpu_& …

    end

add_tmp
Definition mod_add_tmp.F90:3

check_monotony
Definition mod_check_monotony.F90:3

elpa_abstract_impl
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50

global_gather
Definition mod_global_gather.F90:3

global_product
Definition mod_global_product.F90:3

resort_ev
Definition mod_resort_ev.F90:3

solve_secular_equation
Definition mod_solve_secular_equation.F90:55

transform_columns
Definition mod_transform_columns.F90:3

v_add_s
Definition mod_v_add_s.F90:55

elpa_abstract_impl::elpa_abstract_impl_t
Definition elpa_abstract_impl.F90:73