ELPA-2025.06.001.rc1/html/solve__tridi_2merge__systems__template_8F90_source.html

#if 0

!    This file is part of ELPA.

!

!    The ELPA library was originally created by the ELPA consortium,

!    consisting of the following organizations:

!

!    - Max Planck Computing and Data Facility (MPCDF), formerly known as

!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),

!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte

!      Informatik,

!    - Technische Universität München, Lehrstuhl für Informatik mit

!      Schwerpunkt Wissenschaftliches Rechnen ,

!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,

!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,

!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,

!      and

!    - IBM Deutschland GmbH

!

!    This particular source code file contains additions, changes and

!    enhancements authored by Intel Corporation which is not part of

!    the ELPA consortium.

!

!    More information can be found here:

!    http://elpa.mpcdf.mpg.de/

!

!    ELPA is free software: you can redistribute it and/or modify

!    it under the terms of the version 3 of the license of the

!    GNU Lesser General Public License as published by the Free

!    Software Foundation.

!

!    ELPA is distributed in the hope that it will be useful,

!    but WITHOUT ANY WARRANTY; without even the implied warranty of

!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

!    GNU Lesser General Public License for more details.

!

!    You should have received a copy of the GNU Lesser General Public License

!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>

!

!    ELPA reflects a substantial effort on the part of the original

!    ELPA consortium, and we ask you to respect the spirit of the

!    license that we chose: i.e., please contribute any changes you

!    may have back to the original ELPA library distribution, and keep

!    any derivatives of ELPA under the same license that we chose for

!    the original distribution, the GNU Lesser General Public License.

!

!

! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines

!

! Copyright of the original code rests with the authors inside the ELPA

! consortium. The copyright of any additional modifications shall rest

! with their original authors, but shall adhere to the licensing terms

! distributed along with the original code in the file "COPYING".

#endif


#include "../general/sanity.F90"

#include "../general/error_checking.inc"


#ifdef SOLVE_TRIDI_GPU_BUILD

    subroutine merge_systems_gpu_&

    &precision &

                         (obj, na, nm, d, e, q_dev, &

                          matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols_self, &

                          l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)

#else


    subroutine merge_systems_cpu_&

    &precision &

                         (obj, na, nm, d, e, q, &

                          matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols_self, &

                          l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)

#endif


      use elpa_gpu

      use, intrinsic :: iso_c_binding

      use precision

      use elpa_abstract_impl

      use elpa_blas_interfaces

      use global_product

      use global_gather

      use resort_ev

      use transform_columns

      use check_monotony

      use add_tmp

      use v_add_s

      use elpa_utilities

      use elpa_mpi

      use solve_secular_equation

      use elpa_ccl_gpu

      use merge_systems_gpu_new

#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)

      use cuda_functions ! for NVTX labels

#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)

      use hip_functions  ! for ROCTX labels

#endif


#ifdef WITH_OPENMP_TRADITIONAL

      use omp_lib

#endif

      implicit none

#include "../general/precision_kinds.F90"

      class(elpa_abstract_impl_t), intent(inout)  :: obj

      integer(kind=ik), intent(in)                :: na ! this is rather na_local, not global na

      integer(kind=ik), intent(in)                :: nm, matrixRows, nqoff, nblk, matrixCols, mpi_comm_rows, &

                                                     mpi_comm_cols_self, npc_0, npc_n

      integer(kind=ik), intent(in)                :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)


#ifdef SOLVE_TRIDI_GPU_BUILD

      integer(kind=c_intptr_t)                    :: d_dev, e_dev ! shifted; for e_dev only one element is used

      integer(kind=c_intptr_t)                    :: q_dev

#else

      integer(kind=c_intptr_t)                    :: d_dev, e_dev ! dummy variables

      integer(kind=c_intptr_t)                    :: q_dev

#endif


      real(kind=real_datatype), intent(inout)     :: d(na), e

#if defined(USE_ASSUMED_SIZE) && !defined(SOLVE_TRIDI_GPU_BUILD)

      real(kind=real_datatype)                    :: q(matrixrows,*)

#else

      real(kind=real_datatype)                    :: q(matrixrows,matrixcols)

#endif

      logical, intent(in)                         :: useGPU, wantDebug

      integer(kind=c_int)                         :: SM_count


      logical, intent(out)                        :: success


      ! TODO: play with max_strip. If it was larger, matrices being multiplied

      ! might be larger as well!

      ! Peter: two out of three GEMM dimensions are already "big" and filling the GPU computation rate.

      ! Increasing max_strip doesn't improve the performance.

      integer(kind=ik)                            :: max_strip


      real(kind=real_datatype)                    :: beta, sig, s, c, t, tau, rho, eps, tol, &

                                                     qtrans(2,2), dmax, zmax, d1new, d2new

      real(kind=real_datatype)                    :: z(na), d1(na), d2(na), z1(na), delta(na),  &

                                                     dbase(na), ddiff(na), ev_scale(na), tmp(na)

      real(kind=real_datatype)                    :: d1u(na), zu(na), d1l(na), zl(na)

      real(kind=real_datatype), allocatable       :: qtmp1(:,:), qtmp2(:,:), ev(:,:)

#ifdef WITH_OPENMP_TRADITIONAL

      real(kind=real_datatype), allocatable       :: z_p(:,:)

      integer(kind=ik)                            :: my_thread

#endif


      integer(kind=ik)                            :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &

                                                     l_rqm, ns, lc1, lc2, info

      integer(kind=BLAS_KIND)                     :: infoBLAS

      integer(kind=ik)                            :: sig_int

      integer(kind=ik)                            :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &

                                                     l_cols_qreorg, np, l_idx, nqcols1 !, nqcols2

      integer(kind=ik)                            :: nnzu_save, nnzl_save

      integer(kind=ik)                            :: my_proc, n_procs, my_prow, my_pcol, np_rows, &

                                                     np_cols

      integer(kind=MPI_KIND)                      :: mpierr

      integer(kind=MPI_KIND)                      :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI

      integer(kind=ik)                            :: np_next, np_prev, np_rem

      integer(kind=ik)                            :: idx(na), idx1(na), idx2(na)

      integer(kind=BLAS_KIND)                     :: idxBLAS(NA)

      integer(kind=ik)                            :: coltyp(na), idxq1(na) !, idxq2(na)


      integer(kind=ik)                            :: istat, debug

      character(200)                              :: errorMessage

      integer(kind=ik)                            :: gemm_dim_k, gemm_dim_l, gemm_dim_m


      integer(kind=c_intptr_t)                    :: num

      integer(kind=C_intptr_t)                    :: qtmp1_dev, qtmp1_tmp_dev, qtmp2_dev, ev_dev

      integer(kind=c_intptr_t)                    :: z1_dev, delta_dev, rho_dev

      integer(kind=c_intptr_t)                    :: d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev

      integer(kind=c_intptr_t)                    :: d1l_dev, zl_dev, z_dev, d1_dev, ztmp_extended_dev

      integer(kind=c_intptr_t)                    :: idx1_dev, p_col_dev, coltyp_dev, p_col_out_dev, ndef_c_dev

      integer(kind=c_intptr_t)                    :: idxq1_dev, l_col_out_dev, idx_dev, idx2_dev, l_col_dev

      integer(kind=c_intptr_t)                    :: nnzul_dev

      integer(kind=c_intptr_t)                    :: tmp_dev, zero_dev, one_dev, qtrans_dev ! for transform_columns_gpu


      integer(kind=c_intptr_t)                    :: nnzu_val_dev, nnzl_val_dev

      logical                                     :: successGPU

      integer(kind=c_intptr_t), parameter         :: size_of_datatype = size_of_&

                                                                      &precision&

                                                                      &_real

      integer(kind=c_intptr_t)                    :: gpuHandle

      integer(kind=ik), intent(in)                :: max_threads

      integer(kind=c_intptr_t)                    :: my_stream

      integer(kind=ik)                            :: l_col_out_tmp

      integer(kind=ik), allocatable               :: nnzu_val(:,:), nnzl_val(:,:)

      integer(kind=ik)                            :: nnzul(2)


      integer(kind=ik)                            :: nnzu_start, nnzl_start


      integer(kind=ik), allocatable               :: ndef_c(:)


      integer(kind=ik) :: ii,jj, indx, ind_ex, ind_ex2, p_col_tmp, index2, counter1, counter2


      logical                                     :: useCCL

      integer(kind=c_intptr_t)                    :: ccl_comm_rows, ccl_comm_cols

      integer(kind=c_int)                         :: cclDataType


      call obj%timer%start("merge_systems" // precision_suffix)

      success = .true.


      call obj%timer%start("mpi_communication")

      call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)

      call mpi_comm_rank(int(mpi_comm_cols_self,kind=mpi_kind) ,my_pcolmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_cols_self,kind=mpi_kind) ,np_colsmpi, mpierr)


      my_prow = int(my_prowmpi,kind=c_int)

      np_rows = int(np_rowsmpi,kind=c_int)

      my_pcol = int(my_pcolmpi,kind=c_int)

      np_cols = int(np_colsmpi,kind=c_int)


      call obj%timer%stop("mpi_communication")


      if (wantdebug) then

        debug = 1

      else

        debug = 0

      endif


      if (usegpu) then

        max_strip=128

      else

        max_strip=128

      endif

      if (wantdebug) print *, "max_strip = ", max_strip


      useccl = obj%gpu_setup%useCCL


      if (usegpu) then

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

#endif

        sm_count = obj%gpu_setup%gpuSMcount


        if (useccl) then

          my_stream = obj%gpu_setup%my_stream

          ccl_comm_rows = obj%gpu_setup%ccl_comm_rows

          ccl_comm_cols = obj%gpu_setup%ccl_comm_cols

#if defined(DOUBLE_PRECISION)

          ccldatatype = ccldouble

#endif

#if defined(SINGLE_PRECISION)

          ccldatatype = cclfloat

#endif

        endif ! useCCL

      endif ! useGPU


! #ifdef WITH_OPENMP_TRADITIONAL

!       allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)

!       check_allocate("merge_systems: z_p",istat, errorMessage)

! #endif


      ! If my processor column isn't in the requested set, do nothing

      if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif


      ! Determine number of "next" and "prev" column for ring sends

      if (my_pcol == npc_0+npc_n-1) then

        np_next = npc_0

      else

        np_next = my_pcol + 1

      endif


      if (my_pcol == npc_0) then

        np_prev = npc_0+npc_n-1

      else

        np_prev = my_pcol - 1

      endif


      call check_monotony_&

      &precision&

      &(obj, nm,d,'Input1',wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif


      call check_monotony_&

      &precision&

      &(obj,na-nm,d(nm+1),'Input2',wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      ! Get global number of processors and my processor number.

      ! Please note that my_proc does not need to match any real processor number,

      ! it is just used for load balancing some loops.


      n_procs = np_rows*npc_n

      my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major


      ! Local limits of the rows of Q


      l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q

      l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm

      l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q


      l_rnm  = l_rqm-l_rqs+1 ! Number of local rows <= nm

      l_rows = l_rqe-l_rqs+1 ! Total number of local rows


      ! My number of local columns


      l_cols = count(p_col(1:na)==my_pcol)


      ! Get max number of local columns


      max_local_cols = 0

      do np = npc_0, npc_0+npc_n-1

        max_local_cols = max(max_local_cols,count(p_col(1:na)==np))

      enddo


      if (usegpu) then

        num = na * size_of_int

        successgpu = gpu_malloc(ndef_c_dev, num)

        check_alloc_gpu("merge_systems: ndef_c_dev", successgpu)


        num = na * size_of_int

        successgpu = gpu_malloc(idx1_dev, num)

        check_alloc_gpu("merge_systems: idx1_dev", successgpu)


        num = na * size_of_int

        successgpu = gpu_malloc(p_col_dev, num)

        check_alloc_gpu("merge_systems: p_col_dev", successgpu)


        num = na * size_of_int

        successgpu = gpu_malloc(p_col_out_dev, num)

        check_alloc_gpu("merge_systems: p_col_out_dev", successgpu)


        num = na * size_of_int

        successgpu = gpu_malloc(coltyp_dev, num)

        check_alloc_gpu("merge_systems: coltyp_dev", successgpu)


        num = na * size_of_int

        successgpu = gpu_malloc(idx2_dev, num)

        check_alloc_gpu("merge_systems: idx2_dev", successgpu)


        num = na * size_of_int

        successgpu = gpu_malloc(l_col_dev, num)

        check_alloc_gpu("merge_systems: l_col_dev", successgpu)


        num = na * size_of_int

        successgpu = gpu_malloc(l_col_out_dev, num)

        check_alloc_gpu("merge_systems: l_col_out_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(z_dev, num)

        check_alloc_gpu("merge_systems: z_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(z1_dev, num)

        check_alloc_gpu("merge_systems: z1_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(d1_dev, num)

        check_alloc_gpu("merge_systems: d1_dev", successgpu)


        num = 1 * size_of_datatype

        successgpu = gpu_malloc(rho_dev, num)

        check_alloc_gpu("merge_systems: rho_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(d1u_dev, num)

        check_alloc_gpu("merge_systems: d1u_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(dbase_dev, num)

        check_alloc_gpu("merge_systems: dbase_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(ddiff_dev, num)

        check_alloc_gpu("merge_systems: ddiff_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(zu_dev, num)

        check_alloc_gpu("merge_systems: zu_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(ev_scale_dev, num)

        check_alloc_gpu("merge_systems: ev_scale_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(d1l_dev, num)

        check_alloc_gpu("merge_systems: d1l_dev", successgpu)


        num = (na) * size_of_datatype

        successgpu = gpu_malloc(zl_dev, num)

        check_alloc_gpu("merge_systems: zl_dev", successgpu)


        num = (l_rows) * size_of_datatype

        successgpu = gpu_malloc(tmp_dev, num)

        check_alloc_gpu("merge_systems: tmp_dev", successgpu)


        num = 1 * size_of_datatype

        successgpu = gpu_malloc(zero_dev, num)

        check_alloc_gpu("merge_systems: zero_dev", successgpu)


        num = 1 * size_of_datatype

        successgpu = gpu_malloc(one_dev, num)

        check_alloc_gpu("merge_systems: one_dev", successgpu)


        num = 4 * size_of_datatype

        successgpu = gpu_malloc(qtrans_dev, num)

        check_alloc_gpu("merge_systems: qtrans_dev", successgpu)


        num = na * size_of_int

#ifdef WITH_GPU_STREAMS

        successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

#else

        successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

#endif

        check_memcpy_gpu("merge_systems: p_col_dev", successgpu)


        num = na * size_of_int

#ifdef WITH_GPU_STREAMS

        successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

#else

        successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

#endif

        check_memcpy_gpu("merge_systems: l_col_dev", successgpu)


        num = 1 * size_of_datatype

        beta = 0.0_rk

#ifdef WITH_GPU_STREAMS

        successgpu = gpu_memcpy_async(zero_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

#else

        successgpu = gpu_memcpy(zero_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice)

#endif

        check_memcpy_gpu("merge_systems: zero_dev", successgpu)


        num = 1 * size_of_datatype

        beta = 1.0_rk

#ifdef WITH_GPU_STREAMS

        successgpu = gpu_memcpy_async(one_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

#else

        successgpu = gpu_memcpy(one_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice)

#endif

        check_memcpy_gpu("merge_systems: one_dev", successgpu)


      endif ! useGPU


      ! Calculations start here


      beta = abs(e)

      sig  = sign(1.0_rk,e)


      ! Calculate rank-1 modifier z

      if (usegpu) then

        num = na * size_of_datatype

#ifdef WITH_GPU_STREAMS

        successgpu = gpu_memset_async(z_dev, 0, num, my_stream)

#else

        successgpu = gpu_memset(z_dev, 0, num)

#endif

        check_memcpy_gpu("merge_systems: memset z_dev", successgpu)

      else

        z(:) = 0

      endif


      if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow) then

        ! nm is local on my row

        if (usegpu) then

          sig_int = 1

          nvtx_range_push("gpu_fill_z_kernel")

          call gpu_fill_z(precision_char, z_dev, q_dev, p_col_dev, l_col_dev, &

                          sig_int, na, my_pcol, l_rqm, matrixrows, sm_count, debug, my_stream)

          nvtx_range_pop("gpu_fill_z_kernel")

        else

          do i = 1, na

            if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))

          enddo

        endif

      endif


      if (mod((nqoff+nm)/nblk,np_rows)==my_prow) then

        ! nm+1 is local on my row


        if (usegpu) then

          if (sig>0) then

            sig_int = 1

          else

            sig_int = -1

          endif


          nvtx_range_push("gpu_fill_z_kernel")

          call gpu_fill_z(precision_char, z_dev, q_dev, p_col_dev, l_col_dev, &

                          sig_int, na, my_pcol, l_rqm+1, matrixrows, sm_count, debug, my_stream)

          nvtx_range_pop("gpu_fill_z_kernel")

        else

          do i = 1, na

            if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))

          enddo

        endif

      endif


      if (usegpu) then

        num = na * size_of_datatype

#ifdef WITH_GPU_STREAMS

        successgpu = gpu_memcpy_async(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost, my_stream)

#else

        successgpu = gpu_memcpy(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost)

#endif

        check_memcpy_gpu("merge_systems: z_dev", successgpu)

      endif


      call global_gather_&

      &precision&

      &(obj, z, na, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)

      if (.not.(success)) then

        write(error_unit,*) "Error in global_gather. Aborting"

        success = .false.

        return

      endif


      ! Normalize z so that norm(z) = 1.  Since z is the concatenation of

      ! two normalized vectors, norm2(z) = sqrt(2).

      z = z/sqrt(2.0_rk)

      rho = 2.0_rk*beta

      ! Calculate index for merging both systems by ascending eigenvalues

      call obj%timer%start("lapack_lamrg")

      nvtx_range_push("lapack_lamrg_1")

      call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &

                            1_blas_kind, 1_blas_kind, idxblas )

      nvtx_range_pop("lapack_lamrg_1")

      idx(:) = int(idxblas(:),kind=ik)

      call obj%timer%stop("lapack_lamrg")


      ! Calculate the allowable deflation tolerance


      zmax = maxval(abs(z))

      dmax = maxval(abs(d))

      eps = precision_lamch( 'E' ) ! return epsilon

      tol = 8.0_rk*eps*max(dmax,zmax)


      ! If the rank-1 modifier is small enough, no more needs to be done

      ! except to reorganize D and Q


      IF ( rho*zmax <= tol ) THEN

        ! Rearrange eigenvalues

        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo


        ! Rearrange eigenvectors

        if (usegpu) then

          nvtx_range_push("resort_ev_gpu")

          call obj%timer%start("resort_ev_gpu")

          call resort_ev_gpu_&

                              &precision&

                              (obj, idx, na, na, p_col_out, q_dev, matrixrows, matrixcols, l_rows, l_rqe, &

                              l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)

          call obj%timer%stop("resort_ev_gpu")

          nvtx_range_pop("resort_ev_gpu")

        else ! useGPU

          call resort_ev_cpu_&

                             &precision&

                             (obj, idx, na, na, p_col_out, q    , matrixrows, matrixcols, l_rows, l_rqe, &

                              l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)

        endif ! useGPU


        call obj%timer%stop("merge_systems" // precision_suffix)


        if (wantdebug) write(error_unit,*) "Returing early from merge_systems (RHO*zmax <= TOL): matrix is block-diagonal"

        ! tested by validate_real_double_solve_tridiagonal_1stage_blocktridi


        return

      ENDIF


      ! Merge and deflate system


      na1 = 0

      na2 = 0


      ! COLTYP:

      ! 1 : non-zero in the upper half only;

      ! 2 : dense;

      ! 3 : non-zero in the lower half only;

      ! 4 : deflated.


      coltyp(1:nm) = 1

      coltyp(nm+1:na) = 3


      nvtx_range_push("deflation_loop")

      do i=1,na


        if (rho*abs(z(idx(i))) <= tol) then


          ! Deflate due to small z component.


          na2 = na2+1

          d2(na2)   = d(idx(i))

          idx2(na2) = idx(i)

          coltyp(idx(i)) = 4


        else if (na1>0) then


          ! Check if eigenvalues are close enough to allow deflation.


          s = z(idx(i))

          c = z1(na1)


          ! Find TAU = sqrt(a**2+b**2) without overflow or

          ! destructive underflow.

          tau = precision_lapy2( c, s )

          t = d1(na1) - d(idx(i))

          c = c / tau

          s = -s / tau

          IF ( abs( t*c*s ) <= tol ) THEN


            ! Deflation is possible.


            na2 = na2+1


            z1(na1) = tau


            d2new = d(idx(i))*c**2 + d1(na1)*s**2

            d1new = d(idx(i))*s**2 + d1(na1)*c**2


            ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0

            ! This means that after the above transformation it must be

            !    D1(na1) <= d1new <= D(idx(i))

            !    D1(na1) <= d2new <= D(idx(i))

            !

            ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))

            ! so there is no problem with sorting here.

            ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)

            ! which makes a check (and possibly a resort) necessary.

            !

            ! The above relations may not hold exactly due to numeric differences

            ! so they have to be enforced in order not to get troubles with sorting.


            if (d1new<d1(na1)  ) d1new = d1(na1)

            if (d1new>d(idx(i))) d1new = d(idx(i))


            if (d2new<d1(na1)  ) d2new = d1(na1)

            if (d2new>d(idx(i))) d2new = d(idx(i))


            d1(na1) = d1new


            do j=na2-1,1,-1

              if (d2new<d2(j)) then

                d2(j+1)   = d2(j)

                idx2(j+1) = idx2(j)

              else

                exit ! Loop

              endif

            enddo


            d2(j+1)   = d2new

            idx2(j+1) = idx(i)


            qtrans(1,1) = c; qtrans(1,2) =-s

            qtrans(2,1) = s; qtrans(2,2) = c


            nvtx_range_push("transform_columns")

            if (usegpu) then

#ifdef WITH_GPU_STREAMS

              successgpu = gpu_memcpy_async(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &

                                            4*size_of_datatype, gpumemcpyhosttodevice, my_stream)

              if (wantdebug) successgpu = gpu_devicesynchronize()

#else

              successgpu = gpu_memcpy(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &

                                      4*size_of_datatype, gpumemcpyhosttodevice)

#endif

              check_memcpy_gpu("transform_columns: q_dev", successgpu)


              call transform_columns_gpu_&

                                         &precision &

                                        (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &

                                          q_dev, matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &

                                          p_col, l_col, qtrans_dev, &

                                          tmp_dev, zero_dev, one_dev, debug, my_stream)

            else

              call transform_columns_cpu_&

                                        &precision &

                                        (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &

                                          q    , matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &

                                          p_col, l_col, qtrans)

            endif

            nvtx_range_pop("transform_columns")


            if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2

            if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2


            coltyp(idx(i)) = 4


          else

            na1 = na1+1

            d1(na1) = d(idx(i))

            z1(na1) = z(idx(i))

            idx1(na1) = idx(i)

          endif

        else

          na1 = na1+1

          d1(na1) = d(idx(i))

          z1(na1) = z(idx(i))

          idx1(na1) = idx(i)

        endif


      enddo ! do i=1,na

      nvtx_range_pop("deflation_loop")


      call check_monotony_&

      &precision&

      &(obj, na1,d1,'Sorted1', wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif

      call check_monotony_&

      &precision&

      &(obj, na2,d2,'Sorted2', wantdebug, success)

      if (.not.(success)) then

        call obj%timer%stop("merge_systems" // precision_suffix)

        return

      endif


      if (na1==1 .or. na1==2) then

        ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid


        if (na1==1) then

          d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation

        else ! na1==2

          call obj%timer%start("lapack_laed5_x2")

          nvtx_range_push("lapack_laed5_x2")

          call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))

          call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))

          nvtx_range_pop("lapack_laed5_x2")

          call obj%timer%stop("lapack_laed5_x2")


          if (usegpu) then

#ifdef WITH_GPU_STREAMS

            successgpu = gpu_memcpy_async(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &

                                          4*size_of_datatype, gpumemcpyhosttodevice, my_stream)

            if (wantdebug) successgpu = gpu_devicesynchronize()

#else

            successgpu = gpu_memcpy(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &

                                    4*size_of_datatype, gpumemcpyhosttodevice)

#endif

            check_memcpy_gpu("transform_columns: q_dev", successgpu)


            call transform_columns_gpu_&

                                        &precision &

                                      (obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, &

                                        q_dev, matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &

                                        p_col, l_col, qtrans_dev, &

                                        tmp_dev, zero_dev, one_dev, debug, my_stream)

          else

            call transform_columns_cpu_&

                                       &precision&

                                       & (obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &

                                          matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &

                                          p_col, l_col, qtrans)

          endif

        endif ! na1==2


        ! Add the deflated eigenvalues

        d(na1+1:na) = d2(1:na2)


        ! Calculate arrangement of all eigenvalues  in output

        call obj%timer%start("lapack_lamrg")

        nvtx_range_push("lapack_lamrg_2")

        call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &

                              1_blas_kind, 1_blas_kind, idxblas )

        nvtx_range_pop("lapack_lamrg_2")

        idx(:) = int(idxblas(:),kind=ik)

        call obj%timer%stop("lapack_lamrg")

        ! Rearrange eigenvalues


        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo


        ! Rearrange eigenvectors


        do i=1,na

          if (idx(i)<=na1) then

            idxq1(i) = idx1(idx(i))

          else

            idxq1(i) = idx2(idx(i)-na1)

          endif

        enddo


        if (usegpu) then

          call resort_ev_gpu_&

                         &precision&

                         &(obj, idxq1, na, na, p_col_out, q_dev, matrixrows, matrixcols, l_rows, l_rqe, &

                           l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)

        else

          call resort_ev_cpu_&

                         &precision&

                         &(obj, idxq1, na, na, p_col_out, q    , matrixrows, matrixcols, l_rows, l_rqe, &

                           l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)

        endif


        write(error_unit,*) .or."Returing early from merge_systems (na1==1  na1==2)"

        ! na=1 can be tested with "mpirun -n 4 ./validate_real_double_solve_tridiagonal_1stage_gpu_blocktridi 3 3 1"

        ! na=2 can be tested with "mpirun -n 4 ./validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz 4 4 2"

      else if (na1>2) then


        ! Solve secular equation


        if (usegpu) then

          num = (na1*sm_count) * size_of_datatype

          successgpu = gpu_malloc(ztmp_extended_dev, num)

          check_alloc_gpu("merge_systems: delta_dev", successgpu)


          call gpu_fill_array(precision_char, ztmp_extended_dev, one_dev, na1*sm_count, sm_count, debug, my_stream)


          call gpu_fill_array(precision_char, z_dev, one_dev, na1, sm_count, debug, my_stream)


          num = na1 * size_of_datatype

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memset_async(dbase_dev, 0, num, my_stream)

#else

          successgpu = gpu_memset(dbase_dev, 0, num)

#endif

          check_memcpy_gpu("merge_systems: memset dbase_dev", successgpu)


          num = na1 * size_of_datatype

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memset_async(ddiff_dev, 0, num, my_stream)

#else

          successgpu = gpu_memset(ddiff_dev, 0, num)

#endif

          check_memcpy_gpu("merge_systems: memset ddiff_dev", successgpu)

        else

          z(1:na1) = 1

! #ifdef WITH_OPENMP_TRADITIONAL

!           z_p(1:na1,:) = 1

! #endif

          dbase(1:na1) = 0

          ddiff(1:na1) = 0

        endif


        nvtx_range_push("lapack_laed4_loop")


        if (usegpu) then

          ! data transfer to GPU

#ifdef WITH_GPU_STREAMS

          num = na * size_of_datatype

          successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_memcpy_async(z1_dev, int(loc(z1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: z1_dev", successgpu)


          num = 1 * size_of_datatype

          successgpu = gpu_memcpy_async(rho_dev, int(loc(rho),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: rho_dev", successgpu)


#else

          num = na * size_of_datatype

          successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_memcpy(z1_dev, int(loc(z1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: z1_dev", successgpu)


          num = 1 * size_of_datatype

          successgpu = gpu_memcpy(rho_dev, int(loc(rho),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: rho_dev", successgpu)

#endif


          ! delta_dev is a temporary buffer, not used afterwards

          num = (na1*sm_count) * size_of_datatype

          successgpu = gpu_malloc(delta_dev, num)

          check_alloc_gpu("merge_systems: delta_dev", successgpu)


          call gpu_solve_secular_equation_loop (precision_char, d1_dev, z1_dev, delta_dev, rho_dev, &

                  ztmp_extended_dev, dbase_dev, ddiff_dev, my_proc, na1, n_procs, sm_count, debug, my_stream)


          call gpu_local_product(precision_char, z_dev, ztmp_extended_dev, na1, sm_count, debug, my_stream)


          successgpu = gpu_free(delta_dev)

          check_dealloc_gpu("merge_systems: delta_dev", successgpu)


          ! data transfer back to CPU

#ifdef WITH_GPU_STREAMS

          num = na * size_of_datatype

          successgpu = gpu_memcpy_async(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_memcpy_async(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: z1_dev", successgpu)


          successgpu = gpu_memcpy_async(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: dbase_dev", successgpu)


          successgpu = gpu_memcpy_async(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)


          successgpu = gpu_memcpy_async(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: z_dev", successgpu)


          num = 1 * size_of_datatype

          successgpu = gpu_memcpy_async(int(loc(rho),kind=c_intptr_t), rho_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: rho_dev", successgpu)

#else

          num = na * size_of_datatype

          successgpu = gpu_memcpy(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_memcpy(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: z1_dev", successgpu)


          successgpu = gpu_memcpy(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: dbase_dev", successgpu)


          successgpu = gpu_memcpy(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)


          successgpu = gpu_memcpy(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: z_dev", successgpu)


          num = 1 * size_of_datatype

          successgpu = gpu_memcpy(int(loc(rho),kind=c_intptr_t), rho_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: rho_dev", successgpu)

#endif

        else

!        info = 0

!        infoBLAS = int(info,kind=BLAS_KIND)

!#ifdef WITH_OPENMP_TRADITIONAL

!

!        call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)

!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)

!        my_thread = omp_get_thread_num()

!!$OMP DO

!#endif

          do i = my_proc+1, na1, n_procs ! work distributed over all processors

            call obj%timer%start("lapack_laed4")

            nvtx_range_push("lapack_laed4")

            call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &

                                rho, s, infoblas) ! s is not used!

            info = int(infoblas,kind=ik)

            nvtx_range_pop("lapack_laed4")

            call obj%timer%stop("lapack_laed4")

            if (info/=0) then

              ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)

              ! use the more stable bisection algorithm in solve_secular_equation

              ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'

              call solve_secular_equation_&

                                &precision&

                                &(obj, na1, i, d1, z1, delta, rho, s) ! s is not used!

            endif


            ! Compute updated z


  !#ifdef WITH_OPENMP_TRADITIONAL

  !          do j=1,na1

  !            if (i/=j)  z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )

  !          enddo

  !          z_p(i,my_thread) = z_p(i,my_thread)*delta(i)

  !#else

            do j=1,na1

              if (i/=j)  z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )

            enddo

            z(i) = z(i)*delta(i)

  !#endif

            ! Store dbase/ddiff


            if (i<na1) then

              if (abs(delta(i+1)) < abs(delta(i))) then

                dbase(i) = d1(i+1)

                ddiff(i) = delta(i+1)

              else

                dbase(i) = d1(i)

                ddiff(i) = delta(i)

              endif

            else

              dbase(i) = d1(i)

              ddiff(i) = delta(i)

            endif

          enddo ! i = my_proc+1, na1, n_procs

        endif ! useGPU

        nvtx_range_pop("lapack_laed4_loop")


!#ifdef WITH_OPENMP_TRADITIONAL

!!$OMP END PARALLEL

!

!        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)

!

!        do i = 0, max_threads-1

!          z(1:na1) = z(1:na1)*z_p(1:na1,i)

!        enddo

!#endif


        nvtx_range_push("global_product")

        call global_product_&

                  &precision&

                  (obj, z, na1, mpi_comm_rows, mpi_comm_cols_self, npc_0, npc_n, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_product. Aborting..."

          return

        endif

        nvtx_range_pop("global_product")


        z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )


        nvtx_range_push("global_gather_x2")

        call global_gather_&

        &precision&

        &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        call global_gather_&

        &precision&

        &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        nvtx_range_pop("global_gather_x2")


        d(1:na1) = dbase(1:na1) - ddiff(1:na1)


        ! Calculate scale factors for eigenvectors

        if (usegpu) then

          num = na * size_of_datatype

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memset_async(ev_scale_dev, 0, num, my_stream)

#else

          successgpu = gpu_memset(ev_scale_dev, 0, num)

#endif

          check_memcpy_gpu("merge_systems: memset ev_scale_dev", successgpu)

        else  ! useGPU

          ev_scale(:) = 0.0_rk

        endif ! useGPU


        nvtx_range_push("add_tmp_loop")

        if (wantdebug) call obj%timer%start("add_tmp_loop")


        if (usegpu) then

          ! data transfer to GPU

          num = na * size_of_datatype

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: dbase_dev", successgpu)


          successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)


          successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: z_dev", successgpu)

#else

          successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: dbase_dev", successgpu)


          successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)


          successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: z_dev", successgpu)

#endif


          call gpu_add_tmp_loop(precision_char, d1_dev, dbase_dev, ddiff_dev, z_dev, ev_scale_dev, ztmp_extended_dev, &

                                na1, my_proc, n_procs, sm_count, debug, my_stream)


          successgpu = gpu_free(ztmp_extended_dev)

          check_dealloc_gpu("merge_systems: ztmp_extended_dev", successgpu)


          ! data transfer back to CPU

          num = na * size_of_datatype

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memcpy_async(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_memcpy_async(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: dbase_dev", successgpu)


          successgpu = gpu_memcpy_async(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)


          successgpu = gpu_memcpy_async(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: z1_dev", successgpu)


          successgpu = gpu_memcpy_async(int(loc(ev_scale(1)),kind=c_intptr_t), ev_scale_dev, num, gpumemcpydevicetohost, my_stream)

          check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)

#else

          successgpu = gpu_memcpy(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_memcpy(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: dbase_dev", successgpu)


          successgpu = gpu_memcpy(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)


          successgpu = gpu_memcpy(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: z1_dev", successgpu)


          successgpu = gpu_memcpy(int(loc(ev_scale(1)),kind=c_intptr_t), ev_scale_dev, num, gpumemcpydevicetohost)

          check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)

#endif

        else

#ifdef WITH_OPENMP_TRADITIONAL

          call obj%timer%start("OpenMP parallel" // precision_suffix)


!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i) &

!$omp SHARED(na1, my_proc, n_procs,  &

!$OMP d1, dbase, ddiff, z, ev_scale, obj)

#endif

          do i = my_proc+1, na1, n_procs ! work distributed over all processors


            ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code

            ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results


            ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)

            ! in exactly this order, but we want to prevent compiler optimization

  !         ev_scale_val = ev_scale(i)

            call add_tmp_&

            &precision&

            &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)

  !         ev_scale(i) = ev_scale_val

          enddo

#ifdef WITH_OPENMP_TRADITIONAL

!$OMP END PARALLEL DO


          call obj%timer%stop("OpenMP parallel" // precision_suffix)

#endif

        endif ! useGPU


        if (wantdebug) call obj%timer%stop("add_tmp_loop")

        nvtx_range_pop("add_tmp_loop")


        nvtx_range_push("global_gather")

        call global_gather_&

                  &precision&

                  &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)

        if (.not.(success)) then

          write(error_unit,*) "Error in global_gather. Aborting..."

          return

        endif

        nvtx_range_pop("global_gather")


        ! Add the deflated eigenvalues

        d(na1+1:na) = d2(1:na2)


        call obj%timer%start("lapack_lamrg")

        nvtx_range_push("lapack_lamrg_3")

        ! Calculate arrangement of all eigenvalues  in output

        call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &

                             1_blas_kind, 1_blas_kind, idxblas )

        nvtx_range_pop("lapack_lamrg_3")

        idx(:) = int(idxblas(:),kind=ik)

        call obj%timer%stop("lapack_lamrg")


        ! Rearrange eigenvalues

        tmp = d

        do i=1,na

          d(i) = tmp(idx(i))

        enddo

        call check_monotony_&

        &precision&

        &(obj, na,d,'Output', wantdebug, success)


        if (.not.(success)) then

          call obj%timer%stop("merge_systems" // precision_suffix)

          write(error_unit,*) "Error in check_monotony. Aborting..."

          return

        endif

        ! Eigenvector calculations

        if (usegpu) then

          num = 2 * size_of_int

          successgpu = gpu_malloc(nnzul_dev, num) ! packs together nnzu and nnzl

          check_alloc_gpu("merge_systems: ", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(idxq1_dev, num)

          check_alloc_gpu("merge_systems: ", successgpu)


          num = na * size_of_int

          successgpu = gpu_malloc(idx_dev, num)

          check_alloc_gpu("merge_systems: idx_dev", successgpu)


          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: idx_dev", successgpu)

#endif

        endif


        ! Calculate the number of columns in the new local matrix Q

        ! which are updated from non-deflated/deflated eigenvectors.

        ! idxq1/2 stores the global column numbers.


        !if (useGPU) then


        !  !nqcols1 is needed later on host !!

        !  ! memcopy back needed!!

        !else

          nqcols1 = 0 ! number of non-deflated eigenvectors

          !nqcols2 = 0 ! number of deflated eigenvectors

          nvtx_range_push("loop_idxq1")

          DO i = 1, na

            if (p_col_out(i)==my_pcol) then

              if (idx(i)<=na1) then

                nqcols1 = nqcols1+1

                idxq1(nqcols1) = i

              !else

                !nqcols2 = nqcols2+1

                !idxq2(nqcols2) = i

              endif

            endif

          enddo

          nvtx_range_pop("loop_idxq1")

        !endif


        if (usegpu) then

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &

                             num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: idxq1_dev", successgpu)

#endif

        endif


        if (usegpu) then

          allocate(ndef_c(na), stat=istat, errmsg=errormessage)

          check_allocate("merge_systems: ndef_c",istat, errormessage)

        endif


        gemm_dim_k = max(1,l_rows)

        gemm_dim_l = max_local_cols

        gemm_dim_m = min(max_strip, max(1,nqcols1))


        if (.not. useccl) then

          allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)

          check_allocate("merge_systems: qtmp1",istat, errormessage)


          allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)

          check_allocate("merge_systems: ev",istat, errormessage)


          allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)

          check_allocate("merge_systems: qtmp2",istat, errormessage)

        endif


        if (usegpu) then

          num = (gemm_dim_k * gemm_dim_l) * size_of_datatype

          successgpu = gpu_malloc(qtmp1_dev, num)

          check_alloc_gpu("merge_systems: qtmp1_dev", successgpu)


          num = (gemm_dim_k * gemm_dim_l) * size_of_datatype

          successgpu = gpu_malloc(qtmp1_tmp_dev, num)

          check_alloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)


          num = (gemm_dim_l * gemm_dim_m) * size_of_datatype

          successgpu = gpu_malloc(ev_dev, num)

          check_alloc_gpu("merge_systems: ev_dev", successgpu)


          num = (gemm_dim_k * gemm_dim_m) * size_of_datatype

          successgpu = gpu_malloc(qtmp2_dev, num)

          check_alloc_gpu("merge_systems: qtmp2_dev", successgpu)


          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            if (wantdebug) call obj%timer%start("gpu_host_register")


            if (.not. useccl) then

              num = (gemm_dim_k * gemm_dim_l) * size_of_datatype

              nvtx_range_push("gpu_host_register_qtmp1")

              successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t), num, gpuhostregisterdefault)

              check_host_register_gpu("merge_systems: qtmp1", successgpu)

              nvtx_range_pop("gpu_host_register_qtmp1")


              num = (gemm_dim_l * gemm_dim_m) * size_of_datatype

              successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t), num, gpuhostregisterdefault)

              check_host_register_gpu("merge_systems: ev", successgpu)


              num = (gemm_dim_k * gemm_dim_m) * size_of_datatype

              successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t), num, gpuhostregisterdefault)

              check_host_register_gpu("merge_systems: qtmp2", successgpu)

            endif


            if (wantdebug) then

              successgpu = gpu_devicesynchronize()

              call obj%timer%stop("gpu_host_register")

            endif

          endif

        endif ! useGPU


        if (usegpu) then

          num = gemm_dim_k * gemm_dim_l * size_of_datatype

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memset_async(qtmp1_dev, 0, num, my_stream)

#else

          successgpu = gpu_memset(qtmp1_dev, 0, num)

#endif

          check_memcpy_gpu("merge_systems: memset qtmp1_dev", successgpu)

        else

          nvtx_range_push("set_qtmp1_qtmp2_0")

          call obj%timer%start("set_qtmp1_qtmp2_0")

          qtmp1 = 0 ! May contain empty (unset) parts

          qtmp2 = 0 ! Not really needed

          call obj%timer%stop("set_qtmp1_qtmp2_0")

          nvtx_range_pop("set_qtmp1_qtmp2_0")

        endif


        ! Gather nonzero upper/lower components of old matrix Q

        ! which are needed for multiplication with new eigenvectors


        ! kernel compute nnzu on device

        if (usegpu) then

          ! data transfer to GPU

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: idx1_dev", successgpu)


          successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)

#else

          successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: idx1_dev", successgpu)


          successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)

#endif


          nvtx_range_push("gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")

          if (wantdebug) call obj%timer%start("gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")


          call gpu_copy_qtmp1_q_compute_nnzu_nnzl(precision_char, qtmp1_dev, q_dev, &

                                                  p_col_dev, l_col_dev, idx1_dev, coltyp_dev, nnzul_dev, &

                                                  na1, l_rnm, l_rqs, l_rqm, l_rows, my_pcol, gemm_dim_k, matrixrows, &

                                                  sm_count, debug, my_stream)


          if (wantdebug) call obj%timer%stop("gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")

          nvtx_range_pop("gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")


          ! num = 2 * size_of_int

          ! successGPU = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpuMemcpyDeviceToHost)

          ! check_memcpy_gpu("merge_systems: nnzul_dev", successGPU)


          num = 2 * size_of_int

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost, my_stream)

#else

          successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost)

#endif

          check_memcpy_gpu("merge_systems: nnzul_dev", successgpu)


          nnzu = nnzul(1)

          nnzl = nnzul(2)

        else ! useGPU

          nvtx_range_push("loop_compute_nnzu")

          if (wantdebug) call obj%timer%start("loop_compute_nnzu")

          nnzu = 0

          nnzl = 0

          do i = 1, na1

            if (p_col(idx1(i))==my_pcol) then

              l_idx = l_col(idx1(i))


              if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then

                nnzu = nnzu+1

                qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)

              endif


              if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then

                nnzl = nnzl+1

                qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)

              endif

            endif

          enddo

          if (wantdebug) call obj%timer%stop("loop_compute_nnzu")

          nvtx_range_pop("loop_compute_nnzu")

        endif ! useGPU


        if (usegpu) then

          call obj%timer%start("gpu_memcpy")

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: l_col_dev", successgpu)

#endif

          call obj%timer%stop("gpu_memcpy")

        endif


        ! Gather deflated eigenvalues behind nonzero components


        ! compute ndef on device

        ndef = max(nnzu,nnzl)


        if (usegpu) then

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memcpy_async(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)


          successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)

#else

          successgpu = gpu_memcpy(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: idx2_dev", successgpu)


          successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: p_col_dev", successgpu)

#endif

        endif


        if (usegpu) then

          ndef_c(:) = ndef


          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memcpy_async(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ndef_c_dev 4", successgpu)

#else

          successgpu = gpu_memcpy(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: ndef_c_dev", successgpu)

#endif


          call gpu_copy_q_slice_to_qtmp1 (precision_char, qtmp1_dev, q_dev, ndef_c_dev, l_col_dev, idx2_dev, p_col_dev, &

                                          na2, na, my_pcol, l_rows, l_rqs, l_rqe, matrixrows, gemm_dim_k, debug, my_stream)

        else

          do i = 1, na2

            l_idx = l_col(idx2(i))

            if (p_col(idx2(i))==my_pcol) then

              ndef = ndef+1

              qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)

            endif

          enddo

        endif


        l_cols_qreorg = ndef ! Number of columns in reorganized matrix

        if (usegpu) then

          num = na * size_of_int

#ifdef WITH_GPU_STREAMS

          successgpu = gpu_memcpy_async(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &

                                        num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: ", successgpu)


          successgpu = gpu_memcpy_async(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &

                                        num, gpumemcpyhosttodevice, my_stream)

          check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)

#else

          successgpu = gpu_memcpy(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: p_col_out_dev", successgpu)


          successgpu = gpu_memcpy(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

          check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)

#endif

        endif


        ! Set (output) Q to 0, it will sum up new Q


        if (usegpu) then

          call gpu_zero_q(precision_char, q_dev, p_col_out_dev, l_col_out_dev, &

                          na, my_pcol, l_rqs, l_rqe, matrixrows, debug, my_stream)

        else

          DO i = 1, na

            if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0

          enddo

        endif


       ! check memory copies


       if (usegpu) then

#ifdef WITH_GPU_STREAMS

        num = na * size_of_int

        successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("merge_systems: idx1_dev", successgpu)


        successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("merge_systems: p_col_dev", successgpu)


        successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)


        num = na * size_of_datatype

        successgpu = gpu_memcpy_async(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)


        successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("merge_systems: z_dev", successgpu)


        successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("merge_systems: d1_dev", successgpu)


        successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("merge_systems: dbase_dev", successgpu)


        successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)

#else

        num = na * size_of_int

        successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("merge_systems: idx1_dev", successgpu)


        successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("merge_systems: p_col_dev", successgpu)


        successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)


        num = na * size_of_datatype

        successgpu = gpu_memcpy(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)


        successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("merge_systems: z_dev", successgpu)


        successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("merge_systems: d1_dev", successgpu)


        successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("merge_systems: dbase_dev", successgpu)


        successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)

#endif

        endif


        allocate(nnzu_val(na1,npc_n))

        allocate(nnzl_val(na1,npc_n))


        nnzu_val(:,:) = 0

        nnzl_val(:,:) = 0


        if (usegpu) then

          num = na1 * npc_n* size_of_int

          successgpu = gpu_malloc(nnzu_val_dev, num)

          check_alloc_gpu("merge_systems: nnzu_val_dev", successgpu)


          num = na1 * npc_n* size_of_int

          successgpu = gpu_malloc(nnzl_val_dev, num)

          check_alloc_gpu("merge_systems: nnzl_val_dev", successgpu)

        endif


        np_rem = my_pcol

        if (usegpu) then

          do np = 1, npc_n

            if (np > 1) then

              if (np_rem == npc_0) then

                np_rem = npc_0+npc_n-1

              else

                np_rem = np_rem-1

              endif

            endif

            nnzu = 0

            nnzl = 0


            call gpu_compute_nnzl_nnzu_val_part1 (p_col_dev, idx1_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, &

                                                  na, na1, np_rem, npc_n, nnzu_save, nnzl_save, np, debug, my_stream)


          enddo ! np = 1, npc_n


          nnzu_start = 0

          nnzl_start = 0


          call gpu_compute_nnzl_nnzu_val_part2 (nnzu_val_dev, nnzl_val_dev, na, na1, nnzu_start, nnzl_start, npc_n, &

                                                debug, my_stream)

        else

          ! precompute nnzu_val, nnzl_val

          do np = 1, npc_n

            if (np > 1) then

              if (np_rem == npc_0) then

                np_rem = npc_0+npc_n-1

              else

                np_rem = np_rem-1

              endif

            endif

            nnzu = 0

            nnzl = 0

            do i=1,na1

              if (p_col(idx1(i)) == np_rem) then

                if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then

                  nnzu = nnzu+1

                  nnzu_val(i,np) =  nnzu

                endif

                if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then

                  nnzl = nnzl+1

                  nnzl_val(i,np) =  nnzl

                endif

              endif

            enddo

          enddo ! np = 1, npc_n

        endif


        np_rem = my_pcol


        ! is nnzu updated in main loop


        ! main loop

        nvtx_range_push("main_loop")

        do np = 1, npc_n

          nvtx_range_push("np=1,npc_n")

          ! Do a ring send of qtmp1

          if (np > 1) then


            if (np_rem == npc_0) then

              np_rem = npc_0+npc_n-1

            else

              np_rem = np_rem-1

            endif


            if (usegpu) then

              if (useccl) then

                my_stream = obj%gpu_setup%my_stream

                call gpu_copy_qtmp1_to_qtmp1_tmp (precision_char, qtmp1_dev, qtmp1_tmp_dev, gemm_dim_k, gemm_dim_l, &

                                                  debug, my_stream)


                call obj%timer%start("ccl_send_recv")

                successgpu = ccl_group_start()

                if (.not.successgpu) then

                  print *,"Error in setting up ccl_group_start!"

                  stop 1

                endif


                successgpu = ccl_send(qtmp1_tmp_dev, int(l_rows*max_local_cols,kind=c_size_t), &

                                      ccldatatype, np_next, ccl_comm_cols, my_stream)


                if (.not.successgpu) then

                  print *,"Error in ccl_send"

                  stop 1

                endif


                successgpu = ccl_recv(qtmp1_dev, int(l_rows*max_local_cols,kind=c_size_t), &

                                      ccldatatype, np_prev, ccl_comm_cols, my_stream)


                if (.not.successgpu) then

                  print *,"Error in ccl_recv"

                  stop 1

                endif


                successgpu = ccl_group_end()


                if (.not.successgpu) then

                  print *,"Error in setting up ccl_group_end!"

                  stop 1

                endif

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("trans_ev", successgpu)

                call obj%timer%stop("ccl_send_recv")

              else ! useCCL

#ifdef WITH_MPI

                call obj%timer%start("mpi_communication")

#ifdef WITH_GPU_STREAMS

                my_stream = obj%gpu_setup%my_stream

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)


                successgpu = gpu_memcpy_async(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &

                     gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpydevicetohost, my_stream)

                check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)


                my_stream = obj%gpu_setup%my_stream

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

                ! synchronize streamsPerThread; maybe not neccessary

                successgpu = gpu_stream_synchronize()

                check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)


#else

                successgpu = gpu_memcpy(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &

                     gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpydevicetohost)

                check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)

#endif


                call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision,     &

                                          int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &

                                          1111_mpi_kind, int(mpi_comm_cols_self,kind=mpi_kind), mpi_status_ignore, mpierr)

#ifdef WITH_GPU_STREAMS

                my_stream = obj%gpu_setup%my_stream

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)


                successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &

                     gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpyhosttodevice, my_stream)

                check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)


                my_stream = obj%gpu_setup%my_stream

                successgpu = gpu_stream_synchronize(my_stream)

                check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)

                ! synchronize streamsPerThread; maybe not neccessary

                successgpu = gpu_stream_synchronize()

                check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)


#else

                successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &

                     gemm_dim_k * gemm_dim_l  * size_of_datatype, gpumemcpyhosttodevice)

                check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)

#endif

                call obj%timer%stop("mpi_communication")

#endif /* WITH_MPI */


              endif ! useCCL

            else ! useGPU


#ifdef WITH_MPI

              call obj%timer%start("mpi_communication")

              call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision,     &

                                          int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &

                                          1111_mpi_kind, int(mpi_comm_cols_self,kind=mpi_kind), mpi_status_ignore, mpierr)

              call obj%timer%stop("mpi_communication")

#endif /* WITH_MPI */


            endif ! useGPU


          endif ! (np > 1) then


          ! Gather the parts in d1 and z which are fitting to qtmp1.

          ! This also delivers nnzu/nnzl for proc np_rem

          nnzu = 0

          nnzl = 0

          if (usegpu) then


            nvtx_range_push("gpu_fill_tmp_arrays")

            call gpu_fill_tmp_arrays (precision_char, d1u_dev, d1_dev, zu_dev, z_dev, d1l_dev, zl_dev, &

                                      idx1_dev, p_col_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, nnzul_dev, &

                                      na, np, na1, np_rem, debug, my_stream)

            if (wantdebug) successgpu = gpu_devicesynchronize()

            nvtx_range_pop("gpu_fill_tmp_arrays")


            num = 2* size_of_int

#ifdef WITH_GPU_STREAMS

            successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost, my_stream)

            check_memcpy_gpu("merge_systems: nnzul_dev", successgpu)

#else

            successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost)

            check_memcpy_gpu("merge_systems: nnzl_val", successgpu)

#endif

            nnzu = nnzul(1)

            nnzl = nnzul(2)


          else ! useGPU

            do i=1,na1

              if (p_col(idx1(i)) == np_rem) then

                if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then

                  nnzu = nnzu+1

                  d1u(nnzu) = d1(i)

                  zu(nnzu) = z(i)

                endif

                if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then

                  nnzl = nnzl+1

                  d1l(nnzl) = d1(i)

                  zl(nnzl) = z(i)

                endif

              endif

            enddo

          endif ! useGPU


          ! Set the deflated eigenvectors in Q (comming from proc np_rem)


          ndef = max(nnzu,nnzl) ! Remote counter in input matrix

          if (usegpu) then

            ! PETERDEBUG: idx2_dev, potential problem with garbage values?

            call gpu_update_ndef_c(ndef_c_dev, idx_dev, p_col_dev, idx2_dev, na, na1, np_rem, ndef, debug, my_stream)


          endif ! useGPU


          ndef = max(nnzu,nnzl) ! Remote counter in input matrix

          if (usegpu) then

            call gpu_copy_qtmp1_slice_to_q (precision_char, q_dev, qtmp1_dev, &

                                            l_col_out_dev, p_col_out_dev, ndef_c_dev, p_col_dev, idx2_dev, idx_dev, &

                                            l_rqs, l_rqe, l_rows, matrixrows, gemm_dim_k,  my_pcol, na1, np_rem,  na, &

                                            debug, my_stream)

          else ! ! useGPU

            ndef = max(nnzu,nnzl) ! Remote counter in input matrix

            do i = 1, na

              j = idx(i)

              if (j>na1) then

                if (p_col(idx2(j-na1)) == np_rem) then

                  ndef = ndef+1

                  if (p_col_out(i) == my_pcol) then

                    q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)

                  endif

                endif

              endif

            enddo


          endif ! useGPU


          do ns = 0, nqcols1-1, max_strip ! "strimining" (strip mining) loop

            nvtx_range_push("ns=0,nqcols1-1,max_strip")

            ncnt = min(max_strip,nqcols1-ns) ! number of columns in this strip


            ! Get partial result from (output) Q

            if (usegpu) then

              call gpu_copy_q_slice_to_qtmp2 (precision_char, q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, &

                                              l_rows, l_rqs, l_rqe, matrixrows, matrixcols, &

                                              gemm_dim_k, gemm_dim_m, ns, ncnt, ind_ex, ind_ex2, na, debug, my_stream)

            else ! useGPU

!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i, j, k) &

!$omp SHARED(ns, q, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)

              do i = 1, ncnt

                j = idxq1(i+ns)

                k = l_col_out(j)

                qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)

              enddo

!$OMP END PARALLEL DO

            endif ! useGPU


            ! Compute eigenvectors of the rank-1 modified matrix.

            ! Parts for multiplying with upper half of Q:

            if (usegpu) then

              if (nnzu .ge. 1) then

                ! Calculate the j-th eigenvector of the deflated system

                ! See above why we are doing it this way!

                call gpu_fill_ev (precision_char, ev_dev, d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev, idxq1_dev, idx_dev,&

                                  na, gemm_dim_l, gemm_dim_m, nnzu, ns, ncnt, debug, my_stream)

              endif ! nnzu


            else ! useGPU

!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i, j, k, tmp) &

!$omp shared(ncnt, nnzu, idx, idxq1, ns, d1u, dbase, ddiff, zu, ev_scale, ev)

              do i = 1, ncnt

                do k = 1, nnzu

                  j = idx(idxq1(i+ns))

                  ! Calculate the j-th eigenvector of the deflated system

                  ! See above why we are doing it this way!


                  ! kernel here

                  tmp(k) = d1u(k) - dbase(j)

                  tmp(k) = tmp(k) + ddiff(j)

                  ev(k,i) = zu(k) / tmp(k) * ev_scale(j)

                enddo

              enddo

!$OMP END PARALLEL DO

            endif ! useGPU


            ! Multiply old Q with eigenvectors (upper half)


            if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then

              if (usegpu) then

                call obj%timer%start("gpublas_gemm")

                nvtx_range_push("gpublas_gemm_upper")

                gpuhandle = obj%gpu_setup%gpublasHandleArray(0)

                call gpublas_precision_gemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk, &

                                            qtmp1_dev, gemm_dim_k,    &

                                            ev_dev,    gemm_dim_l, 1.0_rk, &

                                            qtmp2_dev, gemm_dim_k, gpuhandle)

                if (wantdebug) successgpu = gpu_devicesynchronize()

                nvtx_range_pop("gpublas_gemm_upper")

                call obj%timer%stop("gpublas_gemm")

              else ! useGPU

                call obj%timer%start("blas_gemm")

                call precision_gemm('N', 'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &

                                    int(nnzu,kind=blas_kind), 1.0_rk, &

                                    qtmp1,      int(gemm_dim_k,kind=blas_kind), &

                                    ev,         int(gemm_dim_l,kind=blas_kind), 1.0_rk, &

                                    qtmp2(1,1), int(gemm_dim_k,kind=blas_kind))

                call obj%timer%stop("blas_gemm")

              endif ! useGPU

            endif ! (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then


            ! Compute eigenvectors of the rank-1 modified matrix.

            ! Parts for multiplying with lower half of Q:


            if (usegpu) then

              if (nnzl .ge. 1) then

                call gpu_fill_ev (precision_char, ev_dev, d1l_dev, dbase_dev, ddiff_dev, zl_dev, ev_scale_dev, idxq1_dev, idx_dev, &

                                            na, gemm_dim_l, gemm_dim_m, nnzl, ns, ncnt, debug, my_stream)

              endif

            else ! useGPU

!$omp PARALLEL DO &

!$omp private(i, j, k, tmp)

              do i = 1, ncnt

                do k = 1, nnzl

                  j = idx(idxq1(i+ns))

                  ! Calculate the j-th eigenvector of the deflated system

                  ! See above why we are doing it this way!

                  tmp(k) = d1l(k) - dbase(j)

                  tmp(k) = tmp(k) + ddiff(j)

                  ev(k,i) = zl(k) / tmp(k) * ev_scale(j)

                enddo

              enddo

!$OMP END PARALLEL DO

            endif ! useGPU


            ! Multiply old Q with eigenvectors (lower half)


            if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then

              if (usegpu) then

                call obj%timer%start("gpublas_gemm")

                nvtx_range_push("gpublas_gemm_lower")

                gpuhandle = obj%gpu_setup%gpublasHandleArray(0)

                call gpublas_precision_gemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk, &

                                            qtmp1_dev + l_rnm*size_of_datatype, gemm_dim_k,   &

                                            ev_dev,                             gemm_dim_l, 1.0_rk, &

                                            qtmp2_dev + l_rnm*size_of_datatype, gemm_dim_k, gpuhandle)

                if (wantdebug) successgpu = gpu_devicesynchronize()

                nvtx_range_pop("gpublas_gemm_lower")

                call obj%timer%stop("gpublas_gemm")

              else ! useGPU

                call obj%timer%start("blas_gemm")

                call precision_gemm('N', 'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind),  &

                                    int(nnzl,kind=blas_kind), 1.0_rk, &

                                    qtmp1(l_rnm+1,1), int(gemm_dim_k,kind=blas_kind), &

                                    ev,               int(gemm_dim_l,kind=blas_kind), 1.0_rk, &

                                    qtmp2(l_rnm+1,1), int(gemm_dim_k,kind=blas_kind))

                call obj%timer%stop("blas_gemm")

              endif ! useGPU

            endif


            ! Put partial result into (output) Q

            if (usegpu) then

              call gpu_copy_qtmp2_slice_to_q (precision_char, q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, &

                                              l_rqs, l_rqe, l_rows, ncnt, gemm_dim_k, matrixrows, ns, debug, my_stream)

            else ! useGPU

!$omp PARALLEL DO &

!$omp default(none) &

!$omp private(i) &

!$omp SHARED(q, ns, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)

              do i = 1, ncnt

                q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)

              enddo

!$OMP END PARALLEL DO

            endif ! useGPU


            nvtx_range_pop("ns=0,nqcols1-1,max_strip")

          enddo   ! ns = 0, nqcols1-1, max_strip ! strimining loop


          nvtx_range_pop("np=1,npc_n")

        enddo    ! do np = 1, npc_n

        nvtx_range_pop("main_loop")


        deallocate(nnzu_val, nnzl_val)


        if (usegpu) then

          deallocate(ndef_c, stat=istat, errmsg=errormessage)

          check_deallocate("merge_systems: ndef_c",istat, errormessage)

        endif


        if (usegpu) then

          successgpu = gpu_free(nnzul_dev)

          check_dealloc_gpu("merge_systems: nnzul_dev", successgpu)


          successgpu = gpu_free(l_col_dev)

          check_dealloc_gpu("merge_systems: l_col_dev", successgpu)


          successgpu = gpu_free(ndef_c_dev)

          check_dealloc_gpu("merge_systems: ndef_c_dev", successgpu)


          successgpu = gpu_free(nnzu_val_dev)

          check_dealloc_gpu("merge_systems: nnzu_val_dev", successgpu)


          successgpu = gpu_free(nnzl_val_dev)

          check_dealloc_gpu("merge_systems: nnzl_val_dev", successgpu)


          successgpu = gpu_free(idx1_dev)

          check_dealloc_gpu("merge_systems: idx1_dev", successgpu)


          successgpu = gpu_free(idx2_dev)

          check_dealloc_gpu("merge_systems: idx2_dev", successgpu)


          successgpu = gpu_free(p_col_dev)

          check_dealloc_gpu("merge_systems: p_col_dev", successgpu)


          successgpu = gpu_free(p_col_out_dev)

          check_dealloc_gpu("merge_systems: p_col_out_dev", successgpu)


          successgpu = gpu_free(coltyp_dev)

          check_dealloc_gpu("merge_systems: coltyp_dev", successgpu)


          successgpu = gpu_free(idx_dev)

          check_dealloc_gpu("merge_systems: idx_dev", successgpu)


          successgpu = gpu_free(l_col_out_dev)

          check_dealloc_gpu("merge_systems: l_col_out_dev", successgpu)


          successgpu = gpu_free(idxq1_dev)

          check_dealloc_gpu("merge_systems: ", successgpu)


          successgpu = gpu_free(d1_dev)

          check_dealloc_gpu("merge_systems: d1_dev", successgpu)


          successgpu = gpu_free(z_dev)

          check_dealloc_gpu("merge_systems: z_dev", successgpu)


          successgpu = gpu_free(z1_dev)

          check_dealloc_gpu("merge_systems: z1_dev", successgpu)


          successgpu = gpu_free(rho_dev)

          check_dealloc_gpu("merge_systems: rho_dev", successgpu)


          successgpu = gpu_free(d1u_dev)

          check_dealloc_gpu("merge_systems: d1u_dev", successgpu)


          successgpu = gpu_free(dbase_dev)

          check_dealloc_gpu("merge_systems: dbase_dev", successgpu)


          successgpu = gpu_free(ddiff_dev)

          check_dealloc_gpu("merge_systems: ddiff_dev", successgpu)


          successgpu = gpu_free(zu_dev)

          check_dealloc_gpu("merge_systems: zu_dev", successgpu)


          successgpu = gpu_free(ev_scale_dev)

          check_dealloc_gpu("merge_systems: ev_scale_dev", successgpu)


          successgpu = gpu_free(d1l_dev)

          check_dealloc_gpu("merge_systems: d1l_dev", successgpu)


          successgpu = gpu_free(zl_dev)

          check_dealloc_gpu("merge_systems: zl_dev", successgpu)


          successgpu = gpu_free(qtmp1_dev)

          check_dealloc_gpu("merge_systems: qtmp1_dev", successgpu)


          successgpu = gpu_free(qtmp1_tmp_dev)

          check_dealloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)


          successgpu = gpu_free(qtmp2_dev)

          check_dealloc_gpu("merge_systems: qtmp2_dev", successgpu)


          successgpu = gpu_free(ev_dev)

          check_dealloc_gpu("merge_systems: ev_dev", successgpu)


          successgpu = gpu_free(tmp_dev)

          check_dealloc_gpu("merge_systems: tmp_dev", successgpu)


          successgpu = gpu_free(zero_dev)

          check_dealloc_gpu("merge_systems: zero_dev", successgpu)


          successgpu = gpu_free(one_dev)

          check_dealloc_gpu("merge_systems: one_dev", successgpu)


          successgpu = gpu_free(qtrans_dev)

          check_dealloc_gpu("merge_systems: qtrans_dev", successgpu)


          if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then

            if (wantdebug) call obj%timer%start("gpu_host_register")

            if (.not. useccl) then

              successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))

              check_host_unregister_gpu("merge_systems: qtmp1", successgpu)


              successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))

              check_host_unregister_gpu("merge_systems: qtmp2", successgpu)


              successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))

              check_host_unregister_gpu("merge_systems: ev", successgpu)

            endif


            if (wantdebug) successgpu = gpu_devicesynchronize()

            if (wantdebug) call obj%timer%stop("gpu_host_register")

          endif

        endif ! useGPU


        if (.not. useccl) then

          deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)

          check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errormessage)

        endif

      endif !very outer test if (na1==1 .or. na1==2) else (na1>2)


! #ifdef WITH_OPENMP_TRADITIONAL

!       deallocate(z_p, stat=istat, errmsg=errorMessage)

!       check_deallocate("merge_systems: z_p",istat, errorMessage)

! #endif


      call obj%timer%stop("merge_systems" // precision_suffix)


      return


    end

add_tmp
Definition mod_add_tmp.F90:3

check_monotony
Definition mod_check_monotony.F90:3

elpa_abstract_impl
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50

global_gather
Definition mod_global_gather.F90:3

global_product
Definition mod_global_product.F90:3

resort_ev
Definition mod_resort_ev.F90:3

solve_secular_equation
Definition mod_solve_secular_equation.F90:55

transform_columns
Definition mod_transform_columns.F90:3

v_add_s
Definition mod_v_add_s.F90:55

elpa_abstract_impl::elpa_abstract_impl_t
Definition elpa_abstract_impl.F90:73