ELPA-2025.06.001.rc1/html/solve__tridi_2merge__recursive__template_8F90_source.html

#ifdef SOLVE_TRIDI_GPU_BUILD

recursive subroutine merge_recursive_gpu_&

          &precision &

   (obj, np_off, nprocs, ldq, matrixcols, nblk, &

   l_col, p_col, l_col_bc, p_col_bc, limits, &

   np_cols, na, q_dev, d, e, &

   mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &

   usegpu, wantdebug, success, max_threads)

#else


recursive subroutine merge_recursive_cpu_&

          &precision &

   (obj, np_off, nprocs, ldq, matrixcols, nblk, &

   l_col, p_col, l_col_bc, p_col_bc, limits, &

   np_cols, na, q, d, e, &

   mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &

   usegpu, wantdebug, success, max_threads)

#endif


  use precision

  use elpa_abstract_impl

!#ifdef WITH_OPENMP_TRADITIONAL

!   use elpa_omp

!#endif

  use elpa_mpi

  use merge_systems

  use elpa_utilities

#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)

  use cuda_functions ! for NVTX labels

#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)

  use hip_functions  ! for ROCTX labels

#endif

  implicit none


   ! noff is always a multiple of nblk_ev

   ! nlen-noff is always > nblk_ev


  class(elpa_abstract_impl_t), intent(inout)  :: obj

  integer(kind=ik), intent(in)                :: max_threads

  integer(kind=ik), intent(in)                :: mpi_comm_all, mpi_comm_rows, mpi_comm_cols

  integer(kind=ik), intent(in)                :: ldq, matrixcols, nblk, na, np_cols

  integer(kind=ik), intent(in)                :: l_col_bc(na), p_col_bc(na), l_col(na), p_col(na), &

                                                limits(0:np_cols)


  integer(kind=c_intptr_t)                    :: q_dev

#ifdef USE_ASSUMED_SIZE

#ifdef SOLVE_TRIDI_GPU_BUILD

  real(kind=real_datatype)                    :: q(ldq,matrixcols)

#else

  real(kind=real_datatype)                    :: q(ldq,*)

#endif

#else

  real(kind=real_datatype)                    :: q(ldq,matrixcols)

#endif


#ifdef WITH_MPI

  integer(kind=MPI_KIND)                      :: mpierr, my_pcolmpi

#endif

  integer(kind=ik)                            :: my_pcol

  real(kind=real_datatype), intent(inout)     :: d(na), e(na)

  integer(kind=ik)                            :: np_off, nprocs

  integer(kind=ik)                            :: np1, np2, noff, nlen, nmid, n

  logical, intent(in)                         :: usegpu, wantdebug

  logical, intent(out)                        :: success


  success = .true.


#ifdef WITH_MPI

  call obj%timer%start("mpi_communication")

  call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)


  my_pcol = int(my_pcolmpi,kind=c_int)

  call obj%timer%stop("mpi_communication")

#endif


  if (nprocs<=1) then

    ! Safety check only

    if (wantdebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs

    success = .false.

    return

  endif


  ! Split problem into 2 subproblems of size np1 / np2


  np1 = nprocs/2

  np2 = nprocs-np1


  if (np1 > 1) then

    if (usegpu) then

      call merge_recursive_gpu_&

                  &precision &

            (obj, np_off, np1, ldq, matrixcols, nblk, &

            l_col, p_col, l_col_bc, p_col_bc, limits, &

            np_cols, na, q_dev, d, e, &

            mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &

            usegpu, wantdebug, success, max_threads)

    else

      call merge_recursive_cpu_&

                    &precision &

            (obj, np_off, np1, ldq, matrixcols, nblk, &

            l_col, p_col, l_col_bc, p_col_bc, limits, &

            np_cols, na, q, d, e, &

            mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &

            usegpu, wantdebug, success, max_threads)

    endif ! useGPU

  endif ! (np1 > 1)


  if (.not.(success)) then

    write(error_unit,*) "Error in merge_recursive. Aborting..."

    return

  endif


  if (np2 > 1) then

    if (usegpu) then

      call merge_recursive_gpu_&

                  &precision &

            (obj, np_off+np1, np2, ldq, matrixcols, nblk, &

            l_col, p_col, l_col_bc, p_col_bc, limits, &

            np_cols, na, q_dev, d, e, &

            mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &

            usegpu, wantdebug, success, max_threads)

    else

      call merge_recursive_cpu_&

                    &precision &

            (obj, np_off+np1, np2, ldq, matrixcols, nblk, &

            l_col, p_col, l_col_bc, p_col_bc, limits, &

            np_cols, na, q, d, e, &

            mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &

            usegpu, wantdebug, success, max_threads)

    endif ! useGPU

  endif ! (np2 > 1)


   if (.not.(success)) then

     write(error_unit,*) "Error in merge_recursice. Aborting..."

     return

   endif


   noff = limits(np_off)

   nmid = limits(np_off+np1) - noff

   nlen = limits(np_off+nprocs) - noff


#ifdef WITH_MPI

   call obj%timer%start("mpi_communication")

   if (my_pcol==np_off) then

     do n=np_off+np1,np_off+nprocs-1

       call mpi_send(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), 12_mpi_kind, &

                     int(mpi_comm_cols,kind=mpi_kind), mpierr)

     enddo

   endif

   call obj%timer%stop("mpi_communication")

#else /* WITH_MPI */

#endif /* WITH_MPI */


   if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then

#ifdef WITH_MPI

     call obj%timer%start("mpi_communication")

     call mpi_recv(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(np_off,kind=mpi_kind), 12_mpi_kind, &

                   int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)

     call obj%timer%stop("mpi_communication")

#else /* WITH_MPI */

!             d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)

#endif /* WITH_MPI */

   endif


   if (my_pcol==np_off+np1) then

     do n=np_off,np_off+np1-1

#ifdef WITH_MPI

       call obj%timer%start("mpi_communication")

       call mpi_send(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), &

                     15_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpierr)

       call obj%timer%stop("mpi_communication")

#else /* WITH_MPI */

#endif /* WITH_MPI */


     enddo

   endif


   if (my_pcol>=np_off .and. my_pcol<np_off+np1) then

#ifdef WITH_MPI

     call obj%timer%start("mpi_communication")

     call mpi_recv(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(np_off+np1,kind=mpi_kind), &

                   15_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)

     call obj%timer%stop("mpi_communication")

#else /* WITH_MPI */

!             d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)

#endif /* WITH_MPI */

   endif

   if (nprocs == np_cols) then


     ! Last merge, result distribution must be block cyclic, noff==0,

     ! p_col_bc is set so that only nev eigenvalues are calculated

     if (usegpu) then

       nvtx_range_push("merge_systems_gpu")

       call merge_systems_gpu_&

            &precision &

                           (obj, nlen, nmid, d(noff+1), e(noff+nmid), q_dev, ldq, noff, &

                           nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &

                           l_col, p_col, &

                           l_col_bc, p_col_bc, np_off, nprocs, usegpu, wantdebug, success, max_threads)

       nvtx_range_pop("merge_systems_gpu")

     else

       call merge_systems_cpu_&

            &precision &

                           (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &

                           nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &

                           l_col, p_col, &

                           l_col_bc, p_col_bc, np_off, nprocs, usegpu, wantdebug, success, max_threads)

     endif

     if (.not.(success)) then

       write(error_unit,*) "Error in merge_systems: Aborting..."

       return

     endif


   else

     ! Not last merge, leave dense column distribution

     if (usegpu) then

       nvtx_range_push("merge_systems_gpu")

       call merge_systems_gpu_&

            &precision &

                          (obj, nlen, nmid, d(noff+1), e(noff+nmid), q_dev, ldq, noff, &

                           nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &

                           l_col(noff+1), p_col(noff+1), &

                           l_col(noff+1), p_col(noff+1), np_off, nprocs, usegpu, wantdebug, success, &

                           max_threads)

       nvtx_range_pop("merge_systems_gpu")

     else

       call merge_systems_cpu_&

            &precision &

                          (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &

                           nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &

                           l_col(noff+1), p_col(noff+1), &

                           l_col(noff+1), p_col(noff+1), np_off, nprocs, usegpu, wantdebug, success, &

                           max_threads)

     endif

     if (.not.(success)) then

       write(error_unit,*) "Error in merge_systems: Aborting..."

       return

     endif

   endif


end


elpa_abstract_impl
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50

merge_systems
Definition mod_merge_systems.F90:3

elpa_abstract_impl::elpa_abstract_impl_t
Definition elpa_abstract_impl.F90:73