ELPA-2025.01.002/html/solve__tridi_2solve__tridi__template_8F90_source.html

#if 0

!    This file is part of ELPA.

!

!    The ELPA library was originally created by the ELPA consortium,

!    consisting of the following organizations:

!

!    - Max Planck Computing and Data Facility (MPCDF), formerly known as

!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),

!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte

!      Informatik,

!    - Technische Universität München, Lehrstuhl für Informatik mit

!      Schwerpunkt Wissenschaftliches Rechnen ,

!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,

!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,

!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,

!      and

!    - IBM Deutschland GmbH

!

!    This particular source code file contains additions, changes and

!    enhancements authored by Intel Corporation which is not part of

!    the ELPA consortium.

!

!    More information can be found here:

!    http://elpa.mpcdf.mpg.de/

!

!    ELPA is free software: you can redistribute it and/or modify

!    it under the terms of the version 3 of the license of the

!    GNU Lesser General Public License as published by the Free

!    Software Foundation.

!

!    ELPA is distributed in the hope that it will be useful,

!    but WITHOUT ANY WARRANTY; without even the implied warranty of

!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

!    GNU Lesser General Public License for more details.

!

!    You should have received a copy of the GNU Lesser General Public License

!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>

!

!    ELPA reflects a substantial effort on the part of the original

!    ELPA consortium, and we ask you to respect the spirit of the

!    license that we chose: i.e., please contribute any changes you

!    may have back to the original ELPA library distribution, and keep

!    any derivatives of ELPA under the same license that we chose for

!    the original distribution, the GNU Lesser General Public License.

!

!

! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines

!

! Copyright of the original code rests with the authors inside the ELPA

! consortium. The copyright of any additional modifications shall rest

! with their original authors, but shall adhere to the licensing terms

! distributed along with the original code in the file "COPYING".

#endif


#include "../general/sanity.F90"

#include "../general/error_checking.inc"


#ifdef SOLVE_TRIDI_GPU_BUILD

subroutine solve_tridi_gpu_&

&precision_and_suffix &

#else


subroutine solve_tridi_cpu_&

&precision_and_suffix &

#endif

    ( obj, na, nev, &

#ifdef SOLVE_TRIDI_GPU_BUILD

      d_dev, e_dev, q_dev, &

#else

      d, e, q, &

#endif

      ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &

                                           mpi_comm_cols, wantdebug, success, max_threads )


      use precision

      use elpa_abstract_impl

      use merge_recursive

      use merge_systems

      use elpa_mpi

      use elpa_utilities

      use distribute_global_column

      use elpa_mpi

      use elpa_gpu

      use elpa_gpu_util

      use tridi_col_gpu

      implicit none

#include "../../src/general/precision_kinds.F90"

      class(elpa_abstract_impl_t), intent(inout) :: obj

      integer(kind=ik), intent(in)               :: na, nev, ldq, nblk, matrixCols, &

                                                    mpi_comm_all, mpi_comm_rows, mpi_comm_cols


      integer(kind=c_intptr_t)                   :: d_dev, e_dev, q_dev

#ifndef SOLVE_TRIDI_GPU_BUILD

      real(kind=real_datatype), intent(inout)    :: d(na), e(na)

#ifdef USE_ASSUMED_SIZE

      real(kind=real_datatype), intent(inout)    :: q(ldq,*)

#else

      real(kind=real_datatype), intent(inout)    :: q(ldq,matrixcols)

#endif

#else /* SOLVE_TRIDI_GPU_BUILD */

      real(kind=real_datatype)                   :: d(na), e(na)

      real(kind=real_datatype)                   :: q(ldq,matrixcols)

#endif /* SOLVE_TRIDI_GPU_BUILD */


      logical, intent(in)                        :: wantDebug

      logical                                    :: success


      integer(kind=ik)                           :: i, j, n, np, nc, nev1, l_cols, l_rows

      integer(kind=ik)                           :: my_prow, my_pcol, np_rows, np_cols

      integer(kind=MPI_KIND)                     :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI

      integer(kind=ik), allocatable              :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)


      integer(kind=ik)                           :: istat

      character(200)                             :: errorMessage

      character(20)                              :: gpuString

      integer(kind=ik), intent(in)               :: max_threads

      logical                                    :: useGPU

      integer(kind=c_intptr_t)                   :: num

      integer(kind=c_intptr_t), parameter        :: size_of_datatype = size_of_&

                                                                      &precision&

                                                                      &_&

                                                                      &math_datatype

      integer(kind=c_intptr_t), parameter        :: size_of_datatype_real = size_of_&

                                                                      &precision&

                                                                      &_real

      integer(kind=c_intptr_t)                   :: gpuHandle, my_stream

      type(c_ptr)                                :: limits_dev

      logical                                    :: successGPU


      usegpu = .false.

#ifdef SOLVE_TRIDI_GPU_BUILD

      usegpu = .true.

#endif


      if(usegpu) then

        gpustring = "_gpu"

      else

        gpustring = ""

      endif


      call obj%timer%start("solve_tridi" // precision_suffix // gpustring)


      call obj%timer%start("mpi_communication")

      call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)

      call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)

      call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)


      my_prow = int(my_prowmpi,kind=c_int)

      np_rows = int(np_rowsmpi,kind=c_int)

      my_pcol = int(my_pcolmpi,kind=c_int)

      np_cols = int(np_colsmpi,kind=c_int)


      call obj%timer%stop("mpi_communication")


      success = .true.


      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q

      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q


      if (usegpu) then

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        successgpu = gpu_memset_async(q_dev, 0, l_rows*l_cols*size_of_datatype_real, my_stream)

        check_memset_gpu("solve_tridi: tmp_dev", successgpu)

#else

        successgpu = gpu_memset(q_dev, 0, l_rows*l_cols*size_of_datatype_real)

        check_memset_gpu("solve_tridi: tmp_dev", successgpu)

#endif


        if (l_rows .ne. ldq) then

          print *,"oh shit ldq:",l_rows,ldq

          stop

        endif

        if (l_cols .ne. matrixcols) then

          print *,"oh shit matrixCols:",l_cols,matrixcols

          stop

        endif

      else

        !if (.not.(obj%eigenvalues_only)) then

          ! Set Q to 0

          q(1:l_rows, 1:l_cols) = 0.0_rk

        !endif

      endif


      ! Get the limits of the subdivisons, each subdivison has as many cols

      ! as fit on the respective processor column


      allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: limits", istat, errormessage)


      limits(0) = 0

      do np=0,np_cols-1

        nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np


        ! Check for the case that a column has have zero width.

        ! This is not supported!

        ! Scalapack supports it but delivers no results for these columns,

        ! which is rather annoying

        if (nc==0) then

          call obj%timer%stop("solve_tridi" // precision_suffix)

          if (wantdebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'

          success = .false.

          return

        endif

        limits(np+1) = limits(np) + nc

      enddo


      ! Subdivide matrix by subtracting rank 1 modifications


      if (usegpu) then


        ! carefull: only from 1:np_cols and not 0:np_cols as on CPU

        num = (np_cols) * size_of_int

        successgpu = gpu_malloc(limits_dev, num)

        check_alloc_gpu("solve_tridi limits_dev: ", successgpu)


        num = (np_cols) * size_of_int

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        successgpu = gpu_memcpy_async(limits_dev, int(loc(limits(1)),kind=c_intptr_t), &

                      num, gpumemcpyhosttodevice, my_stream)

        check_memcpy_gpu("solve_tridi limits_dev: ", successgpu)

#else

        successgpu = gpu_memcpy(limits_dev, int(loc(limits(1)),kind=c_intptr_t), &

                      num, gpumemcpyhosttodevice)

        check_memcpy_gpu("solve_tridi: limits_dev", successgpu)

#endif


        my_stream = obj%gpu_setup%my_stream

        call gpu_update_d_precision (limits_dev, d_dev, e_dev, np_cols, na, my_stream)


        successgpu = gpu_free(limits_dev)

        check_dealloc_gpu("solve_tridi: limits_dev", successgpu)


      else

       do i=1,np_cols-1

          n = limits(i)

          d(n) = d(n)-abs(e(n))

          d(n+1) = d(n+1)-abs(e(n))

        enddo

      endif


      ! Solve sub problems on processsor columns


      nc = limits(my_pcol) ! column after which my problem starts


      if (np_cols>1) then

        nev1 = l_cols ! all eigenvectors are needed

      else

        nev1 = min(nev,l_cols)

      endif


      if (usegpu) then

        call solve_tridi_col_gpu_&

             &precision_and_suffix &

               (obj, l_cols, nev1, nc, d_dev +(nc+1-1)*size_of_datatype_real, &

                          e_dev + (nc+1-1)*size_of_datatype_real, q_dev, ldq, nblk,  &

                          matrixcols, mpi_comm_rows, wantdebug, success, max_threads)


        num = ldq*matrixcols * size_of_datatype_real

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        call gpu_memcpy_async_and_stream_synchronize &

            ("solve_tridi q_dev -> q_vec aaa", q_dev, 0_c_intptr_t, &

                                                 q(1:ldq,1:matrixcols), &

                                 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)

#else

        successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t),  q_dev, &

                              num, gpumemcpydevicetohost)

        check_memcpy_gpu("solve_tridi aaa: q_dev", successgpu)

#endif

      else ! useGPU

        call solve_tridi_col_cpu_&

             &precision_and_suffix &

               (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk,  &

                          matrixcols, mpi_comm_rows, wantdebug, success, max_threads)

      endif ! useGPU

      if (.not.(success)) then

        call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)

        return

      endif


      ! If there is only 1 processor column, we are done


      if (np_cols==1) then

        deallocate(limits, stat=istat, errmsg=errormessage)

        check_deallocate("solve_tridi: limits", istat, errormessage)


        call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)

        return

      endif


      ! Set index arrays for Q columns


      ! Dense distribution scheme:


      allocate(l_col(na), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: l_col", istat, errormessage)


      allocate(p_col(na), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: p_col", istat, errormessage)


      n = 0

      do np=0,np_cols-1

        nc = local_index(na, np, np_cols, nblk, -1)

        do i=1,nc

          n = n+1

          l_col(n) = i

          p_col(n) = np

        enddo

      enddo


      ! Block cyclic distribution scheme, only nev columns are set:


      allocate(l_col_bc(na), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: l_col_bc", istat, errormessage)


      allocate(p_col_bc(na), stat=istat, errmsg=errormessage)

      check_allocate("solve_tridi: p_col_bc", istat, errormessage)


      p_col_bc(:) = -1

      l_col_bc(:) = -1


      do i = 0, na-1, nblk*np_cols

        do j = 0, np_cols-1

          do n = 1, nblk

            if (i+j*nblk+n <= min(nev,na)) then

              p_col_bc(i+j*nblk+n) = j

              l_col_bc(i+j*nblk+n) = i/np_cols + n

             endif

           enddo

         enddo

      enddo


      ! gpu function or memcpy

      if (usegpu) then

        num = na * size_of_datatype_real

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        call gpu_memcpy_async_and_stream_synchronize &

            ("solve_tridi d_dev -> d", d_dev, 0_c_intptr_t, &

                                                 d(1:na), &

                                  1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)

#else

        successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t),  d_dev, &

                              num, gpumemcpydevicetohost)

        check_memcpy_gpu("solve_tridi: 1: d_dev", successgpu)

#endif

        num = na * size_of_datatype_real

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        call gpu_memcpy_async_and_stream_synchronize &

            ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &

                                                 e(1:na), &

                                  1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)

#else

        successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t),  e_dev, &

                              num, gpumemcpydevicetohost)

        check_memcpy_gpu("solve_tridi: 1: d_dev", successgpu)

#endif

      endif


      ! Recursively merge sub problems

      call merge_recursive_&

           &precision &

           (obj, 0, np_cols, ldq, matrixcols, nblk, &

           l_col, p_col, l_col_bc, p_col_bc, limits, &

           np_cols, na, q, d, e, &

           mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&

           usegpu, wantdebug, success, max_threads)


      if (.not.(success)) then

        call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)

        return

      endif


      deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)

      check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)


      if (usegpu) then

        ! dirty hack

        num = na * size_of_datatype_real

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        call gpu_memcpy_async_and_stream_synchronize &

            ("solve_trid d -> d_dev", d_dev, 0_c_intptr_t, &

                                                 d(1:na), &

                                  1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)

#else

        successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t),  &

                              num, gpumemcpyhosttodevice)

        check_memcpy_gpu("solve_tridi: d_dev", successgpu)

#endif

        num = na * size_of_datatype_real

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        call gpu_memcpy_async_and_stream_synchronize &

            ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &

                                                 e(1:na), &

                                 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)

#else

       successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t),  &

                              num, gpumemcpyhosttodevice)

       check_memcpy_gpu("solve_tridi: e_dev", successgpu)

#endif

        if (.not.(obj%eigenvalues_only)) then

          num = ldq*matrixcols * size_of_datatype_real

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          call gpu_memcpy_async_and_stream_synchronize &

            ("solve_tride q_dev -> q_vec", q_dev, 0_c_intptr_t, &

                                                 q(1:ldq,1:matrixcols), &

                                 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)

#else

         successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t),  &

                              num, gpumemcpyhosttodevice)

         check_memcpy_gpu("solve_tridi: q_dev", successgpu)

#endif

        endif ! eigenvalues_only

      endif


      call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)

      return


    end

distribute_global_column
Definition mod_distribute_global_column.F90:55

elpa_abstract_impl
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50

merge_recursive
Definition mod_merge_recursive.F90:3

merge_systems
Definition mod_merge_systems.F90:3

elpa_abstract_impl::elpa_abstract_impl_t
Definition elpa_abstract_impl.F90:73