ELPA-2025.01.002/html/elpa__hermitian__multiply__template_8F90_source.html

!

!    The ELPA library was originally created by the ELPA consortium,

!    consisting of the following organizations:

!

!    - Max Planck Computing and Data Facility (MPCDF), formerly known as

!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),

!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte

!      Informatik,

!    - Technische Universität München, Lehrstuhl für Informatik mit

!      Schwerpunkt Wissenschaftliches Rechnen ,

!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,

!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,

!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,

!      and

!    - IBM Deutschland GmbH

!

!    This particular source code file contains additions, changes and

!    enhancements authored by Intel Corporation which is not part of

!    the ELPA consortium.

!

!    More information can be found here:

!    http://elpa.mpcdf.mpg.de/

!

!    ELPA is free software: you can redistribute it and/or modify

!    it under the terms of the version 3 of the license of the

!    GNU Lesser General Public License as published by the Free

!    Software Foundation.

!

!    ELPA is distributed in the hope that it will be useful,

!    but WITHOUT ANY WARRANTY; without even the implied warranty of

!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

!    GNU Lesser General Public License for more details.

!

!    You should have received a copy of the GNU Lesser General Public License

!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>

!

!    ELPA reflects a substantial effort on the part of the original

!    ELPA consortium, and we ask you to respect the spirit of the

!    license that we chose: i.e., please contribute any changes you

!    may have back to the original ELPA library distribution, and keep

!    any derivatives of ELPA under the same license that we chose for

!    the original distribution, the GNU Lesser General Public License.

!

!

! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines

!

! Copyright of the original code rests with the authors inside the ELPA

! consortium. The copyright of any additional modifications shall rest

! with their original authors, but shall adhere to the licensing terms

! distributed along with the original code in the file "COPYING".

!

! Author: A. Marek, MPCDF


#include "../general/sanity.F90"

#include "../general/error_checking.inc"


#undef USE_CCL_HERMITIAN_MULTIPLY

#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)

#define USE_CCL_HERMITIAN_MULTIPLY

#endif


  use elpa1_compute

  use elpa_mpi

  use precision

  use elpa_abstract_impl

  use, intrinsic :: iso_c_binding

  use elpa_gpu

  use mod_check_for_gpu

  use elpa_blas_interfaces

  use elpa_utilities, only : local_index, greatest_common_divisor, check_deallocate_f, check_dealloc_gpu_f, &

                             check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &

                             check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &

                             check_host_register_gpu_f, check_alloc, error_unit

  use mod_query_gpu_usage

#ifdef WITH_GPU_STREAMS

  use elpa_gpu_util

#endif

#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)

  use cuda_functions ! for NVTX labels

#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)

  use hip_functions  ! for ROCTX labels

#endif

#if defined(USE_CCL_HERMITIAN_MULTIPLY)

  use elpa_ccl_gpu

#endif

  use multiply_a_b_gpu

  implicit none


#include "../../src/general/precision_kinds.F90"

  class(elpa_abstract_impl_t), intent(inout)   :: obj


  character*1                                  :: uplo_a, uplo_c, trans_a, trans_b


  integer(kind=ik), intent(in)                 :: ldb, ldbCols, ldc, ldcCols

  integer(kind=ik)                             :: na, ncb

#ifndef DEVICE_POINTER

#ifdef USE_ASSUMED_SIZE

  math_datatype(kind=rck)                      :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)

#else

  math_datatype(kind=rck)                      :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbcols), c(ldc,ldccols)

#endif

#else /* DEVICE_POINTER */

  ! dummy variables

  math_datatype(kind=rck), allocatable         :: a(:,:), b(:,:), c(:,:)

  type(c_ptr)                                  :: aDev, bDev, cDev

#endif /* DEVICE_POINTER */


  integer(kind=ik)                             :: my_prow, my_pcol, np_rows, np_cols, myid

  integer(kind=MPI_KIND)                       :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI

  integer(kind=MPI_KIND)                       :: mpierr, myidMPI

  integer(kind=ik)                             :: l_cols, l_rows, l_rows_np

  integer(kind=ik)                             :: n

  integer(kind=ik)                             :: np, nb, nblk_mult, lrs, lre, lcs, lce

  integer(kind=ik)                             :: gcol_min, gcol, goff

  integer(kind=ik)                             :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals

  integer(kind=ik), allocatable                :: lrs_save(:), lre_save(:)


  logical                                      :: a_lower, a_upper, c_lower, c_upper

  math_datatype(kind=rck)                      :: beta

  math_datatype(kind=rck), pointer, contiguous :: aux_mat(:,:), tmp1(:,:)

  math_datatype(kind=rck), allocatable         :: aux_bc(:), tmp2(:,:)

  logical                                      :: wantDebug

  integer(kind=ik)                             :: istat, debug

  character(200)                               :: errorMessage

  character(20)                                :: gpuString

  logical                                      :: success, successGPU, successGPU2

  logical                                      :: useGPU

  integer(kind=c_int)                          :: numGPU, blocking

  integer(kind=ik)                             :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all

  integer(kind=ik)                             :: nblk, matrixRows, matrixCols, error

  integer(kind=c_intptr_t)                     :: aux_bc_dev, aux_mat_dev, tmp1_dev, tmp2_dev


  integer(kind=c_intptr_t)                     :: a_dev

  integer(kind=c_intptr_t)                     :: b_dev

  integer(kind=c_intptr_t)                     :: c_dev


  type(c_ptr)                                  :: aux_host

  integer(kind=c_intptr_t)                     :: num

  integer(kind=c_intptr_t)                     :: aux_off, b_off

  integer(kind=c_intptr_t), parameter          :: size_of_datatype = size_of_&

                                                            &precision&

                                                            &_&

                                                            &math_datatype


  integer(kind=c_intptr_t)                     :: gpuHandle, my_stream

  integer(kind=c_int)                          :: gpu_hermitian_multiply


  logical                                      :: useCCL

#if defined(USE_CCL_HERMITIAN_MULTIPLY)

  integer(kind=c_intptr_t)                     :: ccl_comm_rows, ccl_comm_cols

  integer(kind=c_int)                          :: cclDataType

  integer(kind=ik)                             :: k_datatype

#endif


  integer(kind=c_intptr_t)                     :: aux_dev

  integer(kind=c_int)                          :: gpu


#ifdef WITH_NVTX

  call nvtxrangepush("hermitian_multiply")

#endif


  success = .true.

  usegpu = .false.


  call obj%get("debug", debug, error)

  if (error .ne. elpa_ok) then

    write(error_unit,*) "elpa_hermitian_multiply: Problem getting option for debug settings. Aborting..."

    success = .false.

    return

  endif

  if (debug == 1) then

    wantdebug = .true.

  else

    wantdebug = .false.

  endif


#if !defined(DEVICE_POINTER)


#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)

  if (.not.(query_gpu_usage(obj, "ELPA_MULITPLY_AB", usegpu))) then

    print *,"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."

    stop 1

  endif

#endif


  ! check whether the above setting should be overriden

  if (obj%is_set("gpu_hermitian_multiply") == 1) then

    call obj%get("gpu_hermitian_multiply", gpu_hermitian_multiply, error)

    if (error .ne. elpa_ok) then

      print *,"Problem getting option for gpu_hermitian_mutltiply. Aborting..."

      stop 1

    endif

    if (usegpu .and. gpu_hermitian_multiply .eq. 0) then

      usegpu = .false.

    else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1) then

      usegpu = .true.

    else

    endif

  else

    ! no override by user

    ! keep seeting as found before

  endif


#else /* DEVICE_POINTER */


  usegpu = .true.


  a_dev = transfer(adev, a_dev)

  b_dev = transfer(bdev, b_dev)

  c_dev = transfer(cdev, c_dev)


#endif /* DEVICE_POINTER */


  if(usegpu) then

    gpustring = "_gpu"

  else

    gpustring = ""

  endif


  call obj%timer%start("elpa_hermitian_multiply_&

  &MATH_DATATYPE&

  &_&

  &PRECISION&

  &"//gpustring)


  na          = obj%na

  nblk        = obj%nblk

  matrixrows  = obj%local_nrows

  matrixcols  = obj%local_ncols


  mpi_comm_all    = obj%mpi_setup%mpi_comm_parent

  mpi_comm_cols   = obj%mpi_setup%mpi_comm_cols

  mpi_comm_rows   = obj%mpi_setup%mpi_comm_rows


  myid    = obj%mpi_setup%myRank_comm_parent

  my_prow = obj%mpi_setup%myRank_comm_rows

  my_pcol = obj%mpi_setup%myRank_comm_cols


  np_rows = obj%mpi_setup%nRanks_comm_rows

  np_cols = obj%mpi_setup%nRanks_comm_cols


  l_rows = local_index(na,  my_prow, np_rows, nblk, -1) ! Local rows of a and b

  l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b


  ! Block factor for matrix multiplications, must be a multiple of nblk


  if (obj%is_set("blocking_in_multiply") == 1) then

    call obj%get("blocking_in_multiply", blocking, error)

    if (error .ne. elpa_ok) then

      write(error_unit,*) "elpa_hermitian_multiply: Problem in getting keyword 'blocking_in_multiply'. Aborting..."

      stop 1

    endif

    nblk_mult = (blocking/nblk+1) * nblk

  else ! is_set

    if (usegpu) then

      if (na/np_rows <= 256) then

        nblk_mult = (63/nblk+1)*nblk

      else

        nblk_mult = (351/nblk+1)*nblk

      endif

    else ! useGPU

      if (na/np_rows <= 256) then

        nblk_mult = (31/nblk+1)*nblk

      else

        nblk_mult = (63/nblk+1)*nblk

      endif

    endif ! useGPU

  endif ! is_set


  useccl = .false.

  if (usegpu) then

    call obj%timer%start("check_for_gpu")

    if (check_for_gpu(obj, myid, numgpu)) then

      ! set the neccessary parameters

      call set_gpu_parameters()

    else

      print *,"GPUs are requested but not detected! Aborting..."

      success = .false.

      return

    endif

    call obj%timer%stop("check_for_gpu")


#if defined(USE_CCL_HERMITIAN_MULTIPLY)

    useccl = .true.


    ccl_comm_rows = obj%gpu_setup%ccl_comm_rows

    ccl_comm_cols = obj%gpu_setup%ccl_comm_cols


#if   REALCASE == 1 && defined(DOUBLE_PRECISION)

    ccldatatype = ccldouble

    k_datatype = 1

#elif REALCASE == 1 && defined(SINGLE_PRECISION)

    ccldatatype = cclfloat

    k_datatype = 1

#elif COMPLEXCASE == 1 && defined(DOUBLE_PRECISION)

    ccldatatype = ccldouble

    k_datatype = 2

#elif COMPLEXCASE == 1 && defined(SINGLE_PRECISION)

    ccldatatype = cclfloat

    k_datatype = 2

#endif

#endif /* defined(USE_CCL_HERMITIAN_MULTIPLY) */


#if !defined(DEVICE_POINTER)

    num = ldc*ldccols*size_of_datatype

    successgpu = gpu_malloc(c_dev, num)

    check_alloc_gpu("elpa_hermitian_multiply: c_dev", successgpu)

    ! no copy from c to c_dev needed since c will be overwritten anyway

#endif


#if !defined(DEVICE_POINTER)

    ! copy b to b_dev

    num = ldb*ldbcols*size_of_datatype

    successgpu = gpu_malloc(b_dev, num)

    check_alloc_gpu("elpa_hermitian_multiply: b_dev", successgpu)


#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)

    successgpu = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&

                  gpuhostregisterdefault)

#endif


    check_host_register_gpu("elpa_hermitian_multiply: b", successgpu)

#ifdef WITH_GPU_STREAMS

    my_stream = obj%gpu_setup%my_stream

    call gpu_memcpy_async_and_stream_synchronize &

    ("elpa_hermitian_multiply: b to b_dev", b_dev, 0_c_intptr_t, &

                                       b(1:ldb,1:ldbcols), &

                                       1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)

#else

    successgpu = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&

                  gpumemcpyhosttodevice)

    check_memcpy_gpu("elpa_hermitian_multiply: b to b_dev", successgpu)

#endif


#else /* DEVICE_POINTER */


#endif /* DEVICE_POINTER */


    num = l_rows*nblk_mult*size_of_datatype

#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)

    successgpu = gpu_malloc_host(aux_host, num) ! aux_host is needed, because pinning host memory can be done only for 1D arrays

    check_host_alloc_gpu("elpa_hermitian_multiply: aux_host", successgpu)

    call c_f_pointer(aux_host, aux_mat, (/l_rows,nblk_mult/))

#else

    allocate(aux_mat(l_rows, nblk_mult), stat=istat, errmsg=errormessage)

    check_allocate("elpa_hermitian_multiply: aux_mat", istat, errormessage)

#endif


    successgpu = gpu_malloc(aux_mat_dev, num)

    check_alloc_gpu("elpa_hermitian_multiply: aux_mat_dev", successgpu)


    num = nblk_mult*l_cols*size_of_datatype

    successgpu = gpu_malloc(tmp1_dev, num)

    check_alloc_gpu("elpa_hermitian_multiply: tmp1_dev", successgpu)


    num = nblk_mult*l_cols*size_of_datatype

    successgpu = gpu_malloc(tmp2_dev, num)

    check_alloc_gpu("elpa_hermitian_multiply: tmp2_dev", successgpu)


  else ! useGPU

    allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errormessage)

    check_allocate("elpa_hermitian_multiply: aux_mat", istat, errormessage)

  endif ! useGPU


  allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errormessage)

  check_allocate("elpa_hermitian_multiply: aux_bc", istat, errormessage)


  allocate(lrs_save(nblk), stat=istat, errmsg=errormessage)

  check_allocate("elpa_hermitian_multiply: lrs_save", istat, errormessage)


  allocate(lre_save(nblk), stat=istat, errmsg=errormessage)

  check_allocate("elpa_hermitian_multiply: lre_save", istat, errormessage)


  a_lower = .false.

  a_upper = .false.

  c_lower = .false.

  c_upper = .false.


  if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.

  if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.

  if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.

  if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.


  if (usegpu) then


#if !defined(DEVICE_POINTER)

    num = obj%local_nrows*obj%local_ncols*size_of_datatype

    successgpu = gpu_malloc(a_dev, num)

    check_alloc_gpu("elpa_hermitian_multiply: a_dev", successgpu)

#endif


    num = l_rows*nblk*size_of_datatype

    successgpu = gpu_malloc(aux_bc_dev, num)

    check_alloc_gpu("elpa_hermitian_multiply: aux_bc_dev", successgpu)


    num = obj%local_nrows*obj%local_ncols*size_of_datatype

#if !defined(DEVICE_POINTER)


#ifdef WITH_GPU_STREAMS

    my_stream = obj%gpu_setup%my_stream

    call gpu_memcpy_async_and_stream_synchronize &

    ("elpa_hermitian_multiply: a to a_dev", a_dev, 0_c_intptr_t, &

                                       a(1:obj%local_nrows,1:obj%local_ncols), &

                                       1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)

#else

    successgpu = gpu_memcpy(a_dev, int(loc(a),kind=c_intptr_t), &

                  num, gpumemcpyhosttodevice)

    check_memcpy_gpu("elpa_hermitian_multiply: a to a_dev", successgpu)

#endif

#endif /* DEVICE_POINTER */

  endif !useGPU


! _________________________________________________________________________________________________________________________________


  ! main loop: build up the result matrix by processor rows

  do np = 0, np_rows-1


#ifdef WITH_NVTX

    call nvtxrangepush("do np = 0, np_rows-1")

#endif


    ! In this turn, procs of row np assemble the result


    l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors


    nr_done = 0 ! Number of rows done

    nstor = 0   ! Number of columns stored in aux_mat


    aux_mat = 0

    if (usegpu) then

      num = l_rows*nblk_mult*size_of_datatype

#ifdef WITH_GPU_STREAMS

      my_stream = obj%gpu_setup%my_stream

      successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)

      check_memcpy_gpu("multiply: aux_mat_dev", successgpu)

#else

      successgpu = gpu_memset(aux_mat_dev, 0, num)

      check_memcpy_gpu("multiply: aux_mat_dev", successgpu)

#endif

    endif ! useGPU


    ! Loop over the blocks on row np; nb is the 0-based local index of the block

    do nb = 0, (l_rows_np-1)/nblk


#ifdef WITH_NVTX

      call nvtxrangepush("do nb = 0, (l_rows_np-1)/nblk")

#endif


      goff  = nb*np_rows + np ! offset in the global grid of blocks


      ! Get the processor column which owns this block

      ! and the offset in blocks within this column.

      ! The corresponding block column in A is then broadcast to all for multiplication with B


      np_bc = mod(goff, np_cols) ! np, that posesses the given column of blocks; trans_a='T'; "bc"=block column; rename: np_bc -> np_col_b / np_col_curr


      noff = goff/np_cols   ! offset in the local grid of blocks


      ! Gather up the complete column/row of blocks of A (for T/N case) on the owner in contigous memory of aux_bc array

      n_aux_bc = 0

      ! if not for upper/lower cases: aux_bc_2D(1:l_rows,1:l_cols) = a(1:l_rows,1:l_cols)

      do n = 1, min(nblk, l_rows_np-nb*nblk) ! Loop over local columns for to be broadcast


        gcol = goff*nblk + n ! global column corresponding to n, needed only for a_lower and a_upper cases


        if (nstor==0 .and. n==1) gcol_min = gcol


        lrs = 1       ! 1st (start) local row number for broadcast

        lre = l_rows  ! last (end)  local row number for broadcast

        if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)

        if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)


        if (lrs <= lre) then

          nvals = lre-lrs+1

          if (usegpu) then

            if (my_pcol == np_bc) call gpu_copy_precision_a_aux_bc(a_dev, aux_bc_dev, n_aux_bc, nvals, lrs, lre, noff, &

                                                                    nblk, n, l_rows, obj%local_nrows, obj%local_ncols, my_stream)

          else ! useGPU

            if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)

          endif ! useGPU


          n_aux_bc = n_aux_bc + nvals

        endif ! (lrs <= lre)


        lrs_save(n) = lrs

        lre_save(n) = lre


      enddo ! n = 1, min(nblk, l_rows_np-nb*nblk)


#ifdef WITH_MPI

      ! copy data to host for bcast, if needed

      if (usegpu .and. .not. useccl) then

        num = l_rows*nblk*size_of_datatype

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        call gpu_memcpy_async_and_stream_synchronize &

              ("elpa_hermitian_multiply: aux_bc_dev -> aux_bc", aux_bc_dev, 0_c_intptr_t, aux_bc(1:l_rows*nblk), &

              1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)

#else

        successgpu = gpu_memcpy(int(loc(aux_bc),kind=c_intptr_t), aux_bc_dev, num, gpumemcpydevicetohost)

        check_memcpy_gpu("elpa_hermitian_multiply: aux_bc_dev -> aux_bc", successgpu)

#endif

      endif ! useGPU  .and. .not. useCCL


      ! Broadcast block column

      if (useccl) then

#ifdef USE_CCL_HERMITIAN_MULTIPLY

#ifdef WITH_NVTX

        call nvtxrangepush("ccl_bcast aux_bc_dev")

#endif

        call obj%timer%start("ccl_bcast")


        my_stream = obj%gpu_setup%my_stream

        ccl_comm_cols = obj%gpu_setup%ccl_comm_cols


        successgpu = ccl_bcast(aux_bc_dev, aux_bc_dev, int(k_datatype*n_aux_bc,kind=c_size_t), ccldatatype, &

                              int(np_bc,kind=c_int), ccl_comm_cols, my_stream)


        if (.not. successgpu) then

          print *,"Error in ccl_bcast"

          stop 1

        endif


        successgpu = gpu_stream_synchronize(my_stream)

        check_stream_synchronize_gpu("elpa_cholesky: ccl_bcast", successgpu)


        call obj%timer%stop("ccl_bcast")

#ifdef WITH_NVTX

        call nvtxrangepop() ! ccl_bcast aux_bc_dev

#endif

#endif /* USE_CCL_HERMITIAN_MULTIPLY */

      else ! useCCL

        call obj%timer%start("mpi_communication")


        call mpi_bcast(aux_bc, int(n_aux_bc,kind=mpi_kind), mpi_math_datatype_precision, &

                      int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)


        call obj%timer%stop("mpi_communication")

      endif ! useCCL


      ! copy data back to device, if needed

      if (usegpu .and. .not. useccl) then

        num = l_rows*nblk*size_of_datatype

#ifdef WITH_GPU_STREAMS

        my_stream = obj%gpu_setup%my_stream

        call gpu_memcpy_async_and_stream_synchronize &

            ("elpa_hermitian_multiply: aux_bc -> aux_bc_dev", aux_bc_dev, 0_c_intptr_t, aux_bc(1:l_rows*nblk), &

              1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)

#else

        successgpu = gpu_memcpy(aux_bc_dev, int(loc(aux_bc),kind=c_intptr_t), num, gpumemcpyhosttodevice)

        check_memcpy_gpu("elpa_hermitian_multiply: aux_bc -> aux_bc_dev", successgpu)

#endif

      endif ! useGPU .and. .not. useCCL

#endif /* WITH_MPI */


      ! Copy what we got in aux_mat

      if (usegpu) then

        n_aux_bc = 0

        my_stream = obj%gpu_setup%my_stream

        do n = 1, min(nblk, l_rows_np-nb*nblk)

          nstor = nstor+1

          lrs = lrs_save(n)

          lre = lre_save(n)

          if (lrs <= lre) then

            nvals = lre-lrs+1

            call gpu_copy_precision_aux_bc_aux_mat(aux_bc_dev, aux_mat_dev, lrs, lre, nstor, n_aux_bc, &

                                                  nvals, l_rows, nblk, nblk_mult, my_stream)


            n_aux_bc = n_aux_bc + nvals

          endif

        enddo

      else ! useGPU

        n_aux_bc = 0

        do n = 1, min(nblk, l_rows_np-nb*nblk)

          nstor = nstor+1

          lrs = lrs_save(n)

          lre = lre_save(n)

          if (lrs<=lre) then

            nvals = lre-lrs+1

            aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)

            n_aux_bc = n_aux_bc + nvals

          endif

        enddo

      endif ! useGPU


      ! If we got nblk_mult columns in aux_mat or this is the last block

      ! do the matrix multiplication


      if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then


        lrs = 1       ! 1st local row number for multiply

        lre = l_rows  ! last local row number for multiply

        if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)

        if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)


        lcs = 1       ! 1st local col number for multiply

        lce = l_cols  ! last local col number for multiply

        if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)

        if (c_lower) lce = min(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)


        if (lcs <= lce) then

          if (.not. useccl) then

            ! introduce 1-based indexing

            allocate(tmp1(nstor,1:lce-lcs+1), tmp2(nstor,1:lce-lcs+1), stat=istat, errmsg=errormessage)

            call check_alloc("elpa_hermitian_multiply_&

                            &MATH_DATATYPE ", "tmp1", istat, errormessage)

          endif


          if (lrs <= lre) then

            if (usegpu) then

              aux_off = (lrs-1)*size_of_datatype

              b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype


#ifdef WITH_NVTX

              call nvtxrangepush("gpublas")

#endif

              call obj%timer%start("gpublas")

              gpuhandle = obj%gpu_setup%gpublasHandleArray(0)

              ! tmp1_dev = aux_mat_dev^{T/N} * b_dev

              call gpublas_precision_gemm(blas_trans_or_conj, 'N', nstor, lce-lcs+1, lre-lrs+1, one, &

                                          aux_mat_dev+aux_off, l_rows, &

                                          b_dev+b_off, ldb, zero, &

                                          tmp1_dev, nstor, gpuhandle)

              if (wantdebug) successgpu = gpu_devicesynchronize()

              call obj%timer%stop("gpublas")

#ifdef WITH_NVTX

              call nvtxrangepop() ! gpublas

#endif

            else ! useGPU

              call obj%timer%start("blas")

              ! tmp1 = aux_mat^{T/N} * b

              call precision_gemm(blas_trans_or_conj, 'N', int(nstor,kind=blas_kind), &

                                int(lce-lcs+1,kind=blas_kind), int(lre-lrs+1,kind=blas_kind), one, &

                                aux_mat(lrs:lre,1:nstor), int(lre-lrs+1,kind=blas_kind), &

                                b(lrs,lcs), int(ldb,kind=blas_kind), zero, &

                                tmp1, int(nstor,kind=blas_kind))

              call obj%timer%stop("blas")

            endif ! useGPU

          else ! (lrs <= lre)

            if (usegpu) then

              num = nstor*(lce-lcs+1)*size_of_datatype

#ifdef WITH_GPU_STREAMS

              my_stream = obj%gpu_setup%my_stream

              successgpu = gpu_memset_async(tmp1_dev, 0, num, my_stream)

              check_memcpy_gpu("multiply: tmp1_dev", successgpu)

#else

              successgpu = gpu_memset(tmp1_dev, 0, num)

              check_memcpy_gpu("multiply: tmp1_dev", successgpu)

#endif

            else ! useGPU

              tmp1 = 0

            endif ! useGPU

          endif ! (lrs <= lre)


          ! Sum up the results and send to processor row np


#ifdef WITH_MPI

          ! copy data to host, if needed

          if (usegpu .and. .not. useccl) then

            num = nstor*(lce-lcs+1)*size_of_datatype

#ifdef WITH_GPU_STREAMS

            call gpu_memcpy_async_and_stream_synchronize &

            ("elpa_hermitian_multiply: tmp1_dev to tmp1", tmp1_dev, 0_c_intptr_t, &

                                                !tmp1(1:nblk_mult,1:l_cols), &

                                                tmp1(1:nstor,1:lce-lcs+1), &

                                                1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)

#else

            successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &

                            tmp1_dev, num, gpumemcpydevicetohost)

            check_memcpy_gpu("elpa_hermitian_multiply: tmp1_dev to tmp1", successgpu)

#endif

          endif ! useGPU .and. .not. useCCL


          ! MPI/ccl Reduce

          if (useccl) then

#ifdef USE_CCL_HERMITIAN_MULTIPLY

#ifdef WITH_NVTX

            call nvtxrangepush("ccl_reduce tmp1_dev")

#endif

            call obj%timer%start("ccl_reduce")

            my_stream = obj%gpu_setup%my_stream

            ccl_comm_rows = obj%gpu_setup%ccl_comm_rows


            successgpu = ccl_reduce(tmp1_dev, tmp2_dev, int(k_datatype*nstor*(lce-lcs+1),kind=c_size_t), ccldatatype, &

                                    cclsum, int(np,kind=c_int), ccl_comm_rows, my_stream)


            if (.not. successgpu) then

              print *,"Error in ccl_reduce"

              stop 1

            endif


            successgpu = gpu_stream_synchronize(my_stream)

            check_stream_synchronize_gpu("elpa_cholesky: ccl_reduce", successgpu)


            call obj%timer%stop("ccl_reduce")

#ifdef WITH_NVTX

            call nvtxrangepop() ! ccl_reduce tmp1_dev

#endif

#endif /* USE_CCL_HERMITIAN_MULTIPLY */

          else ! useCCL

            call obj%timer%start("mpi_communication")

            call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind),  mpi_math_datatype_precision, &

                          mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)

            call obj%timer%stop("mpi_communication")

          endif ! useCCL


          ! copy data back to device, if needed

          if (usegpu .and. .not. useccl) then

            num = nstor*(lce-lcs+1)*size_of_datatype

#ifdef WITH_GPU_STREAMS

            call gpu_memcpy_async_and_stream_synchronize &

                ("elpa_hermitian_multiply: tmp2 to tmp2_dev", tmp2_dev, 0_c_intptr_t, &

                                                !tmp2(1:nblk_mult,1:l_cols), &

                                                tmp2(1:nstor,1:lce-lcs+1), &

                                                1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)

#else

            successgpu = gpu_memcpy(tmp2_dev, int(loc(tmp2),kind=c_intptr_t), &

                                    num, gpumemcpyhosttodevice)

            check_memcpy_gpu("elpa_hermitian_multiply: tmp2 to tmp2_dev", successgpu)

#endif

          endif ! useGPU .and. .not. useCCL

#else /* WITH_MPI */


          if (usegpu) then

            num = nstor*(lce-lcs+1)*size_of_datatype

            successgpu = gpu_memcpy(tmp2_dev, tmp1_dev, num, gpumemcpydevicetodevice)

            check_memcpy_gpu("elpa_hermitian_multiply: tmp2 to tmp2_dev", successgpu)

          endif

#endif /* WITH_MPI */


          if (usegpu) then

            if (my_prow==np) call gpu_copy_precision_tmp2_c(tmp2_dev, c_dev, nr_done, nstor, &

                                                            lcs, lce, ldc, ldccols, my_stream)

          else ! useGPU

#ifdef WITH_MPI

            ! Put the result into C

            if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)

#else /* WITH_MPI */

            ! Put the result into C

            if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)

            !tmp2(:,:) = 0.

#endif /* WITH_MPI */

          endif ! useGPU


          if (.not. useccl) then

              deallocate(tmp1, tmp2, stat=istat, errmsg=errormessage)

              call check_alloc("elpa_hermitian_multiply_&

                &MATH_DATATYPE ", "tmp1", istat, errormessage)

          endif

        endif ! (lcs <= lce)


        nr_done = nr_done+nstor

        nstor=0

        if (usegpu) then

          num = l_rows*nblk_mult*size_of_datatype

#ifdef WITH_GPU_STREAMS

          my_stream = obj%gpu_setup%my_stream

          successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)

          check_memcpy_gpu("multiply: aux_mat_dev", successgpu)

#else

          successgpu = gpu_memset(aux_mat_dev, 0, num)

          check_memcpy_gpu("multiply: aux_mat_dev", successgpu)

#endif

        else ! useGPU

          aux_mat(:,:) = 0

        endif ! useGPU

      endif ! (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np)


#ifdef WITH_NVTX

      call nvtxrangepop() ! do nb = 0, (l_rows_np-1)/nblk

#endif

    enddo ! nb = 0, (l_rows_np-1)/nblk


#ifdef WITH_NVTX

    call nvtxrangepop() ! do np = 0, np_rows-1

#endif

  enddo ! main loop: np = 0, np_rows-1


!_______________________________________________


  if (usegpu) then

#if !defined(DEVICE_POINTER)

    ! copy result c_dev back to CPU

    num = ldc*ldccols

#ifdef WITH_GPU_STREAMS

    check_stream_synchronize_gpu("elpa_hermitian_multiply: c_dev -> c", successgpu)

    call gpu_memcpy_async_and_stream_synchronize &

        ("elpa_hermitian_multiply: c_dev to c", c_dev, 0_c_intptr_t, c(1:ldc,1:ldccols), &

          1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)

#else

    successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)

    check_memcpy_gpu("elpa_hermitian_multiply: c_dev -> c", successgpu)

#endif

#endif /* !defined(DEVICE_POINTER) */

  endif ! useGPU


!______________________________________________________________________________________________


  if (usegpu) then

#if !defined(DEVICE_POINTER)

    successgpu = gpu_free(b_dev)

    check_dealloc_gpu("elpa_hermitian_multiply: b_dev", successgpu)

#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)

    successgpu = gpu_host_unregister(int(loc(b),kind=c_intptr_t))

    check_host_unregister_gpu("elpa_hermitian_multiply: b", successgpu)

#endif


    successgpu = gpu_free(c_dev)

    check_dealloc_gpu("elpa_hermitian_multiply: c_dev", successgpu)


#else /* DEVICE_POINTER */


#endif /* DEVICE_POINTER */


#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)

    nullify(aux_mat)

    !nullify(tmp1)


    successgpu = gpu_free_host(aux_host)

    check_host_dealloc_gpu("elpa_hermitian_multiply: aux_host", successgpu)

#else

    deallocate(aux_mat, stat=istat, errmsg=errormessage)

    check_deallocate("elpa_hermitian_multiply: aux_mat", istat, errormessage)


    !deallocate(tmp1, stat=istat, errmsg=errorMessage)

    !check_deallocate("elpa_hermitian_multiply: tmp1", istat, errorMessage)

#endif


    successgpu = gpu_free(aux_mat_dev)

    check_dealloc_gpu("elpa_hermitian_multiply: aux_mat_dev", successgpu)


    successgpu = gpu_free(tmp1_dev)

    check_dealloc_gpu("elpa_hermitian_multiply: tmp1_dev", successgpu)


    successgpu = gpu_free(tmp2_dev)

    check_dealloc_gpu("elpa_hermitian_multiply: tmp2_dev", successgpu)


    successgpu = gpu_free(aux_bc_dev)

    check_dealloc_gpu("elpa_hermitian_multiply: aux_bc_dev", successgpu)


#if !defined(DEVICE_POINTER)

    successgpu = gpu_free(a_dev)

    check_dealloc_gpu("elpa_hermitian_multiply: a_dev", successgpu)

#else

    !successGPU = gpu_free(a_dev)

    !check_dealloc_gpu("elpa_hermitian_multiply: a_dev", successGPU)

#endif


  else ! useGPU

    deallocate(aux_mat, stat=istat, errmsg=errormessage)

    check_deallocate("elpa_hermitian_multiply: aux_mat", istat, errormessage)

  endif ! useGPU


  deallocate(aux_bc, lrs_save, lre_save, stat=istat, errmsg=errormessage)

  check_deallocate("elpa_hermitian_multiply: aux_bc, lrs_save, lre_save", istat, errormessage)


  call obj%timer%stop("elpa_hermitian_multiply_&

  &MATH_DATATYPE&

  &_&

  &PRECISION&

  &"//gpustring)


#ifdef WITH_NVTX

  call nvtxrangepop() ! multiply

#endif

set_gpu_parameters
void set_gpu_parameters(int *gpuMemcpyHostToDevice, int *gpuMemcpyDeviceToHost)
Definition gpu_vendor_agnostic_layer.c:62

elpa_abstract_impl
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50