54#include "../general/sanity.F90"
55#include "../general/error_checking.inc"
57#undef USE_CCL_HERMITIAN_MULTIPLY
58#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
59#define USE_CCL_HERMITIAN_MULTIPLY
66 use,
intrinsic :: iso_c_binding
69 use elpa_blas_interfaces
70 use elpa_utilities,
only : local_index, greatest_common_divisor, check_deallocate_f, check_dealloc_gpu_f, &
71 check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &
72 check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &
73 check_host_register_gpu_f, check_alloc, error_unit
74 use mod_query_gpu_usage
75#ifdef WITH_GPU_STREAMS
78#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
80#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
83#if defined(USE_CCL_HERMITIAN_MULTIPLY)
89#include "../../src/general/precision_kinds.F90"
90 class(elpa_abstract_impl_t),
intent(inout) :: obj
92 character*1 :: uplo_a, uplo_c, trans_a, trans_b
94 integer(kind=ik),
intent(in) :: ldb, ldbCols, ldc, ldcCols
95 integer(kind=ik) :: na, ncb
97#ifdef USE_ASSUMED_SIZE
98 math_datatype(kind=rck) :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
100 math_datatype(kind=rck) :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbcols), c(ldc,ldccols)
102#else /* DEVICE_POINTER */
104 math_datatype(kind=rck),
allocatable :: a(:,:), b(:,:), c(:,:)
105 type(c_ptr) :: aDev, bDev, cDev
106#endif /* DEVICE_POINTER */
108 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
109 integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
110 integer(kind=MPI_KIND) :: mpierr, myidMPI
111 integer(kind=ik) :: l_cols, l_rows, l_rows_np
112 integer(kind=ik) :: n
113 integer(kind=ik) :: np, nb, nblk_mult, lrs, lre, lcs, lce
114 integer(kind=ik) :: gcol_min, gcol, goff
115 integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
116 integer(kind=ik),
allocatable :: lrs_save(:), lre_save(:)
118 logical :: a_lower, a_upper, c_lower, c_upper
119 math_datatype(kind=rck) :: beta
120 math_datatype(kind=rck),
pointer,
contiguous :: aux_mat(:,:), tmp1(:,:)
121 math_datatype(kind=rck),
allocatable :: aux_bc(:), tmp2(:,:)
123 integer(kind=ik) :: istat, debug
124 character(200) :: errorMessage
125 character(20) :: gpuString
126 logical :: success, successGPU, successGPU2
128 integer(kind=c_int) :: numGPU, blocking
129 integer(kind=ik) :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
130 integer(kind=ik) :: nblk, matrixRows, matrixCols, error
131 integer(kind=c_intptr_t) :: aux_bc_dev, aux_mat_dev, tmp1_dev, tmp2_dev
133 integer(kind=c_intptr_t) :: a_dev
134 integer(kind=c_intptr_t) :: b_dev
135 integer(kind=c_intptr_t) :: c_dev
137 type(c_ptr) :: aux_host
138 integer(kind=c_intptr_t) :: num
139 integer(kind=c_intptr_t) :: aux_off, b_off
140 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
145 integer(kind=c_intptr_t) :: gpuHandle, my_stream
146 integer(kind=c_int) :: gpu_hermitian_multiply
149#if defined(USE_CCL_HERMITIAN_MULTIPLY)
150 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
151 integer(kind=c_int) :: cclDataType
152 integer(kind=ik) :: k_datatype
155 integer(kind=c_intptr_t) :: aux_dev
156 integer(kind=c_int) :: gpu
159 call nvtxrangepush(
"hermitian_multiply")
165 call obj%get(
"debug", debug, error)
166 if (error .ne. elpa_ok)
then
167 write(error_unit,*)
"elpa_hermitian_multiply: Problem getting option for debug settings. Aborting..."
178#if !defined(DEVICE_POINTER)
180#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
181 if (.not.(query_gpu_usage(obj,
"ELPA_MULITPLY_AB", usegpu)))
then
182 print *,
"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."
188 if (obj%is_set(
"gpu_hermitian_multiply") == 1)
then
189 call obj%get(
"gpu_hermitian_multiply", gpu_hermitian_multiply, error)
190 if (error .ne. elpa_ok)
then
191 print *,
"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
194 if (usegpu .and. gpu_hermitian_multiply .eq. 0)
then
196 else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1)
then
205#else /* DEVICE_POINTER */
209 a_dev = transfer(adev, a_dev)
210 b_dev = transfer(bdev, b_dev)
211 c_dev = transfer(cdev, c_dev)
213#endif /* DEVICE_POINTER */
221 call obj%timer%start(
"elpa_hermitian_multiply_&
229 matrixrows = obj%local_nrows
230 matrixcols = obj%local_ncols
232 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
233 mpi_comm_cols = obj%mpi_setup%mpi_comm_cols
234 mpi_comm_rows = obj%mpi_setup%mpi_comm_rows
236 myid = obj%mpi_setup%myRank_comm_parent
237 my_prow = obj%mpi_setup%myRank_comm_rows
238 my_pcol = obj%mpi_setup%myRank_comm_cols
240 np_rows = obj%mpi_setup%nRanks_comm_rows
241 np_cols = obj%mpi_setup%nRanks_comm_cols
243 l_rows = local_index(na, my_prow, np_rows, nblk, -1)
244 l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1)
248 if (obj%is_set(
"blocking_in_multiply") == 1)
then
249 call obj%get(
"blocking_in_multiply", blocking, error)
250 if (error .ne. elpa_ok)
then
251 write(error_unit,*)
"elpa_hermitian_multiply: Problem in getting keyword 'blocking_in_multiply'. Aborting..."
254 nblk_mult = (blocking/nblk+1) * nblk
257 if (na/np_rows <= 256)
then
258 nblk_mult = (63/nblk+1)*nblk
260 nblk_mult = (351/nblk+1)*nblk
263 if (na/np_rows <= 256)
then
264 nblk_mult = (31/nblk+1)*nblk
266 nblk_mult = (63/nblk+1)*nblk
273 call obj%timer%start(
"check_for_gpu")
274 if (check_for_gpu(obj, myid, numgpu))
then
278 print *,
"GPUs are requested but not detected! Aborting..."
282 call obj%timer%stop(
"check_for_gpu")
284#if defined(USE_CCL_HERMITIAN_MULTIPLY)
287 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
288 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
290#if REALCASE == 1 && defined(DOUBLE_PRECISION)
291 ccldatatype = ccldouble
293#elif REALCASE == 1 && defined(SINGLE_PRECISION)
294 ccldatatype = cclfloat
296#elif COMPLEXCASE == 1 && defined(DOUBLE_PRECISION)
297 ccldatatype = ccldouble
299#elif COMPLEXCASE == 1 && defined(SINGLE_PRECISION)
300 ccldatatype = cclfloat
303#endif /* defined(USE_CCL_HERMITIAN_MULTIPLY) */
305#if !defined(DEVICE_POINTER)
306 num = ldc*ldccols*size_of_datatype
307 successgpu = gpu_malloc(c_dev, num)
308 check_alloc_gpu(
"elpa_hermitian_multiply: c_dev", successgpu)
312#if !defined(DEVICE_POINTER)
314 num = ldb*ldbcols*size_of_datatype
315 successgpu = gpu_malloc(b_dev, num)
316 check_alloc_gpu(
"elpa_hermitian_multiply: b_dev", successgpu)
318#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
319 successgpu = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&
320 gpuhostregisterdefault)
323 check_host_register_gpu(
"elpa_hermitian_multiply: b", successgpu)
324#ifdef WITH_GPU_STREAMS
325 my_stream = obj%gpu_setup%my_stream
326 call gpu_memcpy_async_and_stream_synchronize &
327 (
"elpa_hermitian_multiply: b to b_dev", b_dev, 0_c_intptr_t, &
328 b(1:ldb,1:ldbcols), &
329 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
331 successgpu = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
332 gpumemcpyhosttodevice)
333 check_memcpy_gpu(
"elpa_hermitian_multiply: b to b_dev", successgpu)
336#else /* DEVICE_POINTER */
338#endif /* DEVICE_POINTER */
340 num = l_rows*nblk_mult*size_of_datatype
341#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
342 successgpu = gpu_malloc_host(aux_host, num)
343 check_host_alloc_gpu(
"elpa_hermitian_multiply: aux_host", successgpu)
344 call c_f_pointer(aux_host, aux_mat, (/l_rows,nblk_mult/))
346 allocate(aux_mat(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
347 check_allocate(
"elpa_hermitian_multiply: aux_mat", istat, errormessage)
350 successgpu = gpu_malloc(aux_mat_dev, num)
351 check_alloc_gpu(
"elpa_hermitian_multiply: aux_mat_dev", successgpu)
353 num = nblk_mult*l_cols*size_of_datatype
354 successgpu = gpu_malloc(tmp1_dev, num)
355 check_alloc_gpu(
"elpa_hermitian_multiply: tmp1_dev", successgpu)
357 num = nblk_mult*l_cols*size_of_datatype
358 successgpu = gpu_malloc(tmp2_dev, num)
359 check_alloc_gpu(
"elpa_hermitian_multiply: tmp2_dev", successgpu)
362 allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errormessage)
363 check_allocate(
"elpa_hermitian_multiply: aux_mat", istat, errormessage)
366 allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errormessage)
367 check_allocate(
"elpa_hermitian_multiply: aux_bc", istat, errormessage)
369 allocate(lrs_save(nblk), stat=istat, errmsg=errormessage)
370 check_allocate(
"elpa_hermitian_multiply: lrs_save", istat, errormessage)
372 allocate(lre_save(nblk), stat=istat, errmsg=errormessage)
373 check_allocate(
"elpa_hermitian_multiply: lre_save", istat, errormessage)
380 if (uplo_a==
'u' .or. uplo_a==
'U') a_upper = .true.
381 if (uplo_a==
'l' .or. uplo_a==
'L') a_lower = .true.
382 if (uplo_c==
'u' .or. uplo_c==
'U') c_upper = .true.
383 if (uplo_c==
'l' .or. uplo_c==
'L') c_lower = .true.
387#if !defined(DEVICE_POINTER)
388 num = obj%local_nrows*obj%local_ncols*size_of_datatype
389 successgpu = gpu_malloc(a_dev, num)
390 check_alloc_gpu(
"elpa_hermitian_multiply: a_dev", successgpu)
393 num = l_rows*nblk*size_of_datatype
394 successgpu = gpu_malloc(aux_bc_dev, num)
395 check_alloc_gpu(
"elpa_hermitian_multiply: aux_bc_dev", successgpu)
397 num = obj%local_nrows*obj%local_ncols*size_of_datatype
398#if !defined(DEVICE_POINTER)
400#ifdef WITH_GPU_STREAMS
401 my_stream = obj%gpu_setup%my_stream
402 call gpu_memcpy_async_and_stream_synchronize &
403 (
"elpa_hermitian_multiply: a to a_dev", a_dev, 0_c_intptr_t, &
404 a(1:obj%local_nrows,1:obj%local_ncols), &
405 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
407 successgpu = gpu_memcpy(a_dev, int(loc(a),kind=c_intptr_t), &
408 num, gpumemcpyhosttodevice)
409 check_memcpy_gpu(
"elpa_hermitian_multiply: a to a_dev", successgpu)
411#endif /* DEVICE_POINTER */
420 call nvtxrangepush(
"do np = 0, np_rows-1")
425 l_rows_np = local_index(na, np, np_rows, nblk, -1)
432 num = l_rows*nblk_mult*size_of_datatype
433#ifdef WITH_GPU_STREAMS
434 my_stream = obj%gpu_setup%my_stream
435 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
436 check_memcpy_gpu(
"multiply: aux_mat_dev", successgpu)
438 successgpu = gpu_memset(aux_mat_dev, 0, num)
439 check_memcpy_gpu(
"multiply: aux_mat_dev", successgpu)
444 do nb = 0, (l_rows_np-1)/nblk
447 call nvtxrangepush(
"do nb = 0, (l_rows_np-1)/nblk")
450 goff = nb*np_rows + np
456 np_bc = mod(goff, np_cols)
463 do n = 1, min(nblk, l_rows_np-nb*nblk)
467 if (nstor==0 .and. n==1) gcol_min = gcol
471 if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
472 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
477 if (my_pcol == np_bc)
call gpu_copy_precision_a_aux_bc(a_dev, aux_bc_dev, n_aux_bc, nvals, lrs, lre, noff, &
478 nblk, n, l_rows, obj%local_nrows, obj%local_ncols, my_stream)
480 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
483 n_aux_bc = n_aux_bc + nvals
493 if (usegpu .and. .not. useccl)
then
494 num = l_rows*nblk*size_of_datatype
495#ifdef WITH_GPU_STREAMS
496 my_stream = obj%gpu_setup%my_stream
497 call gpu_memcpy_async_and_stream_synchronize &
498 (
"elpa_hermitian_multiply: aux_bc_dev -> aux_bc", aux_bc_dev, 0_c_intptr_t, aux_bc(1:l_rows*nblk), &
499 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
501 successgpu = gpu_memcpy(int(loc(aux_bc),kind=c_intptr_t), aux_bc_dev, num, gpumemcpydevicetohost)
502 check_memcpy_gpu(
"elpa_hermitian_multiply: aux_bc_dev -> aux_bc", successgpu)
508#ifdef USE_CCL_HERMITIAN_MULTIPLY
510 call nvtxrangepush(
"ccl_bcast aux_bc_dev")
512 call obj%timer%start(
"ccl_bcast")
514 my_stream = obj%gpu_setup%my_stream
515 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
517 successgpu = ccl_bcast(aux_bc_dev, aux_bc_dev, int(k_datatype*n_aux_bc,kind=c_size_t), ccldatatype, &
518 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
520 if (.not. successgpu)
then
521 print *,
"Error in ccl_bcast"
525 successgpu = gpu_stream_synchronize(my_stream)
526 check_stream_synchronize_gpu(
"elpa_cholesky: ccl_bcast", successgpu)
528 call obj%timer%stop(
"ccl_bcast")
532#endif /* USE_CCL_HERMITIAN_MULTIPLY */
534 call obj%timer%start(
"mpi_communication")
536 call mpi_bcast(aux_bc, int(n_aux_bc,kind=mpi_kind), mpi_math_datatype_precision, &
537 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
539 call obj%timer%stop(
"mpi_communication")
543 if (usegpu .and. .not. useccl)
then
544 num = l_rows*nblk*size_of_datatype
545#ifdef WITH_GPU_STREAMS
546 my_stream = obj%gpu_setup%my_stream
547 call gpu_memcpy_async_and_stream_synchronize &
548 (
"elpa_hermitian_multiply: aux_bc -> aux_bc_dev", aux_bc_dev, 0_c_intptr_t, aux_bc(1:l_rows*nblk), &
549 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
551 successgpu = gpu_memcpy(aux_bc_dev, int(loc(aux_bc),kind=c_intptr_t), num, gpumemcpyhosttodevice)
552 check_memcpy_gpu(
"elpa_hermitian_multiply: aux_bc -> aux_bc_dev", successgpu)
561 my_stream = obj%gpu_setup%my_stream
562 do n = 1, min(nblk, l_rows_np-nb*nblk)
568 call gpu_copy_precision_aux_bc_aux_mat(aux_bc_dev, aux_mat_dev, lrs, lre, nstor, n_aux_bc, &
569 nvals, l_rows, nblk, nblk_mult, my_stream)
571 n_aux_bc = n_aux_bc + nvals
576 do n = 1, min(nblk, l_rows_np-nb*nblk)
582 aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
583 n_aux_bc = n_aux_bc + nvals
591 if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np)
then
595 if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
596 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
600 if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
601 if (c_lower) lce = min(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
604 if (.not. useccl)
then
606 allocate(tmp1(nstor,1:lce-lcs+1), tmp2(nstor,1:lce-lcs+1), stat=istat, errmsg=errormessage)
607 call check_alloc(
"elpa_hermitian_multiply_&
608 &MATH_DATATYPE ",
"tmp1", istat, errormessage)
613 aux_off = (lrs-1)*size_of_datatype
614 b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
617 call nvtxrangepush(
"gpublas")
619 call obj%timer%start(
"gpublas")
620 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
622 call gpublas_precision_gemm(blas_trans_or_conj,
'N', nstor, lce-lcs+1, lre-lrs+1, one, &
623 aux_mat_dev+aux_off, l_rows, &
624 b_dev+b_off, ldb, zero, &
625 tmp1_dev, nstor, gpuhandle)
626 if (wantdebug) successgpu = gpu_devicesynchronize()
627 call obj%timer%stop(
"gpublas")
632 call obj%timer%start(
"blas")
634 call precision_gemm(blas_trans_or_conj,
'N', int(nstor,kind=blas_kind), &
635 int(lce-lcs+1,kind=blas_kind), int(lre-lrs+1,kind=blas_kind), one, &
636 aux_mat(lrs:lre,1:nstor), int(lre-lrs+1,kind=blas_kind), &
637 b(lrs,lcs), int(ldb,kind=blas_kind), zero, &
638 tmp1, int(nstor,kind=blas_kind))
639 call obj%timer%stop(
"blas")
643 num = nstor*(lce-lcs+1)*size_of_datatype
644#ifdef WITH_GPU_STREAMS
645 my_stream = obj%gpu_setup%my_stream
646 successgpu = gpu_memset_async(tmp1_dev, 0, num, my_stream)
647 check_memcpy_gpu(
"multiply: tmp1_dev", successgpu)
649 successgpu = gpu_memset(tmp1_dev, 0, num)
650 check_memcpy_gpu(
"multiply: tmp1_dev", successgpu)
661 if (usegpu .and. .not. useccl)
then
662 num = nstor*(lce-lcs+1)*size_of_datatype
663#ifdef WITH_GPU_STREAMS
664 call gpu_memcpy_async_and_stream_synchronize &
665 (
"elpa_hermitian_multiply: tmp1_dev to tmp1", tmp1_dev, 0_c_intptr_t, &
667 tmp1(1:nstor,1:lce-lcs+1), &
668 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
670 successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
671 tmp1_dev, num, gpumemcpydevicetohost)
672 check_memcpy_gpu(
"elpa_hermitian_multiply: tmp1_dev to tmp1", successgpu)
678#ifdef USE_CCL_HERMITIAN_MULTIPLY
680 call nvtxrangepush(
"ccl_reduce tmp1_dev")
682 call obj%timer%start(
"ccl_reduce")
683 my_stream = obj%gpu_setup%my_stream
684 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
686 successgpu = ccl_reduce(tmp1_dev, tmp2_dev, int(k_datatype*nstor*(lce-lcs+1),kind=c_size_t), ccldatatype, &
687 cclsum, int(np,kind=c_int), ccl_comm_rows, my_stream)
689 if (.not. successgpu)
then
690 print *,
"Error in ccl_reduce"
694 successgpu = gpu_stream_synchronize(my_stream)
695 check_stream_synchronize_gpu(
"elpa_cholesky: ccl_reduce", successgpu)
697 call obj%timer%stop(
"ccl_reduce")
701#endif /* USE_CCL_HERMITIAN_MULTIPLY */
703 call obj%timer%start(
"mpi_communication")
704 call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind), mpi_math_datatype_precision, &
705 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
706 call obj%timer%stop(
"mpi_communication")
710 if (usegpu .and. .not. useccl)
then
711 num = nstor*(lce-lcs+1)*size_of_datatype
712#ifdef WITH_GPU_STREAMS
713 call gpu_memcpy_async_and_stream_synchronize &
714 (
"elpa_hermitian_multiply: tmp2 to tmp2_dev", tmp2_dev, 0_c_intptr_t, &
716 tmp2(1:nstor,1:lce-lcs+1), &
717 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
719 successgpu = gpu_memcpy(tmp2_dev, int(loc(tmp2),kind=c_intptr_t), &
720 num, gpumemcpyhosttodevice)
721 check_memcpy_gpu(
"elpa_hermitian_multiply: tmp2 to tmp2_dev", successgpu)
727 num = nstor*(lce-lcs+1)*size_of_datatype
728 successgpu = gpu_memcpy(tmp2_dev, tmp1_dev, num, gpumemcpydevicetodevice)
729 check_memcpy_gpu(
"elpa_hermitian_multiply: tmp2 to tmp2_dev", successgpu)
735 if (my_prow==np)
call gpu_copy_precision_tmp2_c(tmp2_dev, c_dev, nr_done, nstor, &
736 lcs, lce, ldc, ldccols, my_stream)
740 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
743 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
748 if (.not. useccl)
then
749 deallocate(tmp1, tmp2, stat=istat, errmsg=errormessage)
750 call check_alloc(
"elpa_hermitian_multiply_&
751 &MATH_DATATYPE ",
"tmp1", istat, errormessage)
755 nr_done = nr_done+nstor
758 num = l_rows*nblk_mult*size_of_datatype
759#ifdef WITH_GPU_STREAMS
760 my_stream = obj%gpu_setup%my_stream
761 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
762 check_memcpy_gpu(
"multiply: aux_mat_dev", successgpu)
764 successgpu = gpu_memset(aux_mat_dev, 0, num)
765 check_memcpy_gpu(
"multiply: aux_mat_dev", successgpu)
785#if !defined(DEVICE_POINTER)
788#ifdef WITH_GPU_STREAMS
789 check_stream_synchronize_gpu(
"elpa_hermitian_multiply: c_dev -> c", successgpu)
790 call gpu_memcpy_async_and_stream_synchronize &
791 (
"elpa_hermitian_multiply: c_dev to c", c_dev, 0_c_intptr_t, c(1:ldc,1:ldccols), &
792 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
794 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)
795 check_memcpy_gpu(
"elpa_hermitian_multiply: c_dev -> c", successgpu)
797#endif /* !defined(DEVICE_POINTER) */
804#if !defined(DEVICE_POINTER)
805 successgpu = gpu_free(b_dev)
806 check_dealloc_gpu(
"elpa_hermitian_multiply: b_dev", successgpu)
807#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
808 successgpu = gpu_host_unregister(int(loc(b),kind=c_intptr_t))
809 check_host_unregister_gpu(
"elpa_hermitian_multiply: b", successgpu)
812 successgpu = gpu_free(c_dev)
813 check_dealloc_gpu(
"elpa_hermitian_multiply: c_dev", successgpu)
815#else /* DEVICE_POINTER */
817#endif /* DEVICE_POINTER */
819#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
823 successgpu = gpu_free_host(aux_host)
824 check_host_dealloc_gpu(
"elpa_hermitian_multiply: aux_host", successgpu)
826 deallocate(aux_mat, stat=istat, errmsg=errormessage)
827 check_deallocate(
"elpa_hermitian_multiply: aux_mat", istat, errormessage)
833 successgpu = gpu_free(aux_mat_dev)
834 check_dealloc_gpu(
"elpa_hermitian_multiply: aux_mat_dev", successgpu)
836 successgpu = gpu_free(tmp1_dev)
837 check_dealloc_gpu(
"elpa_hermitian_multiply: tmp1_dev", successgpu)
839 successgpu = gpu_free(tmp2_dev)
840 check_dealloc_gpu(
"elpa_hermitian_multiply: tmp2_dev", successgpu)
842 successgpu = gpu_free(aux_bc_dev)
843 check_dealloc_gpu(
"elpa_hermitian_multiply: aux_bc_dev", successgpu)
845#if !defined(DEVICE_POINTER)
846 successgpu = gpu_free(a_dev)
847 check_dealloc_gpu(
"elpa_hermitian_multiply: a_dev", successgpu)
854 deallocate(aux_mat, stat=istat, errmsg=errormessage)
855 check_deallocate(
"elpa_hermitian_multiply: aux_mat", istat, errormessage)
858 deallocate(aux_bc, lrs_save, lre_save, stat=istat, errmsg=errormessage)
859 check_deallocate(
"elpa_hermitian_multiply: aux_bc, lrs_save, lre_save", istat, errormessage)
861 call obj%timer%stop(
"elpa_hermitian_multiply_&
void set_gpu_parameters(int *gpuMemcpyHostToDevice, int *gpuMemcpyDeviceToHost)
Definition gpu_vendor_agnostic_layer.c:62
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50