54#include "../general/sanity.F90"
55#include "../general/error_checking.inc"
57#undef USE_CCL_MULTIPLY
58#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
59#define USE_CCL_MULTIPLY
64#ifdef USE_CCL_MULTIPLY
67#ifndef USE_CCL_MULTIPLY
78 use,
intrinsic :: iso_c_binding
81 use elpa_blas_interfaces
82 use elpa_utilities,
only : local_index, check_deallocate_f, check_dealloc_gpu_f, &
83 check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &
84 check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &
85 check_host_register_gpu_f, check_alloc, error_unit
86 use mod_query_gpu_usage
87#ifdef WITH_GPU_STREAMS
90#ifdef WITH_NVIDIA_GPU_VERSION
93#if defined(USE_CCL_MULTIPLY)
99#include "../../src/general/precision_kinds.F90"
100 class(elpa_abstract_impl_t),
intent(inout) :: obj
102 character*1 :: uplo_a, uplo_c, trans_a
104 integer(kind=ik),
intent(in) :: ldb, ldbCols, ldc, ldcCols
105 integer(kind=ik) :: na, ncb
106#ifndef DEVICE_POINTER
108#ifdef USE_ASSUMED_SIZE
109 math_datatype(kind=rck) :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
111 math_datatype(kind=rck) :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbcols), c(ldc,ldccols)
113#else /* DEVICE_POINTER */
115 math_datatype(kind=rck),
allocatable :: a(:,:), b(:,:), c(:,:)
116 type(c_ptr) :: aDev, bDev, cDev
117#endif /* DEVICE_POINTER */
119 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
120 integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
121 integer(kind=MPI_KIND) :: mpierr, myidMPI
122 integer(kind=ik) :: l_cols, l_rows, l_rows_np
123 integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
124 integer(kind=ik) :: gcol_min, gcol, goff
125 integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
126 integer(kind=ik) :: np_br, noff_n
127 integer(kind=ik),
allocatable :: lrs_save(:), lre_save(:)
129 logical :: a_lower, a_upper, c_lower, c_upper
130 math_datatype(kind=rck),
pointer,
contiguous :: aux_mat(:,:), tmp1(:,:)
131 math_datatype(kind=rck),
allocatable :: aux_bc(:), tmp2(:,:)
132 integer(kind=ik) :: istat
133 character(200) :: errorMessage
134 character(20) :: gpuString
136 logical :: successGPU
138 integer(kind=c_int) :: numGPU, blocking
139 integer(kind=ik) :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
140 integer(kind=ik) :: nblk, matrixRows, matrixCols, error
141 integer(kind=c_intptr_t) :: aux_mat_dev, tmp1_dev
143 integer(kind=c_intptr_t) :: b_dev
145 integer(kind=c_intptr_t) :: a_dev
148 integer(kind=c_intptr_t) :: c_dev
152 integer(kind=c_intptr_t) :: tmp2_dev, aux_bc_dev
154 type(c_ptr) :: aux_host, tmp1_host
155 integer(kind=c_intptr_t) :: num
156 integer(kind=c_intptr_t) :: aux_off, b_off
157 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
162 integer(kind=c_intptr_t) :: gpuHandle, my_stream
163 integer(kind=c_int) :: gpu_hermitian_multiply
165#if defined(USE_CCL_MULTIPLY)
166 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
169 math_datatype(kind=rck),
allocatable :: a_tmp(:,:), c_tmp(:,:)
171 integer(kind=c_intptr_t) :: aux_dev
172 integer(kind=c_int) :: gpu
173 integer(kind=c_int) :: gpu_multiply_a_b
179#if !defined(DEVICE_POINTER)
181#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
182 if (.not.(query_gpu_usage(obj,
"ELPA_MULITPLY_AB", usegpu)))
then
183 print *,
"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."
189 if (obj%is_set(
"gpu_hermitian_multiply") == 1)
then
190 call obj%get(
"gpu_hermitian_multiply", gpu_hermitian_multiply, error)
191 if (error .ne. elpa_ok)
then
192 print *,
"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
195 if (usegpu .and. gpu_hermitian_multiply .eq. 0)
then
197 else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1)
then
206#else /* DEVICE_POINTER */
208#endif /* DEVICE_POINTER */
213#if !defined(DEVICE_POINTER)
215#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
216 if (.not.(query_gpu_usage(obj,
"ELPA_MULITPLY_AB", usegpu)))
then
217 print *,
"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."
223 if (obj%is_set(
"gpu_hermitian_multiply") == 1)
then
224 call obj%get(
"gpu_hermitian_multiply", gpu_hermitian_multiply, error)
225 if (error .ne. elpa_ok)
then
226 print *,
"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
229 if (usegpu .and. gpu_hermitian_multiply .eq. 0)
then
231 else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1)
then
240#else /* DEVICE_POINTER */
244 c_dev = transfer(cdev, c_dev)
245 a_dev = transfer(adev, a_dev)
247 allocate(a_tmp(obj%local_nrows,obj%local_ncols), stat=istat, errmsg=errormessage)
248 check_allocate(
"elpa_mult_at_b: a_tmp", istat, errormessage)
250 num = obj%local_nrows*obj%local_ncols*size_of_datatype
251#ifdef WITH_GPU_STREAMS
252 successgpu = gpu_host_register(int(loc(a_tmp),kind=c_intptr_t), &
253 obj%local_nrows*obj%local_ncols * size_of_datatype,&
254 gpuhostregisterdefault)
255 check_host_register_gpu(
"elpa_mult_at_b: a_tmp", successgpu)
257 my_stream = obj%gpu_setup%my_stream
258 successgpu = gpu_stream_synchronize(my_stream)
259 check_stream_synchronize_gpu(
"elpa_mult_at_b: a_dev to a_tmp", successgpu)
261 successgpu = gpu_memcpy_async(int(loc(a_tmp),kind=c_intptr_t), adev, num,&
262 gpumemcpydevicetohost, my_stream)
263 check_memcpy_gpu(
"elpa_mult_at_b: a_dev -> a_tmp", successgpu)
265 my_stream = obj%gpu_setup%my_stream
266 successgpu = gpu_stream_synchronize(my_stream)
267 check_stream_synchronize_gpu(
"elpa_mult_at_b: a_dev -> a_tmp", successgpu)
269 successgpu = gpu_stream_synchronize()
270 check_stream_synchronize_gpu(
"elpa_mult_at_b: a_dev -> a_tmp", successgpu)
272 successgpu = gpu_memcpy(int(loc(a_tmp),kind=c_intptr_t), adev, num,&
273 gpumemcpydevicetohost)
274 check_memcpy_gpu(
"elpa_mult_at_b: a_dev -> a_tmp", successgpu)
277 allocate(c_tmp(ldc,ldccols), stat=istat, errmsg=errormessage)
278 check_allocate(
"elpa_mult_at_b: c_tmp", istat, errormessage)
280#ifdef WITH_GPU_STREAMS
281 successgpu = gpu_host_register(int(loc(c_tmp),kind=c_intptr_t),&
282 ldc*ldccols*size_of_datatype, &
283 gpuhostregisterdefault)
289 b_dev = transfer(bdev, b_dev)
291#endif /* DEVICE_POINTER */
299 call obj%timer%start(
"elpa_mult_at_b_&
307 matrixrows = obj%local_nrows
308 matrixcols = obj%local_ncols
310 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
311 mpi_comm_cols = obj%mpi_setup%mpi_comm_cols
312 mpi_comm_rows = obj%mpi_setup%mpi_comm_rows
314 myid = obj%mpi_setup%myRank_comm_parent
315 my_prow = obj%mpi_setup%myRank_comm_rows
316 my_pcol = obj%mpi_setup%myRank_comm_cols
318 np_rows = obj%mpi_setup%nRanks_comm_rows
319 np_cols = obj%mpi_setup%nRanks_comm_cols
321 l_rows = local_index(na, my_prow, np_rows, nblk, -1)
322 l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1)
326 if (obj%is_set(
"blocking_in_multiply") == 1)
then
327 call obj%get(
"blocking_in_multiply", blocking, error)
328 if (error .ne. elpa_ok)
then
329 write(error_unit,*)
"ELPA_HERMITIAN_MULTIPLY: Problem in getting keyword 'blocking_in_multiply'. Aborting..."
332 nblk_mult = (blocking/nblk+1) * nblk
335 if (na/np_rows <= 256)
then
336 nblk_mult = (63/nblk+1)*nblk
338 nblk_mult = (351/nblk+1)*nblk
341 if (na/np_rows <= 256)
then
342 nblk_mult = (31/nblk+1)*nblk
344 nblk_mult = (63/nblk+1)*nblk
350 call obj%timer%start(
"check_for_gpu")
351 if (check_for_gpu(obj, myid, numgpu))
then
355 print *,
"GPUs are requested but not detected! Aborting..."
359 call obj%timer%stop(
"check_for_gpu")
362#if !defined(DEVICE_POINTER)
363 num = ldc*ldccols*size_of_datatype
364 successgpu = gpu_malloc(c_dev, num)
365 check_alloc_gpu(
"elpa_mult_at_b: c_dev", successgpu)
370#if !defined(DEVICE_POINTER)
372 num = ldb*ldbcols*size_of_datatype
373 successgpu = gpu_malloc(b_dev, num)
374 check_alloc_gpu(
"elpa_mult_at_b: b_dev", successgpu)
376#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
377 successgpu = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&
378 gpuhostregisterdefault)
381 check_host_register_gpu(
"elpa_mult_at_b: b", successgpu)
382#ifdef WITH_GPU_STREAMS
383 my_stream = obj%gpu_setup%my_stream
384 call gpu_memcpy_async_and_stream_synchronize &
385 (
"elpa_mult_at_b: b to b_dev", b_dev, 0_c_intptr_t, &
386 b(1:ldb,1:ldbcols), &
387 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
389 successgpu = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
390 gpumemcpyhosttodevice)
391 check_memcpy_gpu(
"elpa_mult_at_b: b to b_dev", successgpu)
394#else /* DEVICE_POINTER */
396#endif /* DEVICE_POINTER */
398 num = l_rows*nblk_mult*size_of_datatype
399#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
400 successgpu = gpu_malloc_host(aux_host, num)
401 check_host_alloc_gpu(
"elpa_mult_at_b: aux_host", successgpu)
402 call c_f_pointer(aux_host, aux_mat, (/l_rows,nblk_mult/))
404 allocate(aux_mat(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
405 check_allocate(
"elpa_mult_at_b: aux_mat", istat, errormessage)
408 successgpu = gpu_malloc(aux_mat_dev, num)
409 check_alloc_gpu(
"elpa_mult_at_b: aux_mat_dev", successgpu)
411 num = nblk_mult*l_cols*size_of_datatype
412 successgpu = gpu_malloc(tmp1_dev, num)
413 check_alloc_gpu(
"elpa_mult_at_b: tmp1_dev", successgpu)
421 num = nblk_mult*l_cols*size_of_datatype
422 successgpu = gpu_malloc(tmp2_dev, num)
423 check_alloc_gpu(
"elpa_mult_at_b: tmp2_dev", successgpu)
426 allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errormessage)
427 check_allocate(
"elpa_mult_at_b: aux_mat", istat, errormessage)
430 allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errormessage)
431 check_allocate(
"elpa_mult_at_b: aux_bc", istat, errormessage)
433 allocate(lrs_save(nblk), stat=istat, errmsg=errormessage)
434 check_allocate(
"elpa_mult_at_b: lrs_save", istat, errormessage)
436 allocate(lre_save(nblk), stat=istat, errmsg=errormessage)
437 check_allocate(
"elpa_mult_at_b: lre_save", istat, errormessage)
444 if (uplo_a==
'u' .or. uplo_a==
'U') a_upper = .true.
445 if (uplo_a==
'l' .or. uplo_a==
'L') a_lower = .true.
446 if (uplo_c==
'u' .or. uplo_c==
'U') c_upper = .true.
447 if (uplo_c==
'l' .or. uplo_c==
'L') c_lower = .true.
452#if !defined(DEVICE_POINTER)
453 num = obj%local_nrows*obj%local_ncols*size_of_datatype
454 successgpu = gpu_malloc(a_dev, num)
455 check_alloc_gpu(
"elpa_mult_at_b: a_dev", successgpu)
458 num = l_rows*nblk*size_of_datatype
459 successgpu = gpu_malloc(aux_bc_dev, num)
460 check_alloc_gpu(
"elpa_mult_at_b: aux_bc_dev", successgpu)
462 num = obj%local_nrows*obj%local_ncols*size_of_datatype
463#if !defined(DEVICE_POINTER)
465#ifdef WITH_GPU_STREAMS
466 my_stream = obj%gpu_setup%my_stream
467 call gpu_memcpy_async_and_stream_synchronize &
468 (
"elpa_mult_at_b: a to a_dev", a_dev, 0_c_intptr_t, &
469 a(1:obj%local_nrows,1:obj%local_ncols), &
470 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
472 successgpu = gpu_memcpy(a_dev, int(loc(a),kind=c_intptr_t), &
473 num, gpumemcpyhosttodevice)
474 check_memcpy_gpu(
"elpa_mult_at_b: a to a_dev", successgpu)
476#endif /* DEVICE_POINTER */
484 call nvtxrangepush(
"do np = 0, np_rows-1")
489 l_rows_np = local_index(na, np, np_rows, nblk, -1)
496 num = l_rows*nblk_mult*size_of_datatype
497#ifdef WITH_GPU_STREAMS
498 my_stream = obj%gpu_setup%my_stream
499 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
500 check_memcpy_gpu(
"hermitian_multiply: aux_mat_dev", successgpu)
502 successgpu = gpu_memset(aux_mat_dev, 0, num)
503 check_memcpy_gpu(
"hermitian_multiply: aux_mat_dev", successgpu)
508 do nb = 0, (l_rows_np-1)/nblk
511 call nvtxrangepush(
"do nb = 0, (l_rows_np-1)/nblk")
514 goff = nb*np_rows + np
521 np_br = mod(goff,np_rows)
522 np_bc = mod(goff,np_cols)
525 noff_n = goff/np_rows
529 do n = 1, min(l_rows_np-nb*nblk, nblk)
533 if (nstor==0 .and. n==1) gcol_min = gcol
537 if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
538 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
544 if (my_pcol == np_bc)
call gpu_copy_precision_a_aux_bc(a_dev, aux_bc_dev, n_aux_bc, nvals, lrs, lre, noff, &
545 nblk, n, l_rows, obj%local_nrows, obj%local_ncols, my_stream)
547 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
550#ifndef DEVICE_POINTER
551 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
553 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a_tmp(lrs:lre,noff*nblk+n)
556 n_aux_bc = n_aux_bc + nvals
566#ifdef USE_CCL_MULTIPLY
568#ifdef WITH_GPU_STREAMS
569 my_stream = obj%gpu_setup%my_stream
570 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
571 successgpu = ccl_group_start()
573 if (.not.successgpu)
then
574 print *,
"Error in setting up ccl_group_start!"
578 successgpu = ccl_bcast(aux_bc_dev, aux_bc_dev, &
581 int(n_aux_bc,kind=c_size_t), &
584 int(2*n_aux_bc,kind=c_size_t), &
587#ifdef DOUBLE_PRECISION
590#ifdef SINGLE_PRECISION
595#ifdef DOUBLE_PRECISION
598#ifdef SINGLE_PRECISION
601#endif /* COMPLEXCASE */
602 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
604 if (.not.successgpu)
then
605 print *,
"Error in ccl_bcast"
609 successgpu = ccl_group_end()
610 if (.not.successgpu)
then
611 print *,
"Error in setting up ccl_group_end!"
614#endif /* WITH_GPU_STREAMS */
616#endif /* USE_CCL_MULTIPLY */
618#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
621 num = l_rows*nblk*size_of_datatype
622#ifdef WITH_GPU_STREAMS
623 my_stream = obj%gpu_setup%my_stream
624 call gpu_memcpy_async_and_stream_synchronize &
625 (
"elpa_mult_at_b: aux_bc_dev -> aux_bc", aux_bc_dev, 0_c_intptr_t, &
626 aux_bc(1:l_rows*nblk), &
627 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
629 num = l_rows*nblk*size_of_datatype
630 successgpu = gpu_memcpy(int(loc(aux_bc),kind=c_intptr_t), aux_bc_dev, num,&
631 gpumemcpydevicetohost)
632 check_memcpy_gpu(
"elpa_mult_at_b: aux_bc_dev -> aux_bc", successgpu)
636#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
639 call obj%timer%start(
"mpi_communication")
641 call mpi_bcast(aux_bc, int(n_aux_bc,kind=mpi_kind), &
642 mpi_math_datatype_precision, &
643 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
645 call obj%timer%stop(
"mpi_communication")
647#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
650 num = l_rows*nblk*size_of_datatype
651#ifdef WITH_GPU_STREAMS
652 my_stream = obj%gpu_setup%my_stream
653 call gpu_memcpy_async_and_stream_synchronize &
654 (
"elpa_mult_at_b: aux_bc -> aux_bc_dev", aux_bc_dev, 0_c_intptr_t, &
655 aux_bc(1:l_rows*nblk), &
656 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
658 successgpu = gpu_memcpy(aux_bc_dev, int(loc(aux_bc),kind=c_intptr_t), num,&
659 gpumemcpyhosttodevice)
660 check_memcpy_gpu(
"elpa_mult_at_b: aux_bc -> aux_bc_dev", successgpu)
664#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
666#ifdef USE_CCL_MULTIPLY
668#endif /* USE_CCL_MULTIPLY */
679 my_stream = obj%gpu_setup%my_stream
680 do n = 1, min(l_rows_np-nb*nblk,nblk)
686 call gpu_copy_precision_aux_bc_aux_mat(aux_bc_dev, aux_mat_dev, lrs, lre, nstor, n_aux_bc, &
687 nvals, l_rows, nblk, nblk_mult, my_stream)
689 n_aux_bc = n_aux_bc + nvals
696 do n = 1, min(l_rows_np-nb*nblk,nblk)
702 aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
703 n_aux_bc = n_aux_bc + nvals
714 if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np)
then
718 if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
719 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
723 if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
724 if (c_lower) lce = min(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
727#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
728 if (.not.usegpu)
then
731 allocate(tmp1(nstor,1:lce-lcs+1), tmp2(nstor,1:lce-lcs+1), stat=istat, errmsg=errormessage)
732 call check_alloc(
"elpa_mult_at_b_&
733 &MATH_DATATYPE ",
"tmp1", istat, errormessage)
734#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
741 num = l_rows*nblk_mult*size_of_datatype
742#ifdef WITH_GPU_STREAMS
743 my_stream = obj%gpu_setup%my_stream
744 call gpu_memcpy_async_and_stream_synchronize &
745 (
"elpa_mult_at_b: aux_mat to aux_mat_dev", aux_mat_dev, 0_c_intptr_t, &
746 aux_mat(1:l_rows,1:nblk_mult), &
747 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
749 successgpu = gpu_memcpy(aux_mat_dev, int(loc(aux_mat),kind=c_intptr_t), &
750 num, gpumemcpyhosttodevice)
751 check_memcpy_gpu(
"elpa_mult_at_b: aux_mat to aux_mat_dev", successgpu)
755 aux_off = (lrs-1)*size_of_datatype
756 b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
758 call obj%timer%start(
"gpublas")
759 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
760 call gpublas_precision_gemm(blas_trans_or_conj,
'N', nstor, lce-lcs+1, &
761 lre-lrs+1, one, aux_mat_dev+aux_off, l_rows, b_dev+b_off, ldb, zero, &
762 tmp1_dev, nstor, gpuhandle)
763 call obj%timer%stop(
"gpublas")
766 num = nstor*(lce-lcs+1)*size_of_datatype
767#ifdef WITH_GPU_STREAMS
768 my_stream = obj%gpu_setup%my_stream
769 successgpu = gpu_stream_synchronize(my_stream)
770 check_stream_synchronize_gpu(
"elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
772 successgpu = gpu_memcpy_async(int(loc(tmp1),kind=c_intptr_t), &
773 tmp1_dev, num, gpumemcpydevicetohost, my_stream)
774 check_memcpy_gpu(
"elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
776 my_stream = obj%gpu_setup%my_stream
777 successgpu = gpu_stream_synchronize(my_stream)
778 check_stream_synchronize_gpu(
"elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
780 successgpu = gpu_stream_synchronize()
781 check_stream_synchronize_gpu(
"elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
783 successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
784 tmp1_dev, num, gpumemcpydevicetohost)
785 check_memcpy_gpu(
"elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
790 num = nstor*(lce-lcs+1)*size_of_datatype
792 call obj%timer%start(
"blas")
793 call precision_gemm(blas_trans_or_conj,
'N', int(nstor,kind=blas_kind), &
794 int(lce-lcs+1,kind=blas_kind), int(lre-lrs+1,kind=blas_kind), &
795 one, aux_mat(lrs:lre,1:nstor), int(lre-lrs+1,kind=blas_kind), &
796 b(lrs,lcs), int(ldb,kind=blas_kind), zero, tmp1, &
797 int(nstor,kind=blas_kind))
798 call obj%timer%stop(
"blas")
803 num = nstor*(lce-lcs+1)*size_of_datatype
804#ifdef WITH_GPU_STREAMS
805 my_stream = obj%gpu_setup%my_stream
806 successgpu = gpu_memset_async(tmp1_dev, 0, num, my_stream)
807 check_memcpy_gpu(
"hermitian_multiply: tmp1_dev", successgpu)
809 successgpu = gpu_memset(tmp1_dev, 0, num)
810 check_memcpy_gpu(
"hermitian_multiply: tmp1_dev", successgpu)
825#ifdef USE_CCL_MULTIPLY
826#ifdef WITH_GPU_STREAMS
827 my_stream = obj%gpu_setup%my_stream
828 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
830 successgpu = ccl_group_start()
831 if (.not.successgpu)
then
832 print *,
"Error in setting up ccl_group_start!"
836 successgpu = ccl_reduce(tmp1_dev, tmp2_dev, &
839 int(nstor*(lce-lcs+1),kind=c_size_t), &
842 int(2*nstor*(lce-lcs+1),kind=c_size_t), &
845#ifdef DOUBLE_PRECISION
848#ifdef SINGLE_PRECISION
853#ifdef DOUBLE_PRECISION
856#ifdef SINGLE_PRECISION
859#endif /* COMPLEXCASE */
860 cclsum, int(np,kind=c_int), ccl_comm_rows, my_stream)
863 if (.not.successgpu)
then
864 print *,
"Error in ccl_reduce"
868 successgpu = ccl_group_end()
869 if (.not.successgpu)
then
870 print *,
"Error in setting up ccl_group_end!"
873#endif /* WITH_GPU_STREAMS */
875#endif /* USE_CCL_MULTIPLY */
877#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
879 num = nstor*(lce-lcs+1)*size_of_datatype
880#ifdef WITH_GPU_STREAMS
881 call gpu_memcpy_async_and_stream_synchronize &
882 (
"elpa_mult_at_b: tmp1_dev to tmp1", tmp1_dev, 0_c_intptr_t, &
884 tmp1(1:nstor,1:lce-lcs+1), &
885 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
887 successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
888 tmp1_dev, num, gpumemcpydevicetohost)
889 check_memcpy_gpu(
"elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
891#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
893#if !defined(USE_CCL_MULTIPLY)
895 call obj%timer%start(
"mpi_communication")
896 call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind), mpi_math_datatype_precision, &
897 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
898 call obj%timer%stop(
"mpi_communication")
899#endif /* !defined(USE_CCL_MULTIPLY) */
901#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
903 num = nstor*(lce-lcs+1)*size_of_datatype
904#ifdef WITH_GPU_STREAMS
905 call gpu_memcpy_async_and_stream_synchronize &
906 (
"elpa_mult_at_b: tmp2 to tmp2_dev", tmp2_dev, 0_c_intptr_t, &
908 tmp2(1:nstor,1:lce-lcs+1), &
909 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
911 successgpu = gpu_memcpy(tmp2_dev, int(loc(tmp2),kind=c_intptr_t), &
912 num, gpumemcpyhosttodevice)
913 check_memcpy_gpu(
"elpa_mult_at_b: tmp2 to tmp2_dev", successgpu)
915#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
920 num = nstor*(lce-lcs+1)*size_of_datatype
921 successgpu = gpu_memcpy(tmp2_dev, tmp1_dev, &
922 num, gpumemcpydevicetodevice)
923 check_memcpy_gpu(
"elpa_mult_at_b: tmp2 to tmp2_dev", successgpu)
931 call obj%timer%start(
"mpi_communication")
932 call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind), mpi_math_datatype_precision, &
933 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
934 call obj%timer%stop(
"mpi_communication")
940 if (my_prow==np)
call gpu_copy_precision_tmp2_c(tmp2_dev, c_dev, nr_done, nstor, lcs, lce, ldc, ldccols, my_stream)
944#ifndef DEVICE_POINTER
945 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
947 if (my_prow==np) c_tmp(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
951#ifndef DEVICE_POINTER
952 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
954 if (my_prow==np) c_tmp(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
963 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
966 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
970#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
971 if (.not.usegpu)
then
973 deallocate(tmp1, tmp2, stat=istat, errmsg=errormessage)
974 call check_alloc(
"elpa_mult_at_b_&
975 &MATH_DATATYPE ",
"tmp1", istat, errormessage)
976#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
981 nr_done = nr_done+nstor
984 num = l_rows*nblk_mult*size_of_datatype
985#ifdef WITH_GPU_STREAMS
986 my_stream = obj%gpu_setup%my_stream
987 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
988 check_memcpy_gpu(
"hermitian_multiply: aux_mat_dev", successgpu)
990 successgpu = gpu_memset(aux_mat_dev, 0, num)
991 check_memcpy_gpu(
"hermitian_multiply: aux_mat_dev", successgpu)
1021#if !defined(DEVICE_POINTER)
1022 successgpu = gpu_free(b_dev)
1023 check_dealloc_gpu(
"elpa_multiply_a_b: b_dev", successgpu)
1024#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
1025 successgpu = gpu_host_unregister(int(loc(b),kind=c_intptr_t))
1026 check_host_unregister_gpu(
"elpa_multiply_a_b: b", successgpu)
1031 num = ldc*ldccols*size_of_datatype
1032#ifdef WITH_GPU_STREAMS
1033 check_stream_synchronize_gpu(
"elpa_mult_at_b: c_dev -> c", successgpu)
1034 call gpu_memcpy_async_and_stream_synchronize &
1035 (
"elpa_mult_at_b: c_dev to c", c_dev, 0_c_intptr_t, &
1036 c(1:ldc,1:ldccols), &
1037 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1039 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num,&
1040 gpumemcpydevicetohost)
1041 check_memcpy_gpu(
"elpa_mult_at_b: c_dev -> c", successgpu)
1044 successgpu = gpu_free(c_dev)
1045 check_dealloc_gpu(
"elpa_multiply_a_b: c_dev", successgpu)
1049#endif /* MORE_GPU */
1051#else /* DEVICE_POINTER */
1064#ifdef WITH_GPU_STREAMS
1065 successgpu = gpu_host_unregister(int(loc(a_tmp),kind=c_intptr_t))
1066 check_host_unregister_gpu(
"elpa_mult_at_b: a_tmp", successgpu)
1068 deallocate(a_tmp, stat=istat, errmsg=errormessage)
1069 check_deallocate(
"elpa_mult_at_b: a_tmp", istat, errormessage)
1071 num = ldc*ldccols*size_of_datatype
1072#ifdef WITH_GPU_STREAMS
1073 my_stream = obj%gpu_setup%my_stream
1074 successgpu = gpu_stream_synchronize(my_stream)
1075 check_stream_synchronize_gpu(
"elpa_mult_at_b: c_tmp to c", successgpu)
1077 successgpu = gpu_memcpy_async(cdev,int(loc(c_tmp),kind=c_intptr_t),num,&
1078 gpumemcpyhosttodevice, my_stream)
1079 check_memcpy_gpu(
"elpa_mult_at_b: c_tmp -> c", successgpu)
1081 my_stream = obj%gpu_setup%my_stream
1082 successgpu = gpu_stream_synchronize(my_stream)
1083 check_stream_synchronize_gpu(
"elpa_mult_at_b: c_tmp -> c", successgpu)
1085 successgpu = gpu_stream_synchronize()
1086 check_stream_synchronize_gpu(
"elpa_mult_at_b: c_tmp -> c", successgpu)
1088 successgpu = gpu_memcpy(cdev,int(loc(c_tmp),kind=c_intptr_t),num,&
1089 gpumemcpyhosttodevice)
1090 check_memcpy_gpu(
"elpa_mult_at_b: c_tmp -> c", successgpu)
1092#ifdef WITH_GPU_STREAMS
1093 successgpu = gpu_host_unregister(int(loc(c_tmp),kind=c_intptr_t))
1094 check_host_unregister_gpu(
"elpa_multiply_a_b: c_tmp", successgpu)
1097 deallocate(c_tmp, stat=istat, errmsg=errormessage)
1098 check_deallocate(
"elpa_mult_at_b: c_tmp", istat, errormessage)
1099#endif /* MORE_GPU */
1101#endif /* DEVICE_POINTER */
1102#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
1106 successgpu = gpu_free_host(aux_host)
1107 check_host_dealloc_gpu(
"elpa_multiply_a_b: aux_host", successgpu)
1112 deallocate(aux_mat, stat=istat, errmsg=errormessage)
1113 check_deallocate(
"elpa_mult_at_b: aux_mat", istat, errormessage)
1119 successgpu = gpu_free(aux_mat_dev)
1120 check_dealloc_gpu(
"elpa_multiply_a_b: aux_mat_dev", successgpu)
1122 successgpu = gpu_free(tmp1_dev)
1123 check_dealloc_gpu(
"elpa_multiply_a_b: tmp1_dev", successgpu)
1126 successgpu = gpu_free(tmp2_dev)
1127 check_dealloc_gpu(
"elpa_multiply_a_b: tmp2_dev", successgpu)
1129 successgpu = gpu_free(aux_bc_dev)
1130 check_dealloc_gpu(
"elpa_multiply_a_b: aux_bc_dev", successgpu)
1133#if !defined(DEVICE_POINTER)
1135 successgpu = gpu_free(a_dev)
1136 check_dealloc_gpu(
"elpa_mult_at_b: a_dev", successgpu)
1143 deallocate(aux_mat, stat=istat, errmsg=errormessage)
1144 check_deallocate(
"elpa_mult_at_b: aux_mat", istat, errormessage)
1147 deallocate(aux_bc, lrs_save, lre_save, stat=istat, errmsg=errormessage)
1148 check_deallocate(
"elpa_mult_at_b: aux_bc, lrs_save, lre_save", istat, errormessage)
1150 call obj%timer%stop(
"elpa_mult_at_b_&
void set_gpu_parameters(int *gpuMemcpyHostToDevice, int *gpuMemcpyDeviceToHost)
Definition gpu_vendor_agnostic_layer.c:62
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50