49#include "../general/sanity.F90"
50#include "../general/error_checking.inc"
54#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
62 use,
intrinsic :: iso_c_binding
65 use elpa_blas_interfaces
66 use elpa_utilities,
only : local_index, greatest_common_divisor, least_common_multiple, &
67 check_deallocate_f, check_dealloc_gpu_f, &
68 check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &
69 check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &
70 check_host_register_gpu_f, check_alloc, error_unit
73 use mod_query_gpu_usage
74#ifdef WITH_GPU_STREAMS
77#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
79#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
86#include "../../src/general/precision_kinds.F90"
87 class(elpa_abstract_impl_t),
intent(inout) :: obj
89 character*1 :: trans_a, trans_b
90 character(1, c_char) :: trans_a_cchar, trans_b_cchar
91 integer(kind=ik),
intent(in) :: ldb, ldbCols, ldc, ldcCols
92 integer(kind=ik) :: lda, ldaCols
93 integer(kind=ik) :: na, ncb
95 type(c_ptr) :: aDev, bDev, cDev
96 math_datatype(kind=rck),
allocatable :: a(:,:), b(:,:), c(:,:)
97#else /* DEVICE_POINTER */
98#ifdef USE_ASSUMED_SIZE
99 math_datatype(kind=rck) :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
101 math_datatype(kind=rck) :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbcols), c(ldc,ldccols)
103#endif /* DEVICE_POINTER */
104 math_datatype(kind=rck),
allocatable :: at_full(:,:), bt_full(:,:)
105 math_datatype(kind=rck),
allocatable :: buf_send(:,:), buf_recv(:,:), buf_self(:,:), at_col(:,:), bt_row(:,:)
107 integer(kind=ik) :: my_pdir, my_pdir_t, np_dirs, np_dirs_t
108 integer(kind=ik) :: np_rows_fine, np_cols_fine, np_dirs_fine, np_fine, np_t_fine, np_bc_fine, &
109 np_ab_fine, np_ab_t_fine, dnp_ab, dnp_ab_t
110 integer(kind=ik) :: LCM, nblk_mult_rows, nblk_mult_cols, i_block_loc_fine, j_block_loc_fine
111 integer(kind=ik) :: nblk_mult_rows_max, nblk_mult_cols_max, nblk_rows_cut, nblk_cols_cut
112 integer(kind=ik) :: nblk_mult_max, nblk_mult_min
113 integer(kind=ik) :: l_cols, l_rows
114 integer(kind=ik) :: l_rows_max, l_cols_max, l_rows_min, l_cols_min
115 integer(kind=ik) :: l_rows_source, l_cols_source
116 integer(kind=ik) :: np, nb, nblk_mult, lrs, lre, lcs, lce
117 integer(kind=ik) :: i_block_loc, j_block_loc
118 integer(kind=ik) :: np_bc
119 integer(kind=ik) :: np_t
120 integer(kind=ik),
allocatable :: lrs_save(:), lre_save(:)
122 logical :: a_transposed, b_transposed
123 integer(kind=ik) :: a_transposed_int, b_transposed_int
125 math_datatype(kind=rck) :: beta
126 integer(kind=ik) :: beta_int
127 math_datatype(kind=rck),
pointer,
contiguous :: aux_a_full(:,:), aux_b_full(:,:), tmp1_full(:,:)
129 integer(kind=ik) :: istat, debug
130 character(200) :: errorMessage
131 character(20) :: gpuString, tnString
132 logical :: success, successGPU, successGPU2
134 logical :: isSquareGrid, first_call
135 integer(kind=c_int) :: numGPU, blocking, SM_count
138 integer(kind=MPI_KIND) :: mpierr
139 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
140 integer(kind=ik) :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
141 integer(kind=ik) :: mpi_comm_dirs
143 integer(kind=ik) :: nblk, nblk_cut, nblk_cut_row, nblk_cut_col
144 integer(kind=ik) :: error
145 integer(kind=c_intptr_t) :: aux_a_full_dev, aux_b_full_dev, tmp1_full_dev
146 integer(kind=c_intptr_t) :: at_col_dev, bt_row_dev
147 integer(kind=c_intptr_t) :: buf_send_dev, buf_recv_dev, buf_self_dev
149 integer(kind=c_intptr_t) :: a_dev
150 integer(kind=c_intptr_t) :: b_dev
151 integer(kind=c_intptr_t) :: c_dev
153 integer(kind=c_intptr_t) :: num, num_a, num_b, num_r, num_c
154 integer(kind=c_intptr_t) :: aux_off, b_off
155 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
160 integer(kind=c_intptr_t) :: gpuHandle, my_stream
161 integer(kind=c_int) :: gpu_pxgemm_multiply
164 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols, ccl_comm_all, ccl_comm_dirs
165 integer(kind=c_int) :: cclDataType
166 integer(kind=ik) :: k_datatype
168 nvtx_range_push(
"elpa_pxgemm_multiply")
173 a_transposed = .false.
174 b_transposed = .false.
179 if (trans_a==
't' .or. trans_a==
'T' .or. trans_a==
'c' .or. trans_a==
'C')
then
180 a_transposed = .true.
182 trans_a_cchar = blas_trans_or_conj
184 if (trans_b==
't' .or. trans_b==
'T' .or. trans_b==
'c' .or. trans_b==
'C')
then
185 b_transposed = .true.
187 trans_b_cchar = blas_trans_or_conj
190 call obj%get(
"debug", debug, error)
191 if (error .ne. elpa_ok)
then
192 write(error_unit,*)
"elpa_pxgemm_multiply: Problem getting option for debug settings. Aborting..."
202#if defined(DEVICE_POINTER)
204#else /* DEVICE_POINTER */
205#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
206 if (.not.(query_gpu_usage(obj,
"elpa_pxgemm_multiply", usegpu)))
then
207 print *,
"elpa_pxgemm_multiply: Problem querrying settings for GPU Aborting..."
213 if (obj%is_set(
"gpu_pxgemm_multiply") == 1)
then
214 call obj%get(
"gpu_pxgemm_multiply", gpu_pxgemm_multiply, error)
215 if (error .ne. elpa_ok)
then
216 print *,
"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
219 if (usegpu .and. gpu_pxgemm_multiply .eq. 0)
then
221 else if (.not.(usegpu) .and. gpu_pxgemm_multiply .eq. 1)
then
229#endif /* DEVICE_POINTER */
237 if (a_transposed .and. b_transposed)
then
239 else if (.not. a_transposed .and. b_transposed)
then
241 else if (a_transposed .and. .not. b_transposed)
then
243 else if (.not. a_transposed .and. .not. b_transposed)
then
247 call obj%timer%start(
"elpa_pxgemm_multiply"//trim(tnstring)//trim(gpustring))
251 lda = obj%local_nrows
252 ldacols = obj%local_ncols
254 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
255 mpi_comm_cols = obj%mpi_setup%mpi_comm_cols
256 mpi_comm_rows = obj%mpi_setup%mpi_comm_rows
258 myid = obj%mpi_setup%myRank_comm_parent
259 my_prow = obj%mpi_setup%myRank_comm_rows
260 my_pcol = obj%mpi_setup%myRank_comm_cols
262 np_rows = obj%mpi_setup%nRanks_comm_rows
263 np_cols = obj%mpi_setup%nRanks_comm_cols
265 l_rows = local_index(na, my_prow, np_rows, nblk, -1)
266 l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1)
270 call obj%timer%start(
"check_for_gpu")
271 if (check_for_gpu(obj, myid, numgpu))
then
275 print *,
"GPUs are requested but not detected! Aborting..."
279 call obj%timer%stop(
"check_for_gpu")
281 sm_count = obj%gpu_setup%gpuSMcount
282 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
284#ifdef WITH_GPU_STREAMS
285 my_stream = obj%gpu_setup%my_stream
288#if defined(USE_CCL_PXGEMM)
291 my_stream = obj%gpu_setup%my_stream
292 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
293 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
294 ccl_comm_all = obj%gpu_setup%ccl_comm_all
296#if REALCASE == 1 && defined(DOUBLE_PRECISION)
297 ccldatatype = ccldouble
299#elif REALCASE == 1 && defined(SINGLE_PRECISION)
300 ccldatatype = cclfloat
302#elif COMPLEXCASE == 1 && defined(DOUBLE_PRECISION)
303 ccldatatype = ccldouble
305#elif COMPLEXCASE == 1 && defined(SINGLE_PRECISION)
306 ccldatatype = cclfloat
309#endif /* defined(USE_CCL_PXGEMM) */
312#if defined(DEVICE_POINTER)
313 a_dev = transfer(adev, a_dev)
314 b_dev = transfer(bdev, b_dev)
315 c_dev = transfer(cdev, c_dev)
317#else /* DEVICE_POINTER */
318 successgpu = gpu_malloc(a_dev, lda*ldacols*size_of_datatype)
319 check_alloc_gpu(
"elpa_pxgemm_multiply: a_dev", successgpu)
321 successgpu = gpu_malloc(b_dev, ldb*ldbcols*size_of_datatype)
322 check_alloc_gpu(
"elpa_pxgemm_multiply: b_dev", successgpu)
324 successgpu = gpu_malloc(c_dev, ldc*ldccols*size_of_datatype)
325 check_alloc_gpu(
"elpa_pxgemm_multiply: c_dev", successgpu)
327 call obj%timer%start(
"gpu_memcpy")
328 nvtx_range_push(
"gpu_memcpy: a->a_dev, b->b_dev")
330 num = lda*ldacols*size_of_datatype
331#ifdef WITH_GPU_STREAMS
332 call gpu_memcpy_async_and_stream_synchronize &
333 (
"elpa_pxgemm_multiply: a to a_dev", a_dev, 0_c_intptr_t, &
334 a(1:lda,1:ldacols), &
335 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
337 successgpu = gpu_memcpy(a_dev, int(loc(a),kind=c_intptr_t), num, gpumemcpyhosttodevice)
338 check_memcpy_gpu(
"elpa_pxgemm_multiply: a to a_dev", successgpu)
342#ifdef WITH_GPU_STREAMS
343 call gpu_memcpy_async_and_stream_synchronize &
344 (
"elpa_pxgemm_multiply: b to b_dev", b_dev, 0_c_intptr_t, &
345 b(1:ldb,1:ldbcols), &
346 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
348 successgpu = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t), num, gpumemcpyhosttodevice)
349 check_memcpy_gpu(
"elpa_pxgemm_multiply: b to b_dev", successgpu)
352 nvtx_range_pop(
"gpu_memcpy: a->a_dev, b->b_dev")
353 call obj%timer%stop(
"gpu_memcpy")
356#ifdef WITH_GPU_STREAMS
357 successgpu = gpu_memset_async(c_dev, 0, ldc*ldccols*size_of_datatype, my_stream)
359 successgpu = gpu_memset(c_dev, 0, ldc*ldccols*size_of_datatype)
361 check_memcpy_gpu(
"elpa_pxgemm_multiply: memset c_dev", successgpu)
362#endif /* DEVICE_POINTER */
365 issquaregrid = .false.
366 if (np_rows == np_cols) issquaregrid = .true.
370#ifdef WITH_GPU_STREAMS
372 successgpu = gpu_stream_synchronize(my_stream)
373 check_stream_synchronize_gpu(
"elpa_pxgemm_multiply: stream synchronize", successgpu)
378 call mpi_allreduce(l_rows, l_rows_max, 1_mpi_kind, mpi_integer, mpi_max, int(mpi_comm_all,kind=mpi_kind), mpierr)
379 call mpi_allreduce(l_cols, l_cols_max, 1_mpi_kind, mpi_integer, mpi_max, int(mpi_comm_all,kind=mpi_kind), mpierr)
380 call mpi_allreduce(l_rows, l_rows_min, 1_mpi_kind, mpi_integer, mpi_min, int(mpi_comm_all,kind=mpi_kind), mpierr)
381 call mpi_allreduce(l_cols, l_cols_min, 1_mpi_kind, mpi_integer, mpi_min, int(mpi_comm_all,kind=mpi_kind), mpierr)
389 nblk_mult = greatest_common_divisor(l_rows_max, l_cols_max)
392 if (issquaregrid)
then
396 if (.not. a_transposed .and. b_transposed .or. &
397 a_transposed .and. .not. b_transposed )
then
398 if (wantdebug .and. myid==0) print *,
"elpa_pxgemm_multiply NEW: SQUARE_GRID start: (a_transposed XOR b_transposed)"
400 allocate(aux_a_full(l_rows_max, nblk_mult), stat=istat, errmsg=errormessage)
401 check_allocate(
"elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
403 allocate(aux_b_full(nblk_mult, l_cols_max), stat=istat, errmsg=errormessage)
404 check_allocate(
"elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
406 allocate(tmp1_full(l_rows_max, l_cols_max), stat=istat, errmsg=errormessage)
407 check_allocate(
"elpa_pxgemm_multiply: tmp1_full", istat, errormessage)
411 successgpu = gpu_malloc(aux_a_full_dev, nblk_mult*nblk_mult*size_of_datatype)
412 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
414 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*nblk_mult*size_of_datatype)
415 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
417 successgpu = gpu_malloc(tmp1_full_dev, nblk_mult*nblk_mult*size_of_datatype)
418 check_alloc_gpu(
"elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
422 if (a_transposed)
then
425 mpi_comm_dirs = mpi_comm_rows
426 ccl_comm_dirs = obj%gpu_setup%ccl_comm_rows
427 else if (b_transposed)
then
430 mpi_comm_dirs = mpi_comm_cols
431 ccl_comm_dirs = obj%gpu_setup%ccl_comm_cols
434 call obj%timer%start(
"main_loop_square_grid_tn_nt")
438 nvtx_range_push(
"do np = 0, np_dirs-1")
439 if (wantdebug .and. myid==0) print *,
"np = ", np
445 if (a_transposed)
then
447 if (np_t == my_pcol)
call gpu_copy_and_set_zeros_aux_full(precision_char, a_dev, aux_a_full_dev, l_rows, l_cols, &
448 nblk_mult, debug, my_stream)
449 call gpu_copy_and_set_zeros_aux_full(precision_char, b_dev, aux_b_full_dev, l_rows, l_cols, &
450 nblk_mult, debug, my_stream)
452 if (np_t == my_pcol)
then
453 aux_a_full(1:l_rows,1:l_cols) = a(1:l_rows,1:l_cols)
454 if (l_rows < nblk_mult) aux_a_full(l_rows+1:nblk_mult,1:l_cols) = 0
455 if (l_cols < nblk_mult) aux_a_full(1:l_rows,l_cols+1:nblk_mult) = 0
456 if (l_rows < nblk_mult .and. l_cols < nblk_mult) aux_a_full(l_rows+1:nblk_mult,l_cols+1:nblk_mult) = 0
458 aux_b_full(1:l_rows,1:l_cols) = b(1:l_rows,1:l_cols)
459 if (l_rows < nblk_mult) aux_b_full(l_rows+1:nblk_mult,1:l_cols) = 0
460 if (l_cols < nblk_mult) aux_b_full(1:l_rows,l_cols+1:nblk_mult) = 0
461 if (l_rows < nblk_mult .and. l_cols < nblk_mult) aux_b_full(l_rows+1:nblk_mult,l_cols+1:nblk_mult) = 0
463 else if (b_transposed)
then
465 if (np_t == my_prow)
call gpu_copy_and_set_zeros_aux_full(precision_char, b_dev, aux_b_full_dev, l_rows, l_cols, &
466 nblk_mult, debug, my_stream)
467 call gpu_copy_and_set_zeros_aux_full(precision_char, a_dev, aux_a_full_dev, l_rows, l_cols, &
468 nblk_mult, debug, my_stream)
471 if (np_t == my_prow)
then
472 aux_b_full(1:l_rows,1:l_cols) = b(1:l_rows,1:l_cols)
473 if (l_rows < nblk_mult) aux_b_full(l_rows+1:nblk_mult,1:l_cols) = 0
474 if (l_cols < nblk_mult) aux_b_full(1:l_rows,l_cols+1:nblk_mult) = 0
475 if (l_rows < nblk_mult .and. l_cols < nblk_mult) aux_b_full(l_rows+1:nblk_mult,l_cols+1:nblk_mult) = 0
477 aux_a_full(1:l_rows,1:l_cols) = a(1:l_rows,1:l_cols)
478 if (l_rows < nblk_mult) aux_a_full(l_rows+1:nblk_mult,1:l_cols) = 0
479 if (l_cols < nblk_mult) aux_a_full(1:l_rows,l_cols+1:nblk_mult) = 0
480 if (l_rows < nblk_mult .and. l_cols < nblk_mult) aux_a_full(l_rows+1:nblk_mult,l_cols+1:nblk_mult) = 0
489 if (usegpu .and. .not. useccl)
then
490 num = nblk_mult*nblk_mult
491#ifdef WITH_GPU_STREAMS
492 if (a_transposed)
then
493 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", &
494 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:nblk_mult,1:nblk_mult), &
495 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
496 else if (b_transposed)
then
497 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", &
498 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:nblk_mult), &
499 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
502 if (a_transposed)
then
503 successgpu = gpu_memcpy(int(loc(aux_a_full),kind=c_intptr_t), aux_a_full_dev, &
504 num*size_of_datatype, gpumemcpydevicetohost)
505 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", successgpu)
506 else if (b_transposed)
then
507 successgpu = gpu_memcpy(int(loc(aux_b_full),kind=c_intptr_t), aux_b_full_dev, &
508 num*size_of_datatype, gpumemcpydevicetohost)
509 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", successgpu)
517 call obj%timer%start(
"ccl_bcast")
518 nvtx_range_push(
"ccl_bcast aux_a/b_full")
520 if (a_transposed)
then
521 successgpu = ccl_bcast(aux_a_full_dev, aux_a_full_dev, int(k_datatype*nblk_mult*nblk_mult,kind=c_size_t), &
522 ccldatatype, int(np_t,kind=c_int), ccl_comm_cols, my_stream)
523 else if (b_transposed)
then
524 successgpu = ccl_bcast(aux_b_full_dev, aux_b_full_dev, int(k_datatype*nblk_mult*nblk_mult,kind=c_size_t), &
525 ccldatatype, int(np_t,kind=c_int), ccl_comm_rows, my_stream)
528 if (.not. successgpu)
then
529 print *,
"Error in ccl_bcast"
533 successgpu = gpu_stream_synchronize(my_stream)
534 check_stream_synchronize_gpu(
"elpa_pxgemm_multiply: ccl_bcast", successgpu)
536 nvtx_range_pop(
"ccl_bcast aux_a/b_full")
537 call obj%timer%stop(
"ccl_bcast")
539 call obj%timer%start(
"mpi_communication")
540 nvtx_range_push(
"MPI_Bcast(aux_a/b_full)")
542 if (a_transposed)
then
543 call mpi_bcast(aux_a_full, int(nblk_mult*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
544 int(np_t,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
545 else if (b_transposed)
then
546 call mpi_bcast(aux_b_full, int(nblk_mult*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
547 int(np_t,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
550 nvtx_range_pop(
"MPI_Bcast(aux_a/b_full)")
551 call obj%timer%stop(
"mpi_communication")
555 if (usegpu .and. .not. useccl)
then
556 num = nblk_mult*nblk_mult
557#ifdef WITH_GPU_STREAMS
558 if (a_transposed)
then
559 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", &
560 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:nblk_mult,1:nblk_mult), &
561 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
562 else if (b_transposed)
then
563 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", &
564 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:nblk_mult), &
565 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
568 if (a_transposed)
then
569 successgpu = gpu_memcpy(aux_a_full_dev, int(loc(aux_a_full),kind=c_intptr_t), &
570 num*size_of_datatype, gpumemcpyhosttodevice)
571 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", successgpu)
572 else if (b_transposed)
then
573 successgpu = gpu_memcpy(aux_b_full_dev, int(loc(aux_b_full),kind=c_intptr_t), &
574 num*size_of_datatype, gpumemcpyhosttodevice)
575 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", successgpu)
582 call obj%timer%start(
"gpublas")
583 nvtx_range_push(
"gpublas")
584 call gpublas_precision_gemm(trans_a_cchar, trans_b_cchar, &
585 nblk_mult, nblk_mult, nblk_mult, one, &
586 aux_a_full_dev, nblk_mult, &
587 aux_b_full_dev, nblk_mult, zero, &
588 tmp1_full_dev , nblk_mult, gpuhandle)
589 if (wantdebug) successgpu = gpu_devicesynchronize()
590 nvtx_range_pop(
"gpublas")
591 call obj%timer%stop(
"gpublas")
593 call obj%timer%start(
"blas")
594 call precision_gemm(trans_a, trans_b, &
595 int(nblk_mult, kind=blas_kind), &
596 int(nblk_mult, kind=blas_kind), &
597 int(nblk_mult, kind=blas_kind), one, &
598 aux_a_full, int(nblk_mult, kind=blas_kind), &
599 aux_b_full, int(nblk_mult, kind=blas_kind), zero, &
600 tmp1_full , int(nblk_mult, kind=blas_kind))
601 call obj%timer%stop(
"blas")
605 num = nblk_mult*nblk_mult
608 if (usegpu .and. .not. useccl)
then
609 call obj%timer%start(
"gpu_memcpy")
610#ifdef WITH_GPU_STREAMS
611 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: tmp1_full_dev -> tmp1_full", &
612 tmp1_full_dev, 0_c_intptr_t, tmp1_full(1:nblk_mult,1:nblk_mult), &
613 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
615 successgpu = gpu_memcpy(int(loc(tmp1_full),kind=c_intptr_t), tmp1_full_dev, num*size_of_datatype, gpumemcpydevicetohost)
616 check_memcpy_gpu(
"elpa_pxgemm_multiply: tmp1_full_dev -> tmp1_full", successgpu)
618 call obj%timer%stop(
"gpu_memcpy")
623 call obj%timer%start(
"ccl_reduce")
624 nvtx_range_push(
"ccl_reduce tmp1_full_dev")
626 successgpu = ccl_reduce(tmp1_full_dev, tmp1_full_dev, int(k_datatype*num,kind=c_size_t), ccldatatype, &
627 cclsum, int(np, kind=c_int), ccl_comm_dirs, my_stream)
629 if (.not. successgpu)
then
630 print *,
"Error in ccl_reduce"
634 successgpu = gpu_stream_synchronize(my_stream)
635 check_stream_synchronize_gpu(
"elpa_pxgemm_multiply: ccl_bcast", successgpu)
637 nvtx_range_pop(
"ccl_bcast aux_a_full_dev, aux_b_full_dev")
638 call obj%timer%stop(
"ccl_reduce")
640 call obj%timer%start(
"mpi_communication")
641 if (my_pdir==np)
then
642 call mpi_reduce(mpi_in_place, tmp1_full, int(num,kind=mpi_kind), mpi_math_datatype_precision, &
643 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_dirs,kind=mpi_kind), mpierr)
645 call mpi_reduce(tmp1_full , tmp1_full, int(num,kind=mpi_kind), mpi_math_datatype_precision, &
646 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_dirs,kind=mpi_kind), mpierr)
648 call obj%timer%stop(
"mpi_communication")
651 if (usegpu .and. .not. useccl)
then
652 call obj%timer%start(
"gpu_memcpy")
653#ifdef WITH_GPU_STREAMS
654 call gpu_memcpy_async_and_stream_synchronize &
655 (
"elpa_pxgemm_multiply: tmp1_full -> tmp1_full_dev", 0_c_intptr_t, tmp1_full_dev, &
656 tmp1_full(1:nblk_mult,1:nblk_mult), &
657 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
659 successgpu = gpu_memcpy(tmp1_full_dev, int(loc(tmp1_full),kind=c_intptr_t), &
660 num*size_of_datatype, gpumemcpyhosttodevice)
661 check_memcpy_gpu(
"elpa_pxgemm_multiply: tmp1_full -> tmp1_full_dev", successgpu)
663 call obj%timer%stop(
"gpu_memcpy")
668 if (my_pdir==np)
then
670 nvtx_range_push(
"gpu_copy_aux_full")
671 call gpu_copy_aux_full(precision_char, c_dev, tmp1_full_dev, l_rows, l_cols, ldc, nblk_mult, debug, my_stream)
672 nvtx_range_pop(
"gpu_copy_aux_full")
674 c(1:l_rows,1:l_cols) = tmp1_full(1:l_rows,1:l_cols)
678 nvtx_range_pop(
"do np = 0, np_dirs-1")
680 call obj%timer%stop(
"main_loop_square_grid_tn_nt")
682#if !defined(DEVICE_POINTER)
684 call obj%timer%start(
"gpu_memcpy")
685 nvtx_range_push(
"gpu_memcpy: c_dev->c")
687#ifdef WITH_GPU_STREAMS
688 call gpu_memcpy_async_and_stream_synchronize &
689 (
"elpa_pxgemm_multiply: c_dev -> c", c_dev, 0_c_intptr_t, c(1:ldc,1:ldccols), &
690 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
692 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)
693 check_memcpy_gpu(
"elpa_pxgemm_multiply: c_dev -> c", successgpu)
695 nvtx_range_pop(
"gpu_memcpy: c_dev->c")
696 call obj%timer%stop(
"gpu_memcpy")
698#endif /* !defined(DEVICE_POINTER) */
700 deallocate(tmp1_full, stat=istat, errmsg=errormessage)
701 call check_alloc(
"elpa_pxgemm_multiply",
"tmp1_full", istat, errormessage)
704 successgpu = gpu_free(tmp1_full_dev)
705 check_dealloc_gpu(
"elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
712 if ((.not. a_transposed) .and. (.not. b_transposed) .or. &
713 a_transposed .and. b_transposed)
then
714 if (wantdebug .and. myid==0) print *,
"elpa_pxgemm_multiply NEW: SQUARE_GRID start: NN or TT"
716 allocate(aux_a_full(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
717 check_allocate(
"elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
719 allocate(aux_b_full(nblk_mult, l_cols), stat=istat, errmsg=errormessage)
720 check_allocate(
"elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
722 if (usegpu .and. l_rows /= ldc)
then
723 print *,
"elpa_pxgemm_multiply: Error: case ldc != lda is not implemented yet for NN and TT on GPU"
729 successgpu = gpu_malloc(aux_a_full_dev, l_rows*nblk_mult*size_of_datatype)
730 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
732 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*l_cols*size_of_datatype)
733 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
736 if (a_transposed)
then
737 if (.not. useccl)
then
738 allocate(at_col(l_rows, l_cols), stat=istat, errmsg=errormessage)
739 check_allocate(
"elpa_pxgemm_multiply: at_col", istat, errormessage)
743 successgpu = gpu_malloc(at_col_dev, l_rows*l_cols*size_of_datatype)
744 check_alloc_gpu(
"elpa_pxgemm_multiply: at_col_dev", successgpu)
748 if (b_transposed)
then
749 if (.not. useccl)
then
750 allocate(bt_row(l_rows, l_cols), stat=istat, errmsg=errormessage)
751 check_allocate(
"elpa_pxgemm_multiply: bt_row", istat, errormessage)
755 successgpu = gpu_malloc(bt_row_dev, l_rows*l_cols*size_of_datatype)
756 check_alloc_gpu(
"elpa_pxgemm_multiply: bt_row_dev", successgpu)
760 if (a_transposed .or. b_transposed)
then
761 lcm = least_common_multiple(np_rows, np_cols)*nblk
767 successgpu = gpu_malloc(buf_send_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
768 check_alloc_gpu(
"elpa_pxgemm_multiply: buf_send_dev", successgpu)
770 successgpu = gpu_malloc(buf_recv_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
771 check_alloc_gpu(
"elpa_pxgemm_multiply: buf_recv_dev", successgpu)
773 successgpu = gpu_malloc(buf_self_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
774 check_alloc_gpu(
"elpa_pxgemm_multiply: buf_self_dev", successgpu)
776 allocate(buf_send(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
777 check_allocate(
"elpa_pxgemm_multiply: buf_send", istat, errormessage)
779 allocate(buf_recv(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
780 check_allocate(
"elpa_pxgemm_multiply: buf_recv", istat, errormessage)
782 allocate(buf_self(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
783 check_allocate(
"elpa_pxgemm_multiply: buf_self", istat, errormessage)
785#if defined(DEVICE_POINTER)
786 allocate(a(obj%local_nrows, obj%local_ncols), stat=istat, errmsg=errormessage)
787 check_allocate(
"elpa_pxgemm_multiply: a", istat, errormessage)
789 allocate(b(ldb, ldbcols), stat=istat, errmsg=errormessage)
790 check_allocate(
"elpa_pxgemm_multiply: b", istat, errormessage)
792 successgpu = gpu_memcpy(int(loc(a),kind=c_intptr_t), a_dev, &
793 obj%local_nrows*obj%local_ncols*size_of_datatype, gpumemcpydevicetohost)
794 check_memcpy_gpu(
"elpa_pxgemm_multiply: a_dev -> a", successgpu)
796 successgpu = gpu_memcpy(int(loc(b),kind=c_intptr_t), b_dev, &
797 ldb*ldbcols*size_of_datatype, gpumemcpydevicetohost)
798 check_memcpy_gpu(
"elpa_pxgemm_multiply: b_dev -> b", successgpu)
799#endif /* defined(DEVICE_POINTER) */
804 call obj%timer%start(
"main_loop_square_grid_nn_tt")
807 nvtx_range_push(
"np = 0, np_rows-1")
812 if (a_transposed)
then
814 call elpa_transpose_row_or_col_ccl_&
818 (obj,
'R', a_dev, at_col_dev, buf_send_dev, buf_recv_dev, buf_self_dev, np, l_rows, l_cols, &
819 nblk_mult_rows_max, nblk_mult_cols_max, debug)
821 call elpa_transpose_row_or_col_&
825 (obj,
'R', a, at_col, buf_send, buf_recv, buf_self, np, l_rows, l_cols, &
826 nblk_mult_rows_max, nblk_mult_cols_max, debug)
830 if (b_transposed)
then
832 call elpa_transpose_row_or_col_ccl_&
836 (obj,
'C', b_dev, bt_row_dev, buf_send_dev, buf_recv_dev, buf_self_dev, np, l_rows, l_cols, &
837 nblk_mult_rows_max, nblk_mult_cols_max, debug)
839 call elpa_transpose_row_or_col_&
843 (obj,
'C', b, bt_row, buf_send, buf_recv, buf_self, np, l_rows, l_cols, &
844 nblk_mult_rows_max, nblk_mult_cols_max, debug)
848 if (np_bc == my_pcol)
then
850 if (a_transposed)
then
851 if (.not. useccl)
then
853#ifdef WITH_GPU_STREAMS
854 call gpu_memcpy_async_and_stream_synchronize &
855 (
"elpa_pxgemm_multiply: at_col -> at_col_dev", at_col_dev, 0_c_intptr_t, at_col(1:l_rows,1:l_cols), &
856 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
858 successgpu = gpu_memcpy(at_col_dev, int(loc(at_col),kind=c_intptr_t), num*size_of_datatype, gpumemcpyhosttodevice)
859 check_memcpy_gpu(
"elpa_pxgemm_multiply: at_col -> at_col_dev", successgpu)
863 call gpu_copy_aux_full(precision_char, aux_a_full_dev, at_col_dev, &
864 l_rows, l_cols, l_rows, obj%local_nrows, debug, my_stream)
866 call gpu_copy_aux_full(precision_char, aux_a_full_dev, a_dev, &
867 l_rows, l_cols, l_rows, obj%local_nrows, debug, my_stream)
873 if (a_transposed)
then
874 aux_a_full(1:l_rows,1:l_cols) = at_col(1:l_rows,1:l_cols)
876 aux_a_full(1:l_rows,1:l_cols) = a(1:l_rows,1:l_cols)
880 if (np == my_prow)
then
882 if (b_transposed)
then
883 if (.not. useccl)
then
885#ifdef WITH_GPU_STREAMS
886 call gpu_memcpy_async_and_stream_synchronize &
887 (
"elpa_pxgemm_multiply: bt_row -> bt_row_dev", bt_row_dev, 0_c_intptr_t, bt_row(1:l_rows,1:l_cols), &
888 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
890 successgpu = gpu_memcpy(bt_row_dev, int(loc(bt_row),kind=c_intptr_t), num*size_of_datatype, gpumemcpyhosttodevice)
891 check_memcpy_gpu(
"elpa_pxgemm_multiply: bt_row -> bt_row_dev", successgpu)
895 call gpu_copy_aux_full(precision_char, aux_b_full_dev, bt_row_dev, &
896 l_rows, l_cols, nblk_mult, ldb, debug, my_stream)
898 call gpu_copy_aux_full(precision_char, aux_b_full_dev, b_dev, &
899 l_rows, l_cols, nblk_mult, ldb, debug, my_stream)
905 if (b_transposed)
then
906 aux_b_full(1:l_rows,1:l_cols) = bt_row(1:l_rows,1:l_cols)
908 aux_b_full(1:l_rows,1:l_cols) = b(1:l_rows,1:l_cols)
915 if (usegpu .and. .not. useccl)
then
917 num_a = l_rows*nblk_mult
918 num_b = nblk_mult*l_cols
919#ifdef WITH_GPU_STREAMS
920 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", &
921 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:l_rows,1:nblk_mult), &
922 1, 1, num_a*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
923 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", &
924 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:l_cols), &
925 1, 1, num_b*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
927 successgpu = gpu_memcpy(int(loc(aux_a_full),kind=c_intptr_t),aux_a_full_dev,num_a*size_of_datatype,gpumemcpydevicetohost)
928 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", successgpu)
930 successgpu = gpu_memcpy(int(loc(aux_b_full),kind=c_intptr_t),aux_b_full_dev,num_b*size_of_datatype,gpumemcpydevicetohost)
931 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", successgpu)
937 call obj%timer%start(
"ccl_bcast")
938 nvtx_range_push(
"ccl_bcast aux_a_full_dev, aux_b_full_dev")
940 successgpu = ccl_bcast(aux_a_full_dev, aux_a_full_dev, int(k_datatype*l_rows*nblk_mult,kind=c_size_t), ccldatatype, &
941 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
943 successgpu2 = ccl_bcast(aux_b_full_dev, aux_b_full_dev, int(k_datatype*nblk_mult*l_cols,kind=c_size_t), ccldatatype, &
944 int(np ,kind=c_int), ccl_comm_rows, my_stream)
946 if (.not. (successgpu .and. successgpu2))
then
947 print *,
"Error in ccl_bcast"
951 successgpu = gpu_stream_synchronize(my_stream)
952 check_stream_synchronize_gpu(
"elpa_pxgemm_multiply: ccl_bcast", successgpu)
954 nvtx_range_pop(
"ccl_bcast aux_a_full_dev, aux_b_full_dev")
955 call obj%timer%stop(
"ccl_bcast")
957 call obj%timer%start(
"mpi_communication")
958 nvtx_range_push(
"MPI_Bcast: aux_a_full, aux_b_full")
960 call mpi_bcast(aux_a_full, int(l_rows*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
961 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
962 call mpi_bcast(aux_b_full, int(nblk_mult*l_cols,kind=mpi_kind), mpi_math_datatype_precision, &
963 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
965 nvtx_range_pop(
"MPI_Bcast: aux_a_full, aux_b_full")
966 call obj%timer%stop(
"mpi_communication")
969 call obj%timer%start(
"mpi_communication")
970 nvtx_range_push(
"MPI_Bcast: l_cols_source, l_rows_source")
971 l_cols_source = l_cols
972 l_rows_source = l_rows
974 call mpi_bcast(l_cols_source, int(1,kind=mpi_kind), mpi_integer, &
975 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
976 call mpi_bcast(l_rows_source, int(1,kind=mpi_kind), mpi_integer, &
977 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
979 nblk_mult_min = min(l_rows_source, l_cols_source)
980 nvtx_range_pop(
"MPI_Bcast: l_cols_source, l_rows_source")
981 call obj%timer%stop(
"mpi_communication")
984 if (usegpu .and. .not. useccl)
then
985 num_a = l_rows*nblk_mult
986 num_b = nblk_mult*l_cols
987#ifdef WITH_GPU_STREAMS
988 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", &
989 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:l_rows,1:nblk_mult), &
990 1, 1, num_a*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
991 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", &
992 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:l_cols), &
993 1, 1, num_b*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
995 successgpu = gpu_memcpy(aux_a_full_dev,int(loc(aux_a_full),kind=c_intptr_t),num_a*size_of_datatype,gpumemcpyhosttodevice)
996 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", successgpu)
998 successgpu = gpu_memcpy(aux_b_full_dev,int(loc(aux_b_full),kind=c_intptr_t),num_b*size_of_datatype,gpumemcpyhosttodevice)
999 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", successgpu)
1003 nblk_mult_min= l_rows
1004#endif /* WITH_MPI */
1007 if (np>0) beta = one
1009 call obj%timer%start(
"gpublas")
1010 nvtx_range_push(
"gpublas")
1011 call gpublas_precision_gemm(
'N',
'N', &
1012 l_rows, l_cols, nblk_mult_min, one, &
1013 aux_a_full_dev, l_rows, &
1014 aux_b_full_dev, nblk_mult, beta, &
1015 c_dev , l_rows, gpuhandle)
1016 if (wantdebug) successgpu = gpu_devicesynchronize()
1017 nvtx_range_pop(
"gpublas")
1018 call obj%timer%stop(
"gpublas")
1020 call obj%timer%start(
"blas")
1021 call precision_gemm(
'N',
'N', &
1022 int(l_rows, kind=blas_kind), &
1023 int(l_cols, kind=blas_kind), &
1024 int(nblk_mult_min, kind=blas_kind), one, &
1025 aux_a_full(1:l_rows, 1:nblk_mult_min), int(l_rows,kind=blas_kind), &
1026 aux_b_full(1:nblk_mult_min, 1:l_cols), int(nblk_mult_min,kind=blas_kind), beta, &
1027 c(1:l_rows, 1:l_cols), int(l_rows,kind=blas_kind))
1028 call obj%timer%stop(
"blas")
1031 nvtx_range_pop(
"np = 0, np_rows-1")
1034 call obj%timer%stop(
"main_loop_square_grid_nn_tt")
1036#if !defined(DEVICE_POINTER)
1039 call obj%timer%start(
"gpu_memcpy")
1040 nvtx_range_push(
"gpu_memcpy: c_dev->c")
1042#ifdef WITH_GPU_STREAMS
1043 call gpu_memcpy_async_and_stream_synchronize &
1044 (
"elpa_pxgemm_multiply: c_dev -> c", c_dev, 0_c_intptr_t, c(1:l_rows,1:l_cols), &
1045 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1047 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)
1048 check_memcpy_gpu(
"elpa_pxgemm_multiply: c_dev -> c", successgpu)
1050 nvtx_range_pop(
"gpu_memcpy: c_dev->c")
1051 call obj%timer%stop(
"gpu_memcpy")
1053#endif /* !defined(DEVICE_POINTER) */
1055 if (a_transposed)
then
1056 if (.not. useccl)
then
1057 deallocate(at_col, stat=istat, errmsg=errormessage)
1058 call check_alloc(
"elpa_pxgemm_multiply",
"at_col", istat, errormessage)
1062 successgpu = gpu_free(at_col_dev)
1063 check_dealloc_gpu(
"elpa_pxgemm_multiply: at_col_dev", successgpu)
1067 if (b_transposed)
then
1068 if (.not. useccl)
then
1069 deallocate(bt_row, stat=istat, errmsg=errormessage)
1070 call check_alloc(
"elpa_pxgemm_multiply",
"bt_row", istat, errormessage)
1074 successgpu = gpu_free(bt_row_dev)
1075 check_dealloc_gpu(
"elpa_pxgemm_multiply: bt_row_dev", successgpu)
1079 if (a_transposed .or. b_transposed)
then
1081 successgpu = gpu_free(buf_send_dev)
1082 check_dealloc_gpu(
"elpa_pxgemm_multiply: buf_send_dev", successgpu)
1084 successgpu = gpu_free(buf_recv_dev)
1085 check_dealloc_gpu(
"elpa_pxgemm_multiply: buf_recv_dev", successgpu)
1087 successgpu = gpu_free(buf_self_dev)
1088 check_dealloc_gpu(
"elpa_pxgemm_multiply: buf_self_dev", successgpu)
1090 deallocate(buf_send, buf_recv, buf_self, stat=istat, errmsg=errormessage)
1091 call check_alloc(
"elpa_pxgemm_multiply",
"buf_send, buf_recv, buf_self", istat, errormessage)
1093#if defined(DEVICE_POINTER)
1094 deallocate(a, b, stat=istat, errmsg=errormessage)
1095 call check_alloc(
"elpa_pxgemm_multiply",
"a, b", istat, errormessage)
1108 if (.not. issquaregrid)
then
1110 lcm = least_common_multiple(np_rows, np_cols)*nblk
1111 nblk_mult_rows_max = (l_rows_max*np_rows+lcm-1)/lcm*nblk
1112 nblk_mult_cols_max = (l_cols_max*np_cols+lcm-1)/lcm*nblk
1113 nblk_mult = max(nblk_mult_rows_max, nblk_mult_cols_max)
1115 if (wantdebug .and. myid==0)
then
1116 print *,
"LCM = ", lcm
1117 print *,
"l_rows_max", l_rows_max
1118 print *,
"l_cols_max", l_cols_max
1119 print *,
"l_rows", l_rows
1120 print *,
"l_cols", l_cols
1121 print *,
"np_rows", np_rows
1122 print *,
"np_cols", np_cols
1123 print *,
"nblk_mult = ", nblk_mult
1126 np_rows_fine = least_common_multiple(np_rows, np_cols)
1127 np_cols_fine = np_rows_fine
1131 if ((.not. a_transposed) .and. (.not. b_transposed) .or. &
1132 a_transposed .and. b_transposed)
then
1133 if (wantdebug .and. myid==0) print *,
"elpa_pxgemm_multiply NEW: NON-SQUARE_GRID start: NN or TT or universal"
1135 allocate(aux_a_full(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
1136 check_allocate(
"elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
1138 allocate(aux_b_full(nblk_mult, l_cols), stat=istat, errmsg=errormessage)
1139 check_allocate(
"elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
1142 successgpu = gpu_malloc(aux_a_full_dev, l_rows*nblk_mult*size_of_datatype)
1143 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
1145 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*l_cols*size_of_datatype)
1146 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
1149 if (usegpu .and. l_rows /= ldc)
then
1150 print *,
"elpa_pxgemm_multiply: Error: case ldc != lda is not implemented yet for NN and TT on GPU"
1154 if (a_transposed)
then
1156 if (.not. useccl)
then
1157 allocate(at_col(l_rows, l_cols), stat=istat, errmsg=errormessage)
1158 check_allocate(
"elpa_pxgemm_multiply: at_col", istat, errormessage)
1167 successgpu = gpu_malloc(at_col_dev, l_rows*l_cols*size_of_datatype)
1168 check_alloc_gpu(
"elpa_pxgemm_multiply: at_col_dev", successgpu)
1172 if (b_transposed)
then
1173 if (.not. useccl)
then
1175 allocate(bt_row(l_rows, l_cols), stat=istat, errmsg=errormessage)
1176 check_allocate(
"elpa_pxgemm_multiply: bt_row", istat, errormessage)
1181 successgpu = gpu_malloc(bt_row_dev, l_rows*l_cols*size_of_datatype)
1182 check_alloc_gpu(
"elpa_pxgemm_multiply: bt_row_dev", successgpu)
1186 if (a_transposed .or. b_transposed)
then
1192 successgpu = gpu_malloc(buf_send_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
1193 check_alloc_gpu(
"elpa_pxgemm_multiply: buf_send_dev", successgpu)
1195 successgpu = gpu_malloc(buf_recv_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
1196 check_alloc_gpu(
"elpa_pxgemm_multiply: buf_recv_dev", successgpu)
1198 successgpu = gpu_malloc(buf_self_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
1199 check_alloc_gpu(
"elpa_pxgemm_multiply: buf_self_dev", successgpu)
1201 allocate(buf_send(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
1202 check_allocate(
"elpa_pxgemm_multiply: buf_send", istat, errormessage)
1204 allocate(buf_recv(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
1205 check_allocate(
"elpa_pxgemm_multiply: buf_recv", istat, errormessage)
1207 allocate(buf_self(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
1208 check_allocate(
"elpa_pxgemm_multiply: buf_self", istat, errormessage)
1210#if defined(DEVICE_POINTER)
1211 allocate(a(obj%local_nrows, obj%local_ncols), stat=istat, errmsg=errormessage)
1212 check_allocate(
"elpa_pxgemm_multiply: a", istat, errormessage)
1214 allocate(b(ldb, ldbcols), stat=istat, errmsg=errormessage)
1215 check_allocate(
"elpa_pxgemm_multiply: b", istat, errormessage)
1217 successgpu = gpu_memcpy(int(loc(a),kind=c_intptr_t), a_dev, &
1218 obj%local_nrows*obj%local_ncols*size_of_datatype, gpumemcpydevicetohost)
1219 check_memcpy_gpu(
"elpa_pxgemm_multiply: a_dev -> a", successgpu)
1221 successgpu = gpu_memcpy(int(loc(b),kind=c_intptr_t), b_dev, &
1222 ldb*ldbcols*size_of_datatype, gpumemcpydevicetohost)
1223 check_memcpy_gpu(
"elpa_pxgemm_multiply: b_dev -> b", successgpu)
1224#endif /* defined(DEVICE_POINTER) */
1228 call obj%timer%start(
"main_loop_nn_tt")
1231 do np_fine = 0, np_rows_fine-1
1232 nvtx_range_push(
"np_fine = 0, np_rows_fine-1")
1233 if (wantdebug .and. myid==0) print *,
"np_fine = ", np_fine
1239 np = mod(np_fine, np_rows)
1240 np_bc = mod(np_bc_fine, np_cols)
1245 if (a_transposed)
then
1247 call elpa_transpose_row_or_col_ccl_&
1251 (obj,
'R', a_dev, at_col_dev, buf_send_dev, buf_recv_dev, buf_self_dev, np_fine, l_rows, l_cols, &
1252 nblk_mult_rows_max, nblk_mult_cols_max, debug)
1254 call elpa_transpose_row_or_col_&
1258 (obj,
'R', a, at_col, buf_send, buf_recv, buf_self, np_fine, l_rows, l_cols, &
1259 nblk_mult_rows_max, nblk_mult_cols_max, debug)
1263 if (b_transposed)
then
1265 call elpa_transpose_row_or_col_ccl_&
1269 (obj,
'C', b_dev, bt_row_dev, buf_send_dev, buf_recv_dev, buf_self_dev, np_fine, l_rows, l_cols, &
1270 nblk_mult_rows_max, nblk_mult_cols_max, debug)
1272 call elpa_transpose_row_or_col_&
1276 (obj,
'C', b, bt_row, buf_send, buf_recv, buf_self, np_fine, l_rows, l_cols, &
1277 nblk_mult_rows_max, nblk_mult_cols_max, debug)
1281 if (mod(np_bc_fine,np_cols) == my_pcol)
then
1283 if (a_transposed)
then
1284 if (.not. useccl)
then
1286#ifdef WITH_GPU_STREAMS
1287 call gpu_memcpy_async_and_stream_synchronize &
1288 (
"elpa_pxgemm_multiply: at_col -> at_col_dev", at_col_dev, 0_c_intptr_t, at_col(1:l_rows,1:l_cols), &
1289 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1291 successgpu = gpu_memcpy(at_col_dev, int(loc(at_col),kind=c_intptr_t), num*size_of_datatype, gpumemcpyhosttodevice)
1292 check_memcpy_gpu(
"elpa_pxgemm_multiply: at_col -> at_col_dev", successgpu)
1296 call gpu_copy_and_set_zeros_aux_a_full(precision_char, at_col_dev, aux_a_full_dev, l_rows, l_cols, &
1297 nblk_mult_cols, nblk, np_bc_fine, np_cols_fine, np_cols, debug, my_stream)
1299 call gpu_copy_and_set_zeros_aux_a_full(precision_char, a_dev , aux_a_full_dev, l_rows, l_cols, &
1300 nblk_mult_cols, nblk, np_bc_fine, np_cols_fine, np_cols, debug, my_stream)
1303 do j_block_loc_fine = 0, (nblk_mult_cols+nblk-1)/nblk - 1
1304 nblk_cut = min(nblk, nblk_mult_cols-j_block_loc_fine*nblk)
1305 j_block_loc = (np_bc_fine + j_block_loc_fine*np_cols_fine)/np_cols
1306 if (a_transposed)
then
1307 aux_a_full(1:l_rows, 1+j_block_loc_fine*nblk: nblk_cut+j_block_loc_fine*nblk) = &
1308 at_col(1:l_rows, 1+j_block_loc*nblk : nblk_cut+j_block_loc*nblk)
1310 aux_a_full(1:l_rows, 1+j_block_loc_fine*nblk: nblk_cut+j_block_loc_fine*nblk) = &
1311 a(1:l_rows, 1+j_block_loc*nblk : nblk_cut+j_block_loc*nblk)
1317 if (mod(np_fine,np_rows) == my_prow)
then
1319 if (b_transposed)
then
1320 if (.not. useccl)
then
1322#ifdef WITH_GPU_STREAMS
1323 call gpu_memcpy_async_and_stream_synchronize &
1324 (
"elpa_pxgemm_multiply: bt_row -> bt_row_dev", bt_row_dev, 0_c_intptr_t, bt_row(1:l_rows,1:l_cols), &
1325 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1327 successgpu = gpu_memcpy(bt_row_dev, int(loc(bt_row),kind=c_intptr_t), num*size_of_datatype, gpumemcpyhosttodevice)
1328 check_memcpy_gpu(
"elpa_pxgemm_multiply: bt_row -> bt_row_dev", successgpu)
1332 call gpu_copy_and_set_zeros_aux_b_full(precision_char, bt_row_dev, aux_b_full_dev, l_rows, l_cols, nblk_mult, &
1333 nblk_mult_rows, nblk, np_fine, np_rows_fine, np_rows, sm_count, debug, my_stream)
1335 call gpu_copy_and_set_zeros_aux_b_full(precision_char, b_dev, aux_b_full_dev, l_rows, l_cols, nblk_mult, &
1336 nblk_mult_rows, nblk, np_fine, np_rows_fine, np_rows, sm_count, debug, my_stream)
1339 do i_block_loc_fine = 0, (nblk_mult_rows+nblk-1)/nblk - 1
1340 nblk_cut = min(nblk, nblk_mult_rows-i_block_loc_fine*nblk)
1341 i_block_loc = (np_fine + i_block_loc_fine*np_rows_fine)/np_rows
1342 if (b_transposed)
then
1343 aux_b_full(1+i_block_loc_fine*nblk: nblk_cut+i_block_loc_fine*nblk, 1:l_cols) = &
1344 bt_row(1+i_block_loc*nblk : nblk_cut+i_block_loc*nblk , 1:l_cols)
1346 aux_b_full(1+i_block_loc_fine*nblk: nblk_cut+i_block_loc_fine*nblk, 1:l_cols) = &
1347 b(1+i_block_loc*nblk : nblk_cut+i_block_loc*nblk , 1:l_cols)
1355 num_a = l_rows*nblk_mult
1356 num_b = nblk_mult*l_cols
1357 if (usegpu .and. .not. useccl)
then
1358#ifdef WITH_GPU_STREAMS
1359 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", &
1360 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:l_rows,1:nblk_mult), &
1361 1, 1, num_a*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
1362 call gpu_memcpy_async_and_stream_synchronize (
"elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", &
1363 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:l_cols), &
1364 1, 1, num_b*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1366 successgpu = gpu_memcpy(int(loc(aux_a_full),kind=c_intptr_t), aux_a_full_dev, num_a*size_of_datatype, &
1367 gpumemcpydevicetohost)
1368 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", successgpu)
1370 successgpu = gpu_memcpy(int(loc(aux_b_full),kind=c_intptr_t), aux_b_full_dev, num_b*size_of_datatype, &
1371 gpumemcpydevicetohost)
1372 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", successgpu)
1379 call obj%timer%start(
"ccl_bcast")
1380 nvtx_range_push(
"ccl_bcast aux_a_full_dev, aux_b_full_dev")
1382 successgpu = ccl_bcast(aux_a_full_dev, aux_a_full_dev, int(k_datatype*l_rows*nblk_mult,kind=c_size_t), ccldatatype, &
1383 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
1385 successgpu2 = ccl_bcast(aux_b_full_dev, aux_b_full_dev, int(k_datatype*nblk_mult*l_cols,kind=c_size_t), ccldatatype, &
1386 int(np ,kind=c_int), ccl_comm_rows, my_stream)
1388 if (.not. (successgpu .and. successgpu2))
then
1389 print *,
"Error in ccl_bcast"
1393 call mpi_bcast(nblk_mult_rows, 1_mpi_kind, mpi_integer, &
1394 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
1396 successgpu = gpu_stream_synchronize(my_stream)
1397 check_stream_synchronize_gpu(
"elpa_pxgemm_multiply: ccl_bcast", successgpu)
1399 nvtx_range_pop(
"ccl_bcast aux_a_full_dev, aux_b_full_dev")
1400 call obj%timer%stop(
"ccl_bcast")
1402 call obj%timer%start(
"mpi_communication")
1403 nvtx_range_push(
"MPI_Bcast: aux_a_full, aux_b_full")
1405 call mpi_bcast(aux_a_full, int(l_rows*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
1406 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
1408 call mpi_bcast(aux_b_full, int(nblk_mult*l_cols,kind=mpi_kind), mpi_math_datatype_precision, &
1409 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
1410 call mpi_bcast(nblk_mult_rows, 1_mpi_kind, mpi_integer, &
1411 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
1413 nvtx_range_pop(
"MPI_Bcast: aux_a_full, aux_b_full")
1414 call obj%timer%stop(
"mpi_communication")
1419 if (usegpu .and. .not. useccl)
then
1420 num_a = l_rows*nblk_mult
1421 num_b = nblk_mult*l_cols
1422#ifdef WITH_GPU_STREAMS
1423 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", &
1424 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:l_rows,1:nblk_mult), &
1425 1, 1, num_a*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
1426 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", &
1427 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:l_cols), &
1428 1, 1, num_b*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1430 successgpu = gpu_memcpy(aux_a_full_dev,int(loc(aux_a_full),kind=c_intptr_t), num_a*size_of_datatype, &
1431 gpumemcpyhosttodevice)
1432 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", successgpu)
1434 successgpu = gpu_memcpy(aux_b_full_dev,int(loc(aux_b_full),kind=c_intptr_t), num_b*size_of_datatype, &
1435 gpumemcpyhosttodevice)
1436 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", successgpu)
1439#endif /* WITH_MPI */
1442 if (np_fine>0) beta = one
1444 call obj%timer%start(
"gpublas")
1445 nvtx_range_push(
"gpublas")
1446 call gpublas_precision_gemm(
'N',
'N', &
1447 l_rows, l_cols, nblk_mult_rows, one, &
1448 aux_a_full_dev, l_rows, &
1449 aux_b_full_dev, nblk_mult, beta, &
1450 c_dev , l_rows, gpuhandle)
1451 if (wantdebug) successgpu = gpu_devicesynchronize()
1452 nvtx_range_pop(
"gpublas")
1453 call obj%timer%stop(
"gpublas")
1455 call obj%timer%start(
"blas")
1456 call precision_gemm(
'N',
'N', &
1457 int(l_rows, kind=blas_kind), &
1458 int(l_cols, kind=blas_kind), &
1459 int(nblk_mult_rows, kind=blas_kind), one, &
1460 aux_a_full(1:l_rows, 1:nblk_mult), int(l_rows,kind=blas_kind), &
1461 aux_b_full(1:nblk_mult, 1:l_cols), int(nblk_mult,kind=blas_kind), beta, &
1462 c(1:l_rows, 1:l_cols) , int(l_rows,kind=blas_kind))
1463 call obj%timer%stop(
"blas")
1466 nvtx_range_pop(
"np_fine = 0, np_rows_fine-1")
1469 call obj%timer%stop(
"main_loop_nn_tt")
1471#if !defined(DEVICE_POINTER)
1474 call obj%timer%start(
"gpu_memcpy")
1475 nvtx_range_push(
"gpu_memcpy: c_dev->c")
1477#ifdef WITH_GPU_STREAMS
1478 call gpu_memcpy_async_and_stream_synchronize &
1479 (
"elpa_pxgemm_multiply: c_dev -> c", c_dev, 0_c_intptr_t, c(1:l_rows,1:l_cols), &
1480 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1482 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)
1483 check_memcpy_gpu(
"elpa_pxgemm_multiply: c_dev -> c", successgpu)
1485 nvtx_range_pop(
"gpu_memcpy: c_dev->c")
1486 call obj%timer%stop(
"gpu_memcpy")
1488#endif /* DEVICE_POINTER */
1491 if (a_transposed)
then
1492 if (.not. useccl)
then
1493 deallocate(at_col, stat=istat, errmsg=errormessage)
1494 call check_alloc(
"elpa_pxgemm_multiply",
"at_col", istat, errormessage)
1498 successgpu = gpu_free(at_col_dev)
1499 check_dealloc_gpu(
"elpa_pxgemm_multiply: at_col_dev", successgpu)
1503 if (b_transposed)
then
1504 if (.not. useccl)
then
1505 deallocate(bt_row, stat=istat, errmsg=errormessage)
1506 call check_alloc(
"elpa_pxgemm_multiply",
"bt_row", istat, errormessage)
1510 successgpu = gpu_free(bt_row_dev)
1511 check_dealloc_gpu(
"elpa_pxgemm_multiply: bt_row_dev", successgpu)
1515 if (a_transposed .or. b_transposed)
then
1517 successgpu = gpu_free(buf_send_dev)
1518 check_dealloc_gpu(
"elpa_pxgemm_multiply: buf_send_dev", successgpu)
1520 successgpu = gpu_free(buf_recv_dev)
1521 check_dealloc_gpu(
"elpa_pxgemm_multiply: buf_recv_dev", successgpu)
1523 successgpu = gpu_free(buf_self_dev)
1524 check_dealloc_gpu(
"elpa_pxgemm_multiply: buf_self_dev", successgpu)
1526 deallocate(buf_send, buf_recv, buf_self, stat=istat, errmsg=errormessage)
1527 call check_alloc(
"elpa_pxgemm_multiply",
"buf_send, buf_recv, buf_self", istat, errormessage)
1529#if defined(DEVICE_POINTER)
1530 deallocate(a, b, stat=istat, errmsg=errormessage)
1531 call check_alloc(
"elpa_pxgemm_multiply",
"a, b", istat, errormessage)
1539 if (.not. a_transposed .and. b_transposed .or. &
1540 a_transposed .and. .not. b_transposed)
then
1541 if (wantdebug .and. myid==0) print *,
"elpa_pxgemm_multiply NEW: NON-SQUARE_GRID start: TN or NT"
1544 if (a_transposed)
then
1549 mpi_comm_dirs = mpi_comm_rows
1550 ccl_comm_dirs = obj%gpu_setup%ccl_comm_rows
1551 else if (b_transposed)
then
1556 mpi_comm_dirs = mpi_comm_cols
1557 ccl_comm_dirs = obj%gpu_setup%ccl_comm_cols
1560 np_dirs_fine = least_common_multiple(np_rows, np_cols)
1562 nblk_mult_rows = nblk_mult
1563 nblk_mult_cols = nblk_mult
1565 nblk_mult_max = (na+lcm-1)/lcm * nblk
1569 if (a_transposed)
then
1570 allocate(aux_a_full(nblk_mult, nblk_mult), stat=istat, errmsg=errormessage)
1571 check_allocate(
"elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
1573 allocate(aux_b_full(nblk_mult, nblk_mult_max*(np_dirs_fine/np_dirs_t)), stat=istat, errmsg=errormessage)
1574 check_allocate(
"elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
1576 allocate(tmp1_full(nblk_mult, nblk_mult_max*(np_dirs_fine/np_dirs_t)), stat=istat, errmsg=errormessage)
1577 check_allocate(
"elpa_pxgemm_multiply: tmp1_full", istat, errormessage)
1580 successgpu = gpu_malloc(aux_a_full_dev, nblk_mult*nblk_mult*size_of_datatype)
1581 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
1583 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*nblk_mult_max*(np_dirs_fine/np_dirs_t)*size_of_datatype)
1584 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
1586 successgpu = gpu_malloc(tmp1_full_dev , nblk_mult*nblk_mult_max*(np_dirs_fine/np_dirs_t)*size_of_datatype)
1587 check_alloc_gpu(
"elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
1589 else if (b_transposed)
then
1590 allocate(aux_a_full(nblk_mult_max*(np_dirs_fine/np_dirs_t), nblk_mult), stat=istat, errmsg=errormessage)
1591 check_allocate(
"elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
1593 allocate(aux_b_full(nblk_mult, nblk_mult), stat=istat, errmsg=errormessage)
1594 check_allocate(
"elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
1596 allocate(tmp1_full(nblk_mult_max*(np_dirs_fine/np_dirs_t), nblk_mult), stat=istat, errmsg=errormessage)
1597 check_allocate(
"elpa_pxgemm_multiply: tmp1_full", istat, errormessage)
1599 successgpu = gpu_malloc(aux_a_full_dev, nblk_mult_max*(np_dirs_fine/np_dirs_t)*nblk_mult*size_of_datatype)
1600 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
1602 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*nblk_mult*size_of_datatype)
1603 check_alloc_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
1605 successgpu = gpu_malloc(tmp1_full_dev , nblk_mult_max*(np_dirs_fine/np_dirs_t)*nblk_mult*size_of_datatype)
1606 check_alloc_gpu(
"elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
1610 call obj%timer%start(
"main_loop_tn_nt")
1613 do np_fine = 0, np_dirs_fine-1
1614 nvtx_range_push(
"do np_fine")
1615 if (wantdebug .and. myid==0) print *,
"np_fine = ", np_fine
1624 np = mod(np_fine, np_dirs)
1625 np_t = mod(np_t_fine, np_dirs_t)
1631 do np_ab_fine = 0, np_dirs_fine-1
1635 if (mod(np_ab_fine,np_dirs)/=my_pdir) cycle
1638 call obj%timer%start(
"gpu_copy_and_set_zeros_aux_ab_full_tn")
1639 call gpu_copy_and_set_zeros_aux_ab_full_tn_nt (precision_char, a_transposed_int, &
1640 a_dev, b_dev, aux_a_full_dev, aux_b_full_dev, &
1641 l_rows, l_cols, nblk_mult_max, nblk_mult, nblk, &
1642 np_ab_fine, np_rows, my_prow, &
1643 np_t_fine , np_cols, my_pcol, &
1644 np_dirs_fine, sm_count, debug, my_stream)
1645 call obj%timer%stop(
"gpu_copy_and_set_zeros_aux_ab_full_tn")
1647 call obj%timer%start(
"copy_and_set_zeros_aux_ab_full_tn")
1649 if (a_transposed)
then
1650 if (mod(np_t_fine,np_cols) == my_pcol)
then
1651 do j_block_loc_fine = 0, nblk_mult_max/nblk-1
1652 j_block_loc = (np_t_fine + j_block_loc_fine*np_cols_fine)/np_cols
1654 do i_block_loc_fine = 0, nblk_mult/nblk-1
1655 i_block_loc = (np_ab_fine + i_block_loc_fine*np_rows_fine)/np_rows
1657 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
1658 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
1660 if (nblk_rows_cut>0 .and. nblk_cols_cut>0)
then
1661 aux_a_full(1+i_block_loc_fine*nblk : nblk_rows_cut+i_block_loc_fine*nblk, &
1662 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk) = &
1663 a(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
1664 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk)
1667 call set_zeros_in_unused_block_part_&
1671 (aux_a_full, nblk, nblk_rows_cut, nblk_cols_cut, &
1672 i_block_loc_fine, j_block_loc_fine, 0, 0)
1679 do dnp_ab_t = 0, np_dirs_fine/np_dirs_t-1
1680 np_ab_t_fine = dnp_ab_t*np_dirs_t + my_pdir_t
1682 do j_block_loc_fine = 0, nblk_mult_max/nblk-1
1683 j_block_loc = (np_ab_t_fine + j_block_loc_fine*np_cols_fine)/np_cols
1685 do i_block_loc_fine = 0, nblk_mult/nblk-1
1686 i_block_loc = (np_ab_fine + i_block_loc_fine*np_rows_fine)/np_rows
1688 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
1689 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
1691 if (nblk_rows_cut>0 .and. nblk_cols_cut>0)
then
1692 aux_b_full(1+i_block_loc_fine*nblk : nblk_rows_cut+i_block_loc_fine*nblk, &
1693 1 +j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
1694 nblk_cols_cut+j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max) = &
1695 b(1+i_block_loc*nblk :nblk_rows_cut+i_block_loc*nblk, &
1696 1+j_block_loc*nblk :nblk_cols_cut+j_block_loc*nblk)
1699 call set_zeros_in_unused_block_part_&
1703 (aux_b_full, nblk, nblk_rows_cut, nblk_cols_cut, &
1704 i_block_loc_fine, j_block_loc_fine, 0, dnp_ab_t*nblk_mult_max)
1709 else if (b_transposed)
then
1711 if (mod(np_t_fine, np_rows) == my_prow)
then
1712 do i_block_loc_fine = 0, nblk_mult_max/nblk-1
1713 i_block_loc = (np_t_fine + i_block_loc_fine*np_rows_fine)/np_rows
1715 do j_block_loc_fine = 0, nblk_mult/nblk-1
1716 j_block_loc = (np_ab_fine + j_block_loc_fine*np_cols_fine)/np_cols
1718 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
1719 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
1721 if (nblk_rows_cut>0 .and. nblk_cols_cut>0)
then
1722 aux_b_full(1+i_block_loc_fine*nblk : nblk_rows_cut+i_block_loc_fine*nblk, &
1723 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk) = &
1724 b(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
1725 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk)
1728 call set_zeros_in_unused_block_part_&
1732 (aux_b_full, nblk, nblk_rows_cut, nblk_cols_cut, &
1733 i_block_loc_fine, j_block_loc_fine, 0, 0)
1739 do dnp_ab_t = 0, np_dirs_fine/np_dirs_t-1
1740 np_ab_t_fine = dnp_ab_t*np_dirs_t + my_pdir_t
1742 do i_block_loc_fine = 0, nblk_mult_max/nblk-1
1743 i_block_loc = (np_ab_t_fine + i_block_loc_fine*np_rows_fine)/np_rows
1745 do j_block_loc_fine = 0, nblk_mult/nblk-1
1746 j_block_loc = (np_ab_fine + j_block_loc_fine*np_cols_fine)/np_cols
1748 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
1749 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
1751 if (nblk_rows_cut>0 .and. nblk_cols_cut>0)
then
1752 aux_a_full(1 +i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
1753 nblk_rows_cut+i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max, &
1754 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk) = &
1755 a(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
1756 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk)
1759 call set_zeros_in_unused_block_part_&
1763 (aux_a_full, nblk, nblk_rows_cut, nblk_cols_cut, &
1764 i_block_loc_fine, j_block_loc_fine, dnp_ab_t*nblk_mult_max, 0)
1770 call obj%timer%stop(
"copy_and_set_zeros_aux_ab_full_tn")
1778 if (usegpu .and. .not. useccl)
then
1779 if (my_pdir_t == np_t)
then
1780 call obj%timer%start(
"gpu_memcpy")
1781 num = nblk_mult*nblk_mult
1782 if (a_transposed)
then
1783#ifdef WITH_GPU_STREAMS
1784 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", &
1785 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:nblk_mult,1:nblk_mult),&
1786 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1788 successgpu = gpu_memcpy(int(loc(aux_a_full),kind=c_intptr_t), aux_a_full_dev, &
1789 num*size_of_datatype, gpumemcpydevicetohost)
1790 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", successgpu)
1792 else if (b_transposed)
then
1793#ifdef WITH_GPU_STREAMS
1794 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", &
1795 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:nblk_mult),&
1796 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1798 successgpu = gpu_memcpy(int(loc(aux_b_full),kind=c_intptr_t), aux_b_full_dev, &
1799 nblk_mult*nblk_mult*size_of_datatype, gpumemcpydevicetohost)
1800 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", successgpu)
1803 call obj%timer%stop(
"gpu_memcpy")
1809 call obj%timer%start(
"ccl_bcast")
1810 nvtx_range_push(
"ccl_bcast")
1812 if (a_transposed)
then
1813 successgpu = ccl_bcast(aux_a_full_dev, aux_a_full_dev, int(k_datatype*nblk_mult*nblk_mult,kind=c_size_t), &
1814 ccldatatype, int(np_t,kind=c_int), ccl_comm_cols, my_stream)
1815 else if (b_transposed)
then
1816 successgpu = ccl_bcast(aux_b_full_dev, aux_b_full_dev, int(k_datatype*nblk_mult*nblk_mult,kind=c_size_t), &
1817 ccldatatype, int(np_t,kind=c_int), ccl_comm_rows, my_stream)
1820 if (.not. successgpu)
then
1821 print *,
"Error in ccl_bcast"
1825 successgpu = gpu_stream_synchronize(my_stream)
1826 check_stream_synchronize_gpu(
"elpa_pxgemm_multiply: ccl_bcast", successgpu)
1828 nvtx_range_pop(
"ccl_bcast aux_a_full_dev, aux_b_full_dev")
1829 call obj%timer%stop(
"ccl_bcast")
1831 call obj%timer%start(
"mpi_bcast")
1832 nvtx_range_push(
"MPI_Bcast(aux_a/b_full)")
1834 if (a_transposed)
then
1835 call mpi_bcast(aux_a_full, int(nblk_mult*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
1836 int(np_t,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
1837 else if (b_transposed)
then
1838 call mpi_bcast(aux_b_full, int(nblk_mult*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
1839 int(np_t,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
1842 nvtx_range_pop(
"MPI_Bcast(aux_a/b_full)")
1843 call obj%timer%stop(
"mpi_bcast")
1846 if (usegpu .and. .not. useccl)
then
1847 call obj%timer%start(
"gpu_memcpy")
1848 num = nblk_mult*nblk_mult
1849 if (a_transposed)
then
1850#ifdef WITH_GPU_STREAMS
1851 call gpu_memcpy_async_and_stream_synchronize(
"elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", &
1852 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:nblk_mult,1:nblk_mult), &
1853 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1855 successgpu = gpu_memcpy(aux_a_full_dev, int(loc(aux_a_full),kind=c_intptr_t), &
1856 nblk_mult*nblk_mult*size_of_datatype, gpumemcpyhosttodevice)
1857 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", successgpu)
1859 else if (b_transposed)
then
1860#ifdef WITH_GPU_STREAMS
1861 call gpu_memcpy_async_and_stream_synchronize (
"elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", &
1862 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:nblk_mult), &
1863 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1865 successgpu = gpu_memcpy(aux_b_full_dev, int(loc(aux_b_full),kind=c_intptr_t), &
1866 nblk_mult*nblk_mult*size_of_datatype, gpumemcpyhosttodevice)
1867 check_memcpy_gpu(
"elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", successgpu)
1870 call obj%timer%stop(
"gpu_memcpy")
1872#endif /* WITH_MPI */
1875 call obj%timer%start(
"gpublas")
1876 nvtx_range_push(
"gpublas")
1877 if (a_transposed)
then
1878 call gpublas_precision_gemm(trans_a_cchar, trans_b_cchar, &
1880 nblk_mult_max*(np_dirs_fine/np_dirs_t), &
1882 aux_a_full_dev, nblk_mult, &
1883 aux_b_full_dev, nblk_mult, zero, &
1884 tmp1_full_dev , nblk_mult, gpuhandle)
1885 else if (b_transposed)
then
1886 call gpublas_precision_gemm(trans_a_cchar, trans_b_cchar, &
1887 nblk_mult_max*(np_dirs_fine/np_dirs_t), &
1890 aux_a_full_dev, nblk_mult_max*(np_dirs_fine/np_dirs_t), &
1891 aux_b_full_dev, nblk_mult, zero, &
1892 tmp1_full_dev , nblk_mult_max*(np_dirs_fine/np_dirs_t), gpuhandle)
1894 if (wantdebug) successgpu = gpu_devicesynchronize()
1895 nvtx_range_pop(
"gpublas")
1896 call obj%timer%stop(
"gpublas")
1898 call obj%timer%start(
"blas")
1899 if (a_transposed)
then
1900 call precision_gemm(trans_a, trans_b, &
1901 int(nblk_mult, kind=blas_kind), &
1902 int(nblk_mult_max*(np_dirs_fine/np_dirs_t), kind=blas_kind), &
1903 int(nblk_mult, kind=blas_kind), one, &
1904 aux_a_full(1:nblk_mult,1:nblk_mult), int(nblk_mult, kind=blas_kind), &
1905 aux_b_full(1:nblk_mult,1:nblk_mult_max*(np_dirs_fine/np_dirs_t)), &
1906 int(nblk_mult, kind=blas_kind), zero, &
1907 tmp1_full(1:nblk_mult,1:nblk_mult_max*(np_dirs_fine/np_dirs_t)), &
1908 int(nblk_mult, kind=blas_kind))
1909 else if (b_transposed)
then
1910 call precision_gemm(trans_a, trans_b, &
1911 int(nblk_mult_max*(np_dirs_fine/np_dirs_t), kind=blas_kind), &
1912 int(nblk_mult, kind=blas_kind), &
1913 int(nblk_mult, kind=blas_kind), one, &
1914 aux_a_full(1:nblk_mult_max*(np_dirs_fine/np_dirs_t),1:nblk_mult),&
1915 int(nblk_mult_max*(np_dirs_fine/np_dirs_t), kind=blas_kind), &
1916 aux_b_full(1:nblk_mult,1:nblk_mult), int(nblk_mult, kind=blas_kind), zero, &
1917 tmp1_full(1:nblk_mult_max*(np_dirs_fine/np_dirs_t),1:nblk_mult), &
1918 int(nblk_mult_max*(np_dirs_fine/np_dirs_t), kind=blas_kind))
1920 call obj%timer%stop(
"blas")
1924 if (a_transposed)
then
1926 num_c = nblk_mult_max*(np_dirs_fine/np_dirs_t)
1927 else if (b_transposed)
then
1928 num_r = nblk_mult_max*(np_dirs_fine/np_dirs_t)
1932 num = nblk_mult*nblk_mult_max*(np_dirs_fine/np_dirs_t)
1934 if (usegpu .and. .not. useccl)
then
1935 call obj%timer%start(
"gpu_memcpy")
1936#ifdef WITH_GPU_STREAMS
1937 call gpu_memcpy_async_and_stream_synchronize &
1938 (
"elpa_pxgemm_multiply: tmp1_full_dev -> tmp1_full", tmp1_full_dev, 0_c_intptr_t, &
1939 tmp1_full(1:num_r,1:num_c), &
1940 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1942 successgpu = gpu_memcpy(int(loc(tmp1_full),kind=c_intptr_t), tmp1_full_dev, &
1943 num*size_of_datatype, gpumemcpydevicetohost)
1944 check_memcpy_gpu(
"elpa_pxgemm_multiply: tmp1_full_dev -> tmp1_full", successgpu)
1946 call obj%timer%stop(
"gpu_memcpy")
1950 call obj%timer%start(
"ccl_reduce")
1951 nvtx_range_push(
"ccl_reduce tmp1_full_dev")
1953 successgpu = ccl_reduce(tmp1_full_dev, tmp1_full_dev, int(k_datatype*num, kind=c_size_t), &
1954 ccldatatype, cclsum, int(np, kind=c_int), ccl_comm_dirs, my_stream)
1956 if (.not. successgpu)
then
1957 print *,
"Error in ccl_reduce"
1961 successgpu = gpu_stream_synchronize(my_stream)
1962 check_stream_synchronize_gpu(
"elpa_pxgemm_multiply: ccl_bcast", successgpu)
1964 nvtx_range_pop(
"ccl_bcast aux_a_full_dev, aux_b_full_dev")
1965 call obj%timer%stop(
"ccl_reduce")
1967 call obj%timer%start(
"mpi_reduce")
1968 if (my_pdir==np)
then
1969 call mpi_reduce(mpi_in_place, tmp1_full, int(num, kind=mpi_kind), &
1970 mpi_math_datatype_precision, mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_dirs,kind=mpi_kind), mpierr)
1972 call mpi_reduce(tmp1_full , tmp1_full, int(num, kind=mpi_kind), &
1973 mpi_math_datatype_precision, mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_dirs,kind=mpi_kind), mpierr)
1975 call obj%timer%stop(
"mpi_reduce")
1978 if (usegpu .and. .not. useccl)
then
1979 call obj%timer%start(
"gpu_memcpy")
1980#ifdef WITH_GPU_STREAMS
1981 call gpu_memcpy_async_and_stream_synchronize &
1982 (
"elpa_pxgemm_multiply: tmp1_full -> tmp1_full_dev", 0_c_intptr_t, tmp1_full_dev, &
1983 tmp1_full(1:num_r,1:num_c), &
1984 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1986 successgpu = gpu_memcpy(tmp1_full_dev, int(loc(tmp1_full),kind=c_intptr_t), &
1987 num*size_of_datatype, gpumemcpyhosttodevice)
1988 check_memcpy_gpu(
"elpa_pxgemm_multiply: tmp1_full -> tmp1_full_dev", successgpu)
1990 call obj%timer%stop(
"gpu_memcpy")
1992#endif /* WITH_MPI */
1995 if (my_pdir==np)
then
1997 call obj%timer%start(
"gpu_update_c_tn_nt")
1999 dnp_ab = np_ab_fine/np_dirs
2000 if (dnp_ab == 0) beta_int = 0
2004 call gpu_update_c_tn_nt(precision_char, a_transposed_int, &
2005 c_dev, tmp1_full_dev, beta_int, &
2006 l_rows, l_cols, nblk_mult_max, nblk_mult, nblk, &
2007 np_rows, np_cols, np_dirs_fine, &
2008 np_dirs_t, my_pdir_t, np_fine, &
2009 sm_count, debug, my_stream)
2010 call obj%timer%stop(
"gpu_update_c_tn_nt")
2013 call obj%timer%start(
"update_c_tn_nt")
2014 first_call = .false.
2015 dnp_ab = np_ab_fine/np_dirs
2016 if (dnp_ab == 0) first_call = .true.
2018 do dnp_ab_t = 0, np_dirs_fine/np_dirs_t-1
2019 np_ab_t_fine = dnp_ab_t*np_dirs_t + my_pdir_t
2021 if (a_transposed)
then
2022 do j_block_loc_fine = 0, nblk_mult_max/nblk-1
2023 j_block_loc = (np_ab_t_fine + j_block_loc_fine*np_cols_fine)/np_cols
2025 do i_block_loc_fine = 0, nblk_mult/nblk-1
2026 i_block_loc = (np_fine + i_block_loc_fine*np_rows_fine)/np_rows
2028 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
2029 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
2031 if (nblk_rows_cut>0 .and. nblk_cols_cut>0)
then
2032 if (first_call)
then
2033 c(1+i_block_loc*nblk:nblk_rows_cut+i_block_loc*nblk, &
2034 1+j_block_loc*nblk:nblk_cols_cut+j_block_loc*nblk) = &
2035 tmp1_full(1+i_block_loc_fine*nblk :nblk_rows_cut+i_block_loc_fine*nblk, &
2036 1 +j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
2037 nblk_cols_cut+j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max)
2039 c(1+i_block_loc*nblk:nblk_rows_cut+i_block_loc*nblk, &
2040 1+j_block_loc*nblk:nblk_cols_cut+j_block_loc*nblk) = &
2041 c(1+i_block_loc*nblk:nblk_rows_cut+i_block_loc*nblk, &
2042 1+j_block_loc*nblk:nblk_cols_cut+j_block_loc*nblk) + &
2043 tmp1_full(1+i_block_loc_fine*nblk :nblk_rows_cut+i_block_loc_fine*nblk, &
2044 1 +j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
2045 nblk_cols_cut+j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max)
2050 else if (b_transposed)
then
2051 do i_block_loc_fine = 0, nblk_mult_max/nblk-1
2052 i_block_loc = (np_ab_t_fine + i_block_loc_fine*np_rows_fine)/np_rows
2054 do j_block_loc_fine = 0, nblk_mult/nblk-1
2055 j_block_loc = (np_fine + j_block_loc_fine*np_cols_fine)/np_cols
2057 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
2058 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
2060 if (nblk_rows_cut>0 .and. nblk_cols_cut>0)
then
2061 if (first_call)
then
2062 c(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
2063 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk) = &
2064 tmp1_full(1 +i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
2065 nblk_rows_cut+i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max, &
2066 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk)
2068 c(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
2069 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk) = &
2070 c(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
2071 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk) + &
2072 tmp1_full(1 +i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
2073 nblk_rows_cut+i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max, &
2074 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk)
2081 call obj%timer%stop(
"update_c_tn_nt")
2088 nvtx_range_pop(
"do np = 0, np_dirs-1")
2090 call obj%timer%stop(
"main_loop_tn_nt")
2092#if !defined(DEVICE_POINTER)
2094 call obj%timer%start(
"gpu_memcpy")
2095 nvtx_range_push(
"gpu_memcpy: c_dev->c")
2096#ifdef WITH_GPU_STREAMS
2097 call gpu_memcpy_async_and_stream_synchronize &
2098 (
"elpa_pxgemm_multiply: c_dev -> c", c_dev, 0_c_intptr_t, c(1:ldc,1:ldccols), &
2099 1, 1, ldc*ldccols*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
2101 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, &
2102 ldc*ldccols*size_of_datatype, gpumemcpydevicetohost)
2103 check_memcpy_gpu(
"elpa_pxgemm_multiply: c_dev -> c", successgpu)
2105 nvtx_range_pop(
"gpu_memcpy: c_dev->c")
2106 call obj%timer%stop(
"gpu_memcpy")
2108#endif /* DEVICE_POINTER */
2110 deallocate(tmp1_full, stat=istat, errmsg=errormessage)
2111 call check_alloc(
"elpa_pxgemm_multiply",
"tmp1_full", istat, errormessage)
2114 successgpu = gpu_free(tmp1_full_dev)
2115 check_dealloc_gpu(
"elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
2125 deallocate(aux_a_full, stat=istat, errmsg=errormessage)
2126 call check_alloc(
"elpa_pxgemm_multiply",
"aux_a_full", istat, errormessage)
2128 deallocate(aux_b_full, stat=istat, errmsg=errormessage)
2129 call check_alloc(
"elpa_pxgemm_multiply",
"aux_b_full", istat, errormessage)
2132 successgpu = gpu_free(aux_a_full_dev)
2133 check_dealloc_gpu(
"elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
2135 successgpu = gpu_free(aux_b_full_dev)
2136 check_dealloc_gpu(
"elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
2142#if !defined(DEVICE_POINTER)
2143 successgpu = gpu_free(a_dev)
2144 check_dealloc_gpu(
"elpa_pxgemm_multiply: a_dev", successgpu)
2146 successgpu = gpu_free(b_dev)
2147 check_dealloc_gpu(
"elpa_pxgemm_multiply: b_dev", successgpu)
2149 successgpu = gpu_free(c_dev)
2150 check_dealloc_gpu(
"elpa_pxgemm_multiply: c_dev", successgpu)
2151#endif /* DEVICE_POINTER */
2154#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION)
2156 successgpu = gpu_get_last_error()
2157 if (.not. successgpu)
then
2158 write(error_unit,*)
"elpa1_template: GPU error detected via gpu_get_last_error(). Aborting..."
2159 write(error_unit,*)
"Rerun the program with the debug option e.g. 'export ELPA_DEFAULT_debug=1'"
2165 call obj%timer%stop(
"elpa_pxgemm_multiply"//trim(tnstring)//trim(gpustring))
2167 nvtx_range_pop(
"elpa_pxgemm_multiply")
void set_gpu_parameters(int *gpuMemcpyHostToDevice, int *gpuMemcpyDeviceToHost)
Definition gpu_vendor_agnostic_layer.c:62
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_elpa_pxgemm_helpers.F90:46
integer function find_nblk_mult_dirs(l_dirs, nblk, np_dirs, np_dir_fine, lcm)
Definition mod_elpa_pxgemm_helpers.F90:56
Definition mod_elpa_pxgemm_transpose.F90:46