Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.01.002
Loading...
Searching...
No Matches
elpa_pxgemm_multiply_template.F90
Go to the documentation of this file.
1! Copyright 2024, P. Karpov, MPCDF
2!
3! This file is part of ELPA.
4!
5! The ELPA library was originally created by the ELPA consortium,
6! consisting of the following organizations:
7!
8! - Max Planck Computing and Data Facility (MPCDF), formerly known as
9! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
10! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
11! Informatik,
12! - Technische Universität München, Lehrstuhl für Informatik mit
13! Schwerpunkt Wissenschaftliches Rechnen ,
14! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
15! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
16! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
17! and
18! - IBM Deutschland GmbH
19!
20! This particular source code file contains additions, changes and
21! enhancements authored by Intel Corporation which is not part of
22! the ELPA consortium.
23!
24! More information can be found here:
25! http://elpa.mpcdf.mpg.de/
26!
27! ELPA is free software: you can redistribute it and/or modify
28! it under the terms of the version 3 of the license of the
29! GNU Lesser General Public License as published by the Free
30! Software Foundation.
31!
32! ELPA is distributed in the hope that it will be useful,
33! but WITHOUT ANY WARRANTY; without even the implied warranty of
34! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35! GNU Lesser General Public License for more details.
36!
37! You should have received a copy of the GNU Lesser General Public License
38! along with ELPA. If not, see <http://www.gnu.org/licenses/>
39!
40! ELPA reflects a substantial effort on the part of the original
41! ELPA consortium, and we ask you to respect the spirit of the
42! license that we chose: i.e., please contribute any changes you
43! may have back to the original ELPA library distribution, and keep
44! any derivatives of ELPA under the same license that we chose for
45! the original distribution, the GNU Lesser General Public License.
46
47! This file was written by P. Karpov, MPCDF
48
49#include "../general/sanity.F90"
50#include "../general/error_checking.inc"
51
52
53#undef USE_CCL_PXGEMM
54#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
55#define USE_CCL_PXGEMM
56#endif
57
58 use elpa1_compute
59 use elpa_mpi
60 use precision
62 use, intrinsic :: iso_c_binding
63 use elpa_gpu
64 use mod_check_for_gpu
65 use elpa_blas_interfaces
66 use elpa_utilities, only : local_index, greatest_common_divisor, least_common_multiple, &
67 check_deallocate_f, check_dealloc_gpu_f, &
68 check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &
69 check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &
70 check_host_register_gpu_f, check_alloc, error_unit
73 use mod_query_gpu_usage
74#ifdef WITH_GPU_STREAMS
75 use elpa_gpu_util
76#endif
77#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
78 use cuda_functions ! for NVTX labels
79#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
80 use hip_functions ! for ROCTX labels
81#endif
82 use elpa_ccl_gpu
83 use multiply_a_b_gpu
84 implicit none
85
86#include "../../src/general/precision_kinds.F90"
87 class(elpa_abstract_impl_t), intent(inout) :: obj
88
89 character*1 :: trans_a, trans_b
90 character(1, c_char) :: trans_a_cchar, trans_b_cchar
91 integer(kind=ik), intent(in) :: ldb, ldbCols, ldc, ldcCols
92 integer(kind=ik) :: lda, ldaCols
93 integer(kind=ik) :: na, ncb
94#ifdef DEVICE_POINTER
95 type(c_ptr) :: aDev, bDev, cDev
96 math_datatype(kind=rck), allocatable :: a(:,:), b(:,:), c(:,:) ! used for d_ptr and (.not. USE_CCL_PXGEMM)
97#else /* DEVICE_POINTER */
98#ifdef USE_ASSUMED_SIZE
99 math_datatype(kind=rck) :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
100#else
101 math_datatype(kind=rck) :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbcols), c(ldc,ldccols)
102#endif
103#endif /* DEVICE_POINTER */
104 math_datatype(kind=rck), allocatable :: at_full(:,:), bt_full(:,:)
105 math_datatype(kind=rck), allocatable :: buf_send(:,:), buf_recv(:,:), buf_self(:,:), at_col(:,:), bt_row(:,:) ! needed for TT case
106
107 integer(kind=ik) :: my_pdir, my_pdir_t, np_dirs, np_dirs_t
108 integer(kind=ik) :: np_rows_fine, np_cols_fine, np_dirs_fine, np_fine, np_t_fine, np_bc_fine, &
109 np_ab_fine, np_ab_t_fine, dnp_ab, dnp_ab_t
110 integer(kind=ik) :: LCM, nblk_mult_rows, nblk_mult_cols, i_block_loc_fine, j_block_loc_fine
111 integer(kind=ik) :: nblk_mult_rows_max, nblk_mult_cols_max, nblk_rows_cut, nblk_cols_cut
112 integer(kind=ik) :: nblk_mult_max, nblk_mult_min
113 integer(kind=ik) :: l_cols, l_rows
114 integer(kind=ik) :: l_rows_max, l_cols_max, l_rows_min, l_cols_min
115 integer(kind=ik) :: l_rows_source, l_cols_source
116 integer(kind=ik) :: np, nb, nblk_mult, lrs, lre, lcs, lce
117 integer(kind=ik) :: i_block_loc, j_block_loc
118 integer(kind=ik) :: np_bc
119 integer(kind=ik) :: np_t
120 integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
121
122 logical :: a_transposed, b_transposed
123 integer(kind=ik) :: a_transposed_int, b_transposed_int
124
125 math_datatype(kind=rck) :: beta
126 integer(kind=ik) :: beta_int
127 math_datatype(kind=rck), pointer, contiguous :: aux_a_full(:,:), aux_b_full(:,:), tmp1_full(:,:)
128 logical :: wantDebug
129 integer(kind=ik) :: istat, debug
130 character(200) :: errorMessage
131 character(20) :: gpuString, tnString
132 logical :: success, successGPU, successGPU2
133 logical :: useGPU
134 logical :: isSquareGrid, first_call
135 integer(kind=c_int) :: numGPU, blocking, SM_count
136
137 ! MPI-related
138 integer(kind=MPI_KIND) :: mpierr
139 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
140 integer(kind=ik) :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
141 integer(kind=ik) :: mpi_comm_dirs
142
143 integer(kind=ik) :: nblk, nblk_cut, nblk_cut_row, nblk_cut_col
144 integer(kind=ik) :: error
145 integer(kind=c_intptr_t) :: aux_a_full_dev, aux_b_full_dev, tmp1_full_dev
146 integer(kind=c_intptr_t) :: at_col_dev, bt_row_dev
147 integer(kind=c_intptr_t) :: buf_send_dev, buf_recv_dev, buf_self_dev
148
149 integer(kind=c_intptr_t) :: a_dev
150 integer(kind=c_intptr_t) :: b_dev
151 integer(kind=c_intptr_t) :: c_dev
152
153 integer(kind=c_intptr_t) :: num, num_a, num_b, num_r, num_c
154 integer(kind=c_intptr_t) :: aux_off, b_off
155 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
156 &precision&
157 &_&
158 &math_datatype
159
160 integer(kind=c_intptr_t) :: gpuHandle, my_stream
161 integer(kind=c_int) :: gpu_pxgemm_multiply
162
163 logical :: useCCL
164 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols, ccl_comm_all, ccl_comm_dirs
165 integer(kind=c_int) :: cclDataType
166 integer(kind=ik) :: k_datatype
167
168 nvtx_range_push("elpa_pxgemm_multiply")
169
170 success = .true.
171 usegpu = .false.
172
173 a_transposed = .false.
174 b_transposed = .false.
175 a_transposed_int = 0
176 b_transposed_int = 0
177 trans_a_cchar = 'N'
178 trans_b_cchar = 'N'
179 if (trans_a=='t' .or. trans_a=='T' .or. trans_a=='c' .or. trans_a=='C') then
180 a_transposed = .true.
181 a_transposed_int = 1
182 trans_a_cchar = blas_trans_or_conj
183 endif
184 if (trans_b=='t' .or. trans_b=='T' .or. trans_b=='c' .or. trans_b=='C') then
185 b_transposed = .true.
186 b_transposed_int = 1
187 trans_b_cchar = blas_trans_or_conj
188 endif
189
190 call obj%get("debug", debug, error)
191 if (error .ne. elpa_ok) then
192 write(error_unit,*) "elpa_pxgemm_multiply: Problem getting option for debug settings. Aborting..."
193 success = .false.
194 return
195 endif
196 if (debug == 1) then
197 wantdebug = .true.
198 else
199 wantdebug = .false.
200 endif
201
202#if defined(DEVICE_POINTER)
203 usegpu = .true.
204#else /* DEVICE_POINTER */
205#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
206 if (.not.(query_gpu_usage(obj, "elpa_pxgemm_multiply", usegpu))) then
207 print *,"elpa_pxgemm_multiply: Problem querrying settings for GPU Aborting..."
208 stop 1
209 endif
210#endif
211
212 ! check whether the above setting should be overriden
213 if (obj%is_set("gpu_pxgemm_multiply") == 1) then
214 call obj%get("gpu_pxgemm_multiply", gpu_pxgemm_multiply, error)
215 if (error .ne. elpa_ok) then
216 print *,"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
217 stop 1
218 endif
219 if (usegpu .and. gpu_pxgemm_multiply .eq. 0) then
220 usegpu = .false.
221 else if (.not.(usegpu) .and. gpu_pxgemm_multiply .eq. 1) then
222 usegpu = .true.
223 else
224 endif
225 else
226 ! no override by user
227 ! keep setting as found before
228 endif
229#endif /* DEVICE_POINTER */
230
231 if(usegpu) then
232 gpustring = "_gpu"
233 else
234 gpustring = ""
235 endif
236
237 if (a_transposed .and. b_transposed) then
238 tnstring = "_tt"
239 else if (.not. a_transposed .and. b_transposed) then
240 tnstring = "_nt"
241 else if (a_transposed .and. .not. b_transposed) then
242 tnstring = "_tn"
243 else if (.not. a_transposed .and. .not. b_transposed) then
244 tnstring = "_nn"
245 endif
246
247 call obj%timer%start("elpa_pxgemm_multiply"//trim(tnstring)//trim(gpustring))
248
249 na = obj%na
250 nblk = obj%nblk
251 lda = obj%local_nrows
252 ldacols = obj%local_ncols
253
254 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
255 mpi_comm_cols = obj%mpi_setup%mpi_comm_cols
256 mpi_comm_rows = obj%mpi_setup%mpi_comm_rows
257
258 myid = obj%mpi_setup%myRank_comm_parent
259 my_prow = obj%mpi_setup%myRank_comm_rows
260 my_pcol = obj%mpi_setup%myRank_comm_cols
261
262 np_rows = obj%mpi_setup%nRanks_comm_rows
263 np_cols = obj%mpi_setup%nRanks_comm_cols
264
265 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b
266 l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
267
268 useccl = .false.
269 if (usegpu) then
270 call obj%timer%start("check_for_gpu")
271 if (check_for_gpu(obj, myid, numgpu)) then
272 ! set the neccessary parameters
273 call set_gpu_parameters()
274 else
275 print *,"GPUs are requested but not detected! Aborting..."
276 success = .false.
277 return
278 endif
279 call obj%timer%stop("check_for_gpu")
280
281 sm_count = obj%gpu_setup%gpuSMcount
282 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
283
284#ifdef WITH_GPU_STREAMS
285 my_stream = obj%gpu_setup%my_stream
286#endif
287
288#if defined(USE_CCL_PXGEMM)
289 useccl = .true.
290
291 my_stream = obj%gpu_setup%my_stream
292 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
293 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
294 ccl_comm_all = obj%gpu_setup%ccl_comm_all
295
296#if REALCASE == 1 && defined(DOUBLE_PRECISION)
297 ccldatatype = ccldouble
298 k_datatype = 1
299#elif REALCASE == 1 && defined(SINGLE_PRECISION)
300 ccldatatype = cclfloat
301 k_datatype = 1
302#elif COMPLEXCASE == 1 && defined(DOUBLE_PRECISION)
303 ccldatatype = ccldouble
304 k_datatype = 2
305#elif COMPLEXCASE == 1 && defined(SINGLE_PRECISION)
306 ccldatatype = cclfloat
307 k_datatype = 2
308#endif
309#endif /* defined(USE_CCL_PXGEMM) */
310
311
312#if defined(DEVICE_POINTER)
313 a_dev = transfer(adev, a_dev)
314 b_dev = transfer(bdev, b_dev)
315 c_dev = transfer(cdev, c_dev)
316
317#else /* DEVICE_POINTER */
318 successgpu = gpu_malloc(a_dev, lda*ldacols*size_of_datatype)
319 check_alloc_gpu("elpa_pxgemm_multiply: a_dev", successgpu)
320
321 successgpu = gpu_malloc(b_dev, ldb*ldbcols*size_of_datatype)
322 check_alloc_gpu("elpa_pxgemm_multiply: b_dev", successgpu)
323
324 successgpu = gpu_malloc(c_dev, ldc*ldccols*size_of_datatype)
325 check_alloc_gpu("elpa_pxgemm_multiply: c_dev", successgpu)
326
327 call obj%timer%start("gpu_memcpy")
328 nvtx_range_push("gpu_memcpy: a->a_dev, b->b_dev")
329 ! copy a to a_dev
330 num = lda*ldacols*size_of_datatype
331#ifdef WITH_GPU_STREAMS
332 call gpu_memcpy_async_and_stream_synchronize &
333 ("elpa_pxgemm_multiply: a to a_dev", a_dev, 0_c_intptr_t, &
334 a(1:lda,1:ldacols), &
335 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
336#else
337 successgpu = gpu_memcpy(a_dev, int(loc(a),kind=c_intptr_t), num, gpumemcpyhosttodevice)
338 check_memcpy_gpu("elpa_pxgemm_multiply: a to a_dev", successgpu)
339#endif
340
341 ! copy b to b_dev
342#ifdef WITH_GPU_STREAMS
343 call gpu_memcpy_async_and_stream_synchronize &
344 ("elpa_pxgemm_multiply: b to b_dev", b_dev, 0_c_intptr_t, &
345 b(1:ldb,1:ldbcols), &
346 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
347#else
348 successgpu = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t), num, gpumemcpyhosttodevice)
349 check_memcpy_gpu("elpa_pxgemm_multiply: b to b_dev", successgpu)
350#endif
351
352 nvtx_range_pop("gpu_memcpy: a->a_dev, b->b_dev")
353 call obj%timer%stop("gpu_memcpy")
354
355 ! this is needed to protect from NaNs, until we introduce beta parameter and copy c to c_dev (c = alpha*a*b + beta*c)
356#ifdef WITH_GPU_STREAMS
357 successgpu = gpu_memset_async(c_dev, 0, ldc*ldccols*size_of_datatype, my_stream)
358#else
359 successgpu = gpu_memset(c_dev, 0, ldc*ldccols*size_of_datatype)
360#endif
361 check_memcpy_gpu("elpa_pxgemm_multiply: memset c_dev", successgpu)
362#endif /* DEVICE_POINTER */
363 endif ! useGPU
364
365 issquaregrid = .false.
366 if (np_rows == np_cols) issquaregrid = .true.
367
368#if WITH_MPI
369 ! overlap async memcopy to GPU with MPI
370#ifdef WITH_GPU_STREAMS
371 if (usegpu) then
372 successgpu = gpu_stream_synchronize(my_stream)
373 check_stream_synchronize_gpu("elpa_pxgemm_multiply: stream synchronize", successgpu)
374 endif
375#endif
376
377 ! l_rows_max = l_rows
378 call mpi_allreduce(l_rows, l_rows_max, 1_mpi_kind, mpi_integer, mpi_max, int(mpi_comm_all,kind=mpi_kind), mpierr)
379 call mpi_allreduce(l_cols, l_cols_max, 1_mpi_kind, mpi_integer, mpi_max, int(mpi_comm_all,kind=mpi_kind), mpierr)
380 call mpi_allreduce(l_rows, l_rows_min, 1_mpi_kind, mpi_integer, mpi_min, int(mpi_comm_all,kind=mpi_kind), mpierr)
381 call mpi_allreduce(l_cols, l_cols_min, 1_mpi_kind, mpi_integer, mpi_min, int(mpi_comm_all,kind=mpi_kind), mpierr)
382#else /* WITH_MPI */
383 l_rows_max = l_rows
384 l_cols_max = l_cols
385 l_rows_min = l_rows
386 l_cols_min = l_cols
387#endif /* WITH_MPI */
388
389 nblk_mult = greatest_common_divisor(l_rows_max, l_cols_max)
390!______________________________________________________________________________________________
391
392 if (issquaregrid) then
393 !nblk_mult = l_rows_max ! = l_cols_max
394!_______________________________________________
395
396 if (.not. a_transposed .and. b_transposed .or. &
397 a_transposed .and. .not. b_transposed ) then
398 if (wantdebug .and. myid==0) print *, "elpa_pxgemm_multiply NEW: SQUARE_GRID start: (a_transposed XOR b_transposed)" ! PETERDEBUG
399
400 allocate(aux_a_full(l_rows_max, nblk_mult), stat=istat, errmsg=errormessage)
401 check_allocate("elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
402
403 allocate(aux_b_full(nblk_mult, l_cols_max), stat=istat, errmsg=errormessage)
404 check_allocate("elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
405
406 allocate(tmp1_full(l_rows_max, l_cols_max), stat=istat, errmsg=errormessage)
407 check_allocate("elpa_pxgemm_multiply: tmp1_full", istat, errormessage)
408
409 ! PETERDEBUG: is it possible to use the original GPU memory, without copying?
410 if (usegpu) then
411 successgpu = gpu_malloc(aux_a_full_dev, nblk_mult*nblk_mult*size_of_datatype)
412 check_alloc_gpu("elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
413
414 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*nblk_mult*size_of_datatype)
415 check_alloc_gpu("elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
416
417 successgpu = gpu_malloc(tmp1_full_dev, nblk_mult*nblk_mult*size_of_datatype)
418 check_alloc_gpu("elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
419 endif
420
421 ! dir = row/col for TN/NT
422 if (a_transposed) then
423 my_pdir = my_prow
424 np_dirs = np_rows
425 mpi_comm_dirs = mpi_comm_rows
426 ccl_comm_dirs = obj%gpu_setup%ccl_comm_rows
427 else if (b_transposed) then
428 my_pdir = my_pcol
429 np_dirs = np_cols
430 mpi_comm_dirs = mpi_comm_cols
431 ccl_comm_dirs = obj%gpu_setup%ccl_comm_cols
432 endif
433
434 call obj%timer%start("main_loop_square_grid_tn_nt")
435
436 ! main loop: build up the result matrix by processor rows/cols for TN/NT
437 do np = 0, np_dirs-1
438 nvtx_range_push("do np = 0, np_dirs-1")
439 if (wantdebug .and. myid==0) print *, "np = ", np ! PETERDEBUG
440
441 ! In this turn, procs of row/col np assemble the result for TN/NT case
442
443 np_t=np ! np, that posesses the given "col of a"/"row of b" for TN/NT
444
445 if (a_transposed) then
446 if (usegpu) then
447 if (np_t == my_pcol) call gpu_copy_and_set_zeros_aux_full(precision_char, a_dev, aux_a_full_dev, l_rows, l_cols, &
448 nblk_mult, debug, my_stream)
449 call gpu_copy_and_set_zeros_aux_full(precision_char, b_dev, aux_b_full_dev, l_rows, l_cols, &
450 nblk_mult, debug, my_stream)
451 else ! useGPU
452 if (np_t == my_pcol) then
453 aux_a_full(1:l_rows,1:l_cols) = a(1:l_rows,1:l_cols)
454 if (l_rows < nblk_mult) aux_a_full(l_rows+1:nblk_mult,1:l_cols) = 0
455 if (l_cols < nblk_mult) aux_a_full(1:l_rows,l_cols+1:nblk_mult) = 0
456 if (l_rows < nblk_mult .and. l_cols < nblk_mult) aux_a_full(l_rows+1:nblk_mult,l_cols+1:nblk_mult) = 0
457 endif
458 aux_b_full(1:l_rows,1:l_cols) = b(1:l_rows,1:l_cols)
459 if (l_rows < nblk_mult) aux_b_full(l_rows+1:nblk_mult,1:l_cols) = 0
460 if (l_cols < nblk_mult) aux_b_full(1:l_rows,l_cols+1:nblk_mult) = 0
461 if (l_rows < nblk_mult .and. l_cols < nblk_mult) aux_b_full(l_rows+1:nblk_mult,l_cols+1:nblk_mult) = 0
462 endif ! useGPU
463 else if (b_transposed) then
464 if (usegpu) then
465 if (np_t == my_prow) call gpu_copy_and_set_zeros_aux_full(precision_char, b_dev, aux_b_full_dev, l_rows, l_cols, &
466 nblk_mult, debug, my_stream)
467 call gpu_copy_and_set_zeros_aux_full(precision_char, a_dev, aux_a_full_dev, l_rows, l_cols, &
468 nblk_mult, debug, my_stream)
469
470 else
471 if (np_t == my_prow) then
472 aux_b_full(1:l_rows,1:l_cols) = b(1:l_rows,1:l_cols)
473 if (l_rows < nblk_mult) aux_b_full(l_rows+1:nblk_mult,1:l_cols) = 0
474 if (l_cols < nblk_mult) aux_b_full(1:l_rows,l_cols+1:nblk_mult) = 0
475 if (l_rows < nblk_mult .and. l_cols < nblk_mult) aux_b_full(l_rows+1:nblk_mult,l_cols+1:nblk_mult) = 0
476 endif
477 aux_a_full(1:l_rows,1:l_cols) = a(1:l_rows,1:l_cols) ! aux_a_full -> aux_ab_nontransposed_full
478 if (l_rows < nblk_mult) aux_a_full(l_rows+1:nblk_mult,1:l_cols) = 0
479 if (l_cols < nblk_mult) aux_a_full(1:l_rows,l_cols+1:nblk_mult) = 0
480 if (l_rows < nblk_mult .and. l_cols < nblk_mult) aux_a_full(l_rows+1:nblk_mult,l_cols+1:nblk_mult) = 0
481 endif ! useGPU
482 endif
483
484 ! aux_a_full/aux_b_full -> aux_ab_trans (aux_ab) ! auxillary buffer for a matrix that is to be transposed: a/b in TN/NT case
485 ! aux_b_full/aux_a_full -> aux_ab_nontr (aux_ba) ! auxillary buffer for a matrix that is not to be transposed: b/a in TN/NT case
486
487#ifdef WITH_MPI
488 ! copy data to host for bcast, if needed
489 if (usegpu .and. .not. useccl) then
490 num = nblk_mult*nblk_mult
491#ifdef WITH_GPU_STREAMS
492 if (a_transposed) then
493 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", &
494 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:nblk_mult,1:nblk_mult), &
495 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
496 else if (b_transposed) then
497 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", &
498 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:nblk_mult), &
499 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
500 endif
501#else
502 if (a_transposed) then
503 successgpu = gpu_memcpy(int(loc(aux_a_full),kind=c_intptr_t), aux_a_full_dev, &
504 num*size_of_datatype, gpumemcpydevicetohost)
505 check_memcpy_gpu("elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", successgpu)
506 else if (b_transposed) then
507 successgpu = gpu_memcpy(int(loc(aux_b_full),kind=c_intptr_t), aux_b_full_dev, &
508 num*size_of_datatype, gpumemcpydevicetohost)
509 check_memcpy_gpu("elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", successgpu)
510 endif
511#endif
512 endif ! (useGPU .and. .not. useCCL)
513
514
515 ! Broadcast processor column
516 if (useccl) then
517 call obj%timer%start("ccl_bcast")
518 nvtx_range_push("ccl_bcast aux_a/b_full")
519
520 if (a_transposed) then
521 successgpu = ccl_bcast(aux_a_full_dev, aux_a_full_dev, int(k_datatype*nblk_mult*nblk_mult,kind=c_size_t), &
522 ccldatatype, int(np_t,kind=c_int), ccl_comm_cols, my_stream)
523 else if (b_transposed) then
524 successgpu = ccl_bcast(aux_b_full_dev, aux_b_full_dev, int(k_datatype*nblk_mult*nblk_mult,kind=c_size_t), &
525 ccldatatype, int(np_t,kind=c_int), ccl_comm_rows, my_stream)
526 endif
527
528 if (.not. successgpu) then
529 print *,"Error in ccl_bcast"
530 stop 1
531 endif
532
533 successgpu = gpu_stream_synchronize(my_stream)
534 check_stream_synchronize_gpu("elpa_pxgemm_multiply: ccl_bcast", successgpu)
535
536 nvtx_range_pop("ccl_bcast aux_a/b_full")
537 call obj%timer%stop("ccl_bcast")
538 else ! useCCL
539 call obj%timer%start("mpi_communication")
540 nvtx_range_push("MPI_Bcast(aux_a/b_full)")
541
542 if (a_transposed) then
543 call mpi_bcast(aux_a_full, int(nblk_mult*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
544 int(np_t,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
545 else if (b_transposed) then
546 call mpi_bcast(aux_b_full, int(nblk_mult*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
547 int(np_t,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
548 endif
549
550 nvtx_range_pop("MPI_Bcast(aux_a/b_full)")
551 call obj%timer%stop("mpi_communication")
552 endif ! useCCL
553
554 ! copy data back to device, if needed
555 if (usegpu .and. .not. useccl) then
556 num = nblk_mult*nblk_mult
557#ifdef WITH_GPU_STREAMS
558 if (a_transposed) then
559 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", &
560 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:nblk_mult,1:nblk_mult), &
561 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
562 else if (b_transposed) then
563 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", &
564 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:nblk_mult), &
565 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
566 endif
567#else
568 if (a_transposed) then
569 successgpu = gpu_memcpy(aux_a_full_dev, int(loc(aux_a_full),kind=c_intptr_t), &
570 num*size_of_datatype, gpumemcpyhosttodevice)
571 check_memcpy_gpu("elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", successgpu)
572 else if (b_transposed) then
573 successgpu = gpu_memcpy(aux_b_full_dev, int(loc(aux_b_full),kind=c_intptr_t), &
574 num*size_of_datatype, gpumemcpyhosttodevice)
575 check_memcpy_gpu("elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", successgpu)
576 endif
577#endif
578 endif ! (useGPU .and. .not. useCCL)
579#endif /* WITH_MPI */
580
581 if (usegpu) then
582 call obj%timer%start("gpublas")
583 nvtx_range_push("gpublas")
584 call gpublas_precision_gemm(trans_a_cchar, trans_b_cchar, &
585 nblk_mult, nblk_mult, nblk_mult, one, &
586 aux_a_full_dev, nblk_mult, &
587 aux_b_full_dev, nblk_mult, zero, &
588 tmp1_full_dev , nblk_mult, gpuhandle)
589 if (wantdebug) successgpu = gpu_devicesynchronize()
590 nvtx_range_pop("gpublas")
591 call obj%timer%stop("gpublas")
592 else ! useGPU
593 call obj%timer%start("blas")
594 call precision_gemm(trans_a, trans_b, &
595 int(nblk_mult, kind=blas_kind), &
596 int(nblk_mult, kind=blas_kind), &
597 int(nblk_mult, kind=blas_kind), one, &
598 aux_a_full, int(nblk_mult, kind=blas_kind), &
599 aux_b_full, int(nblk_mult, kind=blas_kind), zero, &
600 tmp1_full , int(nblk_mult, kind=blas_kind))
601 call obj%timer%stop("blas")
602 endif ! useGPU
603
604#ifdef WITH_MPI
605 num = nblk_mult*nblk_mult
606
607 ! copy data to host for mpi_reduce, if needed
608 if (usegpu .and. .not. useccl) then
609 call obj%timer%start("gpu_memcpy")
610#ifdef WITH_GPU_STREAMS
611 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: tmp1_full_dev -> tmp1_full", &
612 tmp1_full_dev, 0_c_intptr_t, tmp1_full(1:nblk_mult,1:nblk_mult), &
613 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
614#else
615 successgpu = gpu_memcpy(int(loc(tmp1_full),kind=c_intptr_t), tmp1_full_dev, num*size_of_datatype, gpumemcpydevicetohost)
616 check_memcpy_gpu("elpa_pxgemm_multiply: tmp1_full_dev -> tmp1_full", successgpu)
617#endif
618 call obj%timer%stop("gpu_memcpy")
619 endif ! (useGPU .and. .not. useCCL)
620
621
622 if (useccl) then
623 call obj%timer%start("ccl_reduce")
624 nvtx_range_push("ccl_reduce tmp1_full_dev")
625
626 successgpu = ccl_reduce(tmp1_full_dev, tmp1_full_dev, int(k_datatype*num,kind=c_size_t), ccldatatype, &
627 cclsum, int(np, kind=c_int), ccl_comm_dirs, my_stream)
628
629 if (.not. successgpu) then
630 print *,"Error in ccl_reduce"
631 stop 1
632 endif
633
634 successgpu = gpu_stream_synchronize(my_stream)
635 check_stream_synchronize_gpu("elpa_pxgemm_multiply: ccl_bcast", successgpu)
636
637 nvtx_range_pop("ccl_bcast aux_a_full_dev, aux_b_full_dev")
638 call obj%timer%stop("ccl_reduce")
639 else ! useCCL
640 call obj%timer%start("mpi_communication")
641 if (my_pdir==np) then
642 call mpi_reduce(mpi_in_place, tmp1_full, int(num,kind=mpi_kind), mpi_math_datatype_precision, &
643 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_dirs,kind=mpi_kind), mpierr)
644 else
645 call mpi_reduce(tmp1_full , tmp1_full, int(num,kind=mpi_kind), mpi_math_datatype_precision, &
646 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_dirs,kind=mpi_kind), mpierr)
647 endif
648 call obj%timer%stop("mpi_communication")
649 endif ! useCCL
650
651 if (usegpu .and. .not. useccl) then
652 call obj%timer%start("gpu_memcpy")
653#ifdef WITH_GPU_STREAMS
654 call gpu_memcpy_async_and_stream_synchronize &
655 ("elpa_pxgemm_multiply: tmp1_full -> tmp1_full_dev", 0_c_intptr_t, tmp1_full_dev, &
656 tmp1_full(1:nblk_mult,1:nblk_mult), &
657 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
658#else
659 successgpu = gpu_memcpy(tmp1_full_dev, int(loc(tmp1_full),kind=c_intptr_t), &
660 num*size_of_datatype, gpumemcpyhosttodevice)
661 check_memcpy_gpu("elpa_pxgemm_multiply: tmp1_full -> tmp1_full_dev", successgpu)
662#endif
663 call obj%timer%stop("gpu_memcpy")
664 endif ! (useGPU .and. .not. useCCL)
665#endif /* WITH_MPI */
666
667 ! Put the result into C
668 if (my_pdir==np) then
669 if (usegpu) then
670 nvtx_range_push("gpu_copy_aux_full")
671 call gpu_copy_aux_full(precision_char, c_dev, tmp1_full_dev, l_rows, l_cols, ldc, nblk_mult, debug, my_stream)
672 nvtx_range_pop("gpu_copy_aux_full")
673 else ! useGPU
674 c(1:l_rows,1:l_cols) = tmp1_full(1:l_rows,1:l_cols)
675 endif ! useGPU
676 endif
677
678 nvtx_range_pop("do np = 0, np_dirs-1")
679 enddo ! np = 0, np_dirs-1
680 call obj%timer%stop("main_loop_square_grid_tn_nt")
681
682#if !defined(DEVICE_POINTER)
683 if (usegpu) then
684 call obj%timer%start("gpu_memcpy")
685 nvtx_range_push("gpu_memcpy: c_dev->c")
686 num = ldc*ldccols
687#ifdef WITH_GPU_STREAMS
688 call gpu_memcpy_async_and_stream_synchronize &
689 ("elpa_pxgemm_multiply: c_dev -> c", c_dev, 0_c_intptr_t, c(1:ldc,1:ldccols), &
690 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
691#else
692 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)
693 check_memcpy_gpu("elpa_pxgemm_multiply: c_dev -> c", successgpu)
694#endif
695 nvtx_range_pop("gpu_memcpy: c_dev->c")
696 call obj%timer%stop("gpu_memcpy")
697 endif ! useGPU
698#endif /* !defined(DEVICE_POINTER) */
699
700 deallocate(tmp1_full, stat=istat, errmsg=errormessage)
701 call check_alloc("elpa_pxgemm_multiply", "tmp1_full", istat, errormessage)
702
703 if (usegpu) then
704 successgpu = gpu_free(tmp1_full_dev)
705 check_dealloc_gpu("elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
706 endif
707
708 endif ! (.not. a_transposed .and. b_transposed)
709
710!_______________________________________________
711
712 if ((.not. a_transposed) .and. (.not. b_transposed) .or. &
713 a_transposed .and. b_transposed) then
714 if (wantdebug .and. myid==0) print *, "elpa_pxgemm_multiply NEW: SQUARE_GRID start: NN or TT" ! PETERDEBUG
715
716 allocate(aux_a_full(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
717 check_allocate("elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
718
719 allocate(aux_b_full(nblk_mult, l_cols), stat=istat, errmsg=errormessage)
720 check_allocate("elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
721
722 if (usegpu .and. l_rows /= ldc) then
723 print *, "elpa_pxgemm_multiply: Error: case ldc != lda is not implemented yet for NN and TT on GPU"
724 stop 1
725 endif
726
727 ! PETERDEBUG: is it possible to use the original GPU memory, without copying?
728 if (usegpu) then
729 successgpu = gpu_malloc(aux_a_full_dev, l_rows*nblk_mult*size_of_datatype)
730 check_alloc_gpu("elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
731
732 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*l_cols*size_of_datatype)
733 check_alloc_gpu("elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
734 endif
735
736 if (a_transposed) then
737 if (.not. useccl) then
738 allocate(at_col(l_rows, l_cols), stat=istat, errmsg=errormessage)
739 check_allocate("elpa_pxgemm_multiply: at_col", istat, errormessage)
740 endif
741
742 if (usegpu) then
743 successgpu = gpu_malloc(at_col_dev, l_rows*l_cols*size_of_datatype)
744 check_alloc_gpu("elpa_pxgemm_multiply: at_col_dev", successgpu)
745 endif
746 endif
747
748 if (b_transposed) then
749 if (.not. useccl) then
750 allocate(bt_row(l_rows, l_cols), stat=istat, errmsg=errormessage)
751 check_allocate("elpa_pxgemm_multiply: bt_row", istat, errormessage)
752 endif
753
754 if (usegpu) then
755 successgpu = gpu_malloc(bt_row_dev, l_rows*l_cols*size_of_datatype)
756 check_alloc_gpu("elpa_pxgemm_multiply: bt_row_dev", successgpu)
757 endif
758 endif
759
760 if (a_transposed .or. b_transposed) then
761 lcm = least_common_multiple(np_rows, np_cols)*nblk
762 ! max nblk_mult_rows, nblk_mult_cols is achieved at the 0-th MPI process
763 nblk_mult_rows_max = find_nblk_mult_dirs(l_rows_max, nblk, np_rows, 0, lcm)
764 nblk_mult_cols_max = find_nblk_mult_dirs(l_cols_max, nblk, np_cols, 0, lcm)
765
766 if (useccl) then
767 successgpu = gpu_malloc(buf_send_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
768 check_alloc_gpu("elpa_pxgemm_multiply: buf_send_dev", successgpu)
769
770 successgpu = gpu_malloc(buf_recv_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
771 check_alloc_gpu("elpa_pxgemm_multiply: buf_recv_dev", successgpu)
772
773 successgpu = gpu_malloc(buf_self_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
774 check_alloc_gpu("elpa_pxgemm_multiply: buf_self_dev", successgpu)
775 else ! useCCL
776 allocate(buf_send(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
777 check_allocate("elpa_pxgemm_multiply: buf_send", istat, errormessage)
778
779 allocate(buf_recv(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
780 check_allocate("elpa_pxgemm_multiply: buf_recv", istat, errormessage)
781
782 allocate(buf_self(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
783 check_allocate("elpa_pxgemm_multiply: buf_self", istat, errormessage)
784
785#if defined(DEVICE_POINTER)
786 allocate(a(obj%local_nrows, obj%local_ncols), stat=istat, errmsg=errormessage)
787 check_allocate("elpa_pxgemm_multiply: a", istat, errormessage)
788
789 allocate(b(ldb, ldbcols), stat=istat, errmsg=errormessage)
790 check_allocate("elpa_pxgemm_multiply: b", istat, errormessage)
791
792 successgpu = gpu_memcpy(int(loc(a),kind=c_intptr_t), a_dev, &
793 obj%local_nrows*obj%local_ncols*size_of_datatype, gpumemcpydevicetohost)
794 check_memcpy_gpu("elpa_pxgemm_multiply: a_dev -> a", successgpu)
795
796 successgpu = gpu_memcpy(int(loc(b),kind=c_intptr_t), b_dev, &
797 ldb*ldbcols*size_of_datatype, gpumemcpydevicetohost)
798 check_memcpy_gpu("elpa_pxgemm_multiply: b_dev -> b", successgpu)
799#endif /* defined(DEVICE_POINTER) */
800 endif ! useCCL
801 endif ! (a_transposed .or. b_transposed)
802
803 ! main loop: iterate through np, which are process rows for matrix A and process cols for matrix B
804 call obj%timer%start("main_loop_square_grid_nn_tt")
805
806 do np = 0, np_rows-1 ! np_rows=np_cols
807 nvtx_range_push("np = 0, np_rows-1")
808
809 ! In this turn, procs of row np assemble the result
810 np_bc=np ! np, that posesses the given column of a
811
812 if (a_transposed) then
813 if (useccl) then
814 call elpa_transpose_row_or_col_ccl_&
815 &math_datatype&
816 &_&
817 &precision&
818 (obj, 'R', a_dev, at_col_dev, buf_send_dev, buf_recv_dev, buf_self_dev, np, l_rows, l_cols, &
819 nblk_mult_rows_max, nblk_mult_cols_max, debug)
820 else
821 call elpa_transpose_row_or_col_&
822 &math_datatype&
823 &_&
824 &precision&
825 (obj, 'R', a, at_col, buf_send, buf_recv, buf_self, np, l_rows, l_cols, &
826 nblk_mult_rows_max, nblk_mult_cols_max, debug)
827 endif
828 endif
829
830 if (b_transposed) then
831 if (useccl) then
832 call elpa_transpose_row_or_col_ccl_&
833 &math_datatype&
834 &_&
835 &precision&
836 (obj, 'C', b_dev, bt_row_dev, buf_send_dev, buf_recv_dev, buf_self_dev, np, l_rows, l_cols, &
837 nblk_mult_rows_max, nblk_mult_cols_max, debug)
838 else
839 call elpa_transpose_row_or_col_&
840 &math_datatype&
841 &_&
842 &precision&
843 (obj, 'C', b, bt_row, buf_send, buf_recv, buf_self, np, l_rows, l_cols, &
844 nblk_mult_rows_max, nblk_mult_cols_max, debug)
845 endif
846 endif
847
848 if (np_bc == my_pcol) then
849 if (usegpu) then
850 if (a_transposed) then
851 if (.not. useccl) then
852 num = l_rows*l_cols
853#ifdef WITH_GPU_STREAMS
854 call gpu_memcpy_async_and_stream_synchronize &
855 ("elpa_pxgemm_multiply: at_col -> at_col_dev", at_col_dev, 0_c_intptr_t, at_col(1:l_rows,1:l_cols), &
856 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
857#else
858 successgpu = gpu_memcpy(at_col_dev, int(loc(at_col),kind=c_intptr_t), num*size_of_datatype, gpumemcpyhosttodevice)
859 check_memcpy_gpu("elpa_pxgemm_multiply: at_col -> at_col_dev", successgpu)
860#endif
861 endif ! (.not. useCCL)
862
863 call gpu_copy_aux_full(precision_char, aux_a_full_dev, at_col_dev, &
864 l_rows, l_cols, l_rows, obj%local_nrows, debug, my_stream)
865 else
866 call gpu_copy_aux_full(precision_char, aux_a_full_dev, a_dev, &
867 l_rows, l_cols, l_rows, obj%local_nrows, debug, my_stream)
868 ! call gpu_copy_and_set_zeros_aux_full(PRECISION_CHAR, a_dev, aux_a_full_dev, & ! PETERDEBUG: cleanup this kernel
869 ! l_rows, l_cols, l_rows, debug, my_stream)
870 !l_rows, l_cols, nblk_mult, debug, my_stream)
871 endif
872 else ! useGPU
873 if (a_transposed) then
874 aux_a_full(1:l_rows,1:l_cols) = at_col(1:l_rows,1:l_cols)
875 else
876 aux_a_full(1:l_rows,1:l_cols) = a(1:l_rows,1:l_cols)
877 endif
878 endif ! useGPU
879 endif
880 if (np == my_prow) then
881 if (usegpu) then
882 if (b_transposed) then
883 if (.not. useccl) then
884 num = l_rows*l_cols
885#ifdef WITH_GPU_STREAMS
886 call gpu_memcpy_async_and_stream_synchronize &
887 ("elpa_pxgemm_multiply: bt_row -> bt_row_dev", bt_row_dev, 0_c_intptr_t, bt_row(1:l_rows,1:l_cols), &
888 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
889#else
890 successgpu = gpu_memcpy(bt_row_dev, int(loc(bt_row),kind=c_intptr_t), num*size_of_datatype, gpumemcpyhosttodevice)
891 check_memcpy_gpu("elpa_pxgemm_multiply: bt_row -> bt_row_dev", successgpu)
892#endif
893 endif ! (.not. useCCL)
894
895 call gpu_copy_aux_full(precision_char, aux_b_full_dev, bt_row_dev, &
896 l_rows, l_cols, nblk_mult, ldb, debug, my_stream)
897 else
898 call gpu_copy_aux_full(precision_char, aux_b_full_dev, b_dev, &
899 l_rows, l_cols, nblk_mult, ldb, debug, my_stream)
900 ! call gpu_copy_and_set_zeros_aux_full(PRECISION_CHAR, b_dev, aux_b_full_dev, & ! PETERDEBUG
901 ! l_rows, l_cols, l_rows, debug, my_stream)
902 !l_rows, l_cols, nblk_mult, debug, my_stream)
903 endif
904 else ! useGPU
905 if (b_transposed) then
906 aux_b_full(1:l_rows,1:l_cols) = bt_row(1:l_rows,1:l_cols)
907 else
908 aux_b_full(1:l_rows,1:l_cols) = b(1:l_rows,1:l_cols)
909 endif
910 endif ! useGPU
911 endif
912
913#ifdef WITH_MPI
914 ! copy data to host for bcast, if needed
915 if (usegpu .and. .not. useccl) then
916 !num = nblk_mult*nblk_mult
917 num_a = l_rows*nblk_mult
918 num_b = nblk_mult*l_cols
919#ifdef WITH_GPU_STREAMS
920 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", &
921 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:l_rows,1:nblk_mult), &
922 1, 1, num_a*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
923 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", &
924 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:l_cols), &
925 1, 1, num_b*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
926#else
927 successgpu = gpu_memcpy(int(loc(aux_a_full),kind=c_intptr_t),aux_a_full_dev,num_a*size_of_datatype,gpumemcpydevicetohost)
928 check_memcpy_gpu("elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", successgpu)
929
930 successgpu = gpu_memcpy(int(loc(aux_b_full),kind=c_intptr_t),aux_b_full_dev,num_b*size_of_datatype,gpumemcpydevicetohost)
931 check_memcpy_gpu("elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", successgpu)
932#endif
933 endif ! (useGPU .and. .not. useCCL)
934
935 ! Broadcast processor column
936 if (useccl) then
937 call obj%timer%start("ccl_bcast")
938 nvtx_range_push("ccl_bcast aux_a_full_dev, aux_b_full_dev")
939
940 successgpu = ccl_bcast(aux_a_full_dev, aux_a_full_dev, int(k_datatype*l_rows*nblk_mult,kind=c_size_t), ccldatatype, &
941 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
942
943 successgpu2 = ccl_bcast(aux_b_full_dev, aux_b_full_dev, int(k_datatype*nblk_mult*l_cols,kind=c_size_t), ccldatatype, &
944 int(np ,kind=c_int), ccl_comm_rows, my_stream)
945
946 if (.not. (successgpu .and. successgpu2)) then
947 print *,"Error in ccl_bcast"
948 stop 1
949 endif
950
951 successgpu = gpu_stream_synchronize(my_stream)
952 check_stream_synchronize_gpu("elpa_pxgemm_multiply: ccl_bcast", successgpu)
953
954 nvtx_range_pop("ccl_bcast aux_a_full_dev, aux_b_full_dev")
955 call obj%timer%stop("ccl_bcast")
956 else ! useCCL
957 call obj%timer%start("mpi_communication")
958 nvtx_range_push("MPI_Bcast: aux_a_full, aux_b_full")
959
960 call mpi_bcast(aux_a_full, int(l_rows*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
961 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
962 call mpi_bcast(aux_b_full, int(nblk_mult*l_cols,kind=mpi_kind), mpi_math_datatype_precision, &
963 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
964
965 nvtx_range_pop("MPI_Bcast: aux_a_full, aux_b_full")
966 call obj%timer%stop("mpi_communication")
967 endif ! useCCL
968
969 call obj%timer%start("mpi_communication")
970 nvtx_range_push("MPI_Bcast: l_cols_source, l_rows_source")
971 l_cols_source = l_cols
972 l_rows_source = l_rows
973
974 call mpi_bcast(l_cols_source, int(1,kind=mpi_kind), mpi_integer, &
975 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
976 call mpi_bcast(l_rows_source, int(1,kind=mpi_kind), mpi_integer, &
977 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
978
979 nblk_mult_min = min(l_rows_source, l_cols_source)
980 nvtx_range_pop("MPI_Bcast: l_cols_source, l_rows_source")
981 call obj%timer%stop("mpi_communication")
982
983 ! copy data back to device, if needed
984 if (usegpu .and. .not. useccl) then
985 num_a = l_rows*nblk_mult
986 num_b = nblk_mult*l_cols
987#ifdef WITH_GPU_STREAMS
988 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", &
989 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:l_rows,1:nblk_mult), &
990 1, 1, num_a*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
991 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", &
992 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:l_cols), &
993 1, 1, num_b*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
994#else
995 successgpu = gpu_memcpy(aux_a_full_dev,int(loc(aux_a_full),kind=c_intptr_t),num_a*size_of_datatype,gpumemcpyhosttodevice)
996 check_memcpy_gpu("elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", successgpu)
997
998 successgpu = gpu_memcpy(aux_b_full_dev,int(loc(aux_b_full),kind=c_intptr_t),num_b*size_of_datatype,gpumemcpyhosttodevice)
999 check_memcpy_gpu("elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", successgpu)
1000#endif
1001 endif ! (useGPU .and. .not. useCCL)
1002#else /* WITH_MPI */
1003 nblk_mult_min= l_rows
1004#endif /* WITH_MPI */
1005
1006 beta = zero
1007 if (np>0) beta = one
1008 if (usegpu) then
1009 call obj%timer%start("gpublas")
1010 nvtx_range_push("gpublas")
1011 call gpublas_precision_gemm('N', 'N', &
1012 l_rows, l_cols, nblk_mult_min, one, &
1013 aux_a_full_dev, l_rows, &
1014 aux_b_full_dev, nblk_mult, beta, &
1015 c_dev , l_rows, gpuhandle)
1016 if (wantdebug) successgpu = gpu_devicesynchronize()
1017 nvtx_range_pop("gpublas")
1018 call obj%timer%stop("gpublas")
1019 else ! useGPU
1020 call obj%timer%start("blas")
1021 call precision_gemm('N', 'N', &
1022 int(l_rows, kind=blas_kind), &
1023 int(l_cols, kind=blas_kind), &
1024 int(nblk_mult_min, kind=blas_kind), one, &
1025 aux_a_full(1:l_rows, 1:nblk_mult_min), int(l_rows,kind=blas_kind), &
1026 aux_b_full(1:nblk_mult_min, 1:l_cols), int(nblk_mult_min,kind=blas_kind), beta, &
1027 c(1:l_rows, 1:l_cols), int(l_rows,kind=blas_kind))
1028 call obj%timer%stop("blas")
1029 endif ! useGPU
1030
1031 nvtx_range_pop("np = 0, np_rows-1")
1032 enddo ! np = 0, np_rows-1
1033
1034 call obj%timer%stop("main_loop_square_grid_nn_tt")
1035
1036#if !defined(DEVICE_POINTER)
1037 ! Put the result into C
1038 if (usegpu) then
1039 call obj%timer%start("gpu_memcpy")
1040 nvtx_range_push("gpu_memcpy: c_dev->c")
1041 num = l_rows*l_cols
1042#ifdef WITH_GPU_STREAMS
1043 call gpu_memcpy_async_and_stream_synchronize &
1044 ("elpa_pxgemm_multiply: c_dev -> c", c_dev, 0_c_intptr_t, c(1:l_rows,1:l_cols), &
1045 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1046#else
1047 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)
1048 check_memcpy_gpu("elpa_pxgemm_multiply: c_dev -> c", successgpu)
1049#endif
1050 nvtx_range_pop("gpu_memcpy: c_dev->c")
1051 call obj%timer%stop("gpu_memcpy")
1052 endif ! useGPU
1053#endif /* !defined(DEVICE_POINTER) */
1054
1055 if (a_transposed) then
1056 if (.not. useccl) then
1057 deallocate(at_col, stat=istat, errmsg=errormessage)
1058 call check_alloc("elpa_pxgemm_multiply", "at_col", istat, errormessage)
1059 endif
1060
1061 if (usegpu) then
1062 successgpu = gpu_free(at_col_dev)
1063 check_dealloc_gpu("elpa_pxgemm_multiply: at_col_dev", successgpu)
1064 endif
1065 endif
1066
1067 if (b_transposed) then
1068 if (.not. useccl) then
1069 deallocate(bt_row, stat=istat, errmsg=errormessage)
1070 call check_alloc("elpa_pxgemm_multiply", "bt_row", istat, errormessage)
1071 endif
1072
1073 if (usegpu) then
1074 successgpu = gpu_free(bt_row_dev)
1075 check_dealloc_gpu("elpa_pxgemm_multiply: bt_row_dev", successgpu)
1076 endif
1077 endif
1078
1079 if (a_transposed .or. b_transposed) then
1080 if (useccl) then
1081 successgpu = gpu_free(buf_send_dev)
1082 check_dealloc_gpu("elpa_pxgemm_multiply: buf_send_dev", successgpu)
1083
1084 successgpu = gpu_free(buf_recv_dev)
1085 check_dealloc_gpu("elpa_pxgemm_multiply: buf_recv_dev", successgpu)
1086
1087 successgpu = gpu_free(buf_self_dev)
1088 check_dealloc_gpu("elpa_pxgemm_multiply: buf_self_dev", successgpu)
1089 else
1090 deallocate(buf_send, buf_recv, buf_self, stat=istat, errmsg=errormessage)
1091 call check_alloc("elpa_pxgemm_multiply", "buf_send, buf_recv, buf_self", istat, errormessage)
1092
1093#if defined(DEVICE_POINTER)
1094 deallocate(a, b, stat=istat, errmsg=errormessage)
1095 call check_alloc("elpa_pxgemm_multiply", "a, b", istat, errormessage)
1096#endif
1097 endif
1098 endif ! (a_transposed .or. b_transposed)
1099
1100 endif ! ((.not. a_transposed) .and. (.not. b_transposed) .or. a_transposed .and. b_transposed)
1101
1102!_______________________________________________
1103
1104 endif ! isSquareGrid
1105
1106! _________________________________________________________________________________________________________________________________
1107
1108 if (.not. issquaregrid) then
1109
1110 lcm = least_common_multiple(np_rows, np_cols)*nblk
1111 nblk_mult_rows_max = (l_rows_max*np_rows+lcm-1)/lcm*nblk
1112 nblk_mult_cols_max = (l_cols_max*np_cols+lcm-1)/lcm*nblk
1113 nblk_mult = max(nblk_mult_rows_max, nblk_mult_cols_max)
1114
1115 if (wantdebug .and. myid==0) then ! PETERDEBUG
1116 print *, "LCM = ", lcm
1117 print *, "l_rows_max", l_rows_max
1118 print *, "l_cols_max", l_cols_max
1119 print *, "l_rows", l_rows
1120 print *, "l_cols", l_cols
1121 print *, "np_rows", np_rows
1122 print *, "np_cols", np_cols
1123 print *, "nblk_mult = ", nblk_mult
1124 endif
1125
1126 np_rows_fine = least_common_multiple(np_rows, np_cols)
1127 np_cols_fine = np_rows_fine
1128
1129! ___________________________________________________________________
1130
1131 if ((.not. a_transposed) .and. (.not. b_transposed) .or. &
1132 a_transposed .and. b_transposed) then
1133 if (wantdebug .and. myid==0) print *, "elpa_pxgemm_multiply NEW: NON-SQUARE_GRID start: NN or TT or universal" ! PETERDEBUG
1134
1135 allocate(aux_a_full(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
1136 check_allocate("elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
1137
1138 allocate(aux_b_full(nblk_mult, l_cols), stat=istat, errmsg=errormessage)
1139 check_allocate("elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
1140
1141 if (usegpu) then
1142 successgpu = gpu_malloc(aux_a_full_dev, l_rows*nblk_mult*size_of_datatype)
1143 check_alloc_gpu("elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
1144
1145 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*l_cols*size_of_datatype)
1146 check_alloc_gpu("elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
1147 endif
1148
1149 if (usegpu .and. l_rows /= ldc) then
1150 print *, "elpa_pxgemm_multiply: Error: case ldc != lda is not implemented yet for NN and TT on GPU"
1151 stop 1
1152 endif
1153
1154 if (a_transposed) then
1155 ! PETERDEBUG: this is as memory consuming as the whole matrix. Can we do smth about it?
1156 if (.not. useccl) then
1157 allocate(at_col(l_rows, l_cols), stat=istat, errmsg=errormessage) ! l_rows*nblk_mult as aux_a_full (maybe they are the same??)
1158 check_allocate("elpa_pxgemm_multiply: at_col", istat, errormessage)
1159 ! PETERDEBUG: it seems we can't do much about it. Consider 2x2 grid.
1160 ! Process (0,1) sends data to process (1,0), hence the latter should contain the whole local matrix
1161 ! Of course, process (1,1) doesn't need the space for the matrix, but that's not signficant
1162 ! due to the balance considerations
1163 endif
1164
1165 if (usegpu) then
1166 ! PETERDEBUG: this is as memory consuming as the whole matrix. Can we do smth about it?
1167 successgpu = gpu_malloc(at_col_dev, l_rows*l_cols*size_of_datatype) ! l_rows*nblk_mult as aux_a_full_dev
1168 check_alloc_gpu("elpa_pxgemm_multiply: at_col_dev", successgpu)
1169 endif
1170 endif
1171
1172 if (b_transposed) then
1173 if (.not. useccl) then
1174 ! PETERDEBUG: this is as memory consuming as the whole matrix. Can we do smth about it?
1175 allocate(bt_row(l_rows, l_cols), stat=istat, errmsg=errormessage)
1176 check_allocate("elpa_pxgemm_multiply: bt_row", istat, errormessage)
1177 endif
1178
1179 if (usegpu) then
1180 ! PETERDEBUG: this is as memory consuming as the whole matrix. Can we do smth about it?
1181 successgpu = gpu_malloc(bt_row_dev, l_rows*l_cols*size_of_datatype)
1182 check_alloc_gpu("elpa_pxgemm_multiply: bt_row_dev", successgpu)
1183 endif
1184 endif
1185
1186 if (a_transposed .or. b_transposed) then
1187 ! max nblk_mult_rows, nblk_mult_cols is achieved at the 0-th MPI process
1188 nblk_mult_rows_max = find_nblk_mult_dirs(l_rows_max, nblk, np_rows, 0, lcm)
1189 nblk_mult_cols_max = find_nblk_mult_dirs(l_cols_max, nblk, np_cols, 0, lcm)
1190
1191 if (useccl) then
1192 successgpu = gpu_malloc(buf_send_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
1193 check_alloc_gpu("elpa_pxgemm_multiply: buf_send_dev", successgpu)
1194
1195 successgpu = gpu_malloc(buf_recv_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
1196 check_alloc_gpu("elpa_pxgemm_multiply: buf_recv_dev", successgpu)
1197
1198 successgpu = gpu_malloc(buf_self_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype)
1199 check_alloc_gpu("elpa_pxgemm_multiply: buf_self_dev", successgpu)
1200 else ! useCCL
1201 allocate(buf_send(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
1202 check_allocate("elpa_pxgemm_multiply: buf_send", istat, errormessage)
1203
1204 allocate(buf_recv(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
1205 check_allocate("elpa_pxgemm_multiply: buf_recv", istat, errormessage)
1206
1207 allocate(buf_self(nblk_mult_rows_max, nblk_mult_cols_max), stat=istat, errmsg=errormessage)
1208 check_allocate("elpa_pxgemm_multiply: buf_self", istat, errormessage)
1209
1210#if defined(DEVICE_POINTER)
1211 allocate(a(obj%local_nrows, obj%local_ncols), stat=istat, errmsg=errormessage)
1212 check_allocate("elpa_pxgemm_multiply: a", istat, errormessage)
1213
1214 allocate(b(ldb, ldbcols), stat=istat, errmsg=errormessage)
1215 check_allocate("elpa_pxgemm_multiply: b", istat, errormessage)
1216
1217 successgpu = gpu_memcpy(int(loc(a),kind=c_intptr_t), a_dev, &
1218 obj%local_nrows*obj%local_ncols*size_of_datatype, gpumemcpydevicetohost)
1219 check_memcpy_gpu("elpa_pxgemm_multiply: a_dev -> a", successgpu)
1220
1221 successgpu = gpu_memcpy(int(loc(b),kind=c_intptr_t), b_dev, &
1222 ldb*ldbcols*size_of_datatype, gpumemcpydevicetohost)
1223 check_memcpy_gpu("elpa_pxgemm_multiply: b_dev -> b", successgpu)
1224#endif /* defined(DEVICE_POINTER) */
1225 endif ! useCCL
1226 endif
1227
1228 call obj%timer%start("main_loop_nn_tt")
1229
1230 ! main loop: iterate through np_fine, which are "virtual" process rows for matrix A and process cols for matrix B
1231 do np_fine = 0, np_rows_fine-1 ! np_rows_fine=np_cols_fine
1232 nvtx_range_push("np_fine = 0, np_rows_fine-1")
1233 if (wantdebug .and. myid==0) print *, "np_fine = ", np_fine ! PETERDEBUG
1234
1235 ! In this turn, procs of row np_fine assemble the result
1236
1237 np_bc_fine=np_fine ! np_fine, that posesses the given column of a
1238
1239 np = mod(np_fine, np_rows)
1240 np_bc = mod(np_bc_fine, np_cols)
1241
1242 nblk_mult_rows = find_nblk_mult_dirs(l_rows, nblk, np_rows, np_fine , lcm)
1243 nblk_mult_cols = find_nblk_mult_dirs(l_cols, nblk, np_cols, np_bc_fine, lcm)
1244
1245 if (a_transposed) then
1246 if (useccl) then
1247 call elpa_transpose_row_or_col_ccl_&
1248 &math_datatype&
1249 &_&
1250 &precision&
1251 (obj, 'R', a_dev, at_col_dev, buf_send_dev, buf_recv_dev, buf_self_dev, np_fine, l_rows, l_cols, &
1252 nblk_mult_rows_max, nblk_mult_cols_max, debug)
1253 else
1254 call elpa_transpose_row_or_col_&
1255 &math_datatype&
1256 &_&
1257 &precision&
1258 (obj, 'R', a, at_col, buf_send, buf_recv, buf_self, np_fine, l_rows, l_cols, &
1259 nblk_mult_rows_max, nblk_mult_cols_max, debug)
1260 endif
1261 endif
1262
1263 if (b_transposed) then
1264 if (useccl) then
1265 call elpa_transpose_row_or_col_ccl_&
1266 &math_datatype&
1267 &_&
1268 &precision&
1269 (obj, 'C', b_dev, bt_row_dev, buf_send_dev, buf_recv_dev, buf_self_dev, np_fine, l_rows, l_cols, &
1270 nblk_mult_rows_max, nblk_mult_cols_max, debug)
1271 else
1272 call elpa_transpose_row_or_col_&
1273 &math_datatype&
1274 &_&
1275 &precision&
1276 (obj, 'C', b, bt_row, buf_send, buf_recv, buf_self, np_fine, l_rows, l_cols, &
1277 nblk_mult_rows_max, nblk_mult_cols_max, debug)
1278 endif
1279 endif
1280
1281 if (mod(np_bc_fine,np_cols) == my_pcol) then
1282 if (usegpu) then
1283 if (a_transposed) then
1284 if (.not. useccl) then
1285 num = l_rows*l_cols
1286#ifdef WITH_GPU_STREAMS
1287 call gpu_memcpy_async_and_stream_synchronize &
1288 ("elpa_pxgemm_multiply: at_col -> at_col_dev", at_col_dev, 0_c_intptr_t, at_col(1:l_rows,1:l_cols), &
1289 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1290#else
1291 successgpu = gpu_memcpy(at_col_dev, int(loc(at_col),kind=c_intptr_t), num*size_of_datatype, gpumemcpyhosttodevice)
1292 check_memcpy_gpu("elpa_pxgemm_multiply: at_col -> at_col_dev", successgpu)
1293#endif
1294 endif ! (.not. useCCL)
1295
1296 call gpu_copy_and_set_zeros_aux_a_full(precision_char, at_col_dev, aux_a_full_dev, l_rows, l_cols, &
1297 nblk_mult_cols, nblk, np_bc_fine, np_cols_fine, np_cols, debug, my_stream)
1298 else ! (a_transposed .and. b_transposed)
1299 call gpu_copy_and_set_zeros_aux_a_full(precision_char, a_dev , aux_a_full_dev, l_rows, l_cols, &
1300 nblk_mult_cols, nblk, np_bc_fine, np_cols_fine, np_cols, debug, my_stream)
1301 endif ! a_transposed
1302 else ! useGPU
1303 do j_block_loc_fine = 0, (nblk_mult_cols+nblk-1)/nblk - 1
1304 nblk_cut = min(nblk, nblk_mult_cols-j_block_loc_fine*nblk)
1305 j_block_loc = (np_bc_fine + j_block_loc_fine*np_cols_fine)/np_cols
1306 if (a_transposed) then
1307 aux_a_full(1:l_rows, 1+j_block_loc_fine*nblk: nblk_cut+j_block_loc_fine*nblk) = &
1308 at_col(1:l_rows, 1+j_block_loc*nblk : nblk_cut+j_block_loc*nblk)
1309 else ! a_transposed
1310 aux_a_full(1:l_rows, 1+j_block_loc_fine*nblk: nblk_cut+j_block_loc_fine*nblk) = &
1311 a(1:l_rows, 1+j_block_loc*nblk : nblk_cut+j_block_loc*nblk)
1312 endif ! a_transposed
1313 enddo ! j_block_loc_fine
1314 endif ! useGPU
1315 endif ! (mod(np_bc_fine,np_cols) == my_pcol)
1316
1317 if (mod(np_fine,np_rows) == my_prow) then
1318 if (usegpu) then
1319 if (b_transposed) then
1320 if (.not. useccl) then
1321 num = l_rows*l_cols
1322#ifdef WITH_GPU_STREAMS
1323 call gpu_memcpy_async_and_stream_synchronize &
1324 ("elpa_pxgemm_multiply: bt_row -> bt_row_dev", bt_row_dev, 0_c_intptr_t, bt_row(1:l_rows,1:l_cols), &
1325 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1326#else
1327 successgpu = gpu_memcpy(bt_row_dev, int(loc(bt_row),kind=c_intptr_t), num*size_of_datatype, gpumemcpyhosttodevice)
1328 check_memcpy_gpu("elpa_pxgemm_multiply: bt_row -> bt_row_dev", successgpu)
1329#endif
1330 endif ! (.not. useCCL)
1331
1332 call gpu_copy_and_set_zeros_aux_b_full(precision_char, bt_row_dev, aux_b_full_dev, l_rows, l_cols, nblk_mult, &
1333 nblk_mult_rows, nblk, np_fine, np_rows_fine, np_rows, sm_count, debug, my_stream)
1334 else ! b_transposed
1335 call gpu_copy_and_set_zeros_aux_b_full(precision_char, b_dev, aux_b_full_dev, l_rows, l_cols, nblk_mult, &
1336 nblk_mult_rows, nblk, np_fine, np_rows_fine, np_rows, sm_count, debug, my_stream)
1337 endif ! b_transposed
1338 else ! useGPU
1339 do i_block_loc_fine = 0, (nblk_mult_rows+nblk-1)/nblk - 1
1340 nblk_cut = min(nblk, nblk_mult_rows-i_block_loc_fine*nblk)
1341 i_block_loc = (np_fine + i_block_loc_fine*np_rows_fine)/np_rows
1342 if (b_transposed) then
1343 aux_b_full(1+i_block_loc_fine*nblk: nblk_cut+i_block_loc_fine*nblk, 1:l_cols) = &
1344 bt_row(1+i_block_loc*nblk : nblk_cut+i_block_loc*nblk , 1:l_cols)
1345 else ! b_transposed
1346 aux_b_full(1+i_block_loc_fine*nblk: nblk_cut+i_block_loc_fine*nblk, 1:l_cols) = &
1347 b(1+i_block_loc*nblk : nblk_cut+i_block_loc*nblk , 1:l_cols)
1348 endif ! b_transposed
1349 enddo ! i_block_loc_fine
1350 endif ! useGPU
1351 endif ! (mod(np_fine,np_rows) == my_prow)
1352
1353
1354 ! copy data to host for bcast, if needed
1355 num_a = l_rows*nblk_mult
1356 num_b = nblk_mult*l_cols
1357 if (usegpu .and. .not. useccl) then
1358#ifdef WITH_GPU_STREAMS
1359 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", &
1360 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:l_rows,1:nblk_mult), &
1361 1, 1, num_a*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
1362 call gpu_memcpy_async_and_stream_synchronize ("elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", &
1363 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:l_cols), &
1364 1, 1, num_b*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1365#else
1366 successgpu = gpu_memcpy(int(loc(aux_a_full),kind=c_intptr_t), aux_a_full_dev, num_a*size_of_datatype, &
1367 gpumemcpydevicetohost)
1368 check_memcpy_gpu("elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", successgpu)
1369
1370 successgpu = gpu_memcpy(int(loc(aux_b_full),kind=c_intptr_t), aux_b_full_dev, num_b*size_of_datatype, &
1371 gpumemcpydevicetohost)
1372 check_memcpy_gpu("elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", successgpu)
1373#endif
1374 endif ! (useGPU .and. .not. useCCL)
1375
1376#ifdef WITH_MPI
1377 ! Broadcast processor column
1378 if (useccl) then
1379 call obj%timer%start("ccl_bcast")
1380 nvtx_range_push("ccl_bcast aux_a_full_dev, aux_b_full_dev")
1381
1382 successgpu = ccl_bcast(aux_a_full_dev, aux_a_full_dev, int(k_datatype*l_rows*nblk_mult,kind=c_size_t), ccldatatype, &
1383 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
1384
1385 successgpu2 = ccl_bcast(aux_b_full_dev, aux_b_full_dev, int(k_datatype*nblk_mult*l_cols,kind=c_size_t), ccldatatype, &
1386 int(np ,kind=c_int), ccl_comm_rows, my_stream)
1387
1388 if (.not. (successgpu .and. successgpu2)) then
1389 print *,"Error in ccl_bcast"
1390 stop 1
1391 endif
1392
1393 call mpi_bcast(nblk_mult_rows, 1_mpi_kind, mpi_integer, &
1394 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
1395
1396 successgpu = gpu_stream_synchronize(my_stream)
1397 check_stream_synchronize_gpu("elpa_pxgemm_multiply: ccl_bcast", successgpu)
1398
1399 nvtx_range_pop("ccl_bcast aux_a_full_dev, aux_b_full_dev")
1400 call obj%timer%stop("ccl_bcast")
1401 else ! useCCL
1402 call obj%timer%start("mpi_communication")
1403 nvtx_range_push("MPI_Bcast: aux_a_full, aux_b_full")
1404
1405 call mpi_bcast(aux_a_full, int(l_rows*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
1406 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
1407
1408 call mpi_bcast(aux_b_full, int(nblk_mult*l_cols,kind=mpi_kind), mpi_math_datatype_precision, &
1409 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
1410 call mpi_bcast(nblk_mult_rows, 1_mpi_kind, mpi_integer, &
1411 int(np ,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
1412
1413 nvtx_range_pop("MPI_Bcast: aux_a_full, aux_b_full")
1414 call obj%timer%stop("mpi_communication")
1415 endif ! useCCL
1416
1417
1418 ! copy data back to device, if needed
1419 if (usegpu .and. .not. useccl) then
1420 num_a = l_rows*nblk_mult
1421 num_b = nblk_mult*l_cols
1422#ifdef WITH_GPU_STREAMS
1423 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", &
1424 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:l_rows,1:nblk_mult), &
1425 1, 1, num_a*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
1426 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", &
1427 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:l_cols), &
1428 1, 1, num_b*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1429#else
1430 successgpu = gpu_memcpy(aux_a_full_dev,int(loc(aux_a_full),kind=c_intptr_t), num_a*size_of_datatype, &
1431 gpumemcpyhosttodevice)
1432 check_memcpy_gpu("elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", successgpu)
1433
1434 successgpu = gpu_memcpy(aux_b_full_dev,int(loc(aux_b_full),kind=c_intptr_t), num_b*size_of_datatype, &
1435 gpumemcpyhosttodevice)
1436 check_memcpy_gpu("elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", successgpu)
1437#endif
1438 endif ! (useGPU .and. .not. useCCL)
1439#endif /* WITH_MPI */
1440
1441 beta = zero
1442 if (np_fine>0) beta = one
1443 if (usegpu) then
1444 call obj%timer%start("gpublas")
1445 nvtx_range_push("gpublas")
1446 call gpublas_precision_gemm('N', 'N', &
1447 l_rows, l_cols, nblk_mult_rows, one, &
1448 aux_a_full_dev, l_rows, &
1449 aux_b_full_dev, nblk_mult, beta, &
1450 c_dev , l_rows, gpuhandle)
1451 if (wantdebug) successgpu = gpu_devicesynchronize()
1452 nvtx_range_pop("gpublas")
1453 call obj%timer%stop("gpublas")
1454 else ! useGPU
1455 call obj%timer%start("blas")
1456 call precision_gemm('N', 'N', &
1457 int(l_rows, kind=blas_kind), &
1458 int(l_cols, kind=blas_kind), &
1459 int(nblk_mult_rows, kind=blas_kind), one, &
1460 aux_a_full(1:l_rows, 1:nblk_mult), int(l_rows,kind=blas_kind), &
1461 aux_b_full(1:nblk_mult, 1:l_cols), int(nblk_mult,kind=blas_kind), beta, &
1462 c(1:l_rows, 1:l_cols) , int(l_rows,kind=blas_kind))
1463 call obj%timer%stop("blas")
1464 endif ! useGPU
1465
1466 nvtx_range_pop("np_fine = 0, np_rows_fine-1")
1467 enddo ! np_fine = 0, np_rows_fine-1
1468
1469 call obj%timer%stop("main_loop_nn_tt")
1470
1471#if !defined(DEVICE_POINTER)
1472 ! Put the result into C
1473 if (usegpu) then
1474 call obj%timer%start("gpu_memcpy")
1475 nvtx_range_push("gpu_memcpy: c_dev->c")
1476 num = l_rows*l_cols
1477#ifdef WITH_GPU_STREAMS
1478 call gpu_memcpy_async_and_stream_synchronize &
1479 ("elpa_pxgemm_multiply: c_dev -> c", c_dev, 0_c_intptr_t, c(1:l_rows,1:l_cols), &
1480 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1481#else
1482 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)
1483 check_memcpy_gpu("elpa_pxgemm_multiply: c_dev -> c", successgpu)
1484#endif
1485 nvtx_range_pop("gpu_memcpy: c_dev->c")
1486 call obj%timer%stop("gpu_memcpy")
1487 endif ! useGPU
1488#endif /* DEVICE_POINTER */
1489
1490
1491 if (a_transposed) then
1492 if (.not. useccl) then
1493 deallocate(at_col, stat=istat, errmsg=errormessage)
1494 call check_alloc("elpa_pxgemm_multiply", "at_col", istat, errormessage)
1495 endif
1496
1497 if (usegpu) then
1498 successgpu = gpu_free(at_col_dev)
1499 check_dealloc_gpu("elpa_pxgemm_multiply: at_col_dev", successgpu)
1500 endif
1501 endif
1502
1503 if (b_transposed) then
1504 if (.not. useccl) then
1505 deallocate(bt_row, stat=istat, errmsg=errormessage)
1506 call check_alloc("elpa_pxgemm_multiply", "bt_row", istat, errormessage)
1507 endif
1508
1509 if (usegpu) then
1510 successgpu = gpu_free(bt_row_dev)
1511 check_dealloc_gpu("elpa_pxgemm_multiply: bt_row_dev", successgpu)
1512 endif
1513 endif
1514
1515 if (a_transposed .or. b_transposed) then
1516 if (useccl) then
1517 successgpu = gpu_free(buf_send_dev)
1518 check_dealloc_gpu("elpa_pxgemm_multiply: buf_send_dev", successgpu)
1519
1520 successgpu = gpu_free(buf_recv_dev)
1521 check_dealloc_gpu("elpa_pxgemm_multiply: buf_recv_dev", successgpu)
1522
1523 successgpu = gpu_free(buf_self_dev)
1524 check_dealloc_gpu("elpa_pxgemm_multiply: buf_self_dev", successgpu)
1525 else
1526 deallocate(buf_send, buf_recv, buf_self, stat=istat, errmsg=errormessage)
1527 call check_alloc("elpa_pxgemm_multiply", "buf_send, buf_recv, buf_self", istat, errormessage)
1528
1529#if defined(DEVICE_POINTER)
1530 deallocate(a, b, stat=istat, errmsg=errormessage)
1531 call check_alloc("elpa_pxgemm_multiply", "a, b", istat, errormessage)
1532#endif
1533 endif
1534 endif ! (a_transposed .or. b_transposed)
1535
1536 endif ! universal
1537
1538! ___________________________________________________________________
1539 if (.not. a_transposed .and. b_transposed .or. &
1540 a_transposed .and. .not. b_transposed) then
1541 if (wantdebug .and. myid==0) print *, "elpa_pxgemm_multiply NEW: NON-SQUARE_GRID start: TN or NT" ! PETERDEBUG
1542
1543 ! dir = row/col for TN/NT
1544 if (a_transposed) then
1545 my_pdir = my_prow
1546 my_pdir_t = my_pcol
1547 np_dirs = np_rows
1548 np_dirs_t = np_cols
1549 mpi_comm_dirs = mpi_comm_rows
1550 ccl_comm_dirs = obj%gpu_setup%ccl_comm_rows
1551 else if (b_transposed) then
1552 my_pdir = my_pcol
1553 my_pdir_t = my_prow
1554 np_dirs = np_cols
1555 np_dirs_t = np_rows
1556 mpi_comm_dirs = mpi_comm_cols
1557 ccl_comm_dirs = obj%gpu_setup%ccl_comm_cols
1558 endif
1559
1560 np_dirs_fine = least_common_multiple(np_rows, np_cols)
1561
1562 nblk_mult_rows = nblk_mult
1563 nblk_mult_cols = nblk_mult
1564
1565 nblk_mult_max = (na+lcm-1)/lcm * nblk
1566 !nblk_mult_cols_max = (na+LCM-1)/LCM * nblk
1567 !nblk_mult_rows_max = (na+LCM-1)/LCM * nblk
1568
1569 if (a_transposed) then
1570 allocate(aux_a_full(nblk_mult, nblk_mult), stat=istat, errmsg=errormessage)
1571 check_allocate("elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
1572
1573 allocate(aux_b_full(nblk_mult, nblk_mult_max*(np_dirs_fine/np_dirs_t)), stat=istat, errmsg=errormessage)
1574 check_allocate("elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
1575
1576 allocate(tmp1_full(nblk_mult, nblk_mult_max*(np_dirs_fine/np_dirs_t)), stat=istat, errmsg=errormessage)
1577 check_allocate("elpa_pxgemm_multiply: tmp1_full", istat, errormessage)
1578
1579 if (usegpu) then
1580 successgpu = gpu_malloc(aux_a_full_dev, nblk_mult*nblk_mult*size_of_datatype)
1581 check_alloc_gpu("elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
1582
1583 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*nblk_mult_max*(np_dirs_fine/np_dirs_t)*size_of_datatype)
1584 check_alloc_gpu("elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
1585
1586 successgpu = gpu_malloc(tmp1_full_dev , nblk_mult*nblk_mult_max*(np_dirs_fine/np_dirs_t)*size_of_datatype)
1587 check_alloc_gpu("elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
1588 endif ! useGPU
1589 else if (b_transposed) then
1590 allocate(aux_a_full(nblk_mult_max*(np_dirs_fine/np_dirs_t), nblk_mult), stat=istat, errmsg=errormessage)
1591 check_allocate("elpa_pxgemm_multiply: aux_a_full", istat, errormessage)
1592
1593 allocate(aux_b_full(nblk_mult, nblk_mult), stat=istat, errmsg=errormessage)
1594 check_allocate("elpa_pxgemm_multiply: aux_b_full", istat, errormessage)
1595
1596 allocate(tmp1_full(nblk_mult_max*(np_dirs_fine/np_dirs_t), nblk_mult), stat=istat, errmsg=errormessage)
1597 check_allocate("elpa_pxgemm_multiply: tmp1_full", istat, errormessage)
1598 if (usegpu) then
1599 successgpu = gpu_malloc(aux_a_full_dev, nblk_mult_max*(np_dirs_fine/np_dirs_t)*nblk_mult*size_of_datatype)
1600 check_alloc_gpu("elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
1601
1602 successgpu = gpu_malloc(aux_b_full_dev, nblk_mult*nblk_mult*size_of_datatype)
1603 check_alloc_gpu("elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
1604
1605 successgpu = gpu_malloc(tmp1_full_dev , nblk_mult_max*(np_dirs_fine/np_dirs_t)*nblk_mult*size_of_datatype)
1606 check_alloc_gpu("elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
1607 endif ! useGPU
1608 endif ! b_transposed
1609
1610 call obj%timer%start("main_loop_tn_nt")
1611
1612 ! main loop: build up the result matrix C by the (virtual) process rows/cols for TN/NT
1613 do np_fine = 0, np_dirs_fine-1
1614 nvtx_range_push("do np_fine")
1615 if (wantdebug .and. myid==0) print *, "np_fine = ", np_fine ! PETERDEBUG
1616
1617 ! In this turn, procs of row/col np assemble the result for TN/NT case
1618
1619 np_t_fine=np_fine ! np, that posesses the given "col of a"/"row of b" for TN/NT
1620
1621 !np = mod(np_fine, np_rows)
1622 !np_t = mod(np_t_fine, np_cols)
1623
1624 np = mod(np_fine, np_dirs)
1625 np_t = mod(np_t_fine, np_dirs_t)
1626
1627 ! For non-square grids we work on LCMxLCM blocks.
1628 ! One block consists of nblk x nblk block in the first LCM*LCM block and all its copies in the other LCM*LCM blocks.
1629 ! TN: loop over fine rows of a
1630 ! NT: loop over fine cols of b
1631 do np_ab_fine = 0, np_dirs_fine-1
1632 !do dnp_ab = 0, np_dirs_fine/np_dirs-1
1633
1634 ! consider only the fine rows/cols that belong to the current process
1635 if (mod(np_ab_fine,np_dirs)/=my_pdir) cycle
1636
1637 if (usegpu) then
1638 call obj%timer%start("gpu_copy_and_set_zeros_aux_ab_full_tn")
1639 call gpu_copy_and_set_zeros_aux_ab_full_tn_nt (precision_char, a_transposed_int, &
1640 a_dev, b_dev, aux_a_full_dev, aux_b_full_dev, &
1641 l_rows, l_cols, nblk_mult_max, nblk_mult, nblk, &
1642 np_ab_fine, np_rows, my_prow, &
1643 np_t_fine , np_cols, my_pcol, &
1644 np_dirs_fine, sm_count, debug, my_stream)
1645 call obj%timer%stop("gpu_copy_and_set_zeros_aux_ab_full_tn")
1646 else ! useGPU
1647 call obj%timer%start("copy_and_set_zeros_aux_ab_full_tn")
1648
1649 if (a_transposed) then
1650 if (mod(np_t_fine,np_cols) == my_pcol) then
1651 do j_block_loc_fine = 0, nblk_mult_max/nblk-1
1652 j_block_loc = (np_t_fine + j_block_loc_fine*np_cols_fine)/np_cols
1653
1654 do i_block_loc_fine = 0, nblk_mult/nblk-1
1655 i_block_loc = (np_ab_fine + i_block_loc_fine*np_rows_fine)/np_rows
1656
1657 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
1658 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
1659
1660 if (nblk_rows_cut>0 .and. nblk_cols_cut>0) then
1661 aux_a_full(1+i_block_loc_fine*nblk : nblk_rows_cut+i_block_loc_fine*nblk, &
1662 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk) = &
1663 a(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
1664 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk)
1665 endif
1666
1667 call set_zeros_in_unused_block_part_&
1668 &math_datatype&
1669 &_&
1670 &precision &
1671 (aux_a_full, nblk, nblk_rows_cut, nblk_cols_cut, &
1672 i_block_loc_fine, j_block_loc_fine, 0, 0)
1673
1674
1675 enddo ! i_block_loc_fine
1676 enddo ! j_block_loc_fine
1677 endif ! (mod(np_t_fine,np_cols) == my_pcol)
1678
1679 do dnp_ab_t = 0, np_dirs_fine/np_dirs_t-1
1680 np_ab_t_fine = dnp_ab_t*np_dirs_t + my_pdir_t
1681
1682 do j_block_loc_fine = 0, nblk_mult_max/nblk-1
1683 j_block_loc = (np_ab_t_fine + j_block_loc_fine*np_cols_fine)/np_cols
1684
1685 do i_block_loc_fine = 0, nblk_mult/nblk-1
1686 i_block_loc = (np_ab_fine + i_block_loc_fine*np_rows_fine)/np_rows
1687
1688 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
1689 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
1690
1691 if (nblk_rows_cut>0 .and. nblk_cols_cut>0) then
1692 aux_b_full(1+i_block_loc_fine*nblk : nblk_rows_cut+i_block_loc_fine*nblk, &
1693 1 +j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
1694 nblk_cols_cut+j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max) = &
1695 b(1+i_block_loc*nblk :nblk_rows_cut+i_block_loc*nblk, &
1696 1+j_block_loc*nblk :nblk_cols_cut+j_block_loc*nblk)
1697 endif
1698
1699 call set_zeros_in_unused_block_part_&
1700 &math_datatype&
1701 &_&
1702 &precision &
1703 (aux_b_full, nblk, nblk_rows_cut, nblk_cols_cut, &
1704 i_block_loc_fine, j_block_loc_fine, 0, dnp_ab_t*nblk_mult_max)
1705
1706 enddo ! i_block_loc_fine
1707 enddo ! j_block_loc_fine
1708 enddo ! np_ab_t_fine
1709 else if (b_transposed) then
1710
1711 if (mod(np_t_fine, np_rows) == my_prow) then
1712 do i_block_loc_fine = 0, nblk_mult_max/nblk-1
1713 i_block_loc = (np_t_fine + i_block_loc_fine*np_rows_fine)/np_rows
1714
1715 do j_block_loc_fine = 0, nblk_mult/nblk-1
1716 j_block_loc = (np_ab_fine + j_block_loc_fine*np_cols_fine)/np_cols
1717
1718 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
1719 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
1720
1721 if (nblk_rows_cut>0 .and. nblk_cols_cut>0) then
1722 aux_b_full(1+i_block_loc_fine*nblk : nblk_rows_cut+i_block_loc_fine*nblk, &
1723 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk) = &
1724 b(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
1725 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk)
1726 endif
1727
1728 call set_zeros_in_unused_block_part_&
1729 &math_datatype&
1730 &_&
1731 &precision &
1732 (aux_b_full, nblk, nblk_rows_cut, nblk_cols_cut, &
1733 i_block_loc_fine, j_block_loc_fine, 0, 0)
1734
1735 enddo ! j_block_loc_fine
1736 enddo ! i_block_loc_fine
1737 endif ! (mod(np_t_fine, np_rows) == my_prow)
1738
1739 do dnp_ab_t = 0, np_dirs_fine/np_dirs_t-1
1740 np_ab_t_fine = dnp_ab_t*np_dirs_t + my_pdir_t
1741
1742 do i_block_loc_fine = 0, nblk_mult_max/nblk-1
1743 i_block_loc = (np_ab_t_fine + i_block_loc_fine*np_rows_fine)/np_rows
1744
1745 do j_block_loc_fine = 0, nblk_mult/nblk-1
1746 j_block_loc = (np_ab_fine + j_block_loc_fine*np_cols_fine)/np_cols
1747
1748 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
1749 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
1750
1751 if (nblk_rows_cut>0 .and. nblk_cols_cut>0) then
1752 aux_a_full(1 +i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
1753 nblk_rows_cut+i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max, &
1754 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk) = &
1755 a(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
1756 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk)
1757 endif
1758
1759 call set_zeros_in_unused_block_part_&
1760 &math_datatype&
1761 &_&
1762 &precision &
1763 (aux_a_full, nblk, nblk_rows_cut, nblk_cols_cut, &
1764 i_block_loc_fine, j_block_loc_fine, dnp_ab_t*nblk_mult_max, 0)
1765
1766 enddo ! j_block_loc_fine
1767 enddo ! i_block_loc_fine
1768 enddo ! dnp_ab_t
1769 endif ! b_transposed
1770 call obj%timer%stop("copy_and_set_zeros_aux_ab_full_tn")
1771 endif ! useGPU
1772
1773 ! aux_a_full/aux_b_full -> aux_ab_trans (aux_ab) ! auxillary buffer for a matrix that is to be transposed: a/b in TN/NT case
1774 ! aux_b_full/aux_a_full -> aux_ab_nontr (aux_ba) ! auxillary buffer for a matrix that is not to be transposed: b/a in TN/NT case
1775 !enddo ! dnp_ab -> later
1776
1777#ifdef WITH_MPI
1778 if (usegpu .and. .not. useccl) then
1779 if (my_pdir_t == np_t) then
1780 call obj%timer%start("gpu_memcpy")
1781 num = nblk_mult*nblk_mult
1782 if (a_transposed) then
1783#ifdef WITH_GPU_STREAMS
1784 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", &
1785 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:nblk_mult,1:nblk_mult),&
1786 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1787#else
1788 successgpu = gpu_memcpy(int(loc(aux_a_full),kind=c_intptr_t), aux_a_full_dev, &
1789 num*size_of_datatype, gpumemcpydevicetohost)
1790 check_memcpy_gpu("elpa_pxgemm_multiply: aux_a_full_dev -> aux_a_full", successgpu)
1791#endif
1792 else if (b_transposed) then
1793#ifdef WITH_GPU_STREAMS
1794 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", &
1795 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:nblk_mult),&
1796 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1797#else
1798 successgpu = gpu_memcpy(int(loc(aux_b_full),kind=c_intptr_t), aux_b_full_dev, &
1799 nblk_mult*nblk_mult*size_of_datatype, gpumemcpydevicetohost)
1800 check_memcpy_gpu("elpa_pxgemm_multiply: aux_b_full_dev -> aux_b_full", successgpu)
1801#endif
1802 endif ! b_transposed
1803 call obj%timer%stop("gpu_memcpy")
1804 endif ! (my_pdir_t == np_t)
1805 endif ! (useGPU .and. .not. useCCL)
1806
1807 ! Broadcast processor column/row
1808 if (useccl) then
1809 call obj%timer%start("ccl_bcast")
1810 nvtx_range_push("ccl_bcast")
1811
1812 if (a_transposed) then
1813 successgpu = ccl_bcast(aux_a_full_dev, aux_a_full_dev, int(k_datatype*nblk_mult*nblk_mult,kind=c_size_t), &
1814 ccldatatype, int(np_t,kind=c_int), ccl_comm_cols, my_stream)
1815 else if (b_transposed) then
1816 successgpu = ccl_bcast(aux_b_full_dev, aux_b_full_dev, int(k_datatype*nblk_mult*nblk_mult,kind=c_size_t), &
1817 ccldatatype, int(np_t,kind=c_int), ccl_comm_rows, my_stream)
1818 endif
1819
1820 if (.not. successgpu) then
1821 print *,"Error in ccl_bcast"
1822 stop 1
1823 endif
1824
1825 successgpu = gpu_stream_synchronize(my_stream)
1826 check_stream_synchronize_gpu("elpa_pxgemm_multiply: ccl_bcast", successgpu)
1827
1828 nvtx_range_pop("ccl_bcast aux_a_full_dev, aux_b_full_dev")
1829 call obj%timer%stop("ccl_bcast")
1830 else ! useCCL
1831 call obj%timer%start("mpi_bcast")
1832 nvtx_range_push("MPI_Bcast(aux_a/b_full)")
1833
1834 if (a_transposed) then
1835 call mpi_bcast(aux_a_full, int(nblk_mult*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
1836 int(np_t,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
1837 else if (b_transposed) then
1838 call mpi_bcast(aux_b_full, int(nblk_mult*nblk_mult,kind=mpi_kind), mpi_math_datatype_precision, &
1839 int(np_t,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
1840 endif
1841
1842 nvtx_range_pop("MPI_Bcast(aux_a/b_full)")
1843 call obj%timer%stop("mpi_bcast")
1844 endif ! useCCL
1845
1846 if (usegpu .and. .not. useccl) then
1847 call obj%timer%start("gpu_memcpy")
1848 num = nblk_mult*nblk_mult
1849 if (a_transposed) then
1850#ifdef WITH_GPU_STREAMS
1851 call gpu_memcpy_async_and_stream_synchronize("elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", &
1852 aux_a_full_dev, 0_c_intptr_t, aux_a_full(1:nblk_mult,1:nblk_mult), &
1853 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1854#else
1855 successgpu = gpu_memcpy(aux_a_full_dev, int(loc(aux_a_full),kind=c_intptr_t), &
1856 nblk_mult*nblk_mult*size_of_datatype, gpumemcpyhosttodevice)
1857 check_memcpy_gpu("elpa_pxgemm_multiply: aux_a_full -> aux_a_full_dev", successgpu)
1858#endif
1859 else if (b_transposed) then
1860#ifdef WITH_GPU_STREAMS
1861 call gpu_memcpy_async_and_stream_synchronize ("elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", &
1862 aux_b_full_dev, 0_c_intptr_t, aux_b_full(1:nblk_mult,1:nblk_mult), &
1863 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1864#else
1865 successgpu = gpu_memcpy(aux_b_full_dev, int(loc(aux_b_full),kind=c_intptr_t), &
1866 nblk_mult*nblk_mult*size_of_datatype, gpumemcpyhosttodevice)
1867 check_memcpy_gpu("elpa_pxgemm_multiply: aux_b_full -> aux_b_full_dev", successgpu)
1868#endif
1869 endif ! b_transposed
1870 call obj%timer%stop("gpu_memcpy")
1871 endif ! (useGPU .and. .not. useCCL)
1872#endif /* WITH_MPI */
1873
1874 if (usegpu) then
1875 call obj%timer%start("gpublas")
1876 nvtx_range_push("gpublas")
1877 if (a_transposed) then
1878 call gpublas_precision_gemm(trans_a_cchar, trans_b_cchar, &
1879 nblk_mult, &
1880 nblk_mult_max*(np_dirs_fine/np_dirs_t), &
1881 nblk_mult, one, &
1882 aux_a_full_dev, nblk_mult, &
1883 aux_b_full_dev, nblk_mult, zero, &
1884 tmp1_full_dev , nblk_mult, gpuhandle)
1885 else if (b_transposed) then
1886 call gpublas_precision_gemm(trans_a_cchar, trans_b_cchar, &
1887 nblk_mult_max*(np_dirs_fine/np_dirs_t), &
1888 nblk_mult, &
1889 nblk_mult, one, &
1890 aux_a_full_dev, nblk_mult_max*(np_dirs_fine/np_dirs_t), &
1891 aux_b_full_dev, nblk_mult, zero, &
1892 tmp1_full_dev , nblk_mult_max*(np_dirs_fine/np_dirs_t), gpuhandle)
1893 endif ! b_transposed
1894 if (wantdebug) successgpu = gpu_devicesynchronize()
1895 nvtx_range_pop("gpublas")
1896 call obj%timer%stop("gpublas")
1897 else ! useGPU
1898 call obj%timer%start("blas")
1899 if (a_transposed) then
1900 call precision_gemm(trans_a, trans_b, &
1901 int(nblk_mult, kind=blas_kind), &
1902 int(nblk_mult_max*(np_dirs_fine/np_dirs_t), kind=blas_kind), &
1903 int(nblk_mult, kind=blas_kind), one, &
1904 aux_a_full(1:nblk_mult,1:nblk_mult), int(nblk_mult, kind=blas_kind), &
1905 aux_b_full(1:nblk_mult,1:nblk_mult_max*(np_dirs_fine/np_dirs_t)), &
1906 int(nblk_mult, kind=blas_kind), zero, &
1907 tmp1_full(1:nblk_mult,1:nblk_mult_max*(np_dirs_fine/np_dirs_t)), &
1908 int(nblk_mult, kind=blas_kind))
1909 else if (b_transposed) then
1910 call precision_gemm(trans_a, trans_b, &
1911 int(nblk_mult_max*(np_dirs_fine/np_dirs_t), kind=blas_kind), &
1912 int(nblk_mult, kind=blas_kind), &
1913 int(nblk_mult, kind=blas_kind), one, &
1914 aux_a_full(1:nblk_mult_max*(np_dirs_fine/np_dirs_t),1:nblk_mult),&
1915 int(nblk_mult_max*(np_dirs_fine/np_dirs_t), kind=blas_kind), &
1916 aux_b_full(1:nblk_mult,1:nblk_mult), int(nblk_mult, kind=blas_kind), zero, &
1917 tmp1_full(1:nblk_mult_max*(np_dirs_fine/np_dirs_t),1:nblk_mult), &
1918 int(nblk_mult_max*(np_dirs_fine/np_dirs_t), kind=blas_kind))
1919 endif ! b_transposed
1920 call obj%timer%stop("blas")
1921 endif ! useGPU
1922
1923#ifdef WITH_MPI
1924 if (a_transposed) then
1925 num_r = nblk_mult
1926 num_c = nblk_mult_max*(np_dirs_fine/np_dirs_t)
1927 else if (b_transposed) then
1928 num_r = nblk_mult_max*(np_dirs_fine/np_dirs_t)
1929 num_c = nblk_mult
1930 endif
1931
1932 num = nblk_mult*nblk_mult_max*(np_dirs_fine/np_dirs_t)
1933
1934 if (usegpu .and. .not. useccl) then
1935 call obj%timer%start("gpu_memcpy")
1936#ifdef WITH_GPU_STREAMS
1937 call gpu_memcpy_async_and_stream_synchronize &
1938 ("elpa_pxgemm_multiply: tmp1_full_dev -> tmp1_full", tmp1_full_dev, 0_c_intptr_t, &
1939 tmp1_full(1:num_r,1:num_c), &
1940 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1941#else
1942 successgpu = gpu_memcpy(int(loc(tmp1_full),kind=c_intptr_t), tmp1_full_dev, &
1943 num*size_of_datatype, gpumemcpydevicetohost)
1944 check_memcpy_gpu("elpa_pxgemm_multiply: tmp1_full_dev -> tmp1_full", successgpu)
1945#endif
1946 call obj%timer%stop("gpu_memcpy")
1947 endif ! (useGPU .and. .not. useCCL)
1948
1949 if (useccl) then
1950 call obj%timer%start("ccl_reduce")
1951 nvtx_range_push("ccl_reduce tmp1_full_dev")
1952
1953 successgpu = ccl_reduce(tmp1_full_dev, tmp1_full_dev, int(k_datatype*num, kind=c_size_t), &
1954 ccldatatype, cclsum, int(np, kind=c_int), ccl_comm_dirs, my_stream)
1955
1956 if (.not. successgpu) then
1957 print *,"Error in ccl_reduce"
1958 stop 1
1959 endif
1960
1961 successgpu = gpu_stream_synchronize(my_stream)
1962 check_stream_synchronize_gpu("elpa_pxgemm_multiply: ccl_bcast", successgpu)
1963
1964 nvtx_range_pop("ccl_bcast aux_a_full_dev, aux_b_full_dev")
1965 call obj%timer%stop("ccl_reduce")
1966 else ! useCCL
1967 call obj%timer%start("mpi_reduce")
1968 if (my_pdir==np) then
1969 call mpi_reduce(mpi_in_place, tmp1_full, int(num, kind=mpi_kind), &
1970 mpi_math_datatype_precision, mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_dirs,kind=mpi_kind), mpierr)
1971 else
1972 call mpi_reduce(tmp1_full , tmp1_full, int(num, kind=mpi_kind), &
1973 mpi_math_datatype_precision, mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_dirs,kind=mpi_kind), mpierr)
1974 endif
1975 call obj%timer%stop("mpi_reduce")
1976 endif ! useCCL
1977
1978 if (usegpu .and. .not. useccl) then
1979 call obj%timer%start("gpu_memcpy")
1980#ifdef WITH_GPU_STREAMS
1981 call gpu_memcpy_async_and_stream_synchronize &
1982 ("elpa_pxgemm_multiply: tmp1_full -> tmp1_full_dev", 0_c_intptr_t, tmp1_full_dev, &
1983 tmp1_full(1:num_r,1:num_c), &
1984 1, 1, num*size_of_datatype, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
1985#else
1986 successgpu = gpu_memcpy(tmp1_full_dev, int(loc(tmp1_full),kind=c_intptr_t), &
1987 num*size_of_datatype, gpumemcpyhosttodevice)
1988 check_memcpy_gpu("elpa_pxgemm_multiply: tmp1_full -> tmp1_full_dev", successgpu)
1989#endif
1990 call obj%timer%stop("gpu_memcpy")
1991 endif ! (useGPU .and. .not. useCCL)
1992#endif /* WITH_MPI */
1993
1994
1995 if (my_pdir==np) then
1996 if (usegpu) then
1997 call obj%timer%start("gpu_update_c_tn_nt")
1998 beta_int = 1
1999 dnp_ab = np_ab_fine/np_dirs
2000 if (dnp_ab == 0) beta_int = 0
2001 ! Peter: for ELPA: ldc=l_rows (i.e. ld of c_dev and ld of tmp1_full_dev are same)
2002 ! but generally it's possible that ldc is not equal to l_rows --> extra parameter in
2003 ! kernel is needed
2004 call gpu_update_c_tn_nt(precision_char, a_transposed_int, &
2005 c_dev, tmp1_full_dev, beta_int, &
2006 l_rows, l_cols, nblk_mult_max, nblk_mult, nblk, &
2007 np_rows, np_cols, np_dirs_fine, &
2008 np_dirs_t, my_pdir_t, np_fine, &
2009 sm_count, debug, my_stream)
2010 call obj%timer%stop("gpu_update_c_tn_nt")
2011 else
2012 ! Put the result into C
2013 call obj%timer%start("update_c_tn_nt")
2014 first_call = .false.
2015 dnp_ab = np_ab_fine/np_dirs
2016 if (dnp_ab == 0) first_call = .true.
2017
2018 do dnp_ab_t = 0, np_dirs_fine/np_dirs_t-1
2019 np_ab_t_fine = dnp_ab_t*np_dirs_t + my_pdir_t
2020
2021 if (a_transposed) then
2022 do j_block_loc_fine = 0, nblk_mult_max/nblk-1
2023 j_block_loc = (np_ab_t_fine + j_block_loc_fine*np_cols_fine)/np_cols
2024
2025 do i_block_loc_fine = 0, nblk_mult/nblk-1
2026 i_block_loc = (np_fine + i_block_loc_fine*np_rows_fine)/np_rows
2027
2028 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
2029 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
2030
2031 if (nblk_rows_cut>0 .and. nblk_cols_cut>0) then
2032 if (first_call) then
2033 c(1+i_block_loc*nblk:nblk_rows_cut+i_block_loc*nblk, &
2034 1+j_block_loc*nblk:nblk_cols_cut+j_block_loc*nblk) = &
2035 tmp1_full(1+i_block_loc_fine*nblk :nblk_rows_cut+i_block_loc_fine*nblk, &
2036 1 +j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
2037 nblk_cols_cut+j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max)
2038 else
2039 c(1+i_block_loc*nblk:nblk_rows_cut+i_block_loc*nblk, &
2040 1+j_block_loc*nblk:nblk_cols_cut+j_block_loc*nblk) = &
2041 c(1+i_block_loc*nblk:nblk_rows_cut+i_block_loc*nblk, &
2042 1+j_block_loc*nblk:nblk_cols_cut+j_block_loc*nblk) + &
2043 tmp1_full(1+i_block_loc_fine*nblk :nblk_rows_cut+i_block_loc_fine*nblk, &
2044 1 +j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
2045 nblk_cols_cut+j_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max)
2046 endif
2047 endif
2048 enddo ! i_block_loc_fine
2049 enddo ! j_block_loc_fine
2050 else if (b_transposed) then
2051 do i_block_loc_fine = 0, nblk_mult_max/nblk-1
2052 i_block_loc = (np_ab_t_fine + i_block_loc_fine*np_rows_fine)/np_rows
2053
2054 do j_block_loc_fine = 0, nblk_mult/nblk-1
2055 j_block_loc = (np_fine + j_block_loc_fine*np_cols_fine)/np_cols
2056
2057 nblk_rows_cut = min(nblk, l_rows - i_block_loc*nblk)
2058 nblk_cols_cut = min(nblk, l_cols - j_block_loc*nblk)
2059
2060 if (nblk_rows_cut>0 .and. nblk_cols_cut>0) then
2061 if (first_call) then
2062 c(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
2063 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk) = &
2064 tmp1_full(1 +i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
2065 nblk_rows_cut+i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max, &
2066 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk)
2067 else
2068 c(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
2069 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk) = &
2070 c(1+i_block_loc*nblk : nblk_rows_cut+i_block_loc*nblk, &
2071 1+j_block_loc*nblk : nblk_cols_cut+j_block_loc*nblk) + &
2072 tmp1_full(1 +i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max : &
2073 nblk_rows_cut+i_block_loc_fine*nblk+dnp_ab_t*nblk_mult_max, &
2074 1+j_block_loc_fine*nblk : nblk_cols_cut+j_block_loc_fine*nblk)
2075 endif
2076 endif
2077 enddo ! j_block_loc_fine
2078 enddo ! i_block_loc_fine
2079 endif
2080 enddo ! dnp_ab_t = 0, np_dirs_fine/np_dirs_t-1
2081 call obj%timer%stop("update_c_tn_nt")
2082 endif ! useGPU
2083 endif ! (my_pdir==np)
2084
2085
2086 enddo ! do np_ab_fine = 0, np_dirs_fine-1
2087
2088 nvtx_range_pop("do np = 0, np_dirs-1")
2089 enddo ! np = 0, np_dirs-1
2090 call obj%timer%stop("main_loop_tn_nt")
2091
2092#if !defined(DEVICE_POINTER)
2093 if (usegpu) then
2094 call obj%timer%start("gpu_memcpy")
2095 nvtx_range_push("gpu_memcpy: c_dev->c")
2096#ifdef WITH_GPU_STREAMS
2097 call gpu_memcpy_async_and_stream_synchronize &
2098 ("elpa_pxgemm_multiply: c_dev -> c", c_dev, 0_c_intptr_t, c(1:ldc,1:ldccols), &
2099 1, 1, ldc*ldccols*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
2100#else
2101 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, &
2102 ldc*ldccols*size_of_datatype, gpumemcpydevicetohost)
2103 check_memcpy_gpu("elpa_pxgemm_multiply: c_dev -> c", successgpu)
2104#endif
2105 nvtx_range_pop("gpu_memcpy: c_dev->c")
2106 call obj%timer%stop("gpu_memcpy")
2107 endif ! useGPU
2108#endif /* DEVICE_POINTER */
2109
2110 deallocate(tmp1_full, stat=istat, errmsg=errormessage)
2111 call check_alloc("elpa_pxgemm_multiply", "tmp1_full", istat, errormessage)
2112
2113 if (usegpu) then
2114 successgpu = gpu_free(tmp1_full_dev)
2115 check_dealloc_gpu("elpa_pxgemm_multiply: tmp1_full_dev", successgpu)
2116 endif
2117 endif ! (.not. a_transposed .and. b_transposed)
2118
2119! ___________________________________________________________________
2120
2121 endif ! (.not. isSquareGrid)
2122
2123!______________________________________________________________________________________________
2124
2125 deallocate(aux_a_full, stat=istat, errmsg=errormessage)
2126 call check_alloc("elpa_pxgemm_multiply", "aux_a_full", istat, errormessage)
2127
2128 deallocate(aux_b_full, stat=istat, errmsg=errormessage)
2129 call check_alloc("elpa_pxgemm_multiply", "aux_b_full", istat, errormessage)
2130
2131 if (usegpu) then
2132 successgpu = gpu_free(aux_a_full_dev)
2133 check_dealloc_gpu("elpa_pxgemm_multiply: aux_a_full_dev", successgpu)
2134
2135 successgpu = gpu_free(aux_b_full_dev)
2136 check_dealloc_gpu("elpa_pxgemm_multiply: aux_b_full_dev", successgpu)
2137 endif
2138
2139!______________________________________________________________________________________________
2140
2141 if (usegpu) then
2142#if !defined(DEVICE_POINTER)
2143 successgpu = gpu_free(a_dev)
2144 check_dealloc_gpu("elpa_pxgemm_multiply: a_dev", successgpu)
2145
2146 successgpu = gpu_free(b_dev)
2147 check_dealloc_gpu("elpa_pxgemm_multiply: b_dev", successgpu)
2148
2149 successgpu = gpu_free(c_dev)
2150 check_dealloc_gpu("elpa_pxgemm_multiply: c_dev", successgpu)
2151#endif /* DEVICE_POINTER */
2152 endif ! useGPU
2153
2154#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION)
2155 if (usegpu) then
2156 successgpu = gpu_get_last_error()
2157 if (.not. successgpu) then
2158 write(error_unit,*) "elpa1_template: GPU error detected via gpu_get_last_error(). Aborting..."
2159 write(error_unit,*) "Rerun the program with the debug option e.g. 'export ELPA_DEFAULT_debug=1'"
2160 stop 1
2161 endif
2162 endif
2163#endif
2164
2165 call obj%timer%stop("elpa_pxgemm_multiply"//trim(tnstring)//trim(gpustring))
2166
2167 nvtx_range_pop("elpa_pxgemm_multiply")
void set_gpu_parameters(int *gpuMemcpyHostToDevice, int *gpuMemcpyDeviceToHost)
Definition gpu_vendor_agnostic_layer.c:62
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_elpa_pxgemm_helpers.F90:46
integer function find_nblk_mult_dirs(l_dirs, nblk, np_dirs, np_dir_fine, lcm)
Definition mod_elpa_pxgemm_helpers.F90:56
Definition mod_elpa_pxgemm_transpose.F90:46