Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.05.001.rc1
Loading...
Searching...
No Matches
elpa_multiply_a_b_template.F90
Go to the documentation of this file.
1!
2! The ELPA library was originally created by the ELPA consortium,
3! consisting of the following organizations:
4!
5! - Max Planck Computing and Data Facility (MPCDF), formerly known as
6! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
7! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
8! Informatik,
9! - Technische Universität München, Lehrstuhl für Informatik mit
10! Schwerpunkt Wissenschaftliches Rechnen ,
11! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
12! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
13! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
14! and
15! - IBM Deutschland GmbH
16!
17! This particular source code file contains additions, changes and
18! enhancements authored by Intel Corporation which is not part of
19! the ELPA consortium.
20!
21! More information can be found here:
22! http://elpa.mpcdf.mpg.de/
23!
24! ELPA is free software: you can redistribute it and/or modify
25! it under the terms of the version 3 of the license of the
26! GNU Lesser General Public License as published by the Free
27! Software Foundation.
28!
29! ELPA is distributed in the hope that it will be useful,
30! but WITHOUT ANY WARRANTY; without even the implied warranty of
31! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32! GNU Lesser General Public License for more details.
33!
34! You should have received a copy of the GNU Lesser General Public License
35! along with ELPA. If not, see <http://www.gnu.org/licenses/>
36!
37! ELPA reflects a substantial effort on the part of the original
38! ELPA consortium, and we ask you to respect the spirit of the
39! license that we chose: i.e., please contribute any changes you
40! may have back to the original ELPA library distribution, and keep
41! any derivatives of ELPA under the same license that we chose for
42! the original distribution, the GNU Lesser General Public License.
43!
44!
45! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
46!
47! Copyright of the original code rests with the authors inside the ELPA
48! consortium. The copyright of any additional modifications shall rest
49! with their original authors, but shall adhere to the licensing terms
50! distributed along with the original code in the file "COPYING".
51!
52! Author: A. Marek, MPCDF
53
54#include "../general/sanity.F90"
55#include "../general/error_checking.inc"
56
57#undef USE_CCL_MULTIPLY
58#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
59#define USE_CCL_MULTIPLY
60#endif
61
62
63
64#ifdef USE_CCL_MULTIPLY
65#define MORE_GPU
66#endif
67#ifndef USE_CCL_MULTIPLY
68#define MORE_GPU
69#endif
70#ifdef DEVICE_POINTER
71#define MORE_GPU
72#endif
73
74 use elpa1_compute
75 use elpa_mpi
76 use precision
78 use, intrinsic :: iso_c_binding
79 use elpa_gpu
80 use mod_check_for_gpu
81 use elpa_blas_interfaces
82 use elpa_utilities, only : local_index, check_deallocate_f, check_dealloc_gpu_f, &
83 check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &
84 check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &
85 check_host_register_gpu_f, check_alloc, error_unit
86 use mod_query_gpu_usage
87#ifdef WITH_GPU_STREAMS
88 use elpa_gpu_util
89#endif
90#ifdef WITH_NVIDIA_GPU_VERSION
91 use cuda_functions ! for NVTX labels
92#endif
93#if defined(USE_CCL_MULTIPLY)
94 use elpa_ccl_gpu
95#endif
96 use multiply_a_b_gpu
97 implicit none
98
99#include "../../src/general/precision_kinds.F90"
100 class(elpa_abstract_impl_t), intent(inout) :: obj
101
102 character*1 :: uplo_a, uplo_c, trans_a
103
104 integer(kind=ik), intent(in) :: ldb, ldbCols, ldc, ldcCols
105 integer(kind=ik) :: na, ncb
106#ifndef DEVICE_POINTER
107!forget device pointer case for the moment implement later
108#ifdef USE_ASSUMED_SIZE
109 math_datatype(kind=rck) :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
110#else
111 math_datatype(kind=rck) :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbcols), c(ldc,ldccols)
112#endif
113#else /* DEVICE_POINTER */
114 ! dummy variables
115 math_datatype(kind=rck), allocatable :: a(:,:), b(:,:), c(:,:)
116 type(c_ptr) :: aDev, bDev, cDev
117#endif /* DEVICE_POINTER */
118
119 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
120 integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
121 integer(kind=MPI_KIND) :: mpierr, myidMPI
122 integer(kind=ik) :: l_cols, l_rows, l_rows_np
123 integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
124 integer(kind=ik) :: gcol_min, gcol, goff
125 integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
126 integer(kind=ik) :: np_br, noff_n ! PETERDEBUG
127 integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
128
129 logical :: a_lower, a_upper, c_lower, c_upper
130 math_datatype(kind=rck), pointer, contiguous :: aux_mat(:,:), tmp1(:,:)
131 math_datatype(kind=rck), allocatable :: aux_bc(:), tmp2(:,:)
132 integer(kind=ik) :: istat
133 character(200) :: errorMessage
134 character(20) :: gpuString
135 logical :: success
136 logical :: successGPU
137 logical :: useGPU
138 integer(kind=c_int) :: numGPU, blocking
139 integer(kind=ik) :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
140 integer(kind=ik) :: nblk, matrixRows, matrixCols, error
141 integer(kind=c_intptr_t) :: aux_mat_dev, tmp1_dev
142!#ifndef DEVICE_POINTER
143 integer(kind=c_intptr_t) :: b_dev
144#ifdef MORE_GPU
145 integer(kind=c_intptr_t) :: a_dev
146#endif
147#ifdef MORE_GPU
148 integer(kind=c_intptr_t) :: c_dev
149#endif
150!#endif
151#ifdef MORE_GPU
152 integer(kind=c_intptr_t) :: tmp2_dev, aux_bc_dev
153#endif
154 type(c_ptr) :: aux_host, tmp1_host
155 integer(kind=c_intptr_t) :: num
156 integer(kind=c_intptr_t) :: aux_off, b_off
157 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
158 &precision&
159 &_&
160 &math_datatype
161
162 integer(kind=c_intptr_t) :: gpuHandle, my_stream
163 integer(kind=c_int) :: gpu_hermitian_multiply
164
165#if defined(USE_CCL_MULTIPLY)
166 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
167#endif
168#ifdef DEVICE_POINTER
169 math_datatype(kind=rck), allocatable :: a_tmp(:,:), c_tmp(:,:)
170#endif
171 integer(kind=c_intptr_t) :: aux_dev
172 integer(kind=c_int) :: gpu
173 integer(kind=c_int) :: gpu_multiply_a_b
174 success = .true.
175 usegpu = .false.
176
177 trans_a='N' ! PETERDEBUG: propagate it up, accept as an option
178
179#if !defined(DEVICE_POINTER)
180
181#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
182 if (.not.(query_gpu_usage(obj, "ELPA_MULITPLY_AB", usegpu))) then
183 print *,"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."
184 stop 1
185 endif
186#endif
187
188 ! check whether the above setting should be overriden
189 if (obj%is_set("gpu_hermitian_multiply") == 1) then
190 call obj%get("gpu_hermitian_multiply", gpu_hermitian_multiply, error)
191 if (error .ne. elpa_ok) then
192 print *,"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
193 stop 1
194 endif
195 if (usegpu .and. gpu_hermitian_multiply .eq. 0) then
196 usegpu = .false.
197 else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1) then
198 usegpu = .true.
199 else
200 endif
201 else
202 ! no override by user
203 ! keep seeting as found before
204 endif
205
206#else /* DEVICE_POINTER */
207 usegpu = .true.
208#endif /* DEVICE_POINTER */
209
210 ! assumption, when DEVICE_POINTER -> MORE_GPU
211 ! when DEVICE_POINTER -> useGPU = .true.
212
213#if !defined(DEVICE_POINTER)
214
215#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
216 if (.not.(query_gpu_usage(obj, "ELPA_MULITPLY_AB", usegpu))) then
217 print *,"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."
218 stop 1
219 endif
220#endif
221
222 ! check whether the above setting should be overriden
223 if (obj%is_set("gpu_hermitian_multiply") == 1) then
224 call obj%get("gpu_hermitian_multiply", gpu_hermitian_multiply, error)
225 if (error .ne. elpa_ok) then
226 print *,"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
227 stop 1
228 endif
229 if (usegpu .and. gpu_hermitian_multiply .eq. 0) then
230 usegpu = .false.
231 else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1) then
232 usegpu = .true.
233 else
234 endif
235 else
236 ! no override by user
237 ! keep seeting as found before
238 endif
239
240#else /* DEVICE_POINTER */
241 usegpu = .true.
242
243#ifdef MORE_GPU
244 c_dev = transfer(cdev, c_dev)
245 a_dev = transfer(adev, a_dev)
246#else /* MORE_GPU */
247 allocate(a_tmp(obj%local_nrows,obj%local_ncols), stat=istat, errmsg=errormessage)
248 check_allocate("elpa_mult_at_b: a_tmp", istat, errormessage)
249
250 num = obj%local_nrows*obj%local_ncols*size_of_datatype
251#ifdef WITH_GPU_STREAMS
252 successgpu = gpu_host_register(int(loc(a_tmp),kind=c_intptr_t), &
253 obj%local_nrows*obj%local_ncols * size_of_datatype,&
254 gpuhostregisterdefault)
255 check_host_register_gpu("elpa_mult_at_b: a_tmp", successgpu)
256
257 my_stream = obj%gpu_setup%my_stream
258 successgpu = gpu_stream_synchronize(my_stream)
259 check_stream_synchronize_gpu("elpa_mult_at_b: a_dev to a_tmp", successgpu)
260
261 successgpu = gpu_memcpy_async(int(loc(a_tmp),kind=c_intptr_t), adev, num,&
262 gpumemcpydevicetohost, my_stream)
263 check_memcpy_gpu("elpa_mult_at_b: a_dev -> a_tmp", successgpu)
264
265 my_stream = obj%gpu_setup%my_stream
266 successgpu = gpu_stream_synchronize(my_stream)
267 check_stream_synchronize_gpu("elpa_mult_at_b: a_dev -> a_tmp", successgpu)
268 ! synchronize streamsPerThread; maybe not neccessary
269 successgpu = gpu_stream_synchronize()
270 check_stream_synchronize_gpu("elpa_mult_at_b: a_dev -> a_tmp", successgpu)
271#else
272 successgpu = gpu_memcpy(int(loc(a_tmp),kind=c_intptr_t), adev, num,&
273 gpumemcpydevicetohost)
274 check_memcpy_gpu("elpa_mult_at_b: a_dev -> a_tmp", successgpu)
275#endif
276
277 allocate(c_tmp(ldc,ldccols), stat=istat, errmsg=errormessage)
278 check_allocate("elpa_mult_at_b: c_tmp", istat, errormessage)
279
280#ifdef WITH_GPU_STREAMS
281 successgpu = gpu_host_register(int(loc(c_tmp),kind=c_intptr_t),&
282 ldc*ldccols*size_of_datatype, &
283 gpuhostregisterdefault)
284#endif
285
286#endif /* MORE_GPU */
287
288
289 b_dev = transfer(bdev, b_dev)
290
291#endif /* DEVICE_POINTER */
292
293 if(usegpu) then
294 gpustring = "_gpu"
295 else
296 gpustring = ""
297 endif
298
299 call obj%timer%start("elpa_mult_at_b_&
300 &MATH_DATATYPE&
301 &_&
302 &PRECISION&
303 &"//gpustring)
304
305 na = obj%na
306 nblk = obj%nblk
307 matrixrows = obj%local_nrows
308 matrixcols = obj%local_ncols
309
310 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
311 mpi_comm_cols = obj%mpi_setup%mpi_comm_cols
312 mpi_comm_rows = obj%mpi_setup%mpi_comm_rows
313
314 myid = obj%mpi_setup%myRank_comm_parent
315 my_prow = obj%mpi_setup%myRank_comm_rows
316 my_pcol = obj%mpi_setup%myRank_comm_cols
317
318 np_rows = obj%mpi_setup%nRanks_comm_rows
319 np_cols = obj%mpi_setup%nRanks_comm_cols
320
321 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b
322 l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
323
324 ! Block factor for matrix multiplications, must be a multiple of nblk
325
326 if (obj%is_set("blocking_in_multiply") == 1) then
327 call obj%get("blocking_in_multiply", blocking, error)
328 if (error .ne. elpa_ok) then
329 write(error_unit,*) "ELPA_HERMITIAN_MULTIPLY: Problem in getting keyword 'blocking_in_multiply'. Aborting..."
330 stop 1
331 endif
332 nblk_mult = (blocking/nblk+1) * nblk
333 else ! is_set
334 if (usegpu) then
335 if (na/np_rows <= 256) then
336 nblk_mult = (63/nblk+1)*nblk
337 else
338 nblk_mult = (351/nblk+1)*nblk
339 endif
340 else ! useGPU
341 if (na/np_rows <= 256) then
342 nblk_mult = (31/nblk+1)*nblk
343 else
344 nblk_mult = (63/nblk+1)*nblk
345 endif
346 endif ! useGPU
347 endif ! is_set
348
349 if (usegpu) then
350 call obj%timer%start("check_for_gpu")
351 if (check_for_gpu(obj, myid, numgpu)) then
352 ! set the neccessary parameters
353 call set_gpu_parameters()
354 else
355 print *,"GPUs are requested but not detected! Aborting..."
356 success = .false.
357 return
358 endif
359 call obj%timer%stop("check_for_gpu")
360
361#ifdef MORE_GPU
362#if !defined(DEVICE_POINTER)
363 num = ldc*ldccols*size_of_datatype
364 successgpu = gpu_malloc(c_dev, num)
365 check_alloc_gpu("elpa_mult_at_b: c_dev", successgpu)
366 ! no copy from c to c_dev needed since c will be overwritten anyway
367#endif
368#endif /* MORE_GPU */
369
370#if !defined(DEVICE_POINTER)
371 ! copy b to b_dev
372 num = ldb*ldbcols*size_of_datatype
373 successgpu = gpu_malloc(b_dev, num)
374 check_alloc_gpu("elpa_mult_at_b: b_dev", successgpu)
375
376#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
377 successgpu = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&
378 gpuhostregisterdefault)
379#endif
380
381 check_host_register_gpu("elpa_mult_at_b: b", successgpu)
382#ifdef WITH_GPU_STREAMS
383 my_stream = obj%gpu_setup%my_stream
384 call gpu_memcpy_async_and_stream_synchronize &
385 ("elpa_mult_at_b: b to b_dev", b_dev, 0_c_intptr_t, &
386 b(1:ldb,1:ldbcols), &
387 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
388#else
389 successgpu = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
390 gpumemcpyhosttodevice)
391 check_memcpy_gpu("elpa_mult_at_b: b to b_dev", successgpu)
392#endif
393
394#else /* DEVICE_POINTER */
395
396#endif /* DEVICE_POINTER */
397
398 num = l_rows*nblk_mult*size_of_datatype
399#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
400 successgpu = gpu_malloc_host(aux_host, num)
401 check_host_alloc_gpu("elpa_mult_at_b: aux_host", successgpu)
402 call c_f_pointer(aux_host, aux_mat, (/l_rows,nblk_mult/))
403#else
404 allocate(aux_mat(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
405 check_allocate("elpa_mult_at_b: aux_mat", istat, errormessage)
406#endif
407
408 successgpu = gpu_malloc(aux_mat_dev, num)
409 check_alloc_gpu("elpa_mult_at_b: aux_mat_dev", successgpu)
410
411 num = nblk_mult*l_cols*size_of_datatype
412 successgpu = gpu_malloc(tmp1_dev, num)
413 check_alloc_gpu("elpa_mult_at_b: tmp1_dev", successgpu)
414
415!#ifdef MORE_GPU
416! allocate(tmp2(nblk_mult,l_cols), stat=istat, errmsg=errorMessage)
417! check_allocate("elpa_mult_at_b: tmp2", istat, errorMessage)
418!#endif
419
420#ifdef MORE_GPU
421 num = nblk_mult*l_cols*size_of_datatype
422 successgpu = gpu_malloc(tmp2_dev, num)
423 check_alloc_gpu("elpa_mult_at_b: tmp2_dev", successgpu)
424#endif
425 else ! useGPU
426 allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errormessage)
427 check_allocate("elpa_mult_at_b: aux_mat", istat, errormessage)
428 endif ! useGPU
429
430 allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errormessage)
431 check_allocate("elpa_mult_at_b: aux_bc", istat, errormessage)
432
433 allocate(lrs_save(nblk), stat=istat, errmsg=errormessage)
434 check_allocate("elpa_mult_at_b: lrs_save", istat, errormessage)
435
436 allocate(lre_save(nblk), stat=istat, errmsg=errormessage)
437 check_allocate("elpa_mult_at_b: lre_save", istat, errormessage)
438
439 a_lower = .false.
440 a_upper = .false.
441 c_lower = .false.
442 c_upper = .false.
443
444 if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
445 if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
446 if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
447 if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
448
449#ifdef MORE_GPU
450 if (usegpu) then
451
452#if !defined(DEVICE_POINTER)
453 num = obj%local_nrows*obj%local_ncols*size_of_datatype
454 successgpu = gpu_malloc(a_dev, num)
455 check_alloc_gpu("elpa_mult_at_b: a_dev", successgpu)
456#endif
457
458 num = l_rows*nblk*size_of_datatype
459 successgpu = gpu_malloc(aux_bc_dev, num)
460 check_alloc_gpu("elpa_mult_at_b: aux_bc_dev", successgpu)
461
462 num = obj%local_nrows*obj%local_ncols*size_of_datatype
463#if !defined(DEVICE_POINTER)
464
465#ifdef WITH_GPU_STREAMS
466 my_stream = obj%gpu_setup%my_stream
467 call gpu_memcpy_async_and_stream_synchronize &
468 ("elpa_mult_at_b: a to a_dev", a_dev, 0_c_intptr_t, &
469 a(1:obj%local_nrows,1:obj%local_ncols), &
470 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
471#else
472 successgpu = gpu_memcpy(a_dev, int(loc(a),kind=c_intptr_t), &
473 num, gpumemcpyhosttodevice)
474 check_memcpy_gpu("elpa_mult_at_b: a to a_dev", successgpu)
475#endif
476#endif /* DEVICE_POINTER */
477 endif !useGPU
478#endif /* MORE_GPU */
479
480 ! main loop: build up the result matrix by processor rows
481 do np = 0, np_rows-1
482
483#ifdef WITH_NVTX
484 call nvtxrangepush("do np = 0, np_rows-1")
485#endif
486
487 ! In this turn, procs of row np assemble the result
488
489 l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
490
491 nr_done = 0 ! Number of rows done
492 nstor = 0 ! Number of columns stored in aux_mat
493
494 aux_mat = 0
495 if (usegpu) then
496 num = l_rows*nblk_mult*size_of_datatype
497#ifdef WITH_GPU_STREAMS
498 my_stream = obj%gpu_setup%my_stream
499 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
500 check_memcpy_gpu("hermitian_multiply: aux_mat_dev", successgpu)
501#else
502 successgpu = gpu_memset(aux_mat_dev, 0, num)
503 check_memcpy_gpu("hermitian_multiply: aux_mat_dev", successgpu)
504#endif
505 endif ! useGPU
506
507 ! Loop over the blocks on row np
508 do nb = 0, (l_rows_np-1)/nblk
509
510#ifdef WITH_NVTX
511 call nvtxrangepush("do nb = 0, (l_rows_np-1)/nblk")
512#endif
513
514 goff = nb*np_rows + np ! Global offset in blocks corresponding to nb
515 ! rename: goff -> block_offset_gl
516
517 ! Get the processor row (if trans_a='N') or column (if trans_a='T' or 'H') which owns this block
518 ! and the offset in blocks within this row/column.
519 ! The corresponding block row/column in A is then broadcast to all for multiplication with B
520
521 np_br = mod(goff,np_rows) ! np, that posesses the given block row ; trans_a='N'; "br"=block row ! PETERDEBUG
522 np_bc = mod(goff,np_cols) ! np, that posesses the given block column; trans_a='T'; "bc"=block column; rename: np_bc -> np_col_b
523
524 noff = goff/np_cols ! noff -> noff_t
525 noff_n = goff/np_rows ! PETERDEBUG
526
527 ! Gather up the complete block column of A on the owner in contigous memory of aux_bc array
528 n_aux_bc = 0
529 do n = 1, min(l_rows_np-nb*nblk, nblk) ! Loop over local rows/columns to be broadcast
530
531 gcol = goff*nblk + n ! global column corresponding to n
532
533 if (nstor==0 .and. n==1) gcol_min = gcol
534
535 lrs = 1 ! 1st local row number for broadcast
536 lre = l_rows ! last local row number for broadcast
537 if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
538 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
539
540 if (lrs <= lre) then
541 nvals = lre-lrs+1
542#ifdef MORE_GPU
543 if (usegpu) then
544 if (my_pcol == np_bc) call gpu_copy_precision_a_aux_bc(a_dev, aux_bc_dev, n_aux_bc, nvals, lrs, lre, noff, &
545 nblk, n, l_rows, obj%local_nrows, obj%local_ncols, my_stream)
546 else ! useGPU
547 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
548 endif ! useGPU
549#else /* MORE_GPU */
550#ifndef DEVICE_POINTER
551 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
552#else
553 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a_tmp(lrs:lre,noff*nblk+n)
554#endif
555#endif /* MORE_GPU */
556 n_aux_bc = n_aux_bc + nvals
557 endif ! (lrs <= lre)
558
559 lrs_save(n) = lrs
560 lre_save(n) = lre
561
562 enddo ! n = 1, min(l_rows_np-nb*nblk, nblk)
563
564#ifdef WITH_MPI
565! CCL only with MPI
566#ifdef USE_CCL_MULTIPLY
567 if (usegpu) then ! useGPU-USE_CCL_MULTIPLY
568#ifdef WITH_GPU_STREAMS
569 my_stream = obj%gpu_setup%my_stream
570 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
571 successgpu = ccl_group_start()
572
573 if (.not.successgpu) then
574 print *,"Error in setting up ccl_group_start!"
575 stop 1
576 endif
577
578 successgpu = ccl_bcast(aux_bc_dev, aux_bc_dev, &
579
580#if REALCASE == 1
581 int(n_aux_bc,kind=c_size_t), &
582#endif
583#if COMPLEXCASE == 1
584 int(2*n_aux_bc,kind=c_size_t), &
585#endif
586#if REALCASE == 1
587#ifdef DOUBLE_PRECISION
588 ccldouble, &
589#endif
590#ifdef SINGLE_PRECISION
591 cclfloat, &
592#endif
593#endif /* REALCASE */
594#if COMPLEXCASE == 1
595#ifdef DOUBLE_PRECISION
596 ccldouble, &
597#endif
598#ifdef SINGLE_PRECISION
599 cclfloat, &
600#endif
601#endif /* COMPLEXCASE */
602 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
603
604 if (.not.successgpu) then
605 print *,"Error in ccl_bcast"
606 stop 1
607 endif
608
609 successgpu = ccl_group_end()
610 if (.not.successgpu) then
611 print *,"Error in setting up ccl_group_end!"
612 stop 1
613 endif
614#endif /* WITH_GPU_STREAMS */
615 else ! useGPU-USE_CCL_MULTIPLY
616#endif /* USE_CCL_MULTIPLY */
617
618#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
619 if (usegpu) then
620 ! copy data to host for Bcast
621 num = l_rows*nblk*size_of_datatype
622#ifdef WITH_GPU_STREAMS
623 my_stream = obj%gpu_setup%my_stream
624 call gpu_memcpy_async_and_stream_synchronize &
625 ("elpa_mult_at_b: aux_bc_dev -> aux_bc", aux_bc_dev, 0_c_intptr_t, &
626 aux_bc(1:l_rows*nblk), &
627 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
628#else
629 num = l_rows*nblk*size_of_datatype
630 successgpu = gpu_memcpy(int(loc(aux_bc),kind=c_intptr_t), aux_bc_dev, num,&
631 gpumemcpydevicetohost)
632 check_memcpy_gpu("elpa_mult_at_b: aux_bc_dev -> aux_bc", successgpu)
633#endif
634
635 endif ! useGPU
636#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
637
638 ! Broadcast block column
639 call obj%timer%start("mpi_communication")
640
641 call mpi_bcast(aux_bc, int(n_aux_bc,kind=mpi_kind), &
642 mpi_math_datatype_precision, &
643 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
644
645 call obj%timer%stop("mpi_communication")
646
647#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
648 if (usegpu) then
649 ! copy data back to device
650 num = l_rows*nblk*size_of_datatype
651#ifdef WITH_GPU_STREAMS
652 my_stream = obj%gpu_setup%my_stream
653 call gpu_memcpy_async_and_stream_synchronize &
654 ("elpa_mult_at_b: aux_bc -> aux_bc_dev", aux_bc_dev, 0_c_intptr_t, &
655 aux_bc(1:l_rows*nblk), &
656 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
657#else
658 successgpu = gpu_memcpy(aux_bc_dev, int(loc(aux_bc),kind=c_intptr_t), num,&
659 gpumemcpyhosttodevice)
660 check_memcpy_gpu("elpa_mult_at_b: aux_bc -> aux_bc_dev", successgpu)
661#endif
662
663 endif ! useGPU
664#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
665
666#ifdef USE_CCL_MULTIPLY
667 endif ! useGPU-USE_CCL_MULTIPLY
668#endif /* USE_CCL_MULTIPLY */
669
670#else /* WITH_MPI */
671
672#endif /* WITH_MPI */
673 ! Insert what we got in aux_mat
674
675
676#ifdef MORE_GPU
677 if (usegpu) then
678 n_aux_bc = 0
679 my_stream = obj%gpu_setup%my_stream
680 do n = 1, min(l_rows_np-nb*nblk,nblk)
681 nstor = nstor+1
682 lrs = lrs_save(n)
683 lre = lre_save(n)
684 if (lrs <= lre) then
685 nvals = lre-lrs+1
686 call gpu_copy_precision_aux_bc_aux_mat(aux_bc_dev, aux_mat_dev, lrs, lre, nstor, n_aux_bc, &
687 nvals, l_rows, nblk, nblk_mult, my_stream)
688
689 n_aux_bc = n_aux_bc + nvals
690 endif
691 enddo
692 else ! useGPU
693#endif /* MORE_GPU */
694
695 n_aux_bc = 0
696 do n = 1, min(l_rows_np-nb*nblk,nblk)
697 nstor = nstor+1
698 lrs = lrs_save(n)
699 lre = lre_save(n)
700 if (lrs<=lre) then
701 nvals = lre-lrs+1
702 aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
703 n_aux_bc = n_aux_bc + nvals
704 endif
705 enddo
706
707#ifdef MORE_GPU
708 endif ! useGPU
709#endif /* MORE_GPU */
710
711 ! If we got nblk_mult columns in aux_mat or this is the last block
712 ! do the matrix multiplication
713
714 if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then
715
716 lrs = 1 ! 1st local row number for multiply
717 lre = l_rows ! last local row number for multiply
718 if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
719 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
720
721 lcs = 1 ! 1st local col number for multiply
722 lce = l_cols ! last local col number for multiply
723 if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
724 if (c_lower) lce = min(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
725
726 if (lcs <= lce) then
727#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
728 if (.not.usegpu) then
729#endif
730 ! introduce 1-based indexing
731 allocate(tmp1(nstor,1:lce-lcs+1), tmp2(nstor,1:lce-lcs+1), stat=istat, errmsg=errormessage)
732 call check_alloc("elpa_mult_at_b_&
733 &MATH_DATATYPE ", "tmp1", istat, errormessage)
734#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
735 endif
736#endif
737
738 if (lrs <= lre) then
739 if (usegpu) then
740#ifndef MORE_GPU
741 num = l_rows*nblk_mult*size_of_datatype
742#ifdef WITH_GPU_STREAMS
743 my_stream = obj%gpu_setup%my_stream
744 call gpu_memcpy_async_and_stream_synchronize &
745 ("elpa_mult_at_b: aux_mat to aux_mat_dev", aux_mat_dev, 0_c_intptr_t, &
746 aux_mat(1:l_rows,1:nblk_mult), &
747 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
748#else
749 successgpu = gpu_memcpy(aux_mat_dev, int(loc(aux_mat),kind=c_intptr_t), &
750 num, gpumemcpyhosttodevice)
751 check_memcpy_gpu("elpa_mult_at_b: aux_mat to aux_mat_dev", successgpu)
752#endif
753
754#endif /* MORE_GPU */
755 aux_off = (lrs-1)*size_of_datatype
756 b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
757
758 call obj%timer%start("gpublas")
759 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
760 call gpublas_precision_gemm(blas_trans_or_conj, 'N', nstor, lce-lcs+1, &
761 lre-lrs+1, one, aux_mat_dev+aux_off, l_rows, b_dev+b_off, ldb, zero, &
762 tmp1_dev, nstor, gpuhandle)
763 call obj%timer%stop("gpublas")
764
765#ifndef MORE_GPU
766 num = nstor*(lce-lcs+1)*size_of_datatype
767#ifdef WITH_GPU_STREAMS
768 my_stream = obj%gpu_setup%my_stream
769 successgpu = gpu_stream_synchronize(my_stream)
770 check_stream_synchronize_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
771
772 successgpu = gpu_memcpy_async(int(loc(tmp1),kind=c_intptr_t), &
773 tmp1_dev, num, gpumemcpydevicetohost, my_stream)
774 check_memcpy_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
775
776 my_stream = obj%gpu_setup%my_stream
777 successgpu = gpu_stream_synchronize(my_stream)
778 check_stream_synchronize_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
779 ! synchronize streamsPerThread; maybe not neccessary
780 successgpu = gpu_stream_synchronize()
781 check_stream_synchronize_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
782#else
783 successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
784 tmp1_dev, num, gpumemcpydevicetohost)
785 check_memcpy_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
786#endif
787#endif /* MORE_GPU */
788
789
790 num = nstor*(lce-lcs+1)*size_of_datatype
791 else ! useGPU
792 call obj%timer%start("blas")
793 call precision_gemm(blas_trans_or_conj, 'N', int(nstor,kind=blas_kind), &
794 int(lce-lcs+1,kind=blas_kind), int(lre-lrs+1,kind=blas_kind), &
795 one, aux_mat(lrs:lre,1:nstor), int(lre-lrs+1,kind=blas_kind), &
796 b(lrs,lcs), int(ldb,kind=blas_kind), zero, tmp1, &
797 int(nstor,kind=blas_kind))
798 call obj%timer%stop("blas")
799 endif ! useGPU
800 else ! (lrs <= lre)
801#ifdef MORE_GPU
802 if (usegpu) then
803 num = nstor*(lce-lcs+1)*size_of_datatype
804#ifdef WITH_GPU_STREAMS
805 my_stream = obj%gpu_setup%my_stream
806 successgpu = gpu_memset_async(tmp1_dev, 0, num, my_stream)
807 check_memcpy_gpu("hermitian_multiply: tmp1_dev", successgpu)
808#else
809 successgpu = gpu_memset(tmp1_dev, 0, num)
810 check_memcpy_gpu("hermitian_multiply: tmp1_dev", successgpu)
811#endif
812 else ! useGPU
813 tmp1 = 0
814 endif ! useGPU
815#else /* MORE_GPU */
816 tmp1 = 0
817#endif /* MORE_GPU */
818 endif ! (lrs <= lre)
819
820 ! Sum up the results and send to processor row np
821
822 if (usegpu) then
823#ifdef WITH_MPI
824
825#ifdef USE_CCL_MULTIPLY
826#ifdef WITH_GPU_STREAMS
827 my_stream = obj%gpu_setup%my_stream
828 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
829
830 successgpu = ccl_group_start()
831 if (.not.successgpu) then
832 print *,"Error in setting up ccl_group_start!"
833 stop 1
834 endif
835
836 successgpu = ccl_reduce(tmp1_dev, tmp2_dev, &
837
838#if REALCASE == 1
839 int(nstor*(lce-lcs+1),kind=c_size_t), &
840#endif
841#if COMPLEXCASE == 1
842 int(2*nstor*(lce-lcs+1),kind=c_size_t), &
843#endif
844#if REALCASE == 1
845#ifdef DOUBLE_PRECISION
846 ccldouble, &
847#endif
848#ifdef SINGLE_PRECISION
849 cclfloat, &
850#endif
851#endif /* REALCASE */
852#if COMPLEXCASE == 1
853#ifdef DOUBLE_PRECISION
854 ccldouble, &
855#endif
856#ifdef SINGLE_PRECISION
857 cclfloat, &
858#endif
859#endif /* COMPLEXCASE */
860 cclsum, int(np,kind=c_int), ccl_comm_rows, my_stream)
861
862
863 if (.not.successgpu) then
864 print *,"Error in ccl_reduce"
865 stop 1
866 endif
867
868 successgpu = ccl_group_end()
869 if (.not.successgpu) then
870 print *,"Error in setting up ccl_group_end!"
871 stop 1
872 endif
873#endif /* WITH_GPU_STREAMS */
874
875#endif /* USE_CCL_MULTIPLY */
876
877#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
878 ! copy data to host
879 num = nstor*(lce-lcs+1)*size_of_datatype
880#ifdef WITH_GPU_STREAMS
881 call gpu_memcpy_async_and_stream_synchronize &
882 ("elpa_mult_at_b: tmp1_dev to tmp1", tmp1_dev, 0_c_intptr_t, &
883 !tmp1(1:nblk_mult,1:l_cols), &
884 tmp1(1:nstor,1:lce-lcs+1), &
885 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
886#else
887 successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
888 tmp1_dev, num, gpumemcpydevicetohost)
889 check_memcpy_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
890#endif
891#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
892
893#if !defined(USE_CCL_MULTIPLY)
894 ! communication already done before with CCL
895 call obj%timer%start("mpi_communication")
896 call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind), mpi_math_datatype_precision, &
897 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
898 call obj%timer%stop("mpi_communication")
899#endif /* !defined(USE_CCL_MULTIPLY) */
900
901#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
902 ! copy data to device
903 num = nstor*(lce-lcs+1)*size_of_datatype
904#ifdef WITH_GPU_STREAMS
905 call gpu_memcpy_async_and_stream_synchronize &
906 ("elpa_mult_at_b: tmp2 to tmp2_dev", tmp2_dev, 0_c_intptr_t, &
907 !tmp2(1:nblk_mult,1:l_cols), &
908 tmp2(1:nstor,1:lce-lcs+1), &
909 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
910#else
911 successgpu = gpu_memcpy(tmp2_dev, int(loc(tmp2),kind=c_intptr_t), &
912 num, gpumemcpyhosttodevice)
913 check_memcpy_gpu("elpa_mult_at_b: tmp2 to tmp2_dev", successgpu)
914#endif
915#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
916
917#else /* WITH_MPI */
918
919#ifdef MORE_GPU
920 num = nstor*(lce-lcs+1)*size_of_datatype
921 successgpu = gpu_memcpy(tmp2_dev, tmp1_dev, &
922 num, gpumemcpydevicetodevice)
923 check_memcpy_gpu("elpa_mult_at_b: tmp2 to tmp2_dev", successgpu)
924#else /* MORE_GPU */
925 tmp2 = tmp1
926#endif /* MORE_GPU */
927#endif /* WITH_MPI */
928
929 else ! useGPU
930#ifdef WITH_MPI
931 call obj%timer%start("mpi_communication")
932 call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind), mpi_math_datatype_precision, &
933 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
934 call obj%timer%stop("mpi_communication")
935#endif
936 endif ! endif
937
938 if (usegpu) then
939#ifdef MORE_GPU
940 if (my_prow==np) call gpu_copy_precision_tmp2_c(tmp2_dev, c_dev, nr_done, nstor, lcs, lce, ldc, ldccols, my_stream)
941#else /* MORE_GPU */
942#ifdef WITH_MPI
943 ! Put the result into C
944#ifndef DEVICE_POINTER
945 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
946#else
947 if (my_prow==np) c_tmp(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
948#endif
949#else /* WITH_MPI */
950 ! Put the result into C
951#ifndef DEVICE_POINTER
952 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
953#else
954 if (my_prow==np) c_tmp(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
955#endif
956 !tmp2(:,:) = 0.
957#endif /* WITH_MPI */
958
959#endif /* MORE_GPU */
960 else ! useGPU
961#ifdef WITH_MPI
962 ! Put the result into C
963 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
964#else /* WITH_MPI */
965 ! Put the result into C
966 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
967 !tmp2(:,:) = 0.
968#endif /* WITH_MPI */
969 endif ! useGPU
970#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
971 if (.not.usegpu) then
972#endif
973 deallocate(tmp1, tmp2, stat=istat, errmsg=errormessage)
974 call check_alloc("elpa_mult_at_b_&
975 &MATH_DATATYPE ", "tmp1", istat, errormessage)
976#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
977 endif
978#endif
979 endif ! (lcs <= lce)
980
981 nr_done = nr_done+nstor
982 nstor=0
983 if (usegpu) then
984 num = l_rows*nblk_mult*size_of_datatype
985#ifdef WITH_GPU_STREAMS
986 my_stream = obj%gpu_setup%my_stream
987 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
988 check_memcpy_gpu("hermitian_multiply: aux_mat_dev", successgpu)
989#else
990 successgpu = gpu_memset(aux_mat_dev, 0, num)
991 check_memcpy_gpu("hermitian_multiply: aux_mat_dev", successgpu)
992#endif
993
994#ifndef MORE_GPU
995 ! additionally set HOST array to zero
996 aux_mat(:,:) = 0
997#endif
998 else
999 aux_mat(:,:) = 0
1000 endif
1001 endif ! (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np)
1002
1003#ifdef WITH_NVTX
1004 call nvtxrangepop() ! do nb = 0, (l_rows_np-1)/nblk
1005#endif
1006 enddo ! nb = 0, (l_rows_np-1)/nbl
1007
1008#ifdef WITH_NVTX
1009 call nvtxrangepop() ! do np = 0, np_rows-1
1010#endif
1011 enddo ! main loop: np = 0, np_rows-1
1012
1013!#ifdef MORE_GPU
1014! if (useGPU) then
1015! deallocate(tmp2, stat=istat, errmsg=errorMessage)
1016! check_deallocate("elpa_mult_at_b: tmp1, tmp2", istat, errorMessage)
1017! endif
1018!#endif
1019
1020 if (usegpu) then
1021#if !defined(DEVICE_POINTER)
1022 successgpu = gpu_free(b_dev)
1023 check_dealloc_gpu("elpa_multiply_a_b: b_dev", successgpu)
1024#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
1025 successgpu = gpu_host_unregister(int(loc(b),kind=c_intptr_t))
1026 check_host_unregister_gpu("elpa_multiply_a_b: b", successgpu)
1027#endif
1028
1029#ifdef MORE_GPU
1030 ! copy result c_dev back to CPU
1031 num = ldc*ldccols*size_of_datatype
1032#ifdef WITH_GPU_STREAMS
1033 check_stream_synchronize_gpu("elpa_mult_at_b: c_dev -> c", successgpu)
1034 call gpu_memcpy_async_and_stream_synchronize &
1035 ("elpa_mult_at_b: c_dev to c", c_dev, 0_c_intptr_t, &
1036 c(1:ldc,1:ldccols), &
1037 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1038#else
1039 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num,&
1040 gpumemcpydevicetohost)
1041 check_memcpy_gpu("elpa_mult_at_b: c_dev -> c", successgpu)
1042
1043#endif
1044 successgpu = gpu_free(c_dev)
1045 check_dealloc_gpu("elpa_multiply_a_b: c_dev", successgpu)
1046
1047#else /* MORE_GPU */
1048 ! c on host no copy back needed
1049#endif /* MORE_GPU */
1050
1051#else /* DEVICE_POINTER */
1052 !successGPU = gpu_free(b_dev)
1053 !check_dealloc_gpu("elpa_multiply_a_b: b_dev", successGPU)
1054
1055 !num = ldc*ldcCols*size_of_datatype
1056 !successGPU = gpu_memcpy(cDev, c_dev, num,&
1057 ! gpuMemcpyDeviceToDevice)
1058 !check_memcpy_gpu("elpa_mult_at_b: c_dev -> cDev", successGPU)
1059
1060 !successGPU = gpu_free(c_dev)
1061 !check_dealloc_gpu("elpa_multiply_a_b: c_dev", successGPU)
1062
1063#ifndef MORE_GPU
1064#ifdef WITH_GPU_STREAMS
1065 successgpu = gpu_host_unregister(int(loc(a_tmp),kind=c_intptr_t))
1066 check_host_unregister_gpu("elpa_mult_at_b: a_tmp", successgpu)
1067#endif
1068 deallocate(a_tmp, stat=istat, errmsg=errormessage)
1069 check_deallocate("elpa_mult_at_b: a_tmp", istat, errormessage)
1070
1071 num = ldc*ldccols*size_of_datatype
1072#ifdef WITH_GPU_STREAMS
1073 my_stream = obj%gpu_setup%my_stream
1074 successgpu = gpu_stream_synchronize(my_stream)
1075 check_stream_synchronize_gpu("elpa_mult_at_b: c_tmp to c", successgpu)
1076
1077 successgpu = gpu_memcpy_async(cdev,int(loc(c_tmp),kind=c_intptr_t),num,&
1078 gpumemcpyhosttodevice, my_stream)
1079 check_memcpy_gpu("elpa_mult_at_b: c_tmp -> c", successgpu)
1080
1081 my_stream = obj%gpu_setup%my_stream
1082 successgpu = gpu_stream_synchronize(my_stream)
1083 check_stream_synchronize_gpu("elpa_mult_at_b: c_tmp -> c", successgpu)
1084 ! synchronize streamsPerThread; maybe not neccessary
1085 successgpu = gpu_stream_synchronize()
1086 check_stream_synchronize_gpu("elpa_mult_at_b: c_tmp -> c", successgpu)
1087#else
1088 successgpu = gpu_memcpy(cdev,int(loc(c_tmp),kind=c_intptr_t),num,&
1089 gpumemcpyhosttodevice)
1090 check_memcpy_gpu("elpa_mult_at_b: c_tmp -> c", successgpu)
1091#endif
1092#ifdef WITH_GPU_STREAMS
1093 successgpu = gpu_host_unregister(int(loc(c_tmp),kind=c_intptr_t))
1094 check_host_unregister_gpu("elpa_multiply_a_b: c_tmp", successgpu)
1095#endif
1096
1097 deallocate(c_tmp, stat=istat, errmsg=errormessage)
1098 check_deallocate("elpa_mult_at_b: c_tmp", istat, errormessage)
1099#endif /* MORE_GPU */
1100
1101#endif /* DEVICE_POINTER */
1102#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
1103 nullify(aux_mat)
1104 !nullify(tmp1)
1105
1106 successgpu = gpu_free_host(aux_host)
1107 check_host_dealloc_gpu("elpa_multiply_a_b: aux_host", successgpu)
1108
1109 !successGPU = gpu_free_host(tmp1_host)
1110 !check_host_dealloc_gpu("elpa_multiply_a_b: tmp1_host", successGPU)
1111#else
1112 deallocate(aux_mat, stat=istat, errmsg=errormessage)
1113 check_deallocate("elpa_mult_at_b: aux_mat", istat, errormessage)
1114
1115 !deallocate(tmp1, stat=istat, errmsg=errorMessage)
1116 !check_deallocate("elpa_mult_at_b: tmp1", istat, errorMessage)
1117#endif
1118
1119 successgpu = gpu_free(aux_mat_dev)
1120 check_dealloc_gpu("elpa_multiply_a_b: aux_mat_dev", successgpu)
1121
1122 successgpu = gpu_free(tmp1_dev)
1123 check_dealloc_gpu("elpa_multiply_a_b: tmp1_dev", successgpu)
1124
1125#ifdef MORE_GPU
1126 successgpu = gpu_free(tmp2_dev)
1127 check_dealloc_gpu("elpa_multiply_a_b: tmp2_dev", successgpu)
1128
1129 successgpu = gpu_free(aux_bc_dev)
1130 check_dealloc_gpu("elpa_multiply_a_b: aux_bc_dev", successgpu)
1131#endif
1132
1133#if !defined(DEVICE_POINTER)
1134#ifdef MORE_GPU
1135 successgpu = gpu_free(a_dev)
1136 check_dealloc_gpu("elpa_mult_at_b: a_dev", successgpu)
1137#endif
1138#else
1139 !successGPU = gpu_free(a_dev)
1140 !check_dealloc_gpu("elpa_mult_at_b: a_dev", successGPU)
1141#endif
1142 else ! useGPU
1143 deallocate(aux_mat, stat=istat, errmsg=errormessage)
1144 check_deallocate("elpa_mult_at_b: aux_mat", istat, errormessage)
1145 endif ! useGPU
1146
1147 deallocate(aux_bc, lrs_save, lre_save, stat=istat, errmsg=errormessage)
1148 check_deallocate("elpa_mult_at_b: aux_bc, lrs_save, lre_save", istat, errormessage)
1149
1150 call obj%timer%stop("elpa_mult_at_b_&
1151 &MATH_DATATYPE&
1152 &_&
1153 &PRECISION&
1154 &"//gpustring)
1155
void set_gpu_parameters(int *gpuMemcpyHostToDevice, int *gpuMemcpyDeviceToHost)
Definition gpu_vendor_agnostic_layer.c:62
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50