Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.03.001
Loading...
Searching...
No Matches
elpa_multiply_a_b_template.F90
Go to the documentation of this file.
1!
2! The ELPA library was originally created by the ELPA consortium,
3! consisting of the following organizations:
4!
5! - Max Planck Computing and Data Facility (MPCDF), formerly known as
6! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
7! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
8! Informatik,
9! - Technische Universität München, Lehrstuhl für Informatik mit
10! Schwerpunkt Wissenschaftliches Rechnen ,
11! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
12! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
13! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
14! and
15! - IBM Deutschland GmbH
16!
17! This particular source code file contains additions, changes and
18! enhancements authored by Intel Corporation which is not part of
19! the ELPA consortium.
20!
21! More information can be found here:
22! http://elpa.mpcdf.mpg.de/
23!
24! ELPA is free software: you can redistribute it and/or modify
25! it under the terms of the version 3 of the license of the
26! GNU Lesser General Public License as published by the Free
27! Software Foundation.
28!
29! ELPA is distributed in the hope that it will be useful,
30! but WITHOUT ANY WARRANTY; without even the implied warranty of
31! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32! GNU Lesser General Public License for more details.
33!
34! You should have received a copy of the GNU Lesser General Public License
35! along with ELPA. If not, see <http://www.gnu.org/licenses/>
36!
37! ELPA reflects a substantial effort on the part of the original
38! ELPA consortium, and we ask you to respect the spirit of the
39! license that we chose: i.e., please contribute any changes you
40! may have back to the original ELPA library distribution, and keep
41! any derivatives of ELPA under the same license that we chose for
42! the original distribution, the GNU Lesser General Public License.
43!
44!
45! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
46!
47! Copyright of the original code rests with the authors inside the ELPA
48! consortium. The copyright of any additional modifications shall rest
49! with their original authors, but shall adhere to the licensing terms
50! distributed along with the original code in the file "COPYING".
51!
52! Author: A. Marek, MPCDF
53
54#include "../general/sanity.F90"
55#include "../general/error_checking.inc"
56
57#undef USE_CCL_MULTIPLY
58#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
59#define USE_CCL_MULTIPLY
60#endif
61
62
63
64#ifdef USE_CCL_MULTIPLY
65#define MORE_GPU
66#endif
67#ifndef USE_CCL_MULTIPLY
68#define MORE_GPU
69#endif
70#ifdef DEVICE_POINTER
71#define MORE_GPU
72#endif
73
74 use elpa1_compute
75 use elpa_mpi
76 use precision
78 use, intrinsic :: iso_c_binding
79 use elpa_gpu
80 use mod_check_for_gpu
81 use elpa_blas_interfaces
82 use elpa_utilities, only : local_index, check_deallocate_f, check_dealloc_gpu_f, &
83 check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &
84 check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &
85 check_host_register_gpu_f, check_alloc, error_unit
86 use mod_query_gpu_usage
87#ifdef WITH_GPU_STREAMS
88 use elpa_gpu_util
89#endif
90#ifdef WITH_NVIDIA_GPU_VERSION
91 use cuda_functions ! for NVTX labels
92#endif
93#if defined(USE_CCL_MULTIPLY)
94 use elpa_ccl_gpu
95#endif
96 use multiply_a_b_gpu
97 implicit none
98
99#include "../../src/general/precision_kinds.F90"
100 class(elpa_abstract_impl_t), intent(inout) :: obj
101
102 character*1 :: uplo_a, uplo_c
103
104 integer(kind=ik), intent(in) :: ldb, ldbCols, ldc, ldcCols
105 integer(kind=ik) :: na, ncb
106#ifndef DEVICE_POINTER
107!forget device pointer case for the moment implement later
108#ifdef USE_ASSUMED_SIZE
109 math_datatype(kind=rck) :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
110#else
111 math_datatype(kind=rck) :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbcols), c(ldc,ldccols)
112#endif
113#else /* DEVICE_POINTER */
114 ! dummy variables
115 math_datatype(kind=rck), allocatable :: a(:,:), b(:,:), c(:,:)
116 type(c_ptr) :: aDev, bDev, cDev
117#endif /* DEVICE_POINTER */
118
119 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
120 integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
121 integer(kind=MPI_KIND) :: mpierr, myidMPI
122 integer(kind=ik) :: l_cols, l_rows, l_rows_np
123 integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
124 integer(kind=ik) :: gcol_min, gcol, goff
125 integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
126 integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
127
128 logical :: a_lower, a_upper, c_lower, c_upper
129 math_datatype(kind=rck), pointer, contiguous :: aux_mat(:,:), tmp1(:,:)
130 math_datatype(kind=rck), allocatable :: aux_bc(:), tmp2(:,:)
131 integer(kind=ik) :: istat
132 character(200) :: errorMessage
133 character(20) :: gpuString
134 logical :: success
135 logical :: successGPU
136 logical :: useGPU
137 integer(kind=c_int) :: numGPU, blocking
138 integer(kind=ik) :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
139 integer(kind=ik) :: nblk, matrixRows, matrixCols, error
140 integer(kind=c_intptr_t) :: aux_mat_dev, tmp1_dev
141!#ifndef DEVICE_POINTER
142 integer(kind=c_intptr_t) :: b_dev
143#ifdef MORE_GPU
144 integer(kind=c_intptr_t) :: a_dev
145#endif
146#ifdef MORE_GPU
147 integer(kind=c_intptr_t) :: c_dev
148#endif
149!#endif
150#ifdef MORE_GPU
151 integer(kind=c_intptr_t) :: tmp2_dev, aux_bc_dev
152#endif
153 type(c_ptr) :: aux_host, tmp1_host
154 integer(kind=c_intptr_t) :: num
155 integer(kind=c_intptr_t) :: aux_off, b_off
156 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
157 &precision&
158 &_&
159 &math_datatype
160
161 integer(kind=c_intptr_t) :: gpuHandle, my_stream
162 integer(kind=c_int) :: gpu_hermitian_multiply
163
164#if defined(USE_CCL_MULTIPLY)
165 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
166#endif
167#ifdef DEVICE_POINTER
168 math_datatype(kind=rck), allocatable :: a_tmp(:,:), c_tmp(:,:)
169#endif
170 integer(kind=c_intptr_t) :: aux_dev
171 integer(kind=c_int) :: gpu
172 integer(kind=c_int) :: gpu_multiply_a_b
173 success = .true.
174 usegpu = .false.
175
176
177
178#if !defined(DEVICE_POINTER)
179
180#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
181 if (.not.(query_gpu_usage(obj, "ELPA_MULITPLY_AB", usegpu))) then
182 print *,"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."
183 stop 1
184 endif
185#endif
186
187 ! check whether the above setting should be overriden
188 if (obj%is_set("gpu_hermitian_multiply") == 1) then
189 call obj%get("gpu_hermitian_multiply", gpu_hermitian_multiply, error)
190 if (error .ne. elpa_ok) then
191 print *,"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
192 stop 1
193 endif
194 if (usegpu .and. gpu_hermitian_multiply .eq. 0) then
195 usegpu = .false.
196 else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1) then
197 usegpu = .true.
198 else
199 endif
200 else
201 ! no override by user
202 ! keep seeting as found before
203 endif
204
205#else /* DEVICE_POINTER */
206 usegpu = .true.
207#endif /* DEVICE_POINTER */
208
209 ! assumption, when DEVICE_POINTER -> MORE_GPU
210 ! when DEVICE_POINTER -> useGPU = .true.
211
212#if !defined(DEVICE_POINTER)
213
214#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
215 if (.not.(query_gpu_usage(obj, "ELPA_MULITPLY_AB", usegpu))) then
216 print *,"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."
217 stop 1
218 endif
219#endif
220
221 ! check whether the above setting should be overriden
222 if (obj%is_set("gpu_hermitian_multiply") == 1) then
223 call obj%get("gpu_hermitian_multiply", gpu_hermitian_multiply, error)
224 if (error .ne. elpa_ok) then
225 print *,"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
226 stop 1
227 endif
228 if (usegpu .and. gpu_hermitian_multiply .eq. 0) then
229 usegpu = .false.
230 else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1) then
231 usegpu = .true.
232 else
233 endif
234 else
235 ! no override by user
236 ! keep seeting as found before
237 endif
238
239#else /* DEVICE_POINTER */
240 usegpu = .true.
241
242#ifdef MORE_GPU
243 c_dev = transfer(cdev, c_dev)
244 a_dev = transfer(adev, a_dev)
245#else /* MORE_GPU */
246 allocate(a_tmp(obj%local_nrows,obj%local_ncols), stat=istat, errmsg=errormessage)
247 check_allocate("elpa_mult_at_b: a_tmp", istat, errormessage)
248
249 num = obj%local_nrows*obj%local_ncols*size_of_datatype
250#ifdef WITH_GPU_STREAMS
251 successgpu = gpu_host_register(int(loc(a_tmp),kind=c_intptr_t), &
252 obj%local_nrows*obj%local_ncols * size_of_datatype,&
253 gpuhostregisterdefault)
254 check_host_register_gpu("elpa_mult_at_b: a_tmp", successgpu)
255
256 my_stream = obj%gpu_setup%my_stream
257 successgpu = gpu_stream_synchronize(my_stream)
258 check_stream_synchronize_gpu("elpa_mult_at_b: a_dev to a_tmp", successgpu)
259
260 successgpu = gpu_memcpy_async(int(loc(a_tmp),kind=c_intptr_t), adev, num,&
261 gpumemcpydevicetohost, my_stream)
262 check_memcpy_gpu("elpa_mult_at_b: a_dev -> a_tmp", successgpu)
263
264 my_stream = obj%gpu_setup%my_stream
265 successgpu = gpu_stream_synchronize(my_stream)
266 check_stream_synchronize_gpu("elpa_mult_at_b: a_dev -> a_tmp", successgpu)
267 ! synchronize streamsPerThread; maybe not neccessary
268 successgpu = gpu_stream_synchronize()
269 check_stream_synchronize_gpu("elpa_mult_at_b: a_dev -> a_tmp", successgpu)
270#else
271 successgpu = gpu_memcpy(int(loc(a_tmp),kind=c_intptr_t), adev, num,&
272 gpumemcpydevicetohost)
273 check_memcpy_gpu("elpa_mult_at_b: a_dev -> a_tmp", successgpu)
274#endif
275
276 allocate(c_tmp(ldc,ldccols), stat=istat, errmsg=errormessage)
277 check_allocate("elpa_mult_at_b: c_tmp", istat, errormessage)
278
279#ifdef WITH_GPU_STREAMS
280 successgpu = gpu_host_register(int(loc(c_tmp),kind=c_intptr_t),&
281 ldc*ldccols*size_of_datatype, &
282 gpuhostregisterdefault)
283#endif
284
285#endif /* MORE_GPU */
286
287
288 b_dev = transfer(bdev, b_dev)
289
290#endif /* DEVICE_POINTER */
291
292 if(usegpu) then
293 gpustring = "_gpu"
294 else
295 gpustring = ""
296 endif
297
298 call obj%timer%start("elpa_mult_at_b_&
299 &MATH_DATATYPE&
300 &_&
301 &PRECISION&
302 &"//gpustring)
303
304 na = obj%na
305 nblk = obj%nblk
306 matrixrows = obj%local_nrows
307 matrixcols = obj%local_ncols
308
309 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
310 mpi_comm_cols = obj%mpi_setup%mpi_comm_cols
311 mpi_comm_rows = obj%mpi_setup%mpi_comm_rows
312
313 myid = obj%mpi_setup%myRank_comm_parent
314 my_prow = obj%mpi_setup%myRank_comm_rows
315 my_pcol = obj%mpi_setup%myRank_comm_cols
316
317 np_rows = obj%mpi_setup%nRanks_comm_rows
318 np_cols = obj%mpi_setup%nRanks_comm_cols
319
320 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b
321 l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
322
323 ! Block factor for matrix multiplications, must be a multiple of nblk
324
325 if (obj%is_set("blocking_in_multiply") == 1) then
326 call obj%get("blocking_in_multiply", blocking, error)
327 if (error .ne. elpa_ok) then
328 write(error_unit,*) "ELPA_HERMITIAN_MULTIPLY: Problem in getting keyword 'blocking_in_multiply'. Aborting..."
329 stop 1
330 endif
331 nblk_mult = (blocking/nblk+1) * nblk
332 else ! is_set
333 if (usegpu) then
334 if (na/np_rows <= 256) then
335 nblk_mult = (63/nblk+1)*nblk
336 else
337 nblk_mult = (351/nblk+1)*nblk
338 endif
339 else ! useGPU
340 if (na/np_rows <= 256) then
341 nblk_mult = (31/nblk+1)*nblk
342 else
343 nblk_mult = (63/nblk+1)*nblk
344 endif
345 endif ! useGPU
346 endif ! is_set
347
348 if (usegpu) then
349 call obj%timer%start("check_for_gpu")
350 if (check_for_gpu(obj, myid, numgpu)) then
351 ! set the neccessary parameters
352 call set_gpu_parameters()
353 else
354 print *,"GPUs are requested but not detected! Aborting..."
355 success = .false.
356 return
357 endif
358 call obj%timer%stop("check_for_gpu")
359
360#ifdef MORE_GPU
361#if !defined(DEVICE_POINTER)
362 num = ldc*ldccols*size_of_datatype
363 successgpu = gpu_malloc(c_dev, num)
364 check_alloc_gpu("elpa_mult_at_b: c_dev", successgpu)
365 ! no copy from c to c_dev needed since c will be overwritten anyway
366#endif
367#endif /* MORE_GPU */
368
369#if !defined(DEVICE_POINTER)
370 ! copy b to b_dev
371 num = ldb*ldbcols*size_of_datatype
372 successgpu = gpu_malloc(b_dev, num)
373 check_alloc_gpu("elpa_mult_at_b: b_dev", successgpu)
374
375#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
376 successgpu = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&
377 gpuhostregisterdefault)
378#endif
379
380 check_host_register_gpu("elpa_mult_at_b: b", successgpu)
381#ifdef WITH_GPU_STREAMS
382 my_stream = obj%gpu_setup%my_stream
383 call gpu_memcpy_async_and_stream_synchronize &
384 ("elpa_mult_at_b: b to b_dev", b_dev, 0_c_intptr_t, &
385 b(1:ldb,1:ldbcols), &
386 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
387#else
388 successgpu = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
389 gpumemcpyhosttodevice)
390 check_memcpy_gpu("elpa_mult_at_b: b to b_dev", successgpu)
391#endif
392
393#else /* DEVICE_POINTER */
394
395#endif /* DEVICE_POINTER */
396
397 num = l_rows*nblk_mult*size_of_datatype
398#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
399 successgpu = gpu_malloc_host(aux_host, num)
400 check_host_alloc_gpu("elpa_mult_at_b: aux_host", successgpu)
401 call c_f_pointer(aux_host, aux_mat, (/l_rows,nblk_mult/))
402#else
403 allocate(aux_mat(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
404 check_allocate("elpa_mult_at_b: aux_mat", istat, errormessage)
405#endif
406
407 successgpu = gpu_malloc(aux_mat_dev, num)
408 check_alloc_gpu("elpa_mult_at_b: aux_mat_dev", successgpu)
409
410 num = nblk_mult*l_cols*size_of_datatype
411 successgpu = gpu_malloc(tmp1_dev, num)
412 check_alloc_gpu("elpa_mult_at_b: tmp1_dev", successgpu)
413
414!#ifdef MORE_GPU
415! allocate(tmp2(nblk_mult,l_cols), stat=istat, errmsg=errorMessage)
416! check_allocate("elpa_mult_at_b: tmp2", istat, errorMessage)
417!#endif
418
419#ifdef MORE_GPU
420 num = nblk_mult*l_cols*size_of_datatype
421 successgpu = gpu_malloc(tmp2_dev, num)
422 check_alloc_gpu("elpa_mult_at_b: tmp2_dev", successgpu)
423#endif
424 else ! useGPU
425 allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errormessage)
426 check_allocate("elpa_mult_at_b: aux_mat", istat, errormessage)
427 endif ! useGPU
428
429 allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errormessage)
430 check_allocate("elpa_mult_at_b: aux_bc", istat, errormessage)
431
432 allocate(lrs_save(nblk), stat=istat, errmsg=errormessage)
433 check_allocate("elpa_mult_at_b: lrs_save", istat, errormessage)
434
435 allocate(lre_save(nblk), stat=istat, errmsg=errormessage)
436 check_allocate("elpa_mult_at_b: lre_save", istat, errormessage)
437
438 a_lower = .false.
439 a_upper = .false.
440 c_lower = .false.
441 c_upper = .false.
442
443 if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
444 if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
445 if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
446 if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
447
448#ifdef MORE_GPU
449 if (usegpu) then
450
451#if !defined(DEVICE_POINTER)
452 num = obj%local_nrows*obj%local_ncols*size_of_datatype
453 successgpu = gpu_malloc(a_dev, num)
454 check_alloc_gpu("elpa_mult_at_b: a_dev", successgpu)
455#endif
456
457 num = l_rows*nblk*size_of_datatype
458 successgpu = gpu_malloc(aux_bc_dev, num)
459 check_alloc_gpu("elpa_mult_at_b: aux_bc_dev", successgpu)
460
461 num = obj%local_nrows*obj%local_ncols*size_of_datatype
462#if !defined(DEVICE_POINTER)
463
464#ifdef WITH_GPU_STREAMS
465 my_stream = obj%gpu_setup%my_stream
466 call gpu_memcpy_async_and_stream_synchronize &
467 ("elpa_mult_at_b: a to a_dev", a_dev, 0_c_intptr_t, &
468 a(1:obj%local_nrows,1:obj%local_ncols), &
469 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
470#else
471 successgpu = gpu_memcpy(a_dev, int(loc(a),kind=c_intptr_t), &
472 num, gpumemcpyhosttodevice)
473 check_memcpy_gpu("elpa_mult_at_b: a to a_dev", successgpu)
474#endif
475#endif /* DEVICE_POINTER */
476 endif !useGPU
477#endif /* MORE_GPU */
478
479 ! Build up the result matrix by processor rows
480 do np = 0, np_rows-1
481
482#ifdef WITH_NVTX
483 call nvtxrangepush("do np = 0, np_rows-1")
484#endif
485
486 ! In this turn, procs of row np assemble the result
487
488 l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
489
490 nr_done = 0 ! Number of rows done
491 aux_mat = 0
492 if (usegpu) then
493 num = l_rows*nblk_mult*size_of_datatype
494#ifdef WITH_GPU_STREAMS
495 my_stream = obj%gpu_setup%my_stream
496 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
497 check_memcpy_gpu("hermitian_multiply: aux_mat_dev", successgpu)
498#else
499 successgpu = gpu_memset(aux_mat_dev, 0, num)
500 check_memcpy_gpu("hermitian_multiply: aux_mat_dev", successgpu)
501#endif
502
503 endif
504 nstor = 0 ! Number of columns stored in aux_mat
505
506 ! Loop over the blocks on row np
507 do nb = 0, (l_rows_np-1)/nblk
508
509#ifdef WITH_NVTX
510 call nvtxrangepush("do nb = 0, (l_rows_np-1)/nblk")
511#endif
512
513 goff = nb*np_rows + np ! Global offset in blocks corresponding to nb
514
515 ! Get the processor column which owns this block (A is transposed, so we need the column)
516 ! and the offset in blocks within this column.
517 ! The corresponding block column in A is then broadcast to all for multiplication with B
518
519 np_bc = mod(goff,np_cols) ! "bc"=block column
520 noff = goff/np_cols
521 n_aux_bc = 0
522
523 ! Gather up the complete block column of A on the owner
524 do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast
525
526 gcol = goff*nblk + n ! global column corresponding to n
527 if (nstor==0 .and. n==1) gcol_min = gcol
528
529 lrs = 1 ! 1st local row number for broadcast
530 lre = l_rows ! last local row number for broadcast
531 if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
532 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
533
534 if (lrs <= lre) then
535 nvals = lre-lrs+1
536#ifdef MORE_GPU
537 if (usegpu) then
538 if (my_pcol == np_bc) call gpu_copy_precision_a_aux_bc(a_dev, aux_bc_dev, n_aux_bc, nvals, lrs, lre, noff, &
539 nblk, n, l_rows, obj%local_nrows, obj%local_ncols, my_stream)
540 else ! useGPU
541 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
542 endif ! useGPU
543#else /* MORE_GPU */
544#ifndef DEVICE_POINTER
545 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
546#else
547 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a_tmp(lrs:lre,noff*nblk+n)
548 !print *,"done test 2"
549#endif
550#endif /* MORE_GPU */
551 n_aux_bc = n_aux_bc + nvals
552 endif
553
554 lrs_save(n) = lrs
555 lre_save(n) = lre
556
557 enddo
558
559#ifdef WITH_MPI
560! CCL only with MPI
561#ifdef USE_CCL_MULTIPLY
562 if (usegpu) then
563#ifdef WITH_GPU_STREAMS
564 my_stream = obj%gpu_setup%my_stream
565 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
566 successgpu = ccl_group_start()
567
568 if (.not.successgpu) then
569 print *,"Error in setting up ccl_group_start!"
570 stop 1
571 endif
572
573 successgpu = ccl_bcast(aux_bc_dev, aux_bc_dev, &
574
575#if REALCASE == 1
576 int(n_aux_bc,kind=c_size_t), &
577#endif
578#if COMPLEXCASE == 1
579 int(2*n_aux_bc,kind=c_size_t), &
580#endif
581#if REALCASE == 1
582#ifdef DOUBLE_PRECISION
583 ccldouble, &
584#endif
585#ifdef SINGLE_PRECISION
586 cclfloat, &
587#endif
588#endif /* REALCASE */
589#if COMPLEXCASE == 1
590#ifdef DOUBLE_PRECISION
591 ccldouble, &
592#endif
593#ifdef SINGLE_PRECISION
594 cclfloat, &
595#endif
596#endif /* COMPLEXCASE */
597 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
598
599 if (.not.successgpu) then
600 print *,"Error in ccl_bcast"
601 stop 1
602 endif
603
604 successgpu = ccl_group_end()
605 if (.not.successgpu) then
606 print *,"Error in setting up ccl_group_end!"
607 stop 1
608 endif
609#endif /* WITH_GPU_STREAMS */
610 else ! useGPU
611#endif /* USE_CCL_MULTIPLY */
612
613#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
614 if (usegpu) then
615 ! copy data to host for Bcast
616 num = l_rows*nblk*size_of_datatype
617#ifdef WITH_GPU_STREAMS
618 my_stream = obj%gpu_setup%my_stream
619 call gpu_memcpy_async_and_stream_synchronize &
620 ("elpa_mult_at_b: aux_bc_dev -> aux_bc", aux_bc_dev, 0_c_intptr_t, &
621 aux_bc(1:l_rows*nblk), &
622 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
623#else
624 num = l_rows*nblk*size_of_datatype
625 successgpu = gpu_memcpy(int(loc(aux_bc),kind=c_intptr_t), aux_bc_dev, num,&
626 gpumemcpydevicetohost)
627 check_memcpy_gpu("elpa_mult_at_b: aux_bc_dev -> aux_bc", successgpu)
628#endif
629
630 endif
631#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
632
633 ! Broadcast block column
634 call obj%timer%start("mpi_communication")
635#if REALCASE == 1
636 call mpi_bcast(aux_bc, int(n_aux_bc,kind=mpi_kind), &
637 mpi_real_precision, &
638 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
639#endif
640#if COMPLEXCASE == 1
641 call mpi_bcast(aux_bc, int(n_aux_bc,kind=mpi_kind), &
642 mpi_complex_precision, &
643 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
644#endif
645 call obj%timer%stop("mpi_communication")
646
647#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
648 if (usegpu) then
649 ! copy data back to device
650 num = l_rows*nblk*size_of_datatype
651#ifdef WITH_GPU_STREAMS
652 my_stream = obj%gpu_setup%my_stream
653 call gpu_memcpy_async_and_stream_synchronize &
654 ("elpa_mult_at_b: aux_bc -> aux_bc_dev", aux_bc_dev, 0_c_intptr_t, &
655 aux_bc(1:l_rows*nblk), &
656 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
657#else
658 successgpu = gpu_memcpy(aux_bc_dev, int(loc(aux_bc),kind=c_intptr_t), num,&
659 gpumemcpyhosttodevice)
660 check_memcpy_gpu("elpa_mult_at_b: aux_bc -> aux_bc_dev", successgpu)
661#endif
662
663 endif !useGPU
664#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
665
666#ifdef USE_CCL_MULTIPLY
667 endif ! useGPU
668#endif /* USE_CCL_MULTIPLY */
669
670#else /* WITH_MPI */
671
672#endif /* WITH_MPI */
673 ! Insert what we got in aux_mat
674
675
676#ifdef MORE_GPU
677 if (usegpu) then
678 n_aux_bc = 0
679 my_stream = obj%gpu_setup%my_stream
680 do n = 1, min(l_rows_np-nb*nblk,nblk)
681 nstor = nstor+1
682 lrs = lrs_save(n)
683 lre = lre_save(n)
684 if (lrs <= lre) then
685 nvals = lre-lrs+1
686 call gpu_copy_precision_aux_bc_aux_mat(aux_bc_dev, aux_mat_dev, lrs, lre, nstor, n_aux_bc, &
687 nvals, l_rows, nblk, nblk_mult, my_stream)
688
689 n_aux_bc = n_aux_bc + nvals
690 endif
691 enddo
692 else ! useGPU
693#endif /* MORE_GPU */
694
695 n_aux_bc = 0
696 do n = 1, min(l_rows_np-nb*nblk,nblk)
697 nstor = nstor+1
698 lrs = lrs_save(n)
699 lre = lre_save(n)
700 if (lrs<=lre) then
701 nvals = lre-lrs+1
702 aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
703 n_aux_bc = n_aux_bc + nvals
704 endif
705 enddo
706
707#ifdef MORE_GPU
708 endif ! useGPU
709#endif /* MORE_GPU */
710
711 ! If we got nblk_mult columns in aux_mat or this is the last block
712 ! do the matrix multiplication
713
714 if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then
715
716 lrs = 1 ! 1st local row number for multiply
717 lre = l_rows ! last local row number for multiply
718 if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
719 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
720
721 lcs = 1 ! 1st local col number for multiply
722 lce = l_cols ! last local col number for multiply
723 if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
724 if (c_lower) lce = min(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
725
726 if (lcs <= lce) then
727#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
728 if (.not.usegpu) then
729#endif
730 ! introduce 1-based indexing
731 allocate(tmp1(nstor,1:lce-lcs+1), tmp2(nstor,1:lce-lcs+1), stat=istat, errmsg=errormessage)
732 call check_alloc("elpa_mult_at_b_&
733 &MATH_DATATYPE ", "tmp1", istat, errormessage)
734#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
735 endif
736#endif
737
738 if (lrs <= lre) then
739 if (usegpu) then
740#ifndef MORE_GPU
741 num = l_rows*nblk_mult*size_of_datatype
742#ifdef WITH_GPU_STREAMS
743 my_stream = obj%gpu_setup%my_stream
744 call gpu_memcpy_async_and_stream_synchronize &
745 ("elpa_mult_at_b: aux_mat to aux_mat_dev", aux_mat_dev, 0_c_intptr_t, &
746 aux_mat(1:l_rows,1:nblk_mult), &
747 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
748#else
749 successgpu = gpu_memcpy(aux_mat_dev, int(loc(aux_mat),kind=c_intptr_t), &
750 num, gpumemcpyhosttodevice)
751 check_memcpy_gpu("elpa_mult_at_b: aux_mat to aux_mat_dev", successgpu)
752#endif
753
754#endif /* MORE_GPU */
755 aux_off = (lrs-1)*size_of_datatype
756 b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
757
758 call obj%timer%start("gpublas")
759 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
760 call gpublas_precision_gemm(blas_trans_or_conj, 'N', nstor, lce-lcs+1, &
761 lre-lrs+1, one, aux_mat_dev+aux_off, l_rows, b_dev+b_off, ldb, zero, &
762 tmp1_dev, nstor, gpuhandle)
763 call obj%timer%stop("gpublas")
764
765#ifndef MORE_GPU
766 num = nstor*(lce-lcs+1)*size_of_datatype
767#ifdef WITH_GPU_STREAMS
768 my_stream = obj%gpu_setup%my_stream
769 successgpu = gpu_stream_synchronize(my_stream)
770 check_stream_synchronize_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
771
772 successgpu = gpu_memcpy_async(int(loc(tmp1),kind=c_intptr_t), &
773 tmp1_dev, num, gpumemcpydevicetohost, my_stream)
774 check_memcpy_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
775
776 my_stream = obj%gpu_setup%my_stream
777 successgpu = gpu_stream_synchronize(my_stream)
778 check_stream_synchronize_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
779 ! synchronize streamsPerThread; maybe not neccessary
780 successgpu = gpu_stream_synchronize()
781 check_stream_synchronize_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
782#else
783 successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
784 tmp1_dev, num, gpumemcpydevicetohost)
785 check_memcpy_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
786#endif
787#endif /* MORE_GPU */
788
789
790 num = nstor*(lce-lcs+1)*size_of_datatype
791 else ! useGPU
792 call obj%timer%start("blas")
793 call precision_gemm(blas_trans_or_conj, 'N', int(nstor,kind=blas_kind), &
794 int(lce-lcs+1,kind=blas_kind), int(lre-lrs+1,kind=blas_kind), &
795 one, aux_mat(lrs:lre,1:nstor), int(lre-lrs+1,kind=blas_kind), &
796 b(lrs,lcs), int(ldb,kind=blas_kind), zero, tmp1, &
797 int(nstor,kind=blas_kind))
798 call obj%timer%stop("blas")
799 endif ! useGPU
800 else ! (lrs <= lre)
801#ifdef MORE_GPU
802 if (usegpu) then
803 num = nstor*(lce-lcs+1)*size_of_datatype
804#ifdef WITH_GPU_STREAMS
805 my_stream = obj%gpu_setup%my_stream
806 successgpu = gpu_memset_async(tmp1_dev, 0, num, my_stream)
807 check_memcpy_gpu("hermitian_multiply: tmp1_dev", successgpu)
808#else
809 successgpu = gpu_memset(tmp1_dev, 0, num)
810 check_memcpy_gpu("hermitian_multiply: tmp1_dev", successgpu)
811#endif
812 else ! useGPU
813 tmp1 = 0
814 endif ! useGPU
815#else /* MORE_GPU */
816 tmp1 = 0
817#endif /* MORE_GPU */
818 endif ! (lrs <= lre)
819
820 ! Sum up the results and send to processor row np
821
822 if (usegpu) then
823#ifdef WITH_MPI
824
825#ifdef USE_CCL_MULTIPLY
826#ifdef WITH_GPU_STREAMS
827 my_stream = obj%gpu_setup%my_stream
828 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
829
830 successgpu = ccl_group_start()
831 if (.not.successgpu) then
832 print *,"Error in setting up ccl_group_start!"
833 stop 1
834 endif
835
836 successgpu = ccl_reduce(tmp1_dev, tmp2_dev, &
837
838#if REALCASE == 1
839 int(nstor*(lce-lcs+1),kind=c_size_t), &
840#endif
841#if COMPLEXCASE == 1
842 int(2*nstor*(lce-lcs+1),kind=c_size_t), &
843#endif
844#if REALCASE == 1
845#ifdef DOUBLE_PRECISION
846 ccldouble, &
847#endif
848#ifdef SINGLE_PRECISION
849 cclfloat, &
850#endif
851#endif /* REALCASE */
852#if COMPLEXCASE == 1
853#ifdef DOUBLE_PRECISION
854 ccldouble, &
855#endif
856#ifdef SINGLE_PRECISION
857 cclfloat, &
858#endif
859#endif /* COMPLEXCASE */
860 cclsum, int(np,kind=c_int), ccl_comm_rows, my_stream)
861
862
863 if (.not.successgpu) then
864 print *,"Error in ccl_reduce"
865 stop 1
866 endif
867
868 successgpu = ccl_group_end()
869 if (.not.successgpu) then
870 print *,"Error in setting up ccl_group_end!"
871 stop 1
872 endif
873#endif /* WITH_GPU_STREAMS */
874
875#endif /* USE_CCL_MULTIPLY */
876
877#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
878 ! copy data to host
879 num = nstor*(lce-lcs+1)*size_of_datatype
880#ifdef WITH_GPU_STREAMS
881 call gpu_memcpy_async_and_stream_synchronize &
882 ("elpa_mult_at_b: tmp1_dev to tmp1", tmp1_dev, 0_c_intptr_t, &
883 !tmp1(1:nblk_mult,1:l_cols), &
884 tmp1(1:nstor,1:lce-lcs+1), &
885 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
886#else
887 successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
888 tmp1_dev, num, gpumemcpydevicetohost)
889 check_memcpy_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successgpu)
890#endif
891#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
892
893#if !defined(USE_CCL_MULTIPLY)
894 ! communication already done before with CCL
895 call obj%timer%start("mpi_communication")
896 call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind), mpi_math_datatype_precision, &
897 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
898 call obj%timer%stop("mpi_communication")
899#endif /* !defined(USE_CCL_MULTIPLY) */
900
901#if defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY)
902 ! copy data to device
903 num = nstor*(lce-lcs+1)*size_of_datatype
904#ifdef WITH_GPU_STREAMS
905 call gpu_memcpy_async_and_stream_synchronize &
906 ("elpa_mult_at_b: tmp2 to tmp2_dev", tmp2_dev, 0_c_intptr_t, &
907 !tmp2(1:nblk_mult,1:l_cols), &
908 tmp2(1:nstor,1:lce-lcs+1), &
909 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
910#else
911 successgpu = gpu_memcpy(tmp2_dev, int(loc(tmp2),kind=c_intptr_t), &
912 num, gpumemcpyhosttodevice)
913 check_memcpy_gpu("elpa_mult_at_b: tmp2 to tmp2_dev", successgpu)
914#endif
915#endif /* defined(MORE_GPU) && !defined(USE_CCL_MULTIPLY) */
916
917#else /* WITH_MPI */
918
919#ifdef MORE_GPU
920 num = nstor*(lce-lcs+1)*size_of_datatype
921 successgpu = gpu_memcpy(tmp2_dev, tmp1_dev, &
922 num, gpumemcpydevicetodevice)
923 check_memcpy_gpu("elpa_mult_at_b: tmp2 to tmp2_dev", successgpu)
924#else /* MORE_GPU */
925 tmp2 = tmp1
926#endif /* MORE_GPU */
927#endif /* WITH_MPI */
928
929 else ! useGPU
930#ifdef WITH_MPI
931 call obj%timer%start("mpi_communication")
932 call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind), mpi_math_datatype_precision, &
933 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
934 call obj%timer%stop("mpi_communication")
935#endif
936 endif ! endif
937
938 if (usegpu) then
939#ifdef MORE_GPU
940 if (my_prow==np) call gpu_copy_precision_tmp2_c(tmp2_dev, c_dev, nr_done, nstor, lcs, lce, ldc, ldccols, my_stream)
941#else /* MORE_GPU */
942#ifdef WITH_MPI
943 ! Put the result into C
944#ifndef DEVICE_POINTER
945 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
946#else
947 if (my_prow==np) c_tmp(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
948#endif
949#else /* WITH_MPI */
950 ! Put the result into C
951#ifndef DEVICE_POINTER
952 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
953#else
954 if (my_prow==np) c_tmp(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
955#endif
956 !tmp2(:,:) = 0.
957#endif /* WITH_MPI */
958
959#endif /* MORE_GPU */
960 else ! useGPU
961#ifdef WITH_MPI
962 ! Put the result into C
963 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
964#else /* WITH_MPI */
965 ! Put the result into C
966 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
967 !tmp2(:,:) = 0.
968#endif /* WITH_MPI */
969 endif ! useGPU
970#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
971 if (.not.usegpu) then
972#endif
973 deallocate(tmp1, tmp2, stat=istat, errmsg=errormessage)
974 call check_alloc("elpa_mult_at_b_&
975 &MATH_DATATYPE ", "tmp1", istat, errormessage)
976#if defined(MORE_GPU) && defined(USE_CCL_MULTIPLY)
977 endif
978#endif
979 endif ! (lcs <= lce)
980
981 nr_done = nr_done+nstor
982 nstor=0
983 if (usegpu) then
984 num = l_rows*nblk_mult*size_of_datatype
985#ifdef WITH_GPU_STREAMS
986 my_stream = obj%gpu_setup%my_stream
987 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
988 check_memcpy_gpu("hermitian_multiply: aux_mat_dev", successgpu)
989#else
990 successgpu = gpu_memset(aux_mat_dev, 0, num)
991 check_memcpy_gpu("hermitian_multiply: aux_mat_dev", successgpu)
992#endif
993
994#ifndef MORE_GPU
995 ! additionally set HOST array to zero
996 aux_mat(:,:) = 0
997#endif
998 else
999 aux_mat(:,:) = 0
1000 endif
1001 endif ! (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np)
1002
1003#ifdef WITH_NVTX
1004 call nvtxrangepop() ! do nb = 0, (l_rows_np-1)/nblk
1005#endif
1006 enddo ! nb = 0, (l_rows_np-1)/nbl
1007
1008#ifdef WITH_NVTX
1009 call nvtxrangepop() ! do np = 0, np_rows-1
1010#endif
1011 enddo ! np = 0, np_rows-1
1012
1013!#ifdef MORE_GPU
1014! if (useGPU) then
1015! deallocate(tmp2, stat=istat, errmsg=errorMessage)
1016! check_deallocate("elpa_mult_at_b: tmp1, tmp2", istat, errorMessage)
1017! endif
1018!#endif
1019
1020 if (usegpu) then
1021#if !defined(DEVICE_POINTER)
1022 successgpu = gpu_free(b_dev)
1023 check_dealloc_gpu("elpa_multiply_a_b: b_dev", successgpu)
1024#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
1025 successgpu = gpu_host_unregister(int(loc(b),kind=c_intptr_t))
1026 check_host_unregister_gpu("elpa_multiply_a_b: b", successgpu)
1027#endif
1028
1029#ifdef MORE_GPU
1030 ! copy result c_dev back to CPU
1031 num = ldc*ldccols*size_of_datatype
1032#ifdef WITH_GPU_STREAMS
1033 check_stream_synchronize_gpu("elpa_mult_at_b: c_dev -> c", successgpu)
1034 call gpu_memcpy_async_and_stream_synchronize &
1035 ("elpa_mult_at_b: c_dev to c", c_dev, 0_c_intptr_t, &
1036 c(1:ldc,1:ldccols), &
1037 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
1038#else
1039 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num,&
1040 gpumemcpydevicetohost)
1041 check_memcpy_gpu("elpa_mult_at_b: c_dev -> c", successgpu)
1042
1043#endif
1044 successgpu = gpu_free(c_dev)
1045 check_dealloc_gpu("elpa_multiply_a_b: c_dev", successgpu)
1046
1047#else /* MORE_GPU */
1048 ! c on host no copy back needed
1049#endif /* MORE_GPU */
1050
1051#else /* DEVICE_POINTER */
1052 !successGPU = gpu_free(b_dev)
1053 !check_dealloc_gpu("elpa_multiply_a_b: b_dev", successGPU)
1054
1055 !num = ldc*ldcCols*size_of_datatype
1056 !successGPU = gpu_memcpy(cDev, c_dev, num,&
1057 ! gpuMemcpyDeviceToDevice)
1058 !check_memcpy_gpu("elpa_mult_at_b: c_dev -> cDev", successGPU)
1059
1060 !successGPU = gpu_free(c_dev)
1061 !check_dealloc_gpu("elpa_multiply_a_b: c_dev", successGPU)
1062
1063#ifndef MORE_GPU
1064#ifdef WITH_GPU_STREAMS
1065 successgpu = gpu_host_unregister(int(loc(a_tmp),kind=c_intptr_t))
1066 check_host_unregister_gpu("elpa_mult_at_b: a_tmp", successgpu)
1067#endif
1068 deallocate(a_tmp, stat=istat, errmsg=errormessage)
1069 check_deallocate("elpa_mult_at_b: a_tmp", istat, errormessage)
1070
1071 num = ldc*ldccols*size_of_datatype
1072#ifdef WITH_GPU_STREAMS
1073 my_stream = obj%gpu_setup%my_stream
1074 successgpu = gpu_stream_synchronize(my_stream)
1075 check_stream_synchronize_gpu("elpa_mult_at_b: c_tmp to c", successgpu)
1076
1077 successgpu = gpu_memcpy_async(cdev,int(loc(c_tmp),kind=c_intptr_t),num,&
1078 gpumemcpyhosttodevice, my_stream)
1079 check_memcpy_gpu("elpa_mult_at_b: c_tmp -> c", successgpu)
1080
1081 my_stream = obj%gpu_setup%my_stream
1082 successgpu = gpu_stream_synchronize(my_stream)
1083 check_stream_synchronize_gpu("elpa_mult_at_b: c_tmp -> c", successgpu)
1084 ! synchronize streamsPerThread; maybe not neccessary
1085 successgpu = gpu_stream_synchronize()
1086 check_stream_synchronize_gpu("elpa_mult_at_b: c_tmp -> c", successgpu)
1087#else
1088 successgpu = gpu_memcpy(cdev,int(loc(c_tmp),kind=c_intptr_t),num,&
1089 gpumemcpyhosttodevice)
1090 check_memcpy_gpu("elpa_mult_at_b: c_tmp -> c", successgpu)
1091#endif
1092#ifdef WITH_GPU_STREAMS
1093 successgpu = gpu_host_unregister(int(loc(c_tmp),kind=c_intptr_t))
1094 check_host_unregister_gpu("elpa_multiply_a_b: c_tmp", successgpu)
1095#endif
1096
1097 deallocate(c_tmp, stat=istat, errmsg=errormessage)
1098 check_deallocate("elpa_mult_at_b: c_tmp", istat, errormessage)
1099#endif /* MORE_GPU */
1100
1101#endif /* DEVICE_POINTER */
1102#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
1103 nullify(aux_mat)
1104 !nullify(tmp1)
1105
1106 successgpu = gpu_free_host(aux_host)
1107 check_host_dealloc_gpu("elpa_multiply_a_b: aux_host", successgpu)
1108
1109 !successGPU = gpu_free_host(tmp1_host)
1110 !check_host_dealloc_gpu("elpa_multiply_a_b: tmp1_host", successGPU)
1111#else
1112 deallocate(aux_mat, stat=istat, errmsg=errormessage)
1113 check_deallocate("elpa_mult_at_b: aux_mat", istat, errormessage)
1114
1115 !deallocate(tmp1, stat=istat, errmsg=errorMessage)
1116 !check_deallocate("elpa_mult_at_b: tmp1", istat, errorMessage)
1117#endif
1118
1119 successgpu = gpu_free(aux_mat_dev)
1120 check_dealloc_gpu("elpa_multiply_a_b: aux_mat_dev", successgpu)
1121
1122 successgpu = gpu_free(tmp1_dev)
1123 check_dealloc_gpu("elpa_multiply_a_b: tmp1_dev", successgpu)
1124
1125#ifdef MORE_GPU
1126 successgpu = gpu_free(tmp2_dev)
1127 check_dealloc_gpu("elpa_multiply_a_b: tmp2_dev", successgpu)
1128
1129 successgpu = gpu_free(aux_bc_dev)
1130 check_dealloc_gpu("elpa_multiply_a_b: aux_bc_dev", successgpu)
1131#endif
1132
1133#if !defined(DEVICE_POINTER)
1134#ifdef MORE_GPU
1135 successgpu = gpu_free(a_dev)
1136 check_dealloc_gpu("elpa_mult_at_b: a_dev", successgpu)
1137#endif
1138#else
1139 !successGPU = gpu_free(a_dev)
1140 !check_dealloc_gpu("elpa_mult_at_b: a_dev", successGPU)
1141#endif
1142 else ! useGPU
1143 deallocate(aux_mat, stat=istat, errmsg=errormessage)
1144 check_deallocate("elpa_mult_at_b: aux_mat", istat, errormessage)
1145 endif ! useGPU
1146
1147 deallocate(aux_bc, lrs_save, lre_save, stat=istat, errmsg=errormessage)
1148 check_deallocate("elpa_mult_at_b: aux_bc, lrs_save, lre_save", istat, errormessage)
1149
1150 call obj%timer%stop("elpa_mult_at_b_&
1151 &MATH_DATATYPE&
1152 &_&
1153 &PRECISION&
1154 &"//gpustring)
1155
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50