Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.01.002
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
elpa_hermitian_multiply_template.F90
Go to the documentation of this file.
1!
2! The ELPA library was originally created by the ELPA consortium,
3! consisting of the following organizations:
4!
5! - Max Planck Computing and Data Facility (MPCDF), formerly known as
6! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
7! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
8! Informatik,
9! - Technische Universität München, Lehrstuhl für Informatik mit
10! Schwerpunkt Wissenschaftliches Rechnen ,
11! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
12! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
13! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
14! and
15! - IBM Deutschland GmbH
16!
17! This particular source code file contains additions, changes and
18! enhancements authored by Intel Corporation which is not part of
19! the ELPA consortium.
20!
21! More information can be found here:
22! http://elpa.mpcdf.mpg.de/
23!
24! ELPA is free software: you can redistribute it and/or modify
25! it under the terms of the version 3 of the license of the
26! GNU Lesser General Public License as published by the Free
27! Software Foundation.
28!
29! ELPA is distributed in the hope that it will be useful,
30! but WITHOUT ANY WARRANTY; without even the implied warranty of
31! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32! GNU Lesser General Public License for more details.
33!
34! You should have received a copy of the GNU Lesser General Public License
35! along with ELPA. If not, see <http://www.gnu.org/licenses/>
36!
37! ELPA reflects a substantial effort on the part of the original
38! ELPA consortium, and we ask you to respect the spirit of the
39! license that we chose: i.e., please contribute any changes you
40! may have back to the original ELPA library distribution, and keep
41! any derivatives of ELPA under the same license that we chose for
42! the original distribution, the GNU Lesser General Public License.
43!
44!
45! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
46!
47! Copyright of the original code rests with the authors inside the ELPA
48! consortium. The copyright of any additional modifications shall rest
49! with their original authors, but shall adhere to the licensing terms
50! distributed along with the original code in the file "COPYING".
51!
52! Author: A. Marek, MPCDF
53
54#include "../general/sanity.F90"
55#include "../general/error_checking.inc"
56
57#undef USE_CCL_HERMITIAN_MULTIPLY
58#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
59#define USE_CCL_HERMITIAN_MULTIPLY
60#endif
61
62 use elpa1_compute
63 use elpa_mpi
64 use precision
66 use, intrinsic :: iso_c_binding
67 use elpa_gpu
68 use mod_check_for_gpu
69 use elpa_blas_interfaces
70 use elpa_utilities, only : local_index, greatest_common_divisor, check_deallocate_f, check_dealloc_gpu_f, &
71 check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &
72 check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &
73 check_host_register_gpu_f, check_alloc, error_unit
74 use mod_query_gpu_usage
75#ifdef WITH_GPU_STREAMS
76 use elpa_gpu_util
77#endif
78#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
79 use cuda_functions ! for NVTX labels
80#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
81 use hip_functions ! for ROCTX labels
82#endif
83#if defined(USE_CCL_HERMITIAN_MULTIPLY)
84 use elpa_ccl_gpu
85#endif
86 use multiply_a_b_gpu
87 implicit none
88
89#include "../../src/general/precision_kinds.F90"
90 class(elpa_abstract_impl_t), intent(inout) :: obj
91
92 character*1 :: uplo_a, uplo_c, trans_a, trans_b
93
94 integer(kind=ik), intent(in) :: ldb, ldbCols, ldc, ldcCols
95 integer(kind=ik) :: na, ncb
96#ifndef DEVICE_POINTER
97#ifdef USE_ASSUMED_SIZE
98 math_datatype(kind=rck) :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
99#else
100 math_datatype(kind=rck) :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbcols), c(ldc,ldccols)
101#endif
102#else /* DEVICE_POINTER */
103 ! dummy variables
104 math_datatype(kind=rck), allocatable :: a(:,:), b(:,:), c(:,:)
105 type(c_ptr) :: aDev, bDev, cDev
106#endif /* DEVICE_POINTER */
107
108 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
109 integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
110 integer(kind=MPI_KIND) :: mpierr, myidMPI
111 integer(kind=ik) :: l_cols, l_rows, l_rows_np
112 integer(kind=ik) :: n
113 integer(kind=ik) :: np, nb, nblk_mult, lrs, lre, lcs, lce
114 integer(kind=ik) :: gcol_min, gcol, goff
115 integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
116 integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
117
118 logical :: a_lower, a_upper, c_lower, c_upper
119 math_datatype(kind=rck) :: beta
120 math_datatype(kind=rck), pointer, contiguous :: aux_mat(:,:), tmp1(:,:)
121 math_datatype(kind=rck), allocatable :: aux_bc(:), tmp2(:,:)
122 logical :: wantDebug
123 integer(kind=ik) :: istat, debug
124 character(200) :: errorMessage
125 character(20) :: gpuString
126 logical :: success, successGPU, successGPU2
127 logical :: useGPU
128 integer(kind=c_int) :: numGPU, blocking
129 integer(kind=ik) :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
130 integer(kind=ik) :: nblk, matrixRows, matrixCols, error
131 integer(kind=c_intptr_t) :: aux_bc_dev, aux_mat_dev, tmp1_dev, tmp2_dev
132
133 integer(kind=c_intptr_t) :: a_dev
134 integer(kind=c_intptr_t) :: b_dev
135 integer(kind=c_intptr_t) :: c_dev
136
137 type(c_ptr) :: aux_host
138 integer(kind=c_intptr_t) :: num
139 integer(kind=c_intptr_t) :: aux_off, b_off
140 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
141 &precision&
142 &_&
143 &math_datatype
144
145 integer(kind=c_intptr_t) :: gpuHandle, my_stream
146 integer(kind=c_int) :: gpu_hermitian_multiply
147
148 logical :: useCCL
149#if defined(USE_CCL_HERMITIAN_MULTIPLY)
150 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
151 integer(kind=c_int) :: cclDataType
152 integer(kind=ik) :: k_datatype
153#endif
154
155 integer(kind=c_intptr_t) :: aux_dev
156 integer(kind=c_int) :: gpu
157
158#ifdef WITH_NVTX
159 call nvtxrangepush("hermitian_multiply")
160#endif
161
162 success = .true.
163 usegpu = .false.
164
165 call obj%get("debug", debug, error)
166 if (error .ne. elpa_ok) then
167 write(error_unit,*) "elpa_hermitian_multiply: Problem getting option for debug settings. Aborting..."
168 success = .false.
169 return
170 endif
171 if (debug == 1) then
172 wantdebug = .true.
173 else
174 wantdebug = .false.
175 endif
176
177
178#if !defined(DEVICE_POINTER)
179
180#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
181 if (.not.(query_gpu_usage(obj, "ELPA_MULITPLY_AB", usegpu))) then
182 print *,"ELPA_MULITPLY_AB: Problem querrying settings for GPU Aborting..."
183 stop 1
184 endif
185#endif
186
187 ! check whether the above setting should be overriden
188 if (obj%is_set("gpu_hermitian_multiply") == 1) then
189 call obj%get("gpu_hermitian_multiply", gpu_hermitian_multiply, error)
190 if (error .ne. elpa_ok) then
191 print *,"Problem getting option for gpu_hermitian_mutltiply. Aborting..."
192 stop 1
193 endif
194 if (usegpu .and. gpu_hermitian_multiply .eq. 0) then
195 usegpu = .false.
196 else if (.not.(usegpu) .and. gpu_hermitian_multiply .eq. 1) then
197 usegpu = .true.
198 else
199 endif
200 else
201 ! no override by user
202 ! keep seeting as found before
203 endif
204
205#else /* DEVICE_POINTER */
206
207 usegpu = .true.
208
209 a_dev = transfer(adev, a_dev)
210 b_dev = transfer(bdev, b_dev)
211 c_dev = transfer(cdev, c_dev)
212
213#endif /* DEVICE_POINTER */
214
215 if(usegpu) then
216 gpustring = "_gpu"
217 else
218 gpustring = ""
219 endif
220
221 call obj%timer%start("elpa_hermitian_multiply_&
222 &MATH_DATATYPE&
223 &_&
224 &PRECISION&
225 &"//gpustring)
226
227 na = obj%na
228 nblk = obj%nblk
229 matrixrows = obj%local_nrows
230 matrixcols = obj%local_ncols
231
232 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
233 mpi_comm_cols = obj%mpi_setup%mpi_comm_cols
234 mpi_comm_rows = obj%mpi_setup%mpi_comm_rows
235
236 myid = obj%mpi_setup%myRank_comm_parent
237 my_prow = obj%mpi_setup%myRank_comm_rows
238 my_pcol = obj%mpi_setup%myRank_comm_cols
239
240 np_rows = obj%mpi_setup%nRanks_comm_rows
241 np_cols = obj%mpi_setup%nRanks_comm_cols
242
243 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b
244 l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
245
246 ! Block factor for matrix multiplications, must be a multiple of nblk
247
248 if (obj%is_set("blocking_in_multiply") == 1) then
249 call obj%get("blocking_in_multiply", blocking, error)
250 if (error .ne. elpa_ok) then
251 write(error_unit,*) "elpa_hermitian_multiply: Problem in getting keyword 'blocking_in_multiply'. Aborting..."
252 stop 1
253 endif
254 nblk_mult = (blocking/nblk+1) * nblk
255 else ! is_set
256 if (usegpu) then
257 if (na/np_rows <= 256) then
258 nblk_mult = (63/nblk+1)*nblk
259 else
260 nblk_mult = (351/nblk+1)*nblk
261 endif
262 else ! useGPU
263 if (na/np_rows <= 256) then
264 nblk_mult = (31/nblk+1)*nblk
265 else
266 nblk_mult = (63/nblk+1)*nblk
267 endif
268 endif ! useGPU
269 endif ! is_set
270
271 useccl = .false.
272 if (usegpu) then
273 call obj%timer%start("check_for_gpu")
274 if (check_for_gpu(obj, myid, numgpu)) then
275 ! set the neccessary parameters
276 call set_gpu_parameters()
277 else
278 print *,"GPUs are requested but not detected! Aborting..."
279 success = .false.
280 return
281 endif
282 call obj%timer%stop("check_for_gpu")
283
284#if defined(USE_CCL_HERMITIAN_MULTIPLY)
285 useccl = .true.
286
287 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
288 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
289
290#if REALCASE == 1 && defined(DOUBLE_PRECISION)
291 ccldatatype = ccldouble
292 k_datatype = 1
293#elif REALCASE == 1 && defined(SINGLE_PRECISION)
294 ccldatatype = cclfloat
295 k_datatype = 1
296#elif COMPLEXCASE == 1 && defined(DOUBLE_PRECISION)
297 ccldatatype = ccldouble
298 k_datatype = 2
299#elif COMPLEXCASE == 1 && defined(SINGLE_PRECISION)
300 ccldatatype = cclfloat
301 k_datatype = 2
302#endif
303#endif /* defined(USE_CCL_HERMITIAN_MULTIPLY) */
304
305#if !defined(DEVICE_POINTER)
306 num = ldc*ldccols*size_of_datatype
307 successgpu = gpu_malloc(c_dev, num)
308 check_alloc_gpu("elpa_hermitian_multiply: c_dev", successgpu)
309 ! no copy from c to c_dev needed since c will be overwritten anyway
310#endif
311
312#if !defined(DEVICE_POINTER)
313 ! copy b to b_dev
314 num = ldb*ldbcols*size_of_datatype
315 successgpu = gpu_malloc(b_dev, num)
316 check_alloc_gpu("elpa_hermitian_multiply: b_dev", successgpu)
317
318#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
319 successgpu = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&
320 gpuhostregisterdefault)
321#endif
322
323 check_host_register_gpu("elpa_hermitian_multiply: b", successgpu)
324#ifdef WITH_GPU_STREAMS
325 my_stream = obj%gpu_setup%my_stream
326 call gpu_memcpy_async_and_stream_synchronize &
327 ("elpa_hermitian_multiply: b to b_dev", b_dev, 0_c_intptr_t, &
328 b(1:ldb,1:ldbcols), &
329 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
330#else
331 successgpu = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
332 gpumemcpyhosttodevice)
333 check_memcpy_gpu("elpa_hermitian_multiply: b to b_dev", successgpu)
334#endif
335
336#else /* DEVICE_POINTER */
337
338#endif /* DEVICE_POINTER */
339
340 num = l_rows*nblk_mult*size_of_datatype
341#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
342 successgpu = gpu_malloc_host(aux_host, num) ! aux_host is needed, because pinning host memory can be done only for 1D arrays
343 check_host_alloc_gpu("elpa_hermitian_multiply: aux_host", successgpu)
344 call c_f_pointer(aux_host, aux_mat, (/l_rows,nblk_mult/))
345#else
346 allocate(aux_mat(l_rows, nblk_mult), stat=istat, errmsg=errormessage)
347 check_allocate("elpa_hermitian_multiply: aux_mat", istat, errormessage)
348#endif
349
350 successgpu = gpu_malloc(aux_mat_dev, num)
351 check_alloc_gpu("elpa_hermitian_multiply: aux_mat_dev", successgpu)
352
353 num = nblk_mult*l_cols*size_of_datatype
354 successgpu = gpu_malloc(tmp1_dev, num)
355 check_alloc_gpu("elpa_hermitian_multiply: tmp1_dev", successgpu)
356
357 num = nblk_mult*l_cols*size_of_datatype
358 successgpu = gpu_malloc(tmp2_dev, num)
359 check_alloc_gpu("elpa_hermitian_multiply: tmp2_dev", successgpu)
360
361 else ! useGPU
362 allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errormessage)
363 check_allocate("elpa_hermitian_multiply: aux_mat", istat, errormessage)
364 endif ! useGPU
365
366 allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errormessage)
367 check_allocate("elpa_hermitian_multiply: aux_bc", istat, errormessage)
368
369 allocate(lrs_save(nblk), stat=istat, errmsg=errormessage)
370 check_allocate("elpa_hermitian_multiply: lrs_save", istat, errormessage)
371
372 allocate(lre_save(nblk), stat=istat, errmsg=errormessage)
373 check_allocate("elpa_hermitian_multiply: lre_save", istat, errormessage)
374
375 a_lower = .false.
376 a_upper = .false.
377 c_lower = .false.
378 c_upper = .false.
379
380 if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
381 if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
382 if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
383 if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
384
385 if (usegpu) then
386
387#if !defined(DEVICE_POINTER)
388 num = obj%local_nrows*obj%local_ncols*size_of_datatype
389 successgpu = gpu_malloc(a_dev, num)
390 check_alloc_gpu("elpa_hermitian_multiply: a_dev", successgpu)
391#endif
392
393 num = l_rows*nblk*size_of_datatype
394 successgpu = gpu_malloc(aux_bc_dev, num)
395 check_alloc_gpu("elpa_hermitian_multiply: aux_bc_dev", successgpu)
396
397 num = obj%local_nrows*obj%local_ncols*size_of_datatype
398#if !defined(DEVICE_POINTER)
399
400#ifdef WITH_GPU_STREAMS
401 my_stream = obj%gpu_setup%my_stream
402 call gpu_memcpy_async_and_stream_synchronize &
403 ("elpa_hermitian_multiply: a to a_dev", a_dev, 0_c_intptr_t, &
404 a(1:obj%local_nrows,1:obj%local_ncols), &
405 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
406#else
407 successgpu = gpu_memcpy(a_dev, int(loc(a),kind=c_intptr_t), &
408 num, gpumemcpyhosttodevice)
409 check_memcpy_gpu("elpa_hermitian_multiply: a to a_dev", successgpu)
410#endif
411#endif /* DEVICE_POINTER */
412 endif !useGPU
413
414! _________________________________________________________________________________________________________________________________
415
416 ! main loop: build up the result matrix by processor rows
417 do np = 0, np_rows-1
418
419#ifdef WITH_NVTX
420 call nvtxrangepush("do np = 0, np_rows-1")
421#endif
422
423 ! In this turn, procs of row np assemble the result
424
425 l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
426
427 nr_done = 0 ! Number of rows done
428 nstor = 0 ! Number of columns stored in aux_mat
429
430 aux_mat = 0
431 if (usegpu) then
432 num = l_rows*nblk_mult*size_of_datatype
433#ifdef WITH_GPU_STREAMS
434 my_stream = obj%gpu_setup%my_stream
435 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
436 check_memcpy_gpu("multiply: aux_mat_dev", successgpu)
437#else
438 successgpu = gpu_memset(aux_mat_dev, 0, num)
439 check_memcpy_gpu("multiply: aux_mat_dev", successgpu)
440#endif
441 endif ! useGPU
442
443 ! Loop over the blocks on row np; nb is the 0-based local index of the block
444 do nb = 0, (l_rows_np-1)/nblk
445
446#ifdef WITH_NVTX
447 call nvtxrangepush("do nb = 0, (l_rows_np-1)/nblk")
448#endif
449
450 goff = nb*np_rows + np ! offset in the global grid of blocks
451
452 ! Get the processor column which owns this block
453 ! and the offset in blocks within this column.
454 ! The corresponding block column in A is then broadcast to all for multiplication with B
455
456 np_bc = mod(goff, np_cols) ! np, that posesses the given column of blocks; trans_a='T'; "bc"=block column; rename: np_bc -> np_col_b / np_col_curr
457
458 noff = goff/np_cols ! offset in the local grid of blocks
459
460 ! Gather up the complete column/row of blocks of A (for T/N case) on the owner in contigous memory of aux_bc array
461 n_aux_bc = 0
462 ! if not for upper/lower cases: aux_bc_2D(1:l_rows,1:l_cols) = a(1:l_rows,1:l_cols)
463 do n = 1, min(nblk, l_rows_np-nb*nblk) ! Loop over local columns for to be broadcast
464
465 gcol = goff*nblk + n ! global column corresponding to n, needed only for a_lower and a_upper cases
466
467 if (nstor==0 .and. n==1) gcol_min = gcol
468
469 lrs = 1 ! 1st (start) local row number for broadcast
470 lre = l_rows ! last (end) local row number for broadcast
471 if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
472 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
473
474 if (lrs <= lre) then
475 nvals = lre-lrs+1
476 if (usegpu) then
477 if (my_pcol == np_bc) call gpu_copy_precision_a_aux_bc(a_dev, aux_bc_dev, n_aux_bc, nvals, lrs, lre, noff, &
478 nblk, n, l_rows, obj%local_nrows, obj%local_ncols, my_stream)
479 else ! useGPU
480 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
481 endif ! useGPU
482
483 n_aux_bc = n_aux_bc + nvals
484 endif ! (lrs <= lre)
485
486 lrs_save(n) = lrs
487 lre_save(n) = lre
488
489 enddo ! n = 1, min(nblk, l_rows_np-nb*nblk)
490
491#ifdef WITH_MPI
492 ! copy data to host for bcast, if needed
493 if (usegpu .and. .not. useccl) then
494 num = l_rows*nblk*size_of_datatype
495#ifdef WITH_GPU_STREAMS
496 my_stream = obj%gpu_setup%my_stream
497 call gpu_memcpy_async_and_stream_synchronize &
498 ("elpa_hermitian_multiply: aux_bc_dev -> aux_bc", aux_bc_dev, 0_c_intptr_t, aux_bc(1:l_rows*nblk), &
499 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
500#else
501 successgpu = gpu_memcpy(int(loc(aux_bc),kind=c_intptr_t), aux_bc_dev, num, gpumemcpydevicetohost)
502 check_memcpy_gpu("elpa_hermitian_multiply: aux_bc_dev -> aux_bc", successgpu)
503#endif
504 endif ! useGPU .and. .not. useCCL
505
506 ! Broadcast block column
507 if (useccl) then
508#ifdef USE_CCL_HERMITIAN_MULTIPLY
509#ifdef WITH_NVTX
510 call nvtxrangepush("ccl_bcast aux_bc_dev")
511#endif
512 call obj%timer%start("ccl_bcast")
513
514 my_stream = obj%gpu_setup%my_stream
515 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
516
517 successgpu = ccl_bcast(aux_bc_dev, aux_bc_dev, int(k_datatype*n_aux_bc,kind=c_size_t), ccldatatype, &
518 int(np_bc,kind=c_int), ccl_comm_cols, my_stream)
519
520 if (.not. successgpu) then
521 print *,"Error in ccl_bcast"
522 stop 1
523 endif
524
525 successgpu = gpu_stream_synchronize(my_stream)
526 check_stream_synchronize_gpu("elpa_cholesky: ccl_bcast", successgpu)
527
528 call obj%timer%stop("ccl_bcast")
529#ifdef WITH_NVTX
530 call nvtxrangepop() ! ccl_bcast aux_bc_dev
531#endif
532#endif /* USE_CCL_HERMITIAN_MULTIPLY */
533 else ! useCCL
534 call obj%timer%start("mpi_communication")
535
536 call mpi_bcast(aux_bc, int(n_aux_bc,kind=mpi_kind), mpi_math_datatype_precision, &
537 int(np_bc,kind=mpi_kind), int(mpi_comm_cols,kind=mpi_kind), mpierr)
538
539 call obj%timer%stop("mpi_communication")
540 endif ! useCCL
541
542 ! copy data back to device, if needed
543 if (usegpu .and. .not. useccl) then
544 num = l_rows*nblk*size_of_datatype
545#ifdef WITH_GPU_STREAMS
546 my_stream = obj%gpu_setup%my_stream
547 call gpu_memcpy_async_and_stream_synchronize &
548 ("elpa_hermitian_multiply: aux_bc -> aux_bc_dev", aux_bc_dev, 0_c_intptr_t, aux_bc(1:l_rows*nblk), &
549 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
550#else
551 successgpu = gpu_memcpy(aux_bc_dev, int(loc(aux_bc),kind=c_intptr_t), num, gpumemcpyhosttodevice)
552 check_memcpy_gpu("elpa_hermitian_multiply: aux_bc -> aux_bc_dev", successgpu)
553#endif
554 endif ! useGPU .and. .not. useCCL
555#endif /* WITH_MPI */
556
557
558 ! Copy what we got in aux_mat
559 if (usegpu) then
560 n_aux_bc = 0
561 my_stream = obj%gpu_setup%my_stream
562 do n = 1, min(nblk, l_rows_np-nb*nblk)
563 nstor = nstor+1
564 lrs = lrs_save(n)
565 lre = lre_save(n)
566 if (lrs <= lre) then
567 nvals = lre-lrs+1
568 call gpu_copy_precision_aux_bc_aux_mat(aux_bc_dev, aux_mat_dev, lrs, lre, nstor, n_aux_bc, &
569 nvals, l_rows, nblk, nblk_mult, my_stream)
570
571 n_aux_bc = n_aux_bc + nvals
572 endif
573 enddo
574 else ! useGPU
575 n_aux_bc = 0
576 do n = 1, min(nblk, l_rows_np-nb*nblk)
577 nstor = nstor+1
578 lrs = lrs_save(n)
579 lre = lre_save(n)
580 if (lrs<=lre) then
581 nvals = lre-lrs+1
582 aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
583 n_aux_bc = n_aux_bc + nvals
584 endif
585 enddo
586 endif ! useGPU
587
588 ! If we got nblk_mult columns in aux_mat or this is the last block
589 ! do the matrix multiplication
590
591 if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then
592
593 lrs = 1 ! 1st local row number for multiply
594 lre = l_rows ! last local row number for multiply
595 if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
596 if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
597
598 lcs = 1 ! 1st local col number for multiply
599 lce = l_cols ! last local col number for multiply
600 if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
601 if (c_lower) lce = min(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
602
603 if (lcs <= lce) then
604 if (.not. useccl) then
605 ! introduce 1-based indexing
606 allocate(tmp1(nstor,1:lce-lcs+1), tmp2(nstor,1:lce-lcs+1), stat=istat, errmsg=errormessage)
607 call check_alloc("elpa_hermitian_multiply_&
608 &MATH_DATATYPE ", "tmp1", istat, errormessage)
609 endif
610
611 if (lrs <= lre) then
612 if (usegpu) then
613 aux_off = (lrs-1)*size_of_datatype
614 b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
615
616#ifdef WITH_NVTX
617 call nvtxrangepush("gpublas")
618#endif
619 call obj%timer%start("gpublas")
620 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
621 ! tmp1_dev = aux_mat_dev^{T/N} * b_dev
622 call gpublas_precision_gemm(blas_trans_or_conj, 'N', nstor, lce-lcs+1, lre-lrs+1, one, &
623 aux_mat_dev+aux_off, l_rows, &
624 b_dev+b_off, ldb, zero, &
625 tmp1_dev, nstor, gpuhandle)
626 if (wantdebug) successgpu = gpu_devicesynchronize()
627 call obj%timer%stop("gpublas")
628#ifdef WITH_NVTX
629 call nvtxrangepop() ! gpublas
630#endif
631 else ! useGPU
632 call obj%timer%start("blas")
633 ! tmp1 = aux_mat^{T/N} * b
634 call precision_gemm(blas_trans_or_conj, 'N', int(nstor,kind=blas_kind), &
635 int(lce-lcs+1,kind=blas_kind), int(lre-lrs+1,kind=blas_kind), one, &
636 aux_mat(lrs:lre,1:nstor), int(lre-lrs+1,kind=blas_kind), &
637 b(lrs,lcs), int(ldb,kind=blas_kind), zero, &
638 tmp1, int(nstor,kind=blas_kind))
639 call obj%timer%stop("blas")
640 endif ! useGPU
641 else ! (lrs <= lre)
642 if (usegpu) then
643 num = nstor*(lce-lcs+1)*size_of_datatype
644#ifdef WITH_GPU_STREAMS
645 my_stream = obj%gpu_setup%my_stream
646 successgpu = gpu_memset_async(tmp1_dev, 0, num, my_stream)
647 check_memcpy_gpu("multiply: tmp1_dev", successgpu)
648#else
649 successgpu = gpu_memset(tmp1_dev, 0, num)
650 check_memcpy_gpu("multiply: tmp1_dev", successgpu)
651#endif
652 else ! useGPU
653 tmp1 = 0
654 endif ! useGPU
655 endif ! (lrs <= lre)
656
657 ! Sum up the results and send to processor row np
658
659#ifdef WITH_MPI
660 ! copy data to host, if needed
661 if (usegpu .and. .not. useccl) then
662 num = nstor*(lce-lcs+1)*size_of_datatype
663#ifdef WITH_GPU_STREAMS
664 call gpu_memcpy_async_and_stream_synchronize &
665 ("elpa_hermitian_multiply: tmp1_dev to tmp1", tmp1_dev, 0_c_intptr_t, &
666 !tmp1(1:nblk_mult,1:l_cols), &
667 tmp1(1:nstor,1:lce-lcs+1), &
668 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
669#else
670 successgpu = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
671 tmp1_dev, num, gpumemcpydevicetohost)
672 check_memcpy_gpu("elpa_hermitian_multiply: tmp1_dev to tmp1", successgpu)
673#endif
674 endif ! useGPU .and. .not. useCCL
675
676 ! MPI/ccl Reduce
677 if (useccl) then
678#ifdef USE_CCL_HERMITIAN_MULTIPLY
679#ifdef WITH_NVTX
680 call nvtxrangepush("ccl_reduce tmp1_dev")
681#endif
682 call obj%timer%start("ccl_reduce")
683 my_stream = obj%gpu_setup%my_stream
684 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
685
686 successgpu = ccl_reduce(tmp1_dev, tmp2_dev, int(k_datatype*nstor*(lce-lcs+1),kind=c_size_t), ccldatatype, &
687 cclsum, int(np,kind=c_int), ccl_comm_rows, my_stream)
688
689 if (.not. successgpu) then
690 print *,"Error in ccl_reduce"
691 stop 1
692 endif
693
694 successgpu = gpu_stream_synchronize(my_stream)
695 check_stream_synchronize_gpu("elpa_cholesky: ccl_reduce", successgpu)
696
697 call obj%timer%stop("ccl_reduce")
698#ifdef WITH_NVTX
699 call nvtxrangepop() ! ccl_reduce tmp1_dev
700#endif
701#endif /* USE_CCL_HERMITIAN_MULTIPLY */
702 else ! useCCL
703 call obj%timer%start("mpi_communication")
704 call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=mpi_kind), mpi_math_datatype_precision, &
705 mpi_sum, int(np,kind=mpi_kind), int(mpi_comm_rows,kind=mpi_kind), mpierr)
706 call obj%timer%stop("mpi_communication")
707 endif ! useCCL
708
709 ! copy data back to device, if needed
710 if (usegpu .and. .not. useccl) then
711 num = nstor*(lce-lcs+1)*size_of_datatype
712#ifdef WITH_GPU_STREAMS
713 call gpu_memcpy_async_and_stream_synchronize &
714 ("elpa_hermitian_multiply: tmp2 to tmp2_dev", tmp2_dev, 0_c_intptr_t, &
715 !tmp2(1:nblk_mult,1:l_cols), &
716 tmp2(1:nstor,1:lce-lcs+1), &
717 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .true., .false.)
718#else
719 successgpu = gpu_memcpy(tmp2_dev, int(loc(tmp2),kind=c_intptr_t), &
720 num, gpumemcpyhosttodevice)
721 check_memcpy_gpu("elpa_hermitian_multiply: tmp2 to tmp2_dev", successgpu)
722#endif
723 endif ! useGPU .and. .not. useCCL
724#else /* WITH_MPI */
725
726 if (usegpu) then
727 num = nstor*(lce-lcs+1)*size_of_datatype
728 successgpu = gpu_memcpy(tmp2_dev, tmp1_dev, num, gpumemcpydevicetodevice)
729 check_memcpy_gpu("elpa_hermitian_multiply: tmp2 to tmp2_dev", successgpu)
730 endif
731#endif /* WITH_MPI */
732
733
734 if (usegpu) then
735 if (my_prow==np) call gpu_copy_precision_tmp2_c(tmp2_dev, c_dev, nr_done, nstor, &
736 lcs, lce, ldc, ldccols, my_stream)
737 else ! useGPU
738#ifdef WITH_MPI
739 ! Put the result into C
740 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,1:lce-lcs+1)
741#else /* WITH_MPI */
742 ! Put the result into C
743 if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,1:lce-lcs+1)
744 !tmp2(:,:) = 0.
745#endif /* WITH_MPI */
746 endif ! useGPU
747
748 if (.not. useccl) then
749 deallocate(tmp1, tmp2, stat=istat, errmsg=errormessage)
750 call check_alloc("elpa_hermitian_multiply_&
751 &MATH_DATATYPE ", "tmp1", istat, errormessage)
752 endif
753 endif ! (lcs <= lce)
754
755 nr_done = nr_done+nstor
756 nstor=0
757 if (usegpu) then
758 num = l_rows*nblk_mult*size_of_datatype
759#ifdef WITH_GPU_STREAMS
760 my_stream = obj%gpu_setup%my_stream
761 successgpu = gpu_memset_async(aux_mat_dev, 0, num, my_stream)
762 check_memcpy_gpu("multiply: aux_mat_dev", successgpu)
763#else
764 successgpu = gpu_memset(aux_mat_dev, 0, num)
765 check_memcpy_gpu("multiply: aux_mat_dev", successgpu)
766#endif
767 else ! useGPU
768 aux_mat(:,:) = 0
769 endif ! useGPU
770 endif ! (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np)
771
772#ifdef WITH_NVTX
773 call nvtxrangepop() ! do nb = 0, (l_rows_np-1)/nblk
774#endif
775 enddo ! nb = 0, (l_rows_np-1)/nblk
776
777#ifdef WITH_NVTX
778 call nvtxrangepop() ! do np = 0, np_rows-1
779#endif
780 enddo ! main loop: np = 0, np_rows-1
781
782!_______________________________________________
783
784 if (usegpu) then
785#if !defined(DEVICE_POINTER)
786 ! copy result c_dev back to CPU
787 num = ldc*ldccols
788#ifdef WITH_GPU_STREAMS
789 check_stream_synchronize_gpu("elpa_hermitian_multiply: c_dev -> c", successgpu)
790 call gpu_memcpy_async_and_stream_synchronize &
791 ("elpa_hermitian_multiply: c_dev to c", c_dev, 0_c_intptr_t, c(1:ldc,1:ldccols), &
792 1, 1, num*size_of_datatype, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
793#else
794 successgpu = gpu_memcpy(int(loc(c),kind=c_intptr_t), c_dev, num*size_of_datatype, gpumemcpydevicetohost)
795 check_memcpy_gpu("elpa_hermitian_multiply: c_dev -> c", successgpu)
796#endif
797#endif /* !defined(DEVICE_POINTER) */
798 endif ! useGPU
799
800
801!______________________________________________________________________________________________
802
803 if (usegpu) then
804#if !defined(DEVICE_POINTER)
805 successgpu = gpu_free(b_dev)
806 check_dealloc_gpu("elpa_hermitian_multiply: b_dev", successgpu)
807#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
808 successgpu = gpu_host_unregister(int(loc(b),kind=c_intptr_t))
809 check_host_unregister_gpu("elpa_hermitian_multiply: b", successgpu)
810#endif
811
812 successgpu = gpu_free(c_dev)
813 check_dealloc_gpu("elpa_hermitian_multiply: c_dev", successgpu)
814
815#else /* DEVICE_POINTER */
816
817#endif /* DEVICE_POINTER */
818
819#if !defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) && !defined(WITH_SYCL_GPU_VERSION)
820 nullify(aux_mat)
821 !nullify(tmp1)
822
823 successgpu = gpu_free_host(aux_host)
824 check_host_dealloc_gpu("elpa_hermitian_multiply: aux_host", successgpu)
825#else
826 deallocate(aux_mat, stat=istat, errmsg=errormessage)
827 check_deallocate("elpa_hermitian_multiply: aux_mat", istat, errormessage)
828
829 !deallocate(tmp1, stat=istat, errmsg=errorMessage)
830 !check_deallocate("elpa_hermitian_multiply: tmp1", istat, errorMessage)
831#endif
832
833 successgpu = gpu_free(aux_mat_dev)
834 check_dealloc_gpu("elpa_hermitian_multiply: aux_mat_dev", successgpu)
835
836 successgpu = gpu_free(tmp1_dev)
837 check_dealloc_gpu("elpa_hermitian_multiply: tmp1_dev", successgpu)
838
839 successgpu = gpu_free(tmp2_dev)
840 check_dealloc_gpu("elpa_hermitian_multiply: tmp2_dev", successgpu)
841
842 successgpu = gpu_free(aux_bc_dev)
843 check_dealloc_gpu("elpa_hermitian_multiply: aux_bc_dev", successgpu)
844
845#if !defined(DEVICE_POINTER)
846 successgpu = gpu_free(a_dev)
847 check_dealloc_gpu("elpa_hermitian_multiply: a_dev", successgpu)
848#else
849 !successGPU = gpu_free(a_dev)
850 !check_dealloc_gpu("elpa_hermitian_multiply: a_dev", successGPU)
851#endif
852
853 else ! useGPU
854 deallocate(aux_mat, stat=istat, errmsg=errormessage)
855 check_deallocate("elpa_hermitian_multiply: aux_mat", istat, errormessage)
856 endif ! useGPU
857
858 deallocate(aux_bc, lrs_save, lre_save, stat=istat, errmsg=errormessage)
859 check_deallocate("elpa_hermitian_multiply: aux_bc, lrs_save, lre_save", istat, errormessage)
860
861 call obj%timer%stop("elpa_hermitian_multiply_&
862 &MATH_DATATYPE&
863 &_&
864 &PRECISION&
865 &"//gpustring)
866
867#ifdef WITH_NVTX
868 call nvtxrangepop() ! multiply
869#endif
void set_gpu_parameters(int *gpuMemcpyHostToDevice, int *gpuMemcpyDeviceToHost)
Definition gpu_vendor_agnostic_layer.c:62
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50