Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.06.001.rc1
Loading...
Searching...
No Matches
merge_systems_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58#ifdef SOLVE_TRIDI_GPU_BUILD
59 subroutine merge_systems_gpu_&
60 &precision &
61 (obj, na, nm, d, e, q_dev, &
62 matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols_self, &
63 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
64#else
65 subroutine merge_systems_cpu_&
66 &precision &
67 (obj, na, nm, d, e, q, &
68 matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols_self, &
69 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
70#endif
71
72
73 use elpa_gpu
74 use, intrinsic :: iso_c_binding
75 use precision
77 use elpa_blas_interfaces
80 use resort_ev
83 use add_tmp
84 use v_add_s
85 use elpa_utilities
86 use elpa_mpi
88 use elpa_ccl_gpu
89 use merge_systems_gpu_new
90#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
91 use cuda_functions ! for NVTX labels
92#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
93 use hip_functions ! for ROCTX labels
94#endif
95
96#ifdef WITH_OPENMP_TRADITIONAL
97 use omp_lib
98#endif
99 implicit none
100#include "../general/precision_kinds.F90"
101 class(elpa_abstract_impl_t), intent(inout) :: obj
102 integer(kind=ik), intent(in) :: na ! this is rather na_local, not global na
103 integer(kind=ik), intent(in) :: nm, matrixRows, nqoff, nblk, matrixCols, mpi_comm_rows, &
104 mpi_comm_cols_self, npc_0, npc_n
105 integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
106
107#ifdef SOLVE_TRIDI_GPU_BUILD
108 integer(kind=c_intptr_t) :: d_dev, e_dev ! shifted; for e_dev only one element is used
109 integer(kind=c_intptr_t) :: q_dev
110#else
111 integer(kind=c_intptr_t) :: d_dev, e_dev ! dummy variables
112 integer(kind=c_intptr_t) :: q_dev
113#endif
114
115 real(kind=real_datatype), intent(inout) :: d(na), e
116#if defined(USE_ASSUMED_SIZE) && !defined(SOLVE_TRIDI_GPU_BUILD)
117 real(kind=real_datatype) :: q(matrixrows,*)
118#else
119 real(kind=real_datatype) :: q(matrixrows,matrixcols)
120#endif
121 logical, intent(in) :: useGPU, wantDebug
122 integer(kind=c_int) :: SM_count
123
124 logical, intent(out) :: success
125
126 ! TODO: play with max_strip. If it was larger, matrices being multiplied
127 ! might be larger as well!
128 ! Peter: two out of three GEMM dimensions are already "big" and filling the GPU computation rate.
129 ! Increasing max_strip doesn't improve the performance.
130 integer(kind=ik) :: max_strip
131
132
133 real(kind=real_datatype) :: beta, sig, s, c, t, tau, rho, eps, tol, &
134 qtrans(2,2), dmax, zmax, d1new, d2new
135 real(kind=real_datatype) :: z(na), d1(na), d2(na), z1(na), delta(na), &
136 dbase(na), ddiff(na), ev_scale(na), tmp(na)
137 real(kind=real_datatype) :: d1u(na), zu(na), d1l(na), zl(na)
138 real(kind=real_datatype), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
139#ifdef WITH_OPENMP_TRADITIONAL
140 real(kind=real_datatype), allocatable :: z_p(:,:)
141 integer(kind=ik) :: my_thread
142#endif
143
144 integer(kind=ik) :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
145 l_rqm, ns, lc1, lc2, info
146 integer(kind=BLAS_KIND) :: infoBLAS
147 integer(kind=ik) :: sig_int
148 integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
149 l_cols_qreorg, np, l_idx, nqcols1 !, nqcols2
150 integer(kind=ik) :: nnzu_save, nnzl_save
151 integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
152 np_cols
153 integer(kind=MPI_KIND) :: mpierr
154 integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
155 integer(kind=ik) :: np_next, np_prev, np_rem
156 integer(kind=ik) :: idx(na), idx1(na), idx2(na)
157 integer(kind=BLAS_KIND) :: idxBLAS(NA)
158 integer(kind=ik) :: coltyp(na), idxq1(na) !, idxq2(na)
159
160 integer(kind=ik) :: istat, debug
161 character(200) :: errorMessage
162 integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
163
164 integer(kind=c_intptr_t) :: num
165 integer(kind=C_intptr_t) :: qtmp1_dev, qtmp1_tmp_dev, qtmp2_dev, ev_dev
166 integer(kind=c_intptr_t) :: z1_dev, delta_dev, rho_dev
167 integer(kind=c_intptr_t) :: d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev
168 integer(kind=c_intptr_t) :: d1l_dev, zl_dev, z_dev, d1_dev, ztmp_extended_dev
169 integer(kind=c_intptr_t) :: idx1_dev, p_col_dev, coltyp_dev, p_col_out_dev, ndef_c_dev
170 integer(kind=c_intptr_t) :: idxq1_dev, l_col_out_dev, idx_dev, idx2_dev, l_col_dev
171 integer(kind=c_intptr_t) :: nnzul_dev
172 integer(kind=c_intptr_t) :: tmp_dev, zero_dev, one_dev, qtrans_dev ! for transform_columns_gpu
173
174 integer(kind=c_intptr_t) :: nnzu_val_dev, nnzl_val_dev
175 logical :: successGPU
176 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
177 &precision&
178 &_real
179 integer(kind=c_intptr_t) :: gpuHandle
180 integer(kind=ik), intent(in) :: max_threads
181 integer(kind=c_intptr_t) :: my_stream
182 integer(kind=ik) :: l_col_out_tmp
183 integer(kind=ik), allocatable :: nnzu_val(:,:), nnzl_val(:,:)
184 integer(kind=ik) :: nnzul(2)
185
186 integer(kind=ik) :: nnzu_start, nnzl_start
187
188 integer(kind=ik), allocatable :: ndef_c(:)
189
190 integer(kind=ik) :: ii,jj, indx, ind_ex, ind_ex2, p_col_tmp, index2, counter1, counter2
191
192 logical :: useCCL
193 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
194 integer(kind=c_int) :: cclDataType
195
196 call obj%timer%start("merge_systems" // precision_suffix)
197 success = .true.
198
199 call obj%timer%start("mpi_communication")
200 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
201 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
202 call mpi_comm_rank(int(mpi_comm_cols_self,kind=mpi_kind) ,my_pcolmpi, mpierr)
203 call mpi_comm_size(int(mpi_comm_cols_self,kind=mpi_kind) ,np_colsmpi, mpierr)
204
205 my_prow = int(my_prowmpi,kind=c_int)
206 np_rows = int(np_rowsmpi,kind=c_int)
207 my_pcol = int(my_pcolmpi,kind=c_int)
208 np_cols = int(np_colsmpi,kind=c_int)
209
210 call obj%timer%stop("mpi_communication")
211
212 if (wantdebug) then
213 debug = 1
214 else
215 debug = 0
216 endif
217
218 if (usegpu) then
219 max_strip=128
220 else
221 max_strip=128
222 endif
223 if (wantdebug) print *, "max_strip = ", max_strip
224
225 useccl = obj%gpu_setup%useCCL
226
227 if (usegpu) then
228#ifdef WITH_GPU_STREAMS
229 my_stream = obj%gpu_setup%my_stream
230#endif
231 sm_count = obj%gpu_setup%gpuSMcount
232
233 if (useccl) then
234 my_stream = obj%gpu_setup%my_stream
235 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
236 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
237#if defined(DOUBLE_PRECISION)
238 ccldatatype = ccldouble
239#endif
240#if defined(SINGLE_PRECISION)
241 ccldatatype = cclfloat
242#endif
243 endif ! useCCL
244 endif ! useGPU
245
246! #ifdef WITH_OPENMP_TRADITIONAL
247! allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
248! check_allocate("merge_systems: z_p",istat, errorMessage)
249! #endif
250
251 ! If my processor column isn't in the requested set, do nothing
252 if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
253 call obj%timer%stop("merge_systems" // precision_suffix)
254 return
255 endif
256
257 ! Determine number of "next" and "prev" column for ring sends
258 if (my_pcol == npc_0+npc_n-1) then
259 np_next = npc_0
260 else
261 np_next = my_pcol + 1
262 endif
263
264 if (my_pcol == npc_0) then
265 np_prev = npc_0+npc_n-1
266 else
267 np_prev = my_pcol - 1
268 endif
269
270 call check_monotony_&
271 &precision&
272 &(obj, nm,d,'Input1',wantdebug, success)
273 if (.not.(success)) then
274 call obj%timer%stop("merge_systems" // precision_suffix)
275 return
276 endif
277
278 call check_monotony_&
279 &precision&
280 &(obj,na-nm,d(nm+1),'Input2',wantdebug, success)
281 if (.not.(success)) then
282 call obj%timer%stop("merge_systems" // precision_suffix)
283 return
284 endif
285 ! Get global number of processors and my processor number.
286 ! Please note that my_proc does not need to match any real processor number,
287 ! it is just used for load balancing some loops.
288
289 n_procs = np_rows*npc_n
290 my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
291
292
293 ! Local limits of the rows of Q
294
295 l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
296 l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
297 l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
298
299 l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm
300 l_rows = l_rqe-l_rqs+1 ! Total number of local rows
301
302
303 ! My number of local columns
304
305 l_cols = count(p_col(1:na)==my_pcol)
306
307 ! Get max number of local columns
308
309 max_local_cols = 0
310 do np = npc_0, npc_0+npc_n-1
311 max_local_cols = max(max_local_cols,count(p_col(1:na)==np))
312 enddo
313
314
315 if (usegpu) then
316 num = na * size_of_int
317 successgpu = gpu_malloc(ndef_c_dev, num)
318 check_alloc_gpu("merge_systems: ndef_c_dev", successgpu)
319
320 num = na * size_of_int
321 successgpu = gpu_malloc(idx1_dev, num)
322 check_alloc_gpu("merge_systems: idx1_dev", successgpu)
323
324 num = na * size_of_int
325 successgpu = gpu_malloc(p_col_dev, num)
326 check_alloc_gpu("merge_systems: p_col_dev", successgpu)
327
328 num = na * size_of_int
329 successgpu = gpu_malloc(p_col_out_dev, num)
330 check_alloc_gpu("merge_systems: p_col_out_dev", successgpu)
331
332 num = na * size_of_int
333 successgpu = gpu_malloc(coltyp_dev, num)
334 check_alloc_gpu("merge_systems: coltyp_dev", successgpu)
335
336 num = na * size_of_int
337 successgpu = gpu_malloc(idx2_dev, num)
338 check_alloc_gpu("merge_systems: idx2_dev", successgpu)
339
340 num = na * size_of_int
341 successgpu = gpu_malloc(l_col_dev, num)
342 check_alloc_gpu("merge_systems: l_col_dev", successgpu)
343
344 num = na * size_of_int
345 successgpu = gpu_malloc(l_col_out_dev, num)
346 check_alloc_gpu("merge_systems: l_col_out_dev", successgpu)
347
348 num = (na) * size_of_datatype
349 successgpu = gpu_malloc(z_dev, num)
350 check_alloc_gpu("merge_systems: z_dev", successgpu)
351
352 num = (na) * size_of_datatype
353 successgpu = gpu_malloc(z1_dev, num)
354 check_alloc_gpu("merge_systems: z1_dev", successgpu)
355
356 num = (na) * size_of_datatype
357 successgpu = gpu_malloc(d1_dev, num)
358 check_alloc_gpu("merge_systems: d1_dev", successgpu)
359
360 num = 1 * size_of_datatype
361 successgpu = gpu_malloc(rho_dev, num)
362 check_alloc_gpu("merge_systems: rho_dev", successgpu)
363
364 num = (na) * size_of_datatype
365 successgpu = gpu_malloc(d1u_dev, num)
366 check_alloc_gpu("merge_systems: d1u_dev", successgpu)
367
368 num = (na) * size_of_datatype
369 successgpu = gpu_malloc(dbase_dev, num)
370 check_alloc_gpu("merge_systems: dbase_dev", successgpu)
371
372 num = (na) * size_of_datatype
373 successgpu = gpu_malloc(ddiff_dev, num)
374 check_alloc_gpu("merge_systems: ddiff_dev", successgpu)
375
376 num = (na) * size_of_datatype
377 successgpu = gpu_malloc(zu_dev, num)
378 check_alloc_gpu("merge_systems: zu_dev", successgpu)
379
380 num = (na) * size_of_datatype
381 successgpu = gpu_malloc(ev_scale_dev, num)
382 check_alloc_gpu("merge_systems: ev_scale_dev", successgpu)
383
384 num = (na) * size_of_datatype
385 successgpu = gpu_malloc(d1l_dev, num)
386 check_alloc_gpu("merge_systems: d1l_dev", successgpu)
387
388 num = (na) * size_of_datatype
389 successgpu = gpu_malloc(zl_dev, num)
390 check_alloc_gpu("merge_systems: zl_dev", successgpu)
391
392 num = (l_rows) * size_of_datatype
393 successgpu = gpu_malloc(tmp_dev, num)
394 check_alloc_gpu("merge_systems: tmp_dev", successgpu)
395
396 num = 1 * size_of_datatype
397 successgpu = gpu_malloc(zero_dev, num)
398 check_alloc_gpu("merge_systems: zero_dev", successgpu)
399
400 num = 1 * size_of_datatype
401 successgpu = gpu_malloc(one_dev, num)
402 check_alloc_gpu("merge_systems: one_dev", successgpu)
403
404 num = 4 * size_of_datatype
405 successgpu = gpu_malloc(qtrans_dev, num)
406 check_alloc_gpu("merge_systems: qtrans_dev", successgpu)
407
408 num = na * size_of_int
409#ifdef WITH_GPU_STREAMS
410 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
411#else
412 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
413#endif
414 check_memcpy_gpu("merge_systems: p_col_dev", successgpu)
415
416 num = na * size_of_int
417#ifdef WITH_GPU_STREAMS
418 successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
419#else
420 successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
421#endif
422 check_memcpy_gpu("merge_systems: l_col_dev", successgpu)
423
424 num = 1 * size_of_datatype
425 beta = 0.0_rk
426#ifdef WITH_GPU_STREAMS
427 successgpu = gpu_memcpy_async(zero_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
428#else
429 successgpu = gpu_memcpy(zero_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice)
430#endif
431 check_memcpy_gpu("merge_systems: zero_dev", successgpu)
432
433 num = 1 * size_of_datatype
434 beta = 1.0_rk
435#ifdef WITH_GPU_STREAMS
436 successgpu = gpu_memcpy_async(one_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
437#else
438 successgpu = gpu_memcpy(one_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice)
439#endif
440 check_memcpy_gpu("merge_systems: one_dev", successgpu)
441
442 endif ! useGPU
443
444 ! Calculations start here
445
446 beta = abs(e)
447 sig = sign(1.0_rk,e)
448
449 ! Calculate rank-1 modifier z
450 if (usegpu) then
451 num = na * size_of_datatype
452#ifdef WITH_GPU_STREAMS
453 successgpu = gpu_memset_async(z_dev, 0, num, my_stream)
454#else
455 successgpu = gpu_memset(z_dev, 0, num)
456#endif
457 check_memcpy_gpu("merge_systems: memset z_dev", successgpu)
458 else
459 z(:) = 0
460 endif
461
462
463 if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow) then
464 ! nm is local on my row
465 if (usegpu) then
466 sig_int = 1
467 nvtx_range_push("gpu_fill_z_kernel")
468 call gpu_fill_z(precision_char, z_dev, q_dev, p_col_dev, l_col_dev, &
469 sig_int, na, my_pcol, l_rqm, matrixrows, sm_count, debug, my_stream)
470 nvtx_range_pop("gpu_fill_z_kernel")
471 else
472 do i = 1, na
473 if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
474 enddo
475 endif
476 endif
477
478 if (mod((nqoff+nm)/nblk,np_rows)==my_prow) then
479 ! nm+1 is local on my row
480
481 if (usegpu) then
482 if (sig>0) then
483 sig_int = 1
484 else
485 sig_int = -1
486 endif
487
488 nvtx_range_push("gpu_fill_z_kernel")
489 call gpu_fill_z(precision_char, z_dev, q_dev, p_col_dev, l_col_dev, &
490 sig_int, na, my_pcol, l_rqm+1, matrixrows, sm_count, debug, my_stream)
491 nvtx_range_pop("gpu_fill_z_kernel")
492 else
493 do i = 1, na
494 if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
495 enddo
496 endif
497 endif
498
499 if (usegpu) then
500 num = na * size_of_datatype
501#ifdef WITH_GPU_STREAMS
502 successgpu = gpu_memcpy_async(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost, my_stream)
503#else
504 successgpu = gpu_memcpy(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost)
505#endif
506 check_memcpy_gpu("merge_systems: z_dev", successgpu)
507 endif
508
509
510 call global_gather_&
511 &precision&
512 &(obj, z, na, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)
513 if (.not.(success)) then
514 write(error_unit,*) "Error in global_gather. Aborting"
515 success = .false.
516 return
517 endif
518
519 ! Normalize z so that norm(z) = 1. Since z is the concatenation of
520 ! two normalized vectors, norm2(z) = sqrt(2).
521 z = z/sqrt(2.0_rk)
522 rho = 2.0_rk*beta
523 ! Calculate index for merging both systems by ascending eigenvalues
524 call obj%timer%start("lapack_lamrg")
525 nvtx_range_push("lapack_lamrg_1")
526 call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &
527 1_blas_kind, 1_blas_kind, idxblas )
528 nvtx_range_pop("lapack_lamrg_1")
529 idx(:) = int(idxblas(:),kind=ik)
530 call obj%timer%stop("lapack_lamrg")
531
532 ! Calculate the allowable deflation tolerance
533
534 zmax = maxval(abs(z))
535 dmax = maxval(abs(d))
536 eps = precision_lamch( 'E' ) ! return epsilon
537 tol = 8.0_rk*eps*max(dmax,zmax)
538
539 ! If the rank-1 modifier is small enough, no more needs to be done
540 ! except to reorganize D and Q
541
542 IF ( rho*zmax <= tol ) THEN
543 ! Rearrange eigenvalues
544 tmp = d
545 do i=1,na
546 d(i) = tmp(idx(i))
547 enddo
548
549 ! Rearrange eigenvectors
550 if (usegpu) then
551 nvtx_range_push("resort_ev_gpu")
552 call obj%timer%start("resort_ev_gpu")
553 call resort_ev_gpu_&
554 &precision&
555 (obj, idx, na, na, p_col_out, q_dev, matrixrows, matrixcols, l_rows, l_rqe, &
556 l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)
557 call obj%timer%stop("resort_ev_gpu")
558 nvtx_range_pop("resort_ev_gpu")
559 else ! useGPU
560 call resort_ev_cpu_&
561 &precision&
562 (obj, idx, na, na, p_col_out, q , matrixrows, matrixcols, l_rows, l_rqe, &
563 l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)
564 endif ! useGPU
565
566 call obj%timer%stop("merge_systems" // precision_suffix)
567
568 if (wantdebug) write(error_unit,*) "Returing early from merge_systems (RHO*zmax <= TOL): matrix is block-diagonal"
569 ! tested by validate_real_double_solve_tridiagonal_1stage_blocktridi
570
571 return
572 ENDIF
573
574 ! Merge and deflate system
575
576 na1 = 0
577 na2 = 0
578
579 ! COLTYP:
580 ! 1 : non-zero in the upper half only;
581 ! 2 : dense;
582 ! 3 : non-zero in the lower half only;
583 ! 4 : deflated.
584
585 coltyp(1:nm) = 1
586 coltyp(nm+1:na) = 3
587
588 nvtx_range_push("deflation_loop")
589 do i=1,na
590
591 if (rho*abs(z(idx(i))) <= tol) then
592
593 ! Deflate due to small z component.
594
595 na2 = na2+1
596 d2(na2) = d(idx(i))
597 idx2(na2) = idx(i)
598 coltyp(idx(i)) = 4
599
600 else if (na1>0) then
601
602 ! Check if eigenvalues are close enough to allow deflation.
603
604 s = z(idx(i))
605 c = z1(na1)
606
607 ! Find TAU = sqrt(a**2+b**2) without overflow or
608 ! destructive underflow.
609 tau = precision_lapy2( c, s )
610 t = d1(na1) - d(idx(i))
611 c = c / tau
612 s = -s / tau
613 IF ( abs( t*c*s ) <= tol ) THEN
614
615 ! Deflation is possible.
616
617 na2 = na2+1
618
619 z1(na1) = tau
620
621 d2new = d(idx(i))*c**2 + d1(na1)*s**2
622 d1new = d(idx(i))*s**2 + d1(na1)*c**2
623
624 ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
625 ! This means that after the above transformation it must be
626 ! D1(na1) <= d1new <= D(idx(i))
627 ! D1(na1) <= d2new <= D(idx(i))
628 !
629 ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
630 ! so there is no problem with sorting here.
631 ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
632 ! which makes a check (and possibly a resort) necessary.
633 !
634 ! The above relations may not hold exactly due to numeric differences
635 ! so they have to be enforced in order not to get troubles with sorting.
636
637
638 if (d1new<d1(na1) ) d1new = d1(na1)
639 if (d1new>d(idx(i))) d1new = d(idx(i))
640
641 if (d2new<d1(na1) ) d2new = d1(na1)
642 if (d2new>d(idx(i))) d2new = d(idx(i))
643
644 d1(na1) = d1new
645
646 do j=na2-1,1,-1
647 if (d2new<d2(j)) then
648 d2(j+1) = d2(j)
649 idx2(j+1) = idx2(j)
650 else
651 exit ! Loop
652 endif
653 enddo
654
655 d2(j+1) = d2new
656 idx2(j+1) = idx(i)
657
658 qtrans(1,1) = c; qtrans(1,2) =-s
659 qtrans(2,1) = s; qtrans(2,2) = c
660
661 nvtx_range_push("transform_columns")
662 if (usegpu) then
663#ifdef WITH_GPU_STREAMS
664 successgpu = gpu_memcpy_async(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &
665 4*size_of_datatype, gpumemcpyhosttodevice, my_stream)
666 if (wantdebug) successgpu = gpu_devicesynchronize()
667#else
668 successgpu = gpu_memcpy(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &
669 4*size_of_datatype, gpumemcpyhosttodevice)
670#endif
671 check_memcpy_gpu("transform_columns: q_dev", successgpu)
672
673 call transform_columns_gpu_&
674 &precision &
675 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
676 q_dev, matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &
677 p_col, l_col, qtrans_dev, &
678 tmp_dev, zero_dev, one_dev, debug, my_stream)
679 else
680 call transform_columns_cpu_&
681 &precision &
682 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
683 q , matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &
684 p_col, l_col, qtrans)
685 endif
686 nvtx_range_pop("transform_columns")
687
688 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
689 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
690
691 coltyp(idx(i)) = 4
692
693 else
694 na1 = na1+1
695 d1(na1) = d(idx(i))
696 z1(na1) = z(idx(i))
697 idx1(na1) = idx(i)
698 endif
699 else
700 na1 = na1+1
701 d1(na1) = d(idx(i))
702 z1(na1) = z(idx(i))
703 idx1(na1) = idx(i)
704 endif
705
706 enddo ! do i=1,na
707 nvtx_range_pop("deflation_loop")
708
709 call check_monotony_&
710 &precision&
711 &(obj, na1,d1,'Sorted1', wantdebug, success)
712 if (.not.(success)) then
713 call obj%timer%stop("merge_systems" // precision_suffix)
714 return
715 endif
716 call check_monotony_&
717 &precision&
718 &(obj, na2,d2,'Sorted2', wantdebug, success)
719 if (.not.(success)) then
720 call obj%timer%stop("merge_systems" // precision_suffix)
721 return
722 endif
723
724 if (na1==1 .or. na1==2) then
725 ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
726
727 if (na1==1) then
728 d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
729 else ! na1==2
730 call obj%timer%start("lapack_laed5_x2")
731 nvtx_range_push("lapack_laed5_x2")
732 call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))
733 call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))
734 nvtx_range_pop("lapack_laed5_x2")
735 call obj%timer%stop("lapack_laed5_x2")
736
737 if (usegpu) then
738#ifdef WITH_GPU_STREAMS
739 successgpu = gpu_memcpy_async(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &
740 4*size_of_datatype, gpumemcpyhosttodevice, my_stream)
741 if (wantdebug) successgpu = gpu_devicesynchronize()
742#else
743 successgpu = gpu_memcpy(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &
744 4*size_of_datatype, gpumemcpyhosttodevice)
745#endif
746 check_memcpy_gpu("transform_columns: q_dev", successgpu)
747
748 call transform_columns_gpu_&
749 &precision &
750 (obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, &
751 q_dev, matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &
752 p_col, l_col, qtrans_dev, &
753 tmp_dev, zero_dev, one_dev, debug, my_stream)
754 else
755 call transform_columns_cpu_&
756 &precision&
757 & (obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &
758 matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &
759 p_col, l_col, qtrans)
760 endif
761 endif ! na1==2
762
763 ! Add the deflated eigenvalues
764 d(na1+1:na) = d2(1:na2)
765
766 ! Calculate arrangement of all eigenvalues in output
767 call obj%timer%start("lapack_lamrg")
768 nvtx_range_push("lapack_lamrg_2")
769 call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
770 1_blas_kind, 1_blas_kind, idxblas )
771 nvtx_range_pop("lapack_lamrg_2")
772 idx(:) = int(idxblas(:),kind=ik)
773 call obj%timer%stop("lapack_lamrg")
774 ! Rearrange eigenvalues
775
776 tmp = d
777 do i=1,na
778 d(i) = tmp(idx(i))
779 enddo
780
781 ! Rearrange eigenvectors
782
783 do i=1,na
784 if (idx(i)<=na1) then
785 idxq1(i) = idx1(idx(i))
786 else
787 idxq1(i) = idx2(idx(i)-na1)
788 endif
789 enddo
790
791 if (usegpu) then
792 call resort_ev_gpu_&
793 &precision&
794 &(obj, idxq1, na, na, p_col_out, q_dev, matrixrows, matrixcols, l_rows, l_rqe, &
795 l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)
796 else
797 call resort_ev_cpu_&
798 &precision&
799 &(obj, idxq1, na, na, p_col_out, q , matrixrows, matrixcols, l_rows, l_rqe, &
800 l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)
801 endif
802
803 write(error_unit,*) .or."Returing early from merge_systems (na1==1 na1==2)"
804 ! na=1 can be tested with "mpirun -n 4 ./validate_real_double_solve_tridiagonal_1stage_gpu_blocktridi 3 3 1"
805 ! na=2 can be tested with "mpirun -n 4 ./validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz 4 4 2"
806 else if (na1>2) then
807
808 ! Solve secular equation
809
810 if (usegpu) then
811 num = (na1*sm_count) * size_of_datatype
812 successgpu = gpu_malloc(ztmp_extended_dev, num)
813 check_alloc_gpu("merge_systems: delta_dev", successgpu)
814
815 call gpu_fill_array(precision_char, ztmp_extended_dev, one_dev, na1*sm_count, sm_count, debug, my_stream)
816
817 call gpu_fill_array(precision_char, z_dev, one_dev, na1, sm_count, debug, my_stream)
818
819 num = na1 * size_of_datatype
820#ifdef WITH_GPU_STREAMS
821 successgpu = gpu_memset_async(dbase_dev, 0, num, my_stream)
822#else
823 successgpu = gpu_memset(dbase_dev, 0, num)
824#endif
825 check_memcpy_gpu("merge_systems: memset dbase_dev", successgpu)
826
827 num = na1 * size_of_datatype
828#ifdef WITH_GPU_STREAMS
829 successgpu = gpu_memset_async(ddiff_dev, 0, num, my_stream)
830#else
831 successgpu = gpu_memset(ddiff_dev, 0, num)
832#endif
833 check_memcpy_gpu("merge_systems: memset ddiff_dev", successgpu)
834 else
835 z(1:na1) = 1
836! #ifdef WITH_OPENMP_TRADITIONAL
837! z_p(1:na1,:) = 1
838! #endif
839 dbase(1:na1) = 0
840 ddiff(1:na1) = 0
841 endif
842
843 nvtx_range_push("lapack_laed4_loop")
844
845 if (usegpu) then
846 ! data transfer to GPU
847#ifdef WITH_GPU_STREAMS
848 num = na * size_of_datatype
849 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
850 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
851
852 successgpu = gpu_memcpy_async(z1_dev, int(loc(z1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
853 check_memcpy_gpu("merge_systems: z1_dev", successgpu)
854
855 num = 1 * size_of_datatype
856 successgpu = gpu_memcpy_async(rho_dev, int(loc(rho),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
857 check_memcpy_gpu("merge_systems: rho_dev", successgpu)
858
859#else
860 num = na * size_of_datatype
861 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
862 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
863
864 successgpu = gpu_memcpy(z1_dev, int(loc(z1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
865 check_memcpy_gpu("merge_systems: z1_dev", successgpu)
866
867 num = 1 * size_of_datatype
868 successgpu = gpu_memcpy(rho_dev, int(loc(rho),kind=c_intptr_t), num, gpumemcpyhosttodevice)
869 check_memcpy_gpu("merge_systems: rho_dev", successgpu)
870#endif
871
872 ! delta_dev is a temporary buffer, not used afterwards
873 num = (na1*sm_count) * size_of_datatype
874 successgpu = gpu_malloc(delta_dev, num)
875 check_alloc_gpu("merge_systems: delta_dev", successgpu)
876
877 call gpu_solve_secular_equation_loop (precision_char, d1_dev, z1_dev, delta_dev, rho_dev, &
878 ztmp_extended_dev, dbase_dev, ddiff_dev, my_proc, na1, n_procs, sm_count, debug, my_stream)
879
880 call gpu_local_product(precision_char, z_dev, ztmp_extended_dev, na1, sm_count, debug, my_stream)
881
882 successgpu = gpu_free(delta_dev)
883 check_dealloc_gpu("merge_systems: delta_dev", successgpu)
884
885 ! data transfer back to CPU
886#ifdef WITH_GPU_STREAMS
887 num = na * size_of_datatype
888 successgpu = gpu_memcpy_async(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost, my_stream)
889 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
890
891 successgpu = gpu_memcpy_async(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost, my_stream)
892 check_memcpy_gpu("merge_systems: z1_dev", successgpu)
893
894 successgpu = gpu_memcpy_async(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost, my_stream)
895 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
896
897 successgpu = gpu_memcpy_async(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost, my_stream)
898 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
899
900 successgpu = gpu_memcpy_async(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost, my_stream)
901 check_memcpy_gpu("merge_systems: z_dev", successgpu)
902
903 num = 1 * size_of_datatype
904 successgpu = gpu_memcpy_async(int(loc(rho),kind=c_intptr_t), rho_dev, num, gpumemcpydevicetohost, my_stream)
905 check_memcpy_gpu("merge_systems: rho_dev", successgpu)
906#else
907 num = na * size_of_datatype
908 successgpu = gpu_memcpy(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost)
909 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
910
911 successgpu = gpu_memcpy(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost)
912 check_memcpy_gpu("merge_systems: z1_dev", successgpu)
913
914 successgpu = gpu_memcpy(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost)
915 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
916
917 successgpu = gpu_memcpy(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost)
918 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
919
920 successgpu = gpu_memcpy(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost)
921 check_memcpy_gpu("merge_systems: z_dev", successgpu)
922
923 num = 1 * size_of_datatype
924 successgpu = gpu_memcpy(int(loc(rho),kind=c_intptr_t), rho_dev, num, gpumemcpydevicetohost)
925 check_memcpy_gpu("merge_systems: rho_dev", successgpu)
926#endif
927 else
928! info = 0
929! infoBLAS = int(info,kind=BLAS_KIND)
930!#ifdef WITH_OPENMP_TRADITIONAL
931!
932! call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
933!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
934! my_thread = omp_get_thread_num()
935!!$OMP DO
936!#endif
937 do i = my_proc+1, na1, n_procs ! work distributed over all processors
938 call obj%timer%start("lapack_laed4")
939 nvtx_range_push("lapack_laed4")
940 call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &
941 rho, s, infoblas) ! s is not used!
942 info = int(infoblas,kind=ik)
943 nvtx_range_pop("lapack_laed4")
944 call obj%timer%stop("lapack_laed4")
945 if (info/=0) then
946 ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
947 ! use the more stable bisection algorithm in solve_secular_equation
948 ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
949 call solve_secular_equation_&
950 &precision&
951 &(obj, na1, i, d1, z1, delta, rho, s) ! s is not used!
952 endif
953
954 ! Compute updated z
955
956 !#ifdef WITH_OPENMP_TRADITIONAL
957 ! do j=1,na1
958 ! if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
959 ! enddo
960 ! z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
961 !#else
962 do j=1,na1
963 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
964 enddo
965 z(i) = z(i)*delta(i)
966 !#endif
967 ! Store dbase/ddiff
968
969 if (i<na1) then
970 if (abs(delta(i+1)) < abs(delta(i))) then
971 dbase(i) = d1(i+1)
972 ddiff(i) = delta(i+1)
973 else
974 dbase(i) = d1(i)
975 ddiff(i) = delta(i)
976 endif
977 else
978 dbase(i) = d1(i)
979 ddiff(i) = delta(i)
980 endif
981 enddo ! i = my_proc+1, na1, n_procs
982 endif ! useGPU
983 nvtx_range_pop("lapack_laed4_loop")
984
985!#ifdef WITH_OPENMP_TRADITIONAL
986!!$OMP END PARALLEL
987!
988! call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
989!
990! do i = 0, max_threads-1
991! z(1:na1) = z(1:na1)*z_p(1:na1,i)
992! enddo
993!#endif
994
995 nvtx_range_push("global_product")
996 call global_product_&
997 &precision&
998 (obj, z, na1, mpi_comm_rows, mpi_comm_cols_self, npc_0, npc_n, success)
999 if (.not.(success)) then
1000 write(error_unit,*) "Error in global_product. Aborting..."
1001 return
1002 endif
1003 nvtx_range_pop("global_product")
1004
1005 z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )
1006
1007 nvtx_range_push("global_gather_x2")
1008 call global_gather_&
1009 &precision&
1010 &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)
1011 if (.not.(success)) then
1012 write(error_unit,*) "Error in global_gather. Aborting..."
1013 return
1014 endif
1015 call global_gather_&
1016 &precision&
1017 &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)
1018 if (.not.(success)) then
1019 write(error_unit,*) "Error in global_gather. Aborting..."
1020 return
1021 endif
1022 nvtx_range_pop("global_gather_x2")
1023
1024 d(1:na1) = dbase(1:na1) - ddiff(1:na1)
1025
1026 ! Calculate scale factors for eigenvectors
1027 if (usegpu) then
1028 num = na * size_of_datatype
1029#ifdef WITH_GPU_STREAMS
1030 successgpu = gpu_memset_async(ev_scale_dev, 0, num, my_stream)
1031#else
1032 successgpu = gpu_memset(ev_scale_dev, 0, num)
1033#endif
1034 check_memcpy_gpu("merge_systems: memset ev_scale_dev", successgpu)
1035 else ! useGPU
1036 ev_scale(:) = 0.0_rk
1037 endif ! useGPU
1038
1039
1040 nvtx_range_push("add_tmp_loop")
1041 if (wantdebug) call obj%timer%start("add_tmp_loop")
1042
1043 if (usegpu) then
1044 ! data transfer to GPU
1045 num = na * size_of_datatype
1046#ifdef WITH_GPU_STREAMS
1047 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1048 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
1049
1050 successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1051 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1052
1053 successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1054 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1055
1056 successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1057 check_memcpy_gpu("merge_systems: z_dev", successgpu)
1058#else
1059 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1060 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
1061
1062 successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1063 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1064
1065 successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1066 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1067
1068 successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1069 check_memcpy_gpu("merge_systems: z_dev", successgpu)
1070#endif
1071
1072 call gpu_add_tmp_loop(precision_char, d1_dev, dbase_dev, ddiff_dev, z_dev, ev_scale_dev, ztmp_extended_dev, &
1073 na1, my_proc, n_procs, sm_count, debug, my_stream)
1074
1075 successgpu = gpu_free(ztmp_extended_dev)
1076 check_dealloc_gpu("merge_systems: ztmp_extended_dev", successgpu)
1077
1078 ! data transfer back to CPU
1079 num = na * size_of_datatype
1080#ifdef WITH_GPU_STREAMS
1081 successgpu = gpu_memcpy_async(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost, my_stream)
1082 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
1083
1084 successgpu = gpu_memcpy_async(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost, my_stream)
1085 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1086
1087 successgpu = gpu_memcpy_async(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost, my_stream)
1088 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1089
1090 successgpu = gpu_memcpy_async(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost, my_stream)
1091 check_memcpy_gpu("merge_systems: z1_dev", successgpu)
1092
1093 successgpu = gpu_memcpy_async(int(loc(ev_scale(1)),kind=c_intptr_t), ev_scale_dev, num, gpumemcpydevicetohost, my_stream)
1094 check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)
1095#else
1096 successgpu = gpu_memcpy(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost)
1097 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
1098
1099 successgpu = gpu_memcpy(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost)
1100 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1101
1102 successgpu = gpu_memcpy(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost)
1103 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1104
1105 successgpu = gpu_memcpy(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost)
1106 check_memcpy_gpu("merge_systems: z1_dev", successgpu)
1107
1108 successgpu = gpu_memcpy(int(loc(ev_scale(1)),kind=c_intptr_t), ev_scale_dev, num, gpumemcpydevicetohost)
1109 check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)
1110#endif
1111 else
1112#ifdef WITH_OPENMP_TRADITIONAL
1113 call obj%timer%start("OpenMP parallel" // precision_suffix)
1114
1115!$omp PARALLEL DO &
1116!$omp default(none) &
1117!$omp private(i) &
1118!$omp SHARED(na1, my_proc, n_procs, &
1119!$OMP d1, dbase, ddiff, z, ev_scale, obj)
1120#endif
1121 do i = my_proc+1, na1, n_procs ! work distributed over all processors
1122
1123 ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code
1124 ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
1125
1126 ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
1127 ! in exactly this order, but we want to prevent compiler optimization
1128 ! ev_scale_val = ev_scale(i)
1129 call add_tmp_&
1130 &precision&
1131 &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)
1132 ! ev_scale(i) = ev_scale_val
1133 enddo
1134#ifdef WITH_OPENMP_TRADITIONAL
1135!$OMP END PARALLEL DO
1136
1137 call obj%timer%stop("OpenMP parallel" // precision_suffix)
1138#endif
1139 endif ! useGPU
1140
1141 if (wantdebug) call obj%timer%stop("add_tmp_loop")
1142 nvtx_range_pop("add_tmp_loop")
1143
1144
1145 nvtx_range_push("global_gather")
1146 call global_gather_&
1147 &precision&
1148 &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)
1149 if (.not.(success)) then
1150 write(error_unit,*) "Error in global_gather. Aborting..."
1151 return
1152 endif
1153 nvtx_range_pop("global_gather")
1154
1155 ! Add the deflated eigenvalues
1156 d(na1+1:na) = d2(1:na2)
1157
1158 call obj%timer%start("lapack_lamrg")
1159 nvtx_range_push("lapack_lamrg_3")
1160 ! Calculate arrangement of all eigenvalues in output
1161 call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
1162 1_blas_kind, 1_blas_kind, idxblas )
1163 nvtx_range_pop("lapack_lamrg_3")
1164 idx(:) = int(idxblas(:),kind=ik)
1165 call obj%timer%stop("lapack_lamrg")
1166
1167 ! Rearrange eigenvalues
1168 tmp = d
1169 do i=1,na
1170 d(i) = tmp(idx(i))
1171 enddo
1172 call check_monotony_&
1173 &precision&
1174 &(obj, na,d,'Output', wantdebug, success)
1175
1176 if (.not.(success)) then
1177 call obj%timer%stop("merge_systems" // precision_suffix)
1178 write(error_unit,*) "Error in check_monotony. Aborting..."
1179 return
1180 endif
1181 ! Eigenvector calculations
1182 if (usegpu) then
1183 num = 2 * size_of_int
1184 successgpu = gpu_malloc(nnzul_dev, num) ! packs together nnzu and nnzl
1185 check_alloc_gpu("merge_systems: ", successgpu)
1186
1187 num = na * size_of_int
1188 successgpu = gpu_malloc(idxq1_dev, num)
1189 check_alloc_gpu("merge_systems: ", successgpu)
1190
1191 num = na * size_of_int
1192 successgpu = gpu_malloc(idx_dev, num)
1193 check_alloc_gpu("merge_systems: idx_dev", successgpu)
1194
1195 num = na * size_of_int
1196#ifdef WITH_GPU_STREAMS
1197 my_stream = obj%gpu_setup%my_stream
1198 successgpu = gpu_memcpy_async(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
1199 num, gpumemcpyhosttodevice, my_stream)
1200 check_memcpy_gpu("merge_systems: ", successgpu)
1201#else
1202 successgpu = gpu_memcpy(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
1203 num, gpumemcpyhosttodevice)
1204 check_memcpy_gpu("merge_systems: idx_dev", successgpu)
1205#endif
1206 endif
1207
1208 ! Calculate the number of columns in the new local matrix Q
1209 ! which are updated from non-deflated/deflated eigenvectors.
1210 ! idxq1/2 stores the global column numbers.
1211
1212 !if (useGPU) then
1213
1214
1215
1216 ! !nqcols1 is needed later on host !!
1217 ! ! memcopy back needed!!
1218 !else
1219 nqcols1 = 0 ! number of non-deflated eigenvectors
1220 !nqcols2 = 0 ! number of deflated eigenvectors
1221 nvtx_range_push("loop_idxq1")
1222 DO i = 1, na
1223 if (p_col_out(i)==my_pcol) then
1224 if (idx(i)<=na1) then
1225 nqcols1 = nqcols1+1
1226 idxq1(nqcols1) = i
1227 !else
1228 !nqcols2 = nqcols2+1
1229 !idxq2(nqcols2) = i
1230 endif
1231 endif
1232 enddo
1233 nvtx_range_pop("loop_idxq1")
1234 !endif
1235
1236 if (usegpu) then
1237 num = na * size_of_int
1238#ifdef WITH_GPU_STREAMS
1239 my_stream = obj%gpu_setup%my_stream
1240 successgpu = gpu_memcpy_async(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
1241 num, gpumemcpyhosttodevice, my_stream)
1242 check_memcpy_gpu("merge_systems: ", successgpu)
1243#else
1244 successgpu = gpu_memcpy(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
1245 num, gpumemcpyhosttodevice)
1246 check_memcpy_gpu("merge_systems: idxq1_dev", successgpu)
1247#endif
1248 endif
1249
1250
1251 if (usegpu) then
1252 allocate(ndef_c(na), stat=istat, errmsg=errormessage)
1253 check_allocate("merge_systems: ndef_c",istat, errormessage)
1254 endif
1255
1256
1257 gemm_dim_k = max(1,l_rows)
1258 gemm_dim_l = max_local_cols
1259 gemm_dim_m = min(max_strip, max(1,nqcols1))
1260
1261 if (.not. useccl) then
1262 allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)
1263 check_allocate("merge_systems: qtmp1",istat, errormessage)
1264
1265 allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)
1266 check_allocate("merge_systems: ev",istat, errormessage)
1267
1268 allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)
1269 check_allocate("merge_systems: qtmp2",istat, errormessage)
1270 endif
1271
1272 if (usegpu) then
1273 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
1274 successgpu = gpu_malloc(qtmp1_dev, num)
1275 check_alloc_gpu("merge_systems: qtmp1_dev", successgpu)
1276
1277 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
1278 successgpu = gpu_malloc(qtmp1_tmp_dev, num)
1279 check_alloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)
1280
1281 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
1282 successgpu = gpu_malloc(ev_dev, num)
1283 check_alloc_gpu("merge_systems: ev_dev", successgpu)
1284
1285 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
1286 successgpu = gpu_malloc(qtmp2_dev, num)
1287 check_alloc_gpu("merge_systems: qtmp2_dev", successgpu)
1288
1289 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1290 if (wantdebug) call obj%timer%start("gpu_host_register")
1291
1292 if (.not. useccl) then
1293 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
1294 nvtx_range_push("gpu_host_register_qtmp1")
1295 successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t), num, gpuhostregisterdefault)
1296 check_host_register_gpu("merge_systems: qtmp1", successgpu)
1297 nvtx_range_pop("gpu_host_register_qtmp1")
1298
1299 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
1300 successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t), num, gpuhostregisterdefault)
1301 check_host_register_gpu("merge_systems: ev", successgpu)
1302
1303 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
1304 successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t), num, gpuhostregisterdefault)
1305 check_host_register_gpu("merge_systems: qtmp2", successgpu)
1306 endif
1307
1308 if (wantdebug) then
1309 successgpu = gpu_devicesynchronize()
1310 call obj%timer%stop("gpu_host_register")
1311 endif
1312 endif
1313 endif ! useGPU
1314
1315 if (usegpu) then
1316 num = gemm_dim_k * gemm_dim_l * size_of_datatype
1317#ifdef WITH_GPU_STREAMS
1318 successgpu = gpu_memset_async(qtmp1_dev, 0, num, my_stream)
1319#else
1320 successgpu = gpu_memset(qtmp1_dev, 0, num)
1321#endif
1322 check_memcpy_gpu("merge_systems: memset qtmp1_dev", successgpu)
1323 else
1324 nvtx_range_push("set_qtmp1_qtmp2_0")
1325 call obj%timer%start("set_qtmp1_qtmp2_0")
1326 qtmp1 = 0 ! May contain empty (unset) parts
1327 qtmp2 = 0 ! Not really needed
1328 call obj%timer%stop("set_qtmp1_qtmp2_0")
1329 nvtx_range_pop("set_qtmp1_qtmp2_0")
1330 endif
1331
1332 ! Gather nonzero upper/lower components of old matrix Q
1333 ! which are needed for multiplication with new eigenvectors
1334
1335 ! kernel compute nnzu on device
1336 if (usegpu) then
1337 ! data transfer to GPU
1338 num = na * size_of_int
1339#ifdef WITH_GPU_STREAMS
1340 successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1341 check_memcpy_gpu("merge_systems: idx1_dev", successgpu)
1342
1343 successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1344 check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)
1345#else
1346 successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1347 check_memcpy_gpu("merge_systems: idx1_dev", successgpu)
1348
1349 successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1350 check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)
1351#endif
1352
1353 nvtx_range_push("gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")
1354 if (wantdebug) call obj%timer%start("gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")
1355
1356 call gpu_copy_qtmp1_q_compute_nnzu_nnzl(precision_char, qtmp1_dev, q_dev, &
1357 p_col_dev, l_col_dev, idx1_dev, coltyp_dev, nnzul_dev, &
1358 na1, l_rnm, l_rqs, l_rqm, l_rows, my_pcol, gemm_dim_k, matrixrows, &
1359 sm_count, debug, my_stream)
1360
1361 if (wantdebug) call obj%timer%stop("gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")
1362 nvtx_range_pop("gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")
1363
1364 ! num = 2 * size_of_int
1365 ! successGPU = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpuMemcpyDeviceToHost)
1366 ! check_memcpy_gpu("merge_systems: nnzul_dev", successGPU)
1367
1368 num = 2 * size_of_int
1369#ifdef WITH_GPU_STREAMS
1370 successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost, my_stream)
1371#else
1372 successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost)
1373#endif
1374 check_memcpy_gpu("merge_systems: nnzul_dev", successgpu)
1375
1376 nnzu = nnzul(1)
1377 nnzl = nnzul(2)
1378 else ! useGPU
1379 nvtx_range_push("loop_compute_nnzu")
1380 if (wantdebug) call obj%timer%start("loop_compute_nnzu")
1381 nnzu = 0
1382 nnzl = 0
1383 do i = 1, na1
1384 if (p_col(idx1(i))==my_pcol) then
1385 l_idx = l_col(idx1(i))
1386
1387 if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
1388 nnzu = nnzu+1
1389 qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
1390 endif
1391
1392 if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
1393 nnzl = nnzl+1
1394 qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
1395 endif
1396 endif
1397 enddo
1398 if (wantdebug) call obj%timer%stop("loop_compute_nnzu")
1399 nvtx_range_pop("loop_compute_nnzu")
1400 endif ! useGPU
1401
1402 if (usegpu) then
1403 call obj%timer%start("gpu_memcpy")
1404 num = na * size_of_int
1405#ifdef WITH_GPU_STREAMS
1406 my_stream = obj%gpu_setup%my_stream
1407 successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1408 check_memcpy_gpu("merge_systems: ", successgpu)
1409#else
1410 successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1411 check_memcpy_gpu("merge_systems: l_col_dev", successgpu)
1412#endif
1413 call obj%timer%stop("gpu_memcpy")
1414 endif
1415
1416 ! Gather deflated eigenvalues behind nonzero components
1417
1418 ! compute ndef on device
1419 ndef = max(nnzu,nnzl)
1420
1421 if (usegpu) then
1422 num = na * size_of_int
1423#ifdef WITH_GPU_STREAMS
1424 successgpu = gpu_memcpy_async(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1425 check_memcpy_gpu("merge_systems: ", successgpu)
1426
1427 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1428 check_memcpy_gpu("merge_systems: ", successgpu)
1429#else
1430 successgpu = gpu_memcpy(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1431 check_memcpy_gpu("merge_systems: idx2_dev", successgpu)
1432
1433 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1434 check_memcpy_gpu("merge_systems: p_col_dev", successgpu)
1435#endif
1436 endif
1437
1438
1439 if (usegpu) then
1440 ndef_c(:) = ndef
1441
1442 num = na * size_of_int
1443#ifdef WITH_GPU_STREAMS
1444 successgpu = gpu_memcpy_async(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1445 check_memcpy_gpu("merge_systems: ndef_c_dev 4", successgpu)
1446#else
1447 successgpu = gpu_memcpy(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1448 check_memcpy_gpu("merge_systems: ndef_c_dev", successgpu)
1449#endif
1450
1451 call gpu_copy_q_slice_to_qtmp1 (precision_char, qtmp1_dev, q_dev, ndef_c_dev, l_col_dev, idx2_dev, p_col_dev, &
1452 na2, na, my_pcol, l_rows, l_rqs, l_rqe, matrixrows, gemm_dim_k, debug, my_stream)
1453 else
1454 do i = 1, na2
1455 l_idx = l_col(idx2(i))
1456 if (p_col(idx2(i))==my_pcol) then
1457 ndef = ndef+1
1458 qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
1459 endif
1460 enddo
1461 endif
1462
1463 l_cols_qreorg = ndef ! Number of columns in reorganized matrix
1464 if (usegpu) then
1465 num = na * size_of_int
1466#ifdef WITH_GPU_STREAMS
1467 successgpu = gpu_memcpy_async(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1468 num, gpumemcpyhosttodevice, my_stream)
1469 check_memcpy_gpu("merge_systems: ", successgpu)
1470
1471 successgpu = gpu_memcpy_async(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1472 num, gpumemcpyhosttodevice, my_stream)
1473 check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)
1474#else
1475 successgpu = gpu_memcpy(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1476 check_memcpy_gpu("merge_systems: p_col_out_dev", successgpu)
1477
1478 successgpu = gpu_memcpy(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1479 check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)
1480#endif
1481 endif
1482
1483
1484 ! Set (output) Q to 0, it will sum up new Q
1485
1486
1487 if (usegpu) then
1488 call gpu_zero_q(precision_char, q_dev, p_col_out_dev, l_col_out_dev, &
1489 na, my_pcol, l_rqs, l_rqe, matrixrows, debug, my_stream)
1490 else
1491 DO i = 1, na
1492 if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
1493 enddo
1494 endif
1495
1496 ! check memory copies
1497
1498 if (usegpu) then
1499#ifdef WITH_GPU_STREAMS
1500 num = na * size_of_int
1501 successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1502 check_memcpy_gpu("merge_systems: idx1_dev", successgpu)
1503
1504 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1505 check_memcpy_gpu("merge_systems: p_col_dev", successgpu)
1506
1507 successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1508 check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)
1509
1510 num = na * size_of_datatype
1511 successgpu = gpu_memcpy_async(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1512 check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)
1513
1514 successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1515 check_memcpy_gpu("merge_systems: z_dev", successgpu)
1516
1517 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1518 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
1519
1520 successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1521 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1522
1523 successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1524 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1525#else
1526 num = na * size_of_int
1527 successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1528 check_memcpy_gpu("merge_systems: idx1_dev", successgpu)
1529
1530 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1531 check_memcpy_gpu("merge_systems: p_col_dev", successgpu)
1532
1533 successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1534 check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)
1535
1536 num = na * size_of_datatype
1537 successgpu = gpu_memcpy(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1538 check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)
1539
1540 successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1541 check_memcpy_gpu("merge_systems: z_dev", successgpu)
1542
1543 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1544 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
1545
1546 successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1547 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1548
1549 successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1550 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1551#endif
1552 endif
1553
1554 allocate(nnzu_val(na1,npc_n))
1555 allocate(nnzl_val(na1,npc_n))
1556
1557 nnzu_val(:,:) = 0
1558 nnzl_val(:,:) = 0
1559
1560 if (usegpu) then
1561 num = na1 * npc_n* size_of_int
1562 successgpu = gpu_malloc(nnzu_val_dev, num)
1563 check_alloc_gpu("merge_systems: nnzu_val_dev", successgpu)
1564
1565 num = na1 * npc_n* size_of_int
1566 successgpu = gpu_malloc(nnzl_val_dev, num)
1567 check_alloc_gpu("merge_systems: nnzl_val_dev", successgpu)
1568 endif
1569
1570
1571 np_rem = my_pcol
1572 if (usegpu) then
1573 do np = 1, npc_n
1574 if (np > 1) then
1575 if (np_rem == npc_0) then
1576 np_rem = npc_0+npc_n-1
1577 else
1578 np_rem = np_rem-1
1579 endif
1580 endif
1581 nnzu = 0
1582 nnzl = 0
1583
1584 call gpu_compute_nnzl_nnzu_val_part1 (p_col_dev, idx1_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, &
1585 na, na1, np_rem, npc_n, nnzu_save, nnzl_save, np, debug, my_stream)
1586
1587 enddo ! np = 1, npc_n
1588
1589 nnzu_start = 0
1590 nnzl_start = 0
1591
1592 call gpu_compute_nnzl_nnzu_val_part2 (nnzu_val_dev, nnzl_val_dev, na, na1, nnzu_start, nnzl_start, npc_n, &
1593 debug, my_stream)
1594 else
1595 ! precompute nnzu_val, nnzl_val
1596 do np = 1, npc_n
1597 if (np > 1) then
1598 if (np_rem == npc_0) then
1599 np_rem = npc_0+npc_n-1
1600 else
1601 np_rem = np_rem-1
1602 endif
1603 endif
1604 nnzu = 0
1605 nnzl = 0
1606 do i=1,na1
1607 if (p_col(idx1(i)) == np_rem) then
1608 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then
1609 nnzu = nnzu+1
1610 nnzu_val(i,np) = nnzu
1611 endif
1612 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then
1613 nnzl = nnzl+1
1614 nnzl_val(i,np) = nnzl
1615 endif
1616 endif
1617 enddo
1618 enddo ! np = 1, npc_n
1619 endif
1620
1621 np_rem = my_pcol
1622
1623 ! is nnzu updated in main loop
1624
1625 ! main loop
1626 nvtx_range_push("main_loop")
1627 do np = 1, npc_n
1628 nvtx_range_push("np=1,npc_n")
1629 ! Do a ring send of qtmp1
1630 if (np > 1) then
1631
1632 if (np_rem == npc_0) then
1633 np_rem = npc_0+npc_n-1
1634 else
1635 np_rem = np_rem-1
1636 endif
1637
1638 if (usegpu) then
1639 if (useccl) then
1640 my_stream = obj%gpu_setup%my_stream
1641 call gpu_copy_qtmp1_to_qtmp1_tmp (precision_char, qtmp1_dev, qtmp1_tmp_dev, gemm_dim_k, gemm_dim_l, &
1642 debug, my_stream)
1643
1644 call obj%timer%start("ccl_send_recv")
1645 successgpu = ccl_group_start()
1646 if (.not.successgpu) then
1647 print *,"Error in setting up ccl_group_start!"
1648 stop 1
1649 endif
1650
1651 successgpu = ccl_send(qtmp1_tmp_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1652 ccldatatype, np_next, ccl_comm_cols, my_stream)
1653
1654 if (.not.successgpu) then
1655 print *,"Error in ccl_send"
1656 stop 1
1657 endif
1658
1659 successgpu = ccl_recv(qtmp1_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1660 ccldatatype, np_prev, ccl_comm_cols, my_stream)
1661
1662
1663 if (.not.successgpu) then
1664 print *,"Error in ccl_recv"
1665 stop 1
1666 endif
1667
1668 successgpu = ccl_group_end()
1669
1670 if (.not.successgpu) then
1671 print *,"Error in setting up ccl_group_end!"
1672 stop 1
1673 endif
1674 successgpu = gpu_stream_synchronize(my_stream)
1675 check_stream_synchronize_gpu("trans_ev", successgpu)
1676 call obj%timer%stop("ccl_send_recv")
1677 else ! useCCL
1678#ifdef WITH_MPI
1679 call obj%timer%start("mpi_communication")
1680#ifdef WITH_GPU_STREAMS
1681 my_stream = obj%gpu_setup%my_stream
1682 successgpu = gpu_stream_synchronize(my_stream)
1683 check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)
1684
1685 successgpu = gpu_memcpy_async(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1686 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost, my_stream)
1687 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1688
1689 my_stream = obj%gpu_setup%my_stream
1690 successgpu = gpu_stream_synchronize(my_stream)
1691 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1692 ! synchronize streamsPerThread; maybe not neccessary
1693 successgpu = gpu_stream_synchronize()
1694 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1695
1696#else
1697 successgpu = gpu_memcpy(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1698 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost)
1699 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1700#endif
1701
1702
1703 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1704 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1705 1111_mpi_kind, int(mpi_comm_cols_self,kind=mpi_kind), mpi_status_ignore, mpierr)
1706#ifdef WITH_GPU_STREAMS
1707 my_stream = obj%gpu_setup%my_stream
1708 successgpu = gpu_stream_synchronize(my_stream)
1709 check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)
1710
1711 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1712 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice, my_stream)
1713 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1714
1715 my_stream = obj%gpu_setup%my_stream
1716 successgpu = gpu_stream_synchronize(my_stream)
1717 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1718 ! synchronize streamsPerThread; maybe not neccessary
1719 successgpu = gpu_stream_synchronize()
1720 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1721
1722#else
1723 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1724 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice)
1725 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1726#endif
1727 call obj%timer%stop("mpi_communication")
1728#endif /* WITH_MPI */
1729
1730 endif ! useCCL
1731 else ! useGPU
1732
1733#ifdef WITH_MPI
1734 call obj%timer%start("mpi_communication")
1735 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1736 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1737 1111_mpi_kind, int(mpi_comm_cols_self,kind=mpi_kind), mpi_status_ignore, mpierr)
1738 call obj%timer%stop("mpi_communication")
1739#endif /* WITH_MPI */
1740
1741 endif ! useGPU
1742
1743 endif ! (np > 1) then
1744
1745 ! Gather the parts in d1 and z which are fitting to qtmp1.
1746 ! This also delivers nnzu/nnzl for proc np_rem
1747 nnzu = 0
1748 nnzl = 0
1749 if (usegpu) then
1750
1751 nvtx_range_push("gpu_fill_tmp_arrays")
1752 call gpu_fill_tmp_arrays (precision_char, d1u_dev, d1_dev, zu_dev, z_dev, d1l_dev, zl_dev, &
1753 idx1_dev, p_col_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, nnzul_dev, &
1754 na, np, na1, np_rem, debug, my_stream)
1755 if (wantdebug) successgpu = gpu_devicesynchronize()
1756 nvtx_range_pop("gpu_fill_tmp_arrays")
1757
1758 num = 2* size_of_int
1759#ifdef WITH_GPU_STREAMS
1760 successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost, my_stream)
1761 check_memcpy_gpu("merge_systems: nnzul_dev", successgpu)
1762#else
1763 successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost)
1764 check_memcpy_gpu("merge_systems: nnzl_val", successgpu)
1765#endif
1766 nnzu = nnzul(1)
1767 nnzl = nnzul(2)
1768
1769 else ! useGPU
1770 do i=1,na1
1771 if (p_col(idx1(i)) == np_rem) then
1772 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then
1773 nnzu = nnzu+1
1774 d1u(nnzu) = d1(i)
1775 zu(nnzu) = z(i)
1776 endif
1777 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then
1778 nnzl = nnzl+1
1779 d1l(nnzl) = d1(i)
1780 zl(nnzl) = z(i)
1781 endif
1782 endif
1783 enddo
1784 endif ! useGPU
1785
1786 ! Set the deflated eigenvectors in Q (comming from proc np_rem)
1787
1788
1789 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1790 if (usegpu) then
1791 ! PETERDEBUG: idx2_dev, potential problem with garbage values?
1792 call gpu_update_ndef_c(ndef_c_dev, idx_dev, p_col_dev, idx2_dev, na, na1, np_rem, ndef, debug, my_stream)
1793
1794 endif ! useGPU
1795
1796 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1797 if (usegpu) then
1798 call gpu_copy_qtmp1_slice_to_q (precision_char, q_dev, qtmp1_dev, &
1799 l_col_out_dev, p_col_out_dev, ndef_c_dev, p_col_dev, idx2_dev, idx_dev, &
1800 l_rqs, l_rqe, l_rows, matrixrows, gemm_dim_k, my_pcol, na1, np_rem, na, &
1801 debug, my_stream)
1802 else ! ! useGPU
1803 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1804 do i = 1, na
1805 j = idx(i)
1806 if (j>na1) then
1807 if (p_col(idx2(j-na1)) == np_rem) then
1808 ndef = ndef+1
1809 if (p_col_out(i) == my_pcol) then
1810 q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
1811 endif
1812 endif
1813 endif
1814 enddo
1815
1816 endif ! useGPU
1817
1818
1819 do ns = 0, nqcols1-1, max_strip ! "strimining" (strip mining) loop
1820 nvtx_range_push("ns=0,nqcols1-1,max_strip")
1821 ncnt = min(max_strip,nqcols1-ns) ! number of columns in this strip
1822
1823 ! Get partial result from (output) Q
1824 if (usegpu) then
1825 call gpu_copy_q_slice_to_qtmp2 (precision_char, q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, &
1826 l_rows, l_rqs, l_rqe, matrixrows, matrixcols, &
1827 gemm_dim_k, gemm_dim_m, ns, ncnt, ind_ex, ind_ex2, na, debug, my_stream)
1828 else ! useGPU
1829!$omp PARALLEL DO &
1830!$omp default(none) &
1831!$omp private(i, j, k) &
1832!$omp SHARED(ns, q, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)
1833 do i = 1, ncnt
1834 j = idxq1(i+ns)
1835 k = l_col_out(j)
1836 qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)
1837 enddo
1838!$OMP END PARALLEL DO
1839 endif ! useGPU
1840
1841 ! Compute eigenvectors of the rank-1 modified matrix.
1842 ! Parts for multiplying with upper half of Q:
1843 if (usegpu) then
1844 if (nnzu .ge. 1) then
1845 ! Calculate the j-th eigenvector of the deflated system
1846 ! See above why we are doing it this way!
1847 call gpu_fill_ev (precision_char, ev_dev, d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev, idxq1_dev, idx_dev,&
1848 na, gemm_dim_l, gemm_dim_m, nnzu, ns, ncnt, debug, my_stream)
1849 endif ! nnzu
1850
1851 else ! useGPU
1852!$omp PARALLEL DO &
1853!$omp default(none) &
1854!$omp private(i, j, k, tmp) &
1855!$omp shared(ncnt, nnzu, idx, idxq1, ns, d1u, dbase, ddiff, zu, ev_scale, ev)
1856 do i = 1, ncnt
1857 do k = 1, nnzu
1858 j = idx(idxq1(i+ns))
1859 ! Calculate the j-th eigenvector of the deflated system
1860 ! See above why we are doing it this way!
1861
1862 ! kernel here
1863 tmp(k) = d1u(k) - dbase(j)
1864 tmp(k) = tmp(k) + ddiff(j)
1865 ev(k,i) = zu(k) / tmp(k) * ev_scale(j)
1866 enddo
1867 enddo
1868!$OMP END PARALLEL DO
1869 endif ! useGPU
1870
1871
1872 ! Multiply old Q with eigenvectors (upper half)
1873
1874 if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
1875 if (usegpu) then
1876 call obj%timer%start("gpublas_gemm")
1877 nvtx_range_push("gpublas_gemm_upper")
1878 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1879 call gpublas_precision_gemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk, &
1880 qtmp1_dev, gemm_dim_k, &
1881 ev_dev, gemm_dim_l, 1.0_rk, &
1882 qtmp2_dev, gemm_dim_k, gpuhandle)
1883 if (wantdebug) successgpu = gpu_devicesynchronize()
1884 nvtx_range_pop("gpublas_gemm_upper")
1885 call obj%timer%stop("gpublas_gemm")
1886 else ! useGPU
1887 call obj%timer%start("blas_gemm")
1888 call precision_gemm('N', 'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1889 int(nnzu,kind=blas_kind), 1.0_rk, &
1890 qtmp1, int(gemm_dim_k,kind=blas_kind), &
1891 ev, int(gemm_dim_l,kind=blas_kind), 1.0_rk, &
1892 qtmp2(1,1), int(gemm_dim_k,kind=blas_kind))
1893 call obj%timer%stop("blas_gemm")
1894 endif ! useGPU
1895 endif ! (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
1896
1897
1898
1899
1900 ! Compute eigenvectors of the rank-1 modified matrix.
1901 ! Parts for multiplying with lower half of Q:
1902
1903
1904 if (usegpu) then
1905 if (nnzl .ge. 1) then
1906 call gpu_fill_ev (precision_char, ev_dev, d1l_dev, dbase_dev, ddiff_dev, zl_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1907 na, gemm_dim_l, gemm_dim_m, nnzl, ns, ncnt, debug, my_stream)
1908 endif
1909 else ! useGPU
1910!$omp PARALLEL DO &
1911!$omp private(i, j, k, tmp)
1912 do i = 1, ncnt
1913 do k = 1, nnzl
1914 j = idx(idxq1(i+ns))
1915 ! Calculate the j-th eigenvector of the deflated system
1916 ! See above why we are doing it this way!
1917 tmp(k) = d1l(k) - dbase(j)
1918 tmp(k) = tmp(k) + ddiff(j)
1919 ev(k,i) = zl(k) / tmp(k) * ev_scale(j)
1920 enddo
1921 enddo
1922!$OMP END PARALLEL DO
1923 endif ! useGPU
1924
1925
1926 ! Multiply old Q with eigenvectors (lower half)
1927
1928 if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then
1929 if (usegpu) then
1930 call obj%timer%start("gpublas_gemm")
1931 nvtx_range_push("gpublas_gemm_lower")
1932 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1933 call gpublas_precision_gemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk, &
1934 qtmp1_dev + l_rnm*size_of_datatype, gemm_dim_k, &
1935 ev_dev, gemm_dim_l, 1.0_rk, &
1936 qtmp2_dev + l_rnm*size_of_datatype, gemm_dim_k, gpuhandle)
1937 if (wantdebug) successgpu = gpu_devicesynchronize()
1938 nvtx_range_pop("gpublas_gemm_lower")
1939 call obj%timer%stop("gpublas_gemm")
1940 else ! useGPU
1941 call obj%timer%start("blas_gemm")
1942 call precision_gemm('N', 'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1943 int(nnzl,kind=blas_kind), 1.0_rk, &
1944 qtmp1(l_rnm+1,1), int(gemm_dim_k,kind=blas_kind), &
1945 ev, int(gemm_dim_l,kind=blas_kind), 1.0_rk, &
1946 qtmp2(l_rnm+1,1), int(gemm_dim_k,kind=blas_kind))
1947 call obj%timer%stop("blas_gemm")
1948 endif ! useGPU
1949 endif
1950
1951
1952 ! Put partial result into (output) Q
1953 if (usegpu) then
1954 call gpu_copy_qtmp2_slice_to_q (precision_char, q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, &
1955 l_rqs, l_rqe, l_rows, ncnt, gemm_dim_k, matrixrows, ns, debug, my_stream)
1956 else ! useGPU
1957!$omp PARALLEL DO &
1958!$omp default(none) &
1959!$omp private(i) &
1960!$omp SHARED(q, ns, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)
1961 do i = 1, ncnt
1962 q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
1963 enddo
1964!$OMP END PARALLEL DO
1965 endif ! useGPU
1966
1967 nvtx_range_pop("ns=0,nqcols1-1,max_strip")
1968 enddo ! ns = 0, nqcols1-1, max_strip ! strimining loop
1969
1970 nvtx_range_pop("np=1,npc_n")
1971 enddo ! do np = 1, npc_n
1972 nvtx_range_pop("main_loop")
1973
1974
1975 deallocate(nnzu_val, nnzl_val)
1976
1977
1978 if (usegpu) then
1979 deallocate(ndef_c, stat=istat, errmsg=errormessage)
1980 check_deallocate("merge_systems: ndef_c",istat, errormessage)
1981 endif
1982
1983 if (usegpu) then
1984 successgpu = gpu_free(nnzul_dev)
1985 check_dealloc_gpu("merge_systems: nnzul_dev", successgpu)
1986
1987 successgpu = gpu_free(l_col_dev)
1988 check_dealloc_gpu("merge_systems: l_col_dev", successgpu)
1989
1990 successgpu = gpu_free(ndef_c_dev)
1991 check_dealloc_gpu("merge_systems: ndef_c_dev", successgpu)
1992
1993 successgpu = gpu_free(nnzu_val_dev)
1994 check_dealloc_gpu("merge_systems: nnzu_val_dev", successgpu)
1995
1996 successgpu = gpu_free(nnzl_val_dev)
1997 check_dealloc_gpu("merge_systems: nnzl_val_dev", successgpu)
1998
1999 successgpu = gpu_free(idx1_dev)
2000 check_dealloc_gpu("merge_systems: idx1_dev", successgpu)
2001
2002 successgpu = gpu_free(idx2_dev)
2003 check_dealloc_gpu("merge_systems: idx2_dev", successgpu)
2004
2005 successgpu = gpu_free(p_col_dev)
2006 check_dealloc_gpu("merge_systems: p_col_dev", successgpu)
2007
2008 successgpu = gpu_free(p_col_out_dev)
2009 check_dealloc_gpu("merge_systems: p_col_out_dev", successgpu)
2010
2011 successgpu = gpu_free(coltyp_dev)
2012 check_dealloc_gpu("merge_systems: coltyp_dev", successgpu)
2013
2014 successgpu = gpu_free(idx_dev)
2015 check_dealloc_gpu("merge_systems: idx_dev", successgpu)
2016
2017 successgpu = gpu_free(l_col_out_dev)
2018 check_dealloc_gpu("merge_systems: l_col_out_dev", successgpu)
2019
2020 successgpu = gpu_free(idxq1_dev)
2021 check_dealloc_gpu("merge_systems: ", successgpu)
2022
2023 successgpu = gpu_free(d1_dev)
2024 check_dealloc_gpu("merge_systems: d1_dev", successgpu)
2025
2026 successgpu = gpu_free(z_dev)
2027 check_dealloc_gpu("merge_systems: z_dev", successgpu)
2028
2029 successgpu = gpu_free(z1_dev)
2030 check_dealloc_gpu("merge_systems: z1_dev", successgpu)
2031
2032 successgpu = gpu_free(rho_dev)
2033 check_dealloc_gpu("merge_systems: rho_dev", successgpu)
2034
2035 successgpu = gpu_free(d1u_dev)
2036 check_dealloc_gpu("merge_systems: d1u_dev", successgpu)
2037
2038 successgpu = gpu_free(dbase_dev)
2039 check_dealloc_gpu("merge_systems: dbase_dev", successgpu)
2040
2041 successgpu = gpu_free(ddiff_dev)
2042 check_dealloc_gpu("merge_systems: ddiff_dev", successgpu)
2043
2044 successgpu = gpu_free(zu_dev)
2045 check_dealloc_gpu("merge_systems: zu_dev", successgpu)
2046
2047 successgpu = gpu_free(ev_scale_dev)
2048 check_dealloc_gpu("merge_systems: ev_scale_dev", successgpu)
2049
2050 successgpu = gpu_free(d1l_dev)
2051 check_dealloc_gpu("merge_systems: d1l_dev", successgpu)
2052
2053 successgpu = gpu_free(zl_dev)
2054 check_dealloc_gpu("merge_systems: zl_dev", successgpu)
2055
2056 successgpu = gpu_free(qtmp1_dev)
2057 check_dealloc_gpu("merge_systems: qtmp1_dev", successgpu)
2058
2059 successgpu = gpu_free(qtmp1_tmp_dev)
2060 check_dealloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)
2061
2062 successgpu = gpu_free(qtmp2_dev)
2063 check_dealloc_gpu("merge_systems: qtmp2_dev", successgpu)
2064
2065 successgpu = gpu_free(ev_dev)
2066 check_dealloc_gpu("merge_systems: ev_dev", successgpu)
2067
2068 successgpu = gpu_free(tmp_dev)
2069 check_dealloc_gpu("merge_systems: tmp_dev", successgpu)
2070
2071 successgpu = gpu_free(zero_dev)
2072 check_dealloc_gpu("merge_systems: zero_dev", successgpu)
2073
2074 successgpu = gpu_free(one_dev)
2075 check_dealloc_gpu("merge_systems: one_dev", successgpu)
2076
2077 successgpu = gpu_free(qtrans_dev)
2078 check_dealloc_gpu("merge_systems: qtrans_dev", successgpu)
2079
2080 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
2081 if (wantdebug) call obj%timer%start("gpu_host_register")
2082 if (.not. useccl) then
2083 successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
2084 check_host_unregister_gpu("merge_systems: qtmp1", successgpu)
2085
2086 successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
2087 check_host_unregister_gpu("merge_systems: qtmp2", successgpu)
2088
2089 successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))
2090 check_host_unregister_gpu("merge_systems: ev", successgpu)
2091 endif
2092
2093 if (wantdebug) successgpu = gpu_devicesynchronize()
2094 if (wantdebug) call obj%timer%stop("gpu_host_register")
2095 endif
2096 endif ! useGPU
2097
2098 if (.not. useccl) then
2099 deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)
2100 check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errormessage)
2101 endif
2102 endif !very outer test if (na1==1 .or. na1==2) else (na1>2)
2103
2104! #ifdef WITH_OPENMP_TRADITIONAL
2105! deallocate(z_p, stat=istat, errmsg=errorMessage)
2106! check_deallocate("merge_systems: z_p",istat, errorMessage)
2107! #endif
2108
2109 call obj%timer%stop("merge_systems" // precision_suffix)
2110
2111 return
2112
2113 end
Definition mod_add_tmp.F90:3
Definition mod_check_monotony.F90:3
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_global_gather.F90:3
Definition mod_global_product.F90:3
Definition mod_resort_ev.F90:3
Definition mod_solve_secular_equation.F90:55
Definition mod_transform_columns.F90:3
Definition mod_v_add_s.F90:55
Definition elpa_abstract_impl.F90:73