Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.01.001
Loading...
Searching...
No Matches
merge_systems_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58#ifdef SOLVE_TRIDI_GPU_BUILD
59 subroutine merge_systems_gpu_&
60 &precision &
61 (obj, na, nm, d, e, q, matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &
62 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
63#else
64 subroutine merge_systems_cpu_&
65 &precision &
66 (obj, na, nm, d, e, q, matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &
67 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
68#endif
69
70
71 use elpa_gpu
72 use, intrinsic :: iso_c_binding
73 use precision
75 use elpa_blas_interfaces
78 use resort_ev
81 use add_tmp
82 use v_add_s
83 use elpa_utilities
84 use elpa_mpi
86#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
87 use elpa_ccl_gpu
88#endif
89 use merge_systems_gpu
90
91
92#ifdef WITH_OPENMP_TRADITIONAL
93 use omp_lib
94#endif
95 implicit none
96#include "../general/precision_kinds.F90"
97 class(elpa_abstract_impl_t), intent(inout) :: obj
98 integer(kind=ik), intent(in) :: na, nm, matrixRows, nqoff, nblk, matrixCols, mpi_comm_rows, &
99 mpi_comm_cols, npc_0, npc_n
100 integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
101 real(kind=real_datatype), intent(inout) :: d(na), e
102#ifdef USE_ASSUMED_SIZE
103 real(kind=real_datatype), intent(inout) :: q(matrixrows,*)
104#else
105 real(kind=real_datatype), intent(inout) :: q(matrixrows,matrixcols)
106#endif
107 logical, intent(in) :: useGPU, wantDebug
108
109 logical, intent(out) :: success
110
111 ! TODO: play with max_strip. If it was larger, matrices being multiplied
112 ! might be larger as well!
113 integer(kind=ik), parameter :: max_strip=128
114
115
116 real(kind=real_datatype) :: beta, sig, s, c, t, tau, rho, eps, tol, &
117 qtrans(2,2), dmax, zmax, d1new, d2new
118 real(kind=real_datatype) :: z(na), d1(na), d2(na), z1(na), delta(na), &
119 dbase(na), ddiff(na), ev_scale(na), tmp(na)
120 real(kind=real_datatype) :: d1u(na), zu(na), d1l(na), zl(na)
121 real(kind=real_datatype), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
122#ifdef WITH_OPENMP_TRADITIONAL
123 real(kind=real_datatype), allocatable :: z_p(:,:)
124#endif
125
126 integer(kind=ik) :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
127 l_rqm, ns, info
128 integer(kind=BLAS_KIND) :: infoBLAS
129 integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
130 l_cols_qreorg, np, l_idx, nqcols1 !, nqcols2
131 integer(kind=ik) :: nnzu_save, nnzl_save
132 integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
133 np_cols
134 integer(kind=MPI_KIND) :: mpierr
135 integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
136 integer(kind=ik) :: np_next, np_prev, np_rem
137 integer(kind=ik) :: idx(na), idx1(na), idx2(na)
138 integer(kind=BLAS_KIND) :: idxBLAS(NA)
139 integer(kind=ik) :: coltyp(na), idxq1(na) !, idxq2(na)
140
141 integer(kind=ik) :: istat
142 character(200) :: errorMessage
143 integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
144
145 integer(kind=c_intptr_t) :: num
146 integer(kind=C_intptr_T) :: qtmp1_dev, qtmp1_tmp_dev, qtmp2_dev, ev_dev, q_dev
147 integer(kind=c_intptr_t) :: d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev
148 integer(kind=c_intptr_t) :: d1l_dev, zl_dev, z_dev, d1_dev
149 integer(kind=c_intptr_t) :: idx1_dev, p_col_dev, coltyp_dev, p_col_out_dev, ndef_c_dev
150 integer(kind=c_intptr_t) :: idxq1_dev, l_col_out_dev, idx_dev, idx2_dev, l_col_dev
151 integer(kind=c_intptr_t) :: nnzul_dev
152
153 integer(kind=c_intptr_t) :: nnzu_val_dev, nnzl_val_dev
154 logical :: successGPU
155 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
156 &precision&
157 &_real
158 integer(kind=c_intptr_t) :: gpuHandle
159 integer(kind=ik), intent(in) :: max_threads
160 integer(kind=c_intptr_t) :: my_stream
161 integer(kind=ik) :: l_col_out_tmp
162 integer(kind=ik), allocatable :: nnzu_val(:,:), nnzl_val(:,:)
163 integer(kind=ik) :: nnzul(2)
164
165 integer(kind=ik) :: nnzu_start, nnzl_start
166
167 integer(kind=ik), allocatable :: ndef_c(:)
168
169 !real(kind=REAL_DATATYPE), allocatable :: qtmp11(:,:)
170 integer(kind=ik) :: ii,jj, indx, ind_ex, ind_ex2, p_col_tmp, index2, counter1, counter2
171#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
172 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
173#endif
174#ifdef WITH_OPENMP_TRADITIONAL
175 integer(kind=ik) :: my_thread
176
177 allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errormessage)
178 check_allocate("merge_systems: z_p",istat, errormessage)
179#endif
180
181 call obj%timer%start("merge_systems" // precision_suffix)
182 success = .true.
183
184 call obj%timer%start("mpi_communication")
185 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
186 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
187 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
188 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
189
190 my_prow = int(my_prowmpi,kind=c_int)
191 np_rows = int(np_rowsmpi,kind=c_int)
192 my_pcol = int(my_pcolmpi,kind=c_int)
193 np_cols = int(np_colsmpi,kind=c_int)
194
195 call obj%timer%stop("mpi_communication")
196
197
198 ! If my processor column isn't in the requested set, do nothing
199
200 if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
201 call obj%timer%stop("merge_systems" // precision_suffix)
202 return
203 endif
204 ! Determine number of "next" and "prev" column for ring sends
205
206 if (my_pcol == npc_0+npc_n-1) then
207 np_next = npc_0
208 else
209 np_next = my_pcol + 1
210 endif
211
212 if (my_pcol == npc_0) then
213 np_prev = npc_0+npc_n-1
214 else
215 np_prev = my_pcol - 1
216 endif
217 call check_monotony_&
218 &precision&
219 &(obj, nm,d,'Input1',wantdebug, success)
220 if (.not.(success)) then
221 call obj%timer%stop("merge_systems" // precision_suffix)
222 return
223 endif
224 call check_monotony_&
225 &precision&
226 &(obj,na-nm,d(nm+1),'Input2',wantdebug, success)
227 if (.not.(success)) then
228 call obj%timer%stop("merge_systems" // precision_suffix)
229 return
230 endif
231 ! Get global number of processors and my processor number.
232 ! Please note that my_proc does not need to match any real processor number,
233 ! it is just used for load balancing some loops.
234
235 n_procs = np_rows*npc_n
236 my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
237
238
239 ! Local limits of the rows of Q
240
241 l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
242 l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
243 l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
244
245 l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm
246 l_rows = l_rqe-l_rqs+1 ! Total number of local rows
247
248
249 ! My number of local columns
250
251 l_cols = count(p_col(1:na)==my_pcol)
252
253 ! Get max number of local columns
254
255 max_local_cols = 0
256 do np = npc_0, npc_0+npc_n-1
257 max_local_cols = max(max_local_cols,count(p_col(1:na)==np))
258 enddo
259
260 ! Calculations start here
261
262 beta = abs(e)
263 sig = sign(1.0_rk,e)
264
265 ! Calculate rank-1 modifier z
266
267 z(:) = 0
268
269 if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow) then
270 ! nm is local on my row
271 do i = 1, na
272 if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
273 enddo
274 endif
275
276 if (mod((nqoff+nm)/nblk,np_rows)==my_prow) then
277 ! nm+1 is local on my row
278 do i = 1, na
279 if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
280 enddo
281 endif
282
283 call global_gather_&
284 &precision&
285 &(obj, z, na, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
286 if (.not.(success)) then
287 write(error_unit,*) "Error in global_gather. ABorting"
288 success = .false.
289 return
290 endif
291 ! Normalize z so that norm(z) = 1. Since z is the concatenation of
292 ! two normalized vectors, norm2(z) = sqrt(2).
293 z = z/sqrt(2.0_rk)
294 rho = 2.0_rk*beta
295 ! Calculate index for merging both systems by ascending eigenvalues
296 call obj%timer%start("lapack_lamrg")
297 call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &
298 1_blas_kind, 1_blas_kind, idxblas )
299 idx(:) = int(idxblas(:),kind=ik)
300 call obj%timer%stop("lapack_lamrg")
301
302 ! Calculate the allowable deflation tolerance
303
304 zmax = maxval(abs(z))
305 dmax = maxval(abs(d))
306 eps = precision_lamch( 'E' ) ! return epsilon
307 tol = 8.0_rk*eps*max(dmax,zmax)
308
309 ! If the rank-1 modifier is small enough, no more needs to be done
310 ! except to reorganize D and Q
311
312 IF ( rho*zmax <= tol ) THEN
313
314 ! Rearrange eigenvalues
315
316 tmp = d
317 do i=1,na
318 d(i) = tmp(idx(i))
319 enddo
320
321 ! Rearrange eigenvectors
322 call resort_ev_&
323 &precision &
324 (obj, idx, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &
325 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
326
327 call obj%timer%stop("merge_systems" // precision_suffix)
328
329 return
330 ENDIF
331
332 ! Merge and deflate system
333
334 na1 = 0
335 na2 = 0
336
337 ! COLTYP:
338 ! 1 : non-zero in the upper half only;
339 ! 2 : dense;
340 ! 3 : non-zero in the lower half only;
341 ! 4 : deflated.
342
343 coltyp(1:nm) = 1
344 coltyp(nm+1:na) = 3
345
346 do i=1,na
347
348 if (rho*abs(z(idx(i))) <= tol) then
349
350 ! Deflate due to small z component.
351
352 na2 = na2+1
353 d2(na2) = d(idx(i))
354 idx2(na2) = idx(i)
355 coltyp(idx(i)) = 4
356
357 else if (na1>0) then
358
359 ! Check if eigenvalues are close enough to allow deflation.
360
361 s = z(idx(i))
362 c = z1(na1)
363
364 ! Find sqrt(a**2+b**2) without overflow or
365 ! destructive underflow.
366 tau = precision_lapy2( c, s )
367 t = d1(na1) - d(idx(i))
368 c = c / tau
369 s = -s / tau
370 IF ( abs( t*c*s ) <= tol ) THEN
371
372 ! Deflation is possible.
373
374 na2 = na2+1
375
376 z1(na1) = tau
377
378 d2new = d(idx(i))*c**2 + d1(na1)*s**2
379 d1new = d(idx(i))*s**2 + d1(na1)*c**2
380
381 ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
382 ! This means that after the above transformation it must be
383 ! D1(na1) <= d1new <= D(idx(i))
384 ! D1(na1) <= d2new <= D(idx(i))
385 !
386 ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
387 ! so there is no problem with sorting here.
388 ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
389 ! which makes a check (and possibly a resort) necessary.
390 !
391 ! The above relations may not hold exactly due to numeric differences
392 ! so they have to be enforced in order not to get troubles with sorting.
393
394
395 if (d1new<d1(na1) ) d1new = d1(na1)
396 if (d1new>d(idx(i))) d1new = d(idx(i))
397
398 if (d2new<d1(na1) ) d2new = d1(na1)
399 if (d2new>d(idx(i))) d2new = d(idx(i))
400
401 d1(na1) = d1new
402
403 do j=na2-1,1,-1
404 if (d2new<d2(j)) then
405 d2(j+1) = d2(j)
406 idx2(j+1) = idx2(j)
407 else
408 exit ! Loop
409 endif
410 enddo
411
412 d2(j+1) = d2new
413 idx2(j+1) = idx(i)
414
415 qtrans(1,1) = c; qtrans(1,2) =-s
416 qtrans(2,1) = s; qtrans(2,2) = c
417 call transform_columns_&
418 &precision &
419 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
420 q, matrixrows, matrixcols, l_rows, mpi_comm_cols, &
421 p_col, l_col, qtrans)
422 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
423 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
424
425 coltyp(idx(i)) = 4
426
427 else
428 na1 = na1+1
429 d1(na1) = d(idx(i))
430 z1(na1) = z(idx(i))
431 idx1(na1) = idx(i)
432 endif
433 else
434 na1 = na1+1
435 d1(na1) = d(idx(i))
436 z1(na1) = z(idx(i))
437 idx1(na1) = idx(i)
438 endif
439
440 enddo ! do i=1,na
441
442 call check_monotony_&
443 &precision&
444 &(obj, na1,d1,'Sorted1', wantdebug, success)
445 if (.not.(success)) then
446 call obj%timer%stop("merge_systems" // precision_suffix)
447 return
448 endif
449 call check_monotony_&
450 &precision&
451 &(obj, na2,d2,'Sorted2', wantdebug, success)
452 if (.not.(success)) then
453 call obj%timer%stop("merge_systems" // precision_suffix)
454 return
455 endif
456
457 if (na1==1 .or. na1==2) then
458 ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
459
460 if (na1==1) then
461 d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
462 else ! na1==2
463 call obj%timer%start("lapack_laed5_x2")
464 call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))
465 call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))
466 call obj%timer%stop("lapack_laed5_x2")
467 call transform_columns_&
468 &precision&
469 &(obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &
470 matrixrows, matrixcols, l_rows, mpi_comm_cols, &
471 p_col, l_col, qtrans)
472
473 endif
474
475 ! Add the deflated eigenvalues
476 d(na1+1:na) = d2(1:na2)
477
478 ! Calculate arrangement of all eigenvalues in output
479 call obj%timer%start("lapack_lamrg")
480 call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
481 1_blas_kind, 1_blas_kind, idxblas )
482 idx(:) = int(idxblas(:),kind=ik)
483 call obj%timer%stop("lapack_lamrg")
484 ! Rearrange eigenvalues
485
486 tmp = d
487 do i=1,na
488 d(i) = tmp(idx(i))
489 enddo
490
491 ! Rearrange eigenvectors
492
493 do i=1,na
494 if (idx(i)<=na1) then
495 idxq1(i) = idx1(idx(i))
496 else
497 idxq1(i) = idx2(idx(i)-na1)
498 endif
499 enddo
500 call resort_ev_&
501 &precision&
502 &(obj, idxq1, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &
503 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
504
505 else if (na1>2) then
506
507 ! Solve secular equation
508
509 z(1:na1) = 1
510#ifdef WITH_OPENMP_TRADITIONAL
511 z_p(1:na1,:) = 1
512#endif
513 dbase(1:na1) = 0
514 ddiff(1:na1) = 0
515
516 info = 0
517 infoblas = int(info,kind=blas_kind)
518!#ifdef WITH_OPENMP_TRADITIONAL
519!
520! call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
521!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
522! my_thread = omp_get_thread_num()
523!!$OMP DO
524!#endif
525 DO i = my_proc+1, na1, n_procs ! work distributed over all processors
526 call obj%timer%start("lapack_laed4")
527 call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &
528 rho, s, infoblas) ! s is not used!
529 info = int(infoblas,kind=ik)
530 call obj%timer%stop("lapack_laed4")
531 if (info/=0) then
532 ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
533 ! use the more stable bisection algorithm in solve_secular_equation
534 ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
535 call solve_secular_equation_&
536 &precision&
537 &(obj, na1, i, d1, z1, delta, rho, s)
538 endif
539
540 ! Compute updated z
541
542!#ifdef WITH_OPENMP_TRADITIONAL
543! do j=1,na1
544! if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
545! enddo
546! z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
547!#else
548 do j=1,na1
549 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
550 enddo
551 z(i) = z(i)*delta(i)
552!#endif
553 ! store dbase/ddiff
554
555 if (i<na1) then
556 if (abs(delta(i+1)) < abs(delta(i))) then
557 dbase(i) = d1(i+1)
558 ddiff(i) = delta(i+1)
559 else
560 dbase(i) = d1(i)
561 ddiff(i) = delta(i)
562 endif
563 else
564 dbase(i) = d1(i)
565 ddiff(i) = delta(i)
566 endif
567 enddo ! i = my_proc+1, na1, n_procs
568!#ifdef WITH_OPENMP_TRADITIONAL
569!!$OMP END PARALLEL
570!
571! call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
572!
573! do i = 0, max_threads-1
574! z(1:na1) = z(1:na1)*z_p(1:na1,i)
575! enddo
576!#endif
577
578 call global_product_&
579 &precision&
580 (obj, z, na1, mpi_comm_rows, mpi_comm_cols, npc_0, npc_n, success)
581 if (.not.(success)) then
582 write(error_unit,*) "Error in global_product. Aborting..."
583 return
584 endif
585 z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )
586
587 call global_gather_&
588 &precision&
589 &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
590 if (.not.(success)) then
591 write(error_unit,*) "Error in global_gather. Aborting..."
592 return
593 endif
594 call global_gather_&
595 &precision&
596 &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
597 if (.not.(success)) then
598 write(error_unit,*) "Error in global_gather. Aborting..."
599 return
600 endif
601 d(1:na1) = dbase(1:na1) - ddiff(1:na1)
602
603 ! Calculate scale factors for eigenvectors
604 ev_scale(:) = 0.0_rk
605
606#ifdef WITH_OPENMP_TRADITIONAL
607
608 call obj%timer%start("OpenMP parallel" // precision_suffix)
609
610!$omp PARALLEL DO &
611!$omp default(none) &
612!$omp private(i) &
613!$omp SHARED(na1, my_proc, n_procs, &
614!$OMP d1, dbase, ddiff, z, ev_scale, obj)
615
616#endif
617 DO i = my_proc+1, na1, n_procs ! work distributed over all processors
618
619 ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code
620 ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
621
622 ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
623 ! in exactly this order, but we want to prevent compiler optimization
624! ev_scale_val = ev_scale(i)
625 call add_tmp_&
626 &precision&
627 &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)
628! ev_scale(i) = ev_scale_val
629 enddo
630#ifdef WITH_OPENMP_TRADITIONAL
631!$OMP END PARALLEL DO
632
633 call obj%timer%stop("OpenMP parallel" // precision_suffix)
634
635#endif
636
637 call global_gather_&
638 &precision&
639 &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
640 if (.not.(success)) then
641 write(error_unit,*) "Error in global_gather. Aborting..."
642 return
643 endif
644 ! Add the deflated eigenvalues
645 d(na1+1:na) = d2(1:na2)
646
647 call obj%timer%start("lapack_lamrg")
648 ! Calculate arrangement of all eigenvalues in output
649 call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
650 1_blas_kind, 1_blas_kind, idxblas )
651 idx(:) = int(idxblas(:),kind=ik)
652 call obj%timer%stop("lapack_lamrg")
653 ! Rearrange eigenvalues
654 tmp = d
655 do i=1,na
656 d(i) = tmp(idx(i))
657 enddo
658 call check_monotony_&
659 &precision&
660 &(obj, na,d,'Output', wantdebug, success)
661
662 if (.not.(success)) then
663 call obj%timer%stop("merge_systems" // precision_suffix)
664 return
665 endif
666 ! Eigenvector calculations
667 if (usegpu) then
668 num = 2 * size_of_int
669 successgpu = gpu_malloc(nnzul_dev, num)
670 check_alloc_gpu("merge_systems: ", successgpu)
671
672 num = na * size_of_int
673 successgpu = gpu_malloc(idxq1_dev, num)
674 check_alloc_gpu("merge_systems: ", successgpu)
675
676 num = na * size_of_int
677 successgpu = gpu_malloc(idx_dev, num)
678 check_alloc_gpu("merge_systems: idx_dev", successgpu)
679
680 num = na * size_of_int
681#ifdef WITH_GPU_STREAMS
682 my_stream = obj%gpu_setup%my_stream
683 successgpu = gpu_memcpy_async(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
684 num, gpumemcpyhosttodevice, my_stream)
685 check_memcpy_gpu("merge_systems: ", successgpu)
686#else
687 successgpu = gpu_memcpy(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
688 num, gpumemcpyhosttodevice)
689 check_memcpy_gpu("merge_systems: idx_dev", successgpu)
690#endif
691 endif
692
693 ! Calculate the number of columns in the new local matrix Q
694 ! which are updated from non-deflated/deflated eigenvectors.
695 ! idxq1/2 stores the global column numbers.
696
697 !if (useGPU) then
698
699
700
701 ! !nqcols1 is needed later on host !!
702 ! ! memcopy back needed!!
703 !else
704 nqcols1 = 0 ! number of non-deflated eigenvectors
705 !nqcols2 = 0 ! number of deflated eigenvectors
706 DO i = 1, na
707 if (p_col_out(i)==my_pcol) then
708 if (idx(i)<=na1) then
709 nqcols1 = nqcols1+1
710 idxq1(nqcols1) = i
711 !else
712 !nqcols2 = nqcols2+1
713 !idxq2(nqcols2) = i
714 endif
715 endif
716 enddo
717 !endif
718
719 if (usegpu) then
720 num = na * size_of_int
721#ifdef WITH_GPU_STREAMS
722 my_stream = obj%gpu_setup%my_stream
723 successgpu = gpu_memcpy_async(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
724 num, gpumemcpyhosttodevice, my_stream)
725 check_memcpy_gpu("merge_systems: ", successgpu)
726#else
727 successgpu = gpu_memcpy(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
728 num, gpumemcpyhosttodevice)
729 check_memcpy_gpu("merge_systems: idxq1_dev", successgpu)
730#endif
731 endif
732
733
734
735 if (usegpu) then
736 allocate(ndef_c(na), stat=istat, errmsg=errormessage)
737 check_allocate("merge_systems: ndef_c",istat, errormessage)
738 endif
739
740
741 gemm_dim_k = max(1,l_rows)
742 gemm_dim_l = max_local_cols
743 gemm_dim_m = min(max_strip,max(1,nqcols1))
744
745 allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)
746 check_allocate("merge_systems: qtmp1",istat, errormessage)
747
748 allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)
749 check_allocate("merge_systems: ev",istat, errormessage)
750
751 allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)
752 check_allocate("merge_systems: qtmp2",istat, errormessage)
753
754 qtmp1 = 0 ! May contain empty (unset) parts
755 qtmp2 = 0 ! Not really needed
756
757
758 ! check memory copies
759
760 if (usegpu) then
761 num = na * size_of_int
762 successgpu = gpu_malloc(ndef_c_dev, num)
763 check_alloc_gpu("merge_systems: ndef_c_dev", successgpu)
764
765 num = na * size_of_int
766 successgpu = gpu_malloc(idx1_dev, num)
767 check_alloc_gpu("merge_systems: idx1_dev", successgpu)
768
769 num = na * size_of_int
770 successgpu = gpu_malloc(p_col_dev, num)
771 check_alloc_gpu("merge_systems: p_col_dev", successgpu)
772
773 num = na * size_of_int
774 successgpu = gpu_malloc(p_col_out_dev, num)
775 check_alloc_gpu("merge_systems: p_col_out_dev", successgpu)
776
777 num = na * size_of_int
778 successgpu = gpu_malloc(coltyp_dev, num)
779 check_alloc_gpu("merge_systems: coltyp_dev", successgpu)
780
781 num = na * size_of_int
782 successgpu = gpu_malloc(idx2_dev, num)
783 check_alloc_gpu("merge_systems: idx2_dev", successgpu)
784
785 num = na * size_of_int
786 successgpu = gpu_malloc(l_col_out_dev, num)
787 check_alloc_gpu("merge_systems: l_col_out_dev", successgpu)
788
789 num = (na) * size_of_datatype
790 successgpu = gpu_malloc(z_dev, num)
791 check_alloc_gpu("merge_systems: z_dev", successgpu)
792
793 num = (na) * size_of_datatype
794 successgpu = gpu_malloc(d1_dev, num)
795 check_alloc_gpu("merge_systems: d1_dev", successgpu)
796
797 num = (na) * size_of_datatype
798 successgpu = gpu_malloc(d1u_dev, num)
799 check_alloc_gpu("merge_systems: d1u_dev", successgpu)
800
801 num = (na) * size_of_datatype
802 successgpu = gpu_malloc(dbase_dev, num)
803 check_alloc_gpu("merge_systems: dbase_dev", successgpu)
804
805 num = (na) * size_of_datatype
806 successgpu = gpu_malloc(ddiff_dev, num)
807 check_alloc_gpu("merge_systems: ddiff_dev", successgpu)
808
809 num = (na) * size_of_datatype
810 successgpu = gpu_malloc(zu_dev, num)
811 check_alloc_gpu("merge_systems: zu_dev", successgpu)
812
813 num = (na) * size_of_datatype
814 successgpu = gpu_malloc(ev_scale_dev, num)
815 check_alloc_gpu("merge_systems: ev_scale_dev", successgpu)
816
817 num = (na) * size_of_datatype
818 successgpu = gpu_malloc(d1l_dev, num)
819 check_alloc_gpu("merge_systems: d1l_dev", successgpu)
820
821 num = (na) * size_of_datatype
822 successgpu = gpu_malloc(zl_dev, num)
823 check_alloc_gpu("merge_systems: zl_dev", successgpu)
824
825 num = (matrixrows*matrixcols) * size_of_datatype
826 successgpu = gpu_malloc(q_dev, num)
827 check_alloc_gpu("merge_systems: q_dev", successgpu)
828
829 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
830#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
831 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
832 successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&
833 gpuhostregisterdefault)
834 check_host_register_gpu("merge_systems: qtmp1", successgpu)
835 endif
836#endif
837 successgpu = gpu_malloc(qtmp1_dev, num)
838 check_alloc_gpu("merge_systems: qtmp1_dev", successgpu)
839
840 successgpu = gpu_malloc(qtmp1_tmp_dev, num)
841 check_alloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)
842
843 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
844#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
845 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
846 successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t),num,&
847 gpuhostregisterdefault)
848 check_host_register_gpu("merge_systems: ev", successgpu)
849 endif
850#endif
851 successgpu = gpu_malloc(ev_dev, num)
852 check_alloc_gpu("merge_systems: ev_dev", successgpu)
853
854
855 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
856#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
857 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
858 successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&
859 gpuhostregisterdefault)
860 check_host_register_gpu("merge_systems: qtmp2", successgpu)
861 endif
862#endif
863 successgpu = gpu_malloc(qtmp2_dev, num)
864 check_alloc_gpu("merge_systems: qtmp2_dev", successgpu)
865 endif !useGPU
866
867 ! Gather nonzero upper/lower components of old matrix Q
868 ! which are needed for multiplication with new eigenvectors
869
870
871 ! kernel compute nnzu on device
872 nnzu = 0
873 nnzl = 0
874 do i = 1, na1
875 l_idx = l_col(idx1(i))
876 if (p_col(idx1(i))==my_pcol) then
877 if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
878 nnzu = nnzu+1
879 qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
880 endif
881 if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
882 nnzl = nnzl+1
883 qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
884 endif
885 endif
886 enddo
887
888 if (usegpu) then
889
890 num = gemm_dim_k * gemm_dim_l * size_of_datatype
891#ifdef WITH_GPU_STREAMS
892 my_stream = obj%gpu_setup%my_stream
893 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
894 num, gpumemcpyhosttodevice, my_stream)
895 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
896#else
897 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
898 num, gpumemcpyhosttodevice)
899 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
900#endif
901
902
903 num = matrixrows*matrixcols*size_of_datatype
904#ifdef WITH_GPU_STREAMS
905 successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
906 num, gpumemcpyhosttodevice, my_stream)
907 check_memcpy_gpu("merge_systems: q_dev", successgpu)
908#else
909 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
910 num, gpumemcpyhosttodevice)
911 check_memcpy_gpu("merge_systems: q_dev", successgpu)
912#endif
913
914
915
916 num = na * size_of_int
917 successgpu = gpu_malloc(l_col_dev, num)
918 check_alloc_gpu("merge_systems: l_col_dev", successgpu)
919
920 num = na * size_of_int
921#ifdef WITH_GPU_STREAMS
922 my_stream = obj%gpu_setup%my_stream
923 successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &
924 num, gpumemcpyhosttodevice, my_stream)
925 check_memcpy_gpu("merge_systems: ", successgpu)
926#else
927 successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &
928 num, gpumemcpyhosttodevice)
929 check_memcpy_gpu("merge_systems: l_col_dev", successgpu)
930#endif
931 endif
932
933 ! Gather deflated eigenvalues behind nonzero components
934
935 ! compute ndef on device
936 ndef = max(nnzu,nnzl)
937
938 if (usegpu) then
939 num = na * size_of_int
940#ifdef WITH_GPU_STREAMS
941 my_stream = obj%gpu_setup%my_stream
942 successgpu = gpu_memcpy_async(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &
943 num, gpumemcpyhosttodevice, my_stream)
944 check_memcpy_gpu("merge_systems: ", successgpu)
945#else
946 successgpu = gpu_memcpy(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &
947 num, gpumemcpyhosttodevice)
948 check_memcpy_gpu("merge_systems: idx2_dev", successgpu)
949#endif
950 num = na * size_of_int
951#ifdef WITH_GPU_STREAMS
952 my_stream = obj%gpu_setup%my_stream
953
954 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
955 num, gpumemcpyhosttodevice, my_stream)
956 check_memcpy_gpu("merge_systems: ", successgpu)
957#else
958 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
959 num, gpumemcpyhosttodevice)
960 check_memcpy_gpu("merge_systems: p_col_dev", successgpu)
961#endif
962 endif
963
964
965
966 if (usegpu) then
967 ndef_c(:) = ndef
968
969 num = na * size_of_int
970#ifdef WITH_GPU_STREAMS
971 my_stream = obj%gpu_setup%my_stream
972 successgpu = gpu_memcpy_async(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &
973 num, gpumemcpyhosttodevice, my_stream)
974 check_memcpy_gpu("merge_systems: ndef_c_dev 4", successgpu)
975
976#else
977 successgpu = gpu_memcpy(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &
978 num, gpumemcpyhosttodevice)
979 check_memcpy_gpu("merge_systems: ndef_c_dev", successgpu)
980#endif
981
982 call gpu_copy_q_slice_to_qtmp1_precision (qtmp1_dev, q_dev, ndef_c_dev, l_col_dev, idx2_dev, p_col_dev, na2, na, &
983 my_pcol, l_rows, l_rqs, l_rqe, matrixrows, gemm_dim_k, my_stream)
984 else
985 do i = 1, na2
986 l_idx = l_col(idx2(i))
987 if (p_col(idx2(i))==my_pcol) then
988 ndef = ndef+1
989 qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
990 endif
991 enddo
992 endif
993
994 l_cols_qreorg = ndef ! Number of columns in reorganized matrix
995 if (usegpu) then
996 num = na * size_of_int
997#ifdef WITH_GPU_STREAMS
998 my_stream = obj%gpu_setup%my_stream
999 successgpu = gpu_memcpy_async(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1000 num, gpumemcpyhosttodevice, my_stream)
1001 check_memcpy_gpu("merge_systems: ", successgpu)
1002
1003#else
1004 successgpu = gpu_memcpy(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1005 num, gpumemcpyhosttodevice)
1006 check_memcpy_gpu("merge_systems: p_col_out_dev", successgpu)
1007#endif
1008 num = na * size_of_int
1009#ifdef WITH_GPU_STREAMS
1010 my_stream = obj%gpu_setup%my_stream
1011 successgpu = gpu_memcpy_async(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1012 num, gpumemcpyhosttodevice, my_stream)
1013 check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)
1014
1015#else
1016 successgpu = gpu_memcpy(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1017 num, gpumemcpyhosttodevice)
1018 check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)
1019#endif
1020 endif
1021
1022
1023 ! Set (output) Q to 0, it will sum up new Q
1024
1025
1026 if (usegpu) then
1027 call gpu_zero_q_precision (q_dev, p_col_out_dev, l_col_out_dev, na, my_pcol, l_rqs, l_rqe, matrixrows, my_stream)
1028 else
1029 DO i = 1, na
1030 if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
1031 enddo
1032 endif
1033
1034 ! check memory copies
1035
1036 if (usegpu) then
1037 num = na * size_of_int
1038#ifdef WITH_GPU_STREAMS
1039 my_stream = obj%gpu_setup%my_stream
1040 successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &
1041 num, gpumemcpyhosttodevice, my_stream)
1042 check_memcpy_gpu("merge_systems: ", successgpu)
1043#else
1044 successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &
1045 num, gpumemcpyhosttodevice)
1046 check_memcpy_gpu("merge_systems: idx1_dev", successgpu)
1047#endif
1048
1049 num = na * size_of_int
1050#ifdef WITH_GPU_STREAMS
1051 my_stream = obj%gpu_setup%my_stream
1052 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
1053 num, gpumemcpyhosttodevice, my_stream)
1054 check_memcpy_gpu("merge_systems: ", successgpu)
1055#else
1056 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
1057 num, gpumemcpyhosttodevice)
1058 check_memcpy_gpu("merge_systems: p_col_dev", successgpu)
1059#endif
1060
1061 num = na * size_of_int
1062#ifdef WITH_GPU_STREAMS
1063 my_stream = obj%gpu_setup%my_stream
1064 successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &
1065 num, gpumemcpyhosttodevice, my_stream)
1066 check_memcpy_gpu("merge_systems: ", successgpu)
1067#else
1068 successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &
1069 num, gpumemcpyhosttodevice)
1070 check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)
1071#endif
1072
1073 num = na*size_of_datatype
1074#ifdef WITH_GPU_STREAMS
1075 my_stream = obj%gpu_setup%my_stream
1076 successgpu = gpu_memcpy_async(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &
1077 num, gpumemcpyhosttodevice, my_stream)
1078 check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)
1079#else
1080 successgpu = gpu_memcpy(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &
1081 num, gpumemcpyhosttodevice)
1082 check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)
1083#endif
1084
1085 num = na*size_of_datatype
1086#ifdef WITH_GPU_STREAMS
1087 my_stream = obj%gpu_setup%my_stream
1088 successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), &
1089 num, gpumemcpyhosttodevice, my_stream)
1090 check_memcpy_gpu("merge_systems: z_dev", successgpu)
1091#else
1092 successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), &
1093 num, gpumemcpyhosttodevice)
1094 check_memcpy_gpu("merge_systems: z_dev", successgpu)
1095#endif
1096
1097 num = na*size_of_datatype
1098#ifdef WITH_GPU_STREAMS
1099 my_stream = obj%gpu_setup%my_stream
1100 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &
1101 num, gpumemcpyhosttodevice, my_stream)
1102#else
1103 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &
1104 num, gpumemcpyhosttodevice)
1105 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
1106#endif
1107
1108 num = gemm_dim_l * gemm_dim_m * size_of_datatype
1109#ifdef WITH_GPU_STREAMS
1110 my_stream = obj%gpu_setup%my_stream
1111 successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
1112 num, gpumemcpyhosttodevice, my_stream)
1113 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
1114#else
1115 !TODO the previous loop could be possible to do on device and thus
1116 !copy less
1117 successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
1118 num, gpumemcpyhosttodevice)
1119 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
1120#endif
1121
1122 num = na*size_of_datatype
1123#ifdef WITH_GPU_STREAMS
1124 my_stream = obj%gpu_setup%my_stream
1125 successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &
1126 num, gpumemcpyhosttodevice, my_stream)
1127 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1128#else
1129 successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &
1130 num, gpumemcpyhosttodevice)
1131 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1132#endif
1133
1134 num = na*size_of_datatype
1135#ifdef WITH_GPU_STREAMS
1136 successgpu = gpu_stream_synchronize(my_stream)
1137 successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &
1138 num, gpumemcpyhosttodevice, my_stream)
1139 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1140#else
1141 successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &
1142 num, gpumemcpyhosttodevice)
1143 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1144#endif
1145 endif
1146
1147 allocate(nnzu_val(na1,npc_n))
1148 allocate(nnzl_val(na1,npc_n))
1149
1150 nnzu_val(:,:) = 0
1151 nnzl_val(:,:) = 0
1152
1153 if (usegpu) then
1154 num = na1 * npc_n* size_of_int
1155 successgpu = gpu_malloc(nnzu_val_dev, num)
1156 check_alloc_gpu("merge_systems: nnzu_val_dev", successgpu)
1157
1158 num = na1 * npc_n* size_of_int
1159 successgpu = gpu_malloc(nnzl_val_dev, num)
1160 check_alloc_gpu("merge_systems: nnzl_val_dev", successgpu)
1161 endif
1162
1163
1164 np_rem = my_pcol
1165 if (usegpu) then
1166 do np = 1, npc_n
1167 if (np > 1) then
1168 if (np_rem == npc_0) then
1169 np_rem = npc_0+npc_n-1
1170 else
1171 np_rem = np_rem-1
1172 endif
1173 endif
1174 nnzu = 0
1175 nnzl = 0
1176
1177 call gpu_compute_nnzl_nnzu_val_part1 (p_col_dev, idx1_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, &
1178 na, na1, np_rem, npc_n, nnzu_save, nnzl_save, np, my_stream)
1179
1180 enddo ! np = 1, npc_n
1181
1182
1183 nnzu_start = 0
1184 nnzl_start = 0
1185
1186 call gpu_compute_nnzl_nnzu_val_part2 (nnzu_val_dev, nnzl_val_dev, na, na1, nnzu_start, nnzl_start, npc_n, my_stream)
1187 else
1188 ! precompute nnzu_val, nnzl_val
1189 do np = 1, npc_n
1190 if (np > 1) then
1191 if (np_rem == npc_0) then
1192 np_rem = npc_0+npc_n-1
1193 else
1194 np_rem = np_rem-1
1195 endif
1196 endif
1197 nnzu = 0
1198 nnzl = 0
1199 do i=1,na1
1200 if (p_col(idx1(i)) == np_rem) then
1201 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then
1202 nnzu = nnzu+1
1203 nnzu_val(i,np) = nnzu
1204 endif
1205 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then
1206 nnzl = nnzl+1
1207 nnzl_val(i,np) = nnzl
1208 endif
1209 endif
1210 enddo
1211 enddo ! np = 1, npc_n
1212 endif
1213
1214 np_rem = my_pcol
1215
1216 ! is nnzu updated in main loop
1217 ! main loop
1218
1219 do np = 1, npc_n
1220 ! Do a ring send of qtmp1
1221
1222
1223 if (np > 1) then
1224
1225 if (np_rem == npc_0) then
1226 np_rem = npc_0+npc_n-1
1227 else
1228 np_rem = np_rem-1
1229 endif
1230
1231 if (usegpu) then
1232#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
1233 my_stream = obj%gpu_setup%my_stream
1234 call gpu_copy_qtmp1_to_qtmp1_tmp_precision (qtmp1_dev, qtmp1_tmp_dev, gemm_dim_k, gemm_dim_l, my_stream)
1235
1236 call obj%timer%start("ccl_send_recv")
1237 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
1238 successgpu = ccl_group_start() ! PETERDEBUG: should this be moved outside of the loop or deleted?
1239 if (.not.successgpu) then
1240 print *,"Error in setting up nccl_group_start!"
1241 stop
1242 endif
1243 successgpu = ccl_send(qtmp1_tmp_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1244#ifdef DOUBLE_PRECISION
1245 ccldouble, &
1246#endif
1247#ifdef SINGLE_PRECISION
1248 cclfloat, &
1249#endif
1250 np_next, ccl_comm_cols, my_stream)
1251 if (.not.successgpu) then
1252 print *,"Error in nccl_send"
1253 stop
1254 endif
1255 successgpu = ccl_recv(qtmp1_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1256#ifdef DOUBLE_PRECISION
1257 ccldouble, &
1258#endif
1259#ifdef SINGLE_PRECISION
1260 cclfloat, &
1261#endif
1262 np_prev, ccl_comm_cols, my_stream)
1263
1264
1265 if (.not.successgpu) then
1266 print *,"Error in ccl_send/ccl_recv"
1267 stop
1268 endif
1269 successgpu = ccl_group_end()
1270 if (.not.successgpu) then
1271 print *,"Error in setting up ccl_group_end!"
1272 stop
1273 endif
1274 successgpu = gpu_stream_synchronize(my_stream)
1275 check_stream_synchronize_gpu("trans_ev", successgpu)
1276 call obj%timer%stop("ccl_send_recv")
1277
1278#else /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */
1279
1280#ifdef WITH_MPI
1281 call obj%timer%start("mpi_communication")
1282#ifdef WITH_GPU_STREAMS
1283 my_stream = obj%gpu_setup%my_stream
1284 successgpu = gpu_stream_synchronize(my_stream)
1285 check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)
1286
1287 successgpu = gpu_memcpy_async(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1288 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost, my_stream)
1289 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1290
1291 my_stream = obj%gpu_setup%my_stream
1292 successgpu = gpu_stream_synchronize(my_stream)
1293 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1294 ! synchronize streamsPerThread; maybe not neccessary
1295 successgpu = gpu_stream_synchronize()
1296 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1297
1298#else
1299 successgpu = gpu_memcpy(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1300 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost)
1301 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1302#endif
1303
1304
1305 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1306 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1307 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
1308#ifdef WITH_GPU_STREAMS
1309 my_stream = obj%gpu_setup%my_stream
1310 successgpu = gpu_stream_synchronize(my_stream)
1311 check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)
1312
1313 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1314 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice, my_stream)
1315 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1316
1317 my_stream = obj%gpu_setup%my_stream
1318 successgpu = gpu_stream_synchronize(my_stream)
1319 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1320 ! synchronize streamsPerThread; maybe not neccessary
1321 successgpu = gpu_stream_synchronize()
1322 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1323
1324#else
1325 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1326 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice)
1327 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1328#endif
1329 call obj%timer%stop("mpi_communication")
1330#endif /* WITH_MPI */
1331
1332#endif /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */
1333 else ! useGPU
1334#ifdef WITH_MPI
1335 call obj%timer%start("mpi_communication")
1336 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1337 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1338 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
1339 call obj%timer%stop("mpi_communication")
1340#endif /* WITH_MPI */
1341
1342 endif ! useGPU
1343
1344 endif ! (np > 1) then
1345
1346 ! Gather the parts in d1 and z which are fitting to qtmp1.
1347 ! This also delivers nnzu/nnzl for proc np_rem
1348 nnzu = 0
1349 nnzl = 0
1350 if (usegpu) then
1351
1352 my_stream = obj%gpu_setup%my_stream
1353 call gpu_fill_tmp_arrays_precision (idx1_dev, p_col_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, nnzul_dev, &
1354 d1u_dev, d1_dev, zu_dev, z_dev, d1l_dev, zl_dev, na, np, na1, np_rem, my_stream)
1355
1356 num = 2* size_of_int
1357#ifdef WITH_GPU_STREAMS
1358 successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &
1359 num, gpumemcpydevicetohost, my_stream)
1360 check_memcpy_gpu("merge_systems: nnzul_dev", successgpu)
1361#else
1362 successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &
1363 num, gpumemcpydevicetohost)
1364 check_memcpy_gpu("merge_systems: nnzl_val", successgpu)
1365#endif
1366 nnzu = nnzul(1)
1367 nnzl = nnzul(2)
1368
1369 else ! useGPU
1370 do i=1,na1
1371 if (p_col(idx1(i)) == np_rem) then
1372 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then
1373 nnzu = nnzu+1
1374 d1u(nnzu) = d1(i)
1375 zu(nnzu) = z(i)
1376 endif
1377 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then
1378 nnzl = nnzl+1
1379 d1l(nnzl) = d1(i)
1380 zl(nnzl) = z(i)
1381 endif
1382 endif
1383 enddo
1384 endif ! useGPU
1385
1386 ! Set the deflated eigenvectors in Q (comming from proc np_rem)
1387
1388
1389 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1390 if (usegpu) then
1391 ! PETERDEBUG: idx2_dev, problem with garbage values
1392 call gpu_update_ndef_c(ndef_c_dev, idx_dev, p_col_dev, idx2_dev, na, na1, np_rem, ndef, my_stream)
1393
1394 endif ! useGPU
1395
1396 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1397 if (usegpu) then
1398 call gpu_copy_qtmp1_slice_to_q_precision (q_dev, qtmp1_dev, l_col_out_dev, p_col_out_dev, ndef_c_dev, p_col_dev, &
1399 idx2_dev, idx_dev, l_rqs, l_rqe, l_rows, matrixrows, &
1400 gemm_dim_k, my_pcol, na1, np_rem, na, my_stream)
1401 else ! ! useGPU
1402 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1403 do i = 1, na
1404 j = idx(i)
1405 if (j>na1) then
1406 if (p_col(idx2(j-na1)) == np_rem) then
1407 ndef = ndef+1
1408 if (p_col_out(i) == my_pcol) then
1409 q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
1410 endif
1411 endif
1412 endif
1413 enddo
1414
1415 endif ! useGPU
1416
1417
1418 do ns = 0, nqcols1-1, max_strip ! strimining loop
1419 ncnt = min(max_strip,nqcols1-ns) ! number of columns in this strip
1420
1421 ! Get partial result from (output) Q
1422 if (usegpu) then
1423 call gpu_copy_q_slice_to_qtmp2_precision (q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rows, l_rqs, l_rqe, &
1424 matrixrows, matrixcols, gemm_dim_k, gemm_dim_m, ns, &
1425 ncnt, ind_ex, ind_ex2, na, my_stream)
1426 else ! useGPU
1427!$omp PARALLEL DO &
1428!$omp default(none) &
1429!$omp private(i, j, k) &
1430!$omp SHARED(ns, q, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)
1431 do i = 1, ncnt
1432 j = idxq1(i+ns)
1433 k = l_col_out(j)
1434 qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)
1435 enddo
1436!$OMP END PARALLEL DO
1437 endif ! useGPU
1438
1439 ! Compute eigenvectors of the rank-1 modified matrix.
1440 ! Parts for multiplying with upper half of Q:
1441 if (usegpu) then
1442 if (nnzu .ge. 1) then
1443 ! Calculate the j-th eigenvector of the deflated system
1444 ! See above why we are doing it this way!
1445 call gpu_fill_ev_precision (ev_dev, d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1446 na, gemm_dim_l, gemm_dim_m, nnzu, ns, ncnt, my_stream)
1447 endif ! nnzu
1448
1449 else ! useGPU
1450!$omp PARALLEL DO &
1451!$omp default(none) &
1452!$omp private(i, j, k, tmp) &
1453!$omp shared(ncnt, nnzu, idx, idxq1, ns, d1u, dbase, ddiff, zu, ev_scale, ev)
1454 do i = 1, ncnt
1455 do k = 1, nnzu
1456 j = idx(idxq1(i+ns))
1457 ! Calculate the j-th eigenvector of the deflated system
1458 ! See above why we are doing it this way!
1459
1460 ! kernel here
1461 tmp(k) = d1u(k) - dbase(j)
1462 tmp(k) = tmp(k) + ddiff(j)
1463 ev(k,i) = zu(k) / tmp(k) * ev_scale(j)
1464 enddo
1465 enddo
1466!$OMP END PARALLEL DO
1467 endif ! useGPU
1468
1469
1470 ! Multiply old Q with eigenvectors (upper half)
1471
1472 if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
1473 if (usegpu) then
1474 call obj%timer%start("gpublas_gemm")
1475 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1476 call gpublas_precision_gemm('N', 'N', l_rnm, ncnt, nnzu, &
1477 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
1478 ev_dev, ubound(ev,dim=1), &
1479 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1), gpuhandle)
1480 if (wantdebug) successgpu = gpu_devicesynchronize()
1481 call obj%timer%stop("gpublas_gemm")
1482 else ! useGPU
1483 call obj%timer%start("blas_gemm")
1484 call precision_gemm('N', 'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1485 int(nnzu,kind=blas_kind), &
1486 1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=blas_kind), &
1487 ev, int(ubound(ev,dim=1),kind=blas_kind), &
1488 1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
1489 call obj%timer%stop("blas_gemm")
1490 endif ! useGPU
1491 endif ! (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
1492
1493
1494
1495
1496 ! Compute eigenvectors of the rank-1 modified matrix.
1497 ! Parts for multiplying with lower half of Q:
1498
1499
1500 if (usegpu) then
1501 if (nnzl .ge. 1) then
1502 call gpu_fill_ev_precision (ev_dev, d1l_dev, dbase_dev, ddiff_dev, zl_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1503 na, gemm_dim_l, gemm_dim_m, nnzl, ns, ncnt, my_stream)
1504 endif
1505 else ! useGPU
1506!$omp PARALLEL DO &
1507!$omp private(i, j, k, tmp)
1508 do i = 1, ncnt
1509 do k = 1, nnzl
1510 j = idx(idxq1(i+ns))
1511 ! Calculate the j-th eigenvector of the deflated system
1512 ! See above why we are doing it this way!
1513 tmp(k) = d1l(k) - dbase(j)
1514 tmp(k) = tmp(k) + ddiff(j)
1515 ev(k,i) = zl(k) / tmp(k) * ev_scale(j)
1516 enddo
1517 enddo
1518!$OMP END PARALLEL DO
1519 endif ! useGPU
1520
1521
1522 ! Multiply old Q with eigenvectors (lower half)
1523
1524 if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then
1525 if (usegpu) then
1526 call obj%timer%start("gpublas_gemm")
1527 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1528 call gpublas_precision_gemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, &
1529 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), &
1530 ev_dev, ubound(ev,dim=1), &
1531 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1), gpuhandle)
1532 if (wantdebug) successgpu = gpu_devicesynchronize()
1533 call obj%timer%stop("gpublas_gemm")
1534 else ! useGPU
1535 call obj%timer%start("blas_gemm")
1536 call precision_gemm('N', 'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1537 int(nnzl,kind=blas_kind), &
1538 1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=blas_kind), &
1539 ev, int(ubound(ev,dim=1),kind=blas_kind), &
1540 1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
1541 call obj%timer%stop("blas_gemm")
1542 endif ! useGPU
1543 endif
1544
1545
1546 ! Put partial result into (output) Q
1547 if (usegpu) then
1548 call gpu_copy_qtmp2_slice_to_q_precision(q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rqs, l_rqe, l_rows, ncnt, &
1549 gemm_dim_k, matrixrows, ns, my_stream)
1550 else ! useGPU
1551!$omp PARALLEL DO &
1552!$omp default(none) &
1553!$omp private(i) &
1554!$omp SHARED(q, ns, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)
1555 do i = 1, ncnt
1556 q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
1557 enddo
1558!$OMP END PARALLEL DO
1559 endif ! useGPU
1560
1561
1562 enddo !ns = 0, nqcols1-1, max_strip ! strimining loop
1563 enddo !do np = 1, npc_n
1564
1565
1566 deallocate(nnzu_val, nnzl_val)
1567
1568
1569 if (usegpu) then
1570
1571 num = matrixrows*matrixcols*size_of_datatype
1572#ifdef WITH_GPU_STREAMS
1573 successgpu = gpu_memcpy_async(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
1574 num, gpumemcpydevicetohost, my_stream)
1575 check_memcpy_gpu("merge_systems: q_dev", successgpu)
1576
1577 my_stream = obj%gpu_setup%my_stream
1578 successgpu = gpu_stream_synchronize(my_stream)
1579 check_stream_synchronize_gpu("merge_systems: q_dev", successgpu)
1580#else
1581 successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
1582 num, gpumemcpydevicetohost)
1583 check_memcpy_gpu("merge_systems: q_dev", successgpu)
1584#endif
1585#ifdef WITH_GPU_STREAMS
1586 successgpu = gpu_memcpy_async(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &
1587 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost, my_stream)
1588 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
1589
1590 successgpu = gpu_stream_synchronize(my_stream)
1591 check_stream_synchronize_gpu("merge_systems: ev_dev", successgpu)
1592#else
1593 !TODO the previous loop could be possible to do on device and thus
1594 !copy less
1595 successgpu = gpu_memcpy(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &
1596 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost)
1597 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
1598#endif
1599 endif
1600
1601 if (usegpu) then
1602 deallocate(ndef_c, stat=istat, errmsg=errormessage)
1603 check_deallocate("merge_systems: ndef_c",istat, errormessage)
1604 endif
1605
1606 if (usegpu) then
1607 successgpu = gpu_free(nnzul_dev)
1608 check_dealloc_gpu("merge_systems: nnzul_dev", successgpu)
1609
1610 successgpu = gpu_free(l_col_dev)
1611 check_dealloc_gpu("merge_systems: l_col_dev", successgpu)
1612
1613 successgpu = gpu_free(ndef_c_dev)
1614 check_dealloc_gpu("merge_systems: ndef_c_dev", successgpu)
1615
1616 successgpu = gpu_free(nnzu_val_dev)
1617 check_dealloc_gpu("merge_systems: nnzu_val_dev", successgpu)
1618
1619 successgpu = gpu_free(nnzl_val_dev)
1620 check_dealloc_gpu("merge_systems: nnzl_val_dev", successgpu)
1621
1622 successgpu = gpu_free(idx1_dev)
1623 check_dealloc_gpu("merge_systems: idx1_dev", successgpu)
1624
1625 successgpu = gpu_free(idx2_dev)
1626 check_dealloc_gpu("merge_systems: idx2_dev", successgpu)
1627
1628 successgpu = gpu_free(p_col_dev)
1629 check_dealloc_gpu("merge_systems: p_col_dev", successgpu)
1630
1631 successgpu = gpu_free(p_col_out_dev)
1632 check_dealloc_gpu("merge_systems: p_col_out_dev", successgpu)
1633
1634 successgpu = gpu_free(coltyp_dev)
1635 check_dealloc_gpu("merge_systems: coltyp_dev", successgpu)
1636
1637 successgpu = gpu_free(idx_dev)
1638 check_dealloc_gpu("merge_systems: idx_dev", successgpu)
1639
1640 successgpu = gpu_free(l_col_out_dev)
1641 check_dealloc_gpu("merge_systems: l_col_out_dev", successgpu)
1642
1643 successgpu = gpu_free(idxq1_dev)
1644 check_dealloc_gpu("merge_systems: ", successgpu)
1645
1646 successgpu = gpu_free(q_dev)
1647 check_dealloc_gpu("merge_systems: q_dev", successgpu)
1648
1649 successgpu = gpu_free(d1_dev)
1650 check_dealloc_gpu("merge_systems: d1_dev", successgpu)
1651
1652 successgpu = gpu_free(z_dev)
1653 check_dealloc_gpu("merge_systems: z_dev", successgpu)
1654
1655 successgpu = gpu_free(d1u_dev)
1656 check_dealloc_gpu("merge_systems: d1u_dev", successgpu)
1657
1658 successgpu = gpu_free(dbase_dev)
1659 check_dealloc_gpu("merge_systems: dbase_dev", successgpu)
1660
1661 successgpu = gpu_free(ddiff_dev)
1662 check_dealloc_gpu("merge_systems: ddiff_dev", successgpu)
1663
1664 successgpu = gpu_free(zu_dev)
1665 check_dealloc_gpu("merge_systems: zu_dev", successgpu)
1666
1667 successgpu = gpu_free(ev_scale_dev)
1668 check_dealloc_gpu("merge_systems: ev_scale_dev", successgpu)
1669
1670 successgpu = gpu_free(d1l_dev)
1671 check_dealloc_gpu("merge_systems: d1l_dev", successgpu)
1672
1673 successgpu = gpu_free(zl_dev)
1674 check_dealloc_gpu("merge_systems: zl_dev", successgpu)
1675
1676
1677#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1678 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1679 successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
1680 check_host_unregister_gpu("merge_systems: qtmp1", successgpu)
1681 endif
1682#endif
1683 successgpu = gpu_free(qtmp1_dev)
1684 check_dealloc_gpu("merge_systems: qtmp1_dev", successgpu)
1685
1686 successgpu = gpu_free(qtmp1_tmp_dev)
1687 check_dealloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)
1688
1689#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1690 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1691 successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
1692 check_host_unregister_gpu("merge_systems: qtmp2", successgpu)
1693 endif
1694#endif
1695 successgpu = gpu_free(qtmp2_dev)
1696 check_dealloc_gpu("merge_systems: qtmp2_dev", successgpu)
1697
1698#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1699 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1700 successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))
1701 check_host_unregister_gpu("merge_systems: ev", successgpu)
1702 endif
1703#endif
1704 successgpu = gpu_free(ev_dev)
1705 check_dealloc_gpu("merge_systems: ev_dev", successgpu)
1706 endif ! useGPU
1707
1708 deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)
1709 check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errormessage)
1710 endif !very outer test (na1==1 .or. na1==2)
1711#ifdef WITH_OPENMP_TRADITIONAL
1712 deallocate(z_p, stat=istat, errmsg=errormessage)
1713 check_deallocate("merge_systems: z_p",istat, errormessage)
1714#endif
1715
1716 call obj%timer%stop("merge_systems" // precision_suffix)
1717
1718 return
1719
1720 end
Definition mod_add_tmp.F90:3
Definition mod_check_monotony.F90:3
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_global_gather.F90:3
Definition mod_global_product.F90:3
Definition mod_resort_ev.F90:3
Definition mod_solve_secular_equation.F90:55
Definition mod_transform_columns.F90:3
Definition mod_v_add_s.F90:55
Definition elpa_abstract_impl.F90:73