Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.01.002
Loading...
Searching...
No Matches
merge_systems_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58#ifdef SOLVE_TRIDI_GPU_BUILD
59 subroutine merge_systems_gpu_&
60 &precision &
61 (obj, na, nm, d, e, q, matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &
62 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
63#else
64 subroutine merge_systems_cpu_&
65 &precision &
66 (obj, na, nm, d, e, q, matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &
67 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
68#endif
69
70
71 use elpa_gpu
72 use, intrinsic :: iso_c_binding
73 use precision
75 use elpa_blas_interfaces
78 use resort_ev
81 use add_tmp
82 use v_add_s
83 use elpa_utilities
84 use elpa_mpi
86#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
87 use elpa_ccl_gpu
88#endif
89 use merge_systems_gpu
90
91
92#ifdef WITH_OPENMP_TRADITIONAL
93 use omp_lib
94#endif
95 implicit none
96#include "../general/precision_kinds.F90"
97 class(elpa_abstract_impl_t), intent(inout) :: obj
98 integer(kind=ik), intent(in) :: na, nm, matrixRows, nqoff, nblk, matrixCols, mpi_comm_rows, &
99 mpi_comm_cols, npc_0, npc_n
100 integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
101 real(kind=real_datatype), intent(inout) :: d(na), e
102#ifdef USE_ASSUMED_SIZE
103 real(kind=real_datatype), intent(inout) :: q(matrixrows,*)
104#else
105 real(kind=real_datatype), intent(inout) :: q(matrixrows,matrixcols)
106#endif
107 logical, intent(in) :: useGPU, wantDebug
108
109 logical, intent(out) :: success
110
111 ! TODO: play with max_strip. If it was larger, matrices being multiplied
112 ! might be larger as well!
113 integer(kind=ik), parameter :: max_strip=128
114
115
116 real(kind=real_datatype) :: beta, sig, s, c, t, tau, rho, eps, tol, &
117 qtrans(2,2), dmax, zmax, d1new, d2new
118 real(kind=real_datatype) :: z(na), d1(na), d2(na), z1(na), delta(na), &
119 dbase(na), ddiff(na), ev_scale(na), tmp(na)
120 real(kind=real_datatype) :: d1u(na), zu(na), d1l(na), zl(na)
121 real(kind=real_datatype), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
122#ifdef WITH_OPENMP_TRADITIONAL
123 real(kind=real_datatype), allocatable :: z_p(:,:)
124#endif
125
126 integer(kind=ik) :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
127 l_rqm, ns, info
128 integer(kind=BLAS_KIND) :: infoBLAS
129 integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
130 l_cols_qreorg, np, l_idx, nqcols1 !, nqcols2
131 integer(kind=ik) :: nnzu_save, nnzl_save
132 integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
133 np_cols
134 integer(kind=MPI_KIND) :: mpierr
135 integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
136 integer(kind=ik) :: np_next, np_prev, np_rem
137 integer(kind=ik) :: idx(na), idx1(na), idx2(na)
138 integer(kind=BLAS_KIND) :: idxBLAS(NA)
139 integer(kind=ik) :: coltyp(na), idxq1(na) !, idxq2(na)
140
141 integer(kind=ik) :: istat
142 character(200) :: errorMessage
143 integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
144
145 integer(kind=c_intptr_t) :: num
146 integer(kind=C_intptr_T) :: qtmp1_dev, qtmp1_tmp_dev, qtmp2_dev, ev_dev, q_dev
147 integer(kind=c_intptr_t) :: d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev
148 integer(kind=c_intptr_t) :: d1l_dev, zl_dev, z_dev, d1_dev
149 integer(kind=c_intptr_t) :: idx1_dev, p_col_dev, coltyp_dev, p_col_out_dev, ndef_c_dev
150 integer(kind=c_intptr_t) :: idxq1_dev, l_col_out_dev, idx_dev, idx2_dev, l_col_dev
151 integer(kind=c_intptr_t) :: nnzul_dev
152
153 integer(kind=c_intptr_t) :: nnzu_val_dev, nnzl_val_dev
154 logical :: successGPU
155 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
156 &precision&
157 &_real
158 integer(kind=c_intptr_t) :: gpuHandle
159 integer(kind=ik), intent(in) :: max_threads
160 integer(kind=c_intptr_t) :: my_stream
161 integer(kind=ik) :: l_col_out_tmp
162 integer(kind=ik), allocatable :: nnzu_val(:,:), nnzl_val(:,:)
163 integer(kind=ik) :: nnzul(2)
164
165 integer(kind=ik) :: nnzu_start, nnzl_start
166
167 integer(kind=ik), allocatable :: ndef_c(:)
168
169 !real(kind=REAL_DATATYPE), allocatable :: qtmp11(:,:)
170 integer(kind=ik) :: ii,jj, indx, ind_ex, ind_ex2, p_col_tmp, index2, counter1, counter2
171#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
172 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
173#endif
174 logical :: useCCL
175#ifdef WITH_OPENMP_TRADITIONAL
176 integer(kind=ik) :: my_thread
177
178 allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errormessage)
179 check_allocate("merge_systems: z_p",istat, errormessage)
180#endif
181
182 call obj%timer%start("merge_systems" // precision_suffix)
183 success = .true.
184
185 call obj%timer%start("mpi_communication")
186 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
187 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
188 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
189 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
190
191 my_prow = int(my_prowmpi,kind=c_int)
192 np_rows = int(np_rowsmpi,kind=c_int)
193 my_pcol = int(my_pcolmpi,kind=c_int)
194 np_cols = int(np_colsmpi,kind=c_int)
195
196 call obj%timer%stop("mpi_communication")
197
198
199 useccl = obj%gpu_setup%useCCL
200 ! If my processor column isn't in the requested set, do nothing
201
202 if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
203 call obj%timer%stop("merge_systems" // precision_suffix)
204 return
205 endif
206 ! Determine number of "next" and "prev" column for ring sends
207
208 if (my_pcol == npc_0+npc_n-1) then
209 np_next = npc_0
210 else
211 np_next = my_pcol + 1
212 endif
213
214 if (my_pcol == npc_0) then
215 np_prev = npc_0+npc_n-1
216 else
217 np_prev = my_pcol - 1
218 endif
219 call check_monotony_&
220 &precision&
221 &(obj, nm,d,'Input1',wantdebug, success)
222 if (.not.(success)) then
223 call obj%timer%stop("merge_systems" // precision_suffix)
224 return
225 endif
226 call check_monotony_&
227 &precision&
228 &(obj,na-nm,d(nm+1),'Input2',wantdebug, success)
229 if (.not.(success)) then
230 call obj%timer%stop("merge_systems" // precision_suffix)
231 return
232 endif
233 ! Get global number of processors and my processor number.
234 ! Please note that my_proc does not need to match any real processor number,
235 ! it is just used for load balancing some loops.
236
237 n_procs = np_rows*npc_n
238 my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
239
240
241 ! Local limits of the rows of Q
242
243 l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
244 l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
245 l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
246
247 l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm
248 l_rows = l_rqe-l_rqs+1 ! Total number of local rows
249
250
251 ! My number of local columns
252
253 l_cols = count(p_col(1:na)==my_pcol)
254
255 ! Get max number of local columns
256
257 max_local_cols = 0
258 do np = npc_0, npc_0+npc_n-1
259 max_local_cols = max(max_local_cols,count(p_col(1:na)==np))
260 enddo
261
262 ! Calculations start here
263
264 beta = abs(e)
265 sig = sign(1.0_rk,e)
266
267 ! Calculate rank-1 modifier z
268
269 z(:) = 0
270
271 if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow) then
272 ! nm is local on my row
273 do i = 1, na
274 if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
275 enddo
276 endif
277
278 if (mod((nqoff+nm)/nblk,np_rows)==my_prow) then
279 ! nm+1 is local on my row
280 do i = 1, na
281 if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
282 enddo
283 endif
284
285 call global_gather_&
286 &precision&
287 &(obj, z, na, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
288 if (.not.(success)) then
289 write(error_unit,*) "Error in global_gather. ABorting"
290 success = .false.
291 return
292 endif
293 ! Normalize z so that norm(z) = 1. Since z is the concatenation of
294 ! two normalized vectors, norm2(z) = sqrt(2).
295 z = z/sqrt(2.0_rk)
296 rho = 2.0_rk*beta
297 ! Calculate index for merging both systems by ascending eigenvalues
298 call obj%timer%start("lapack_lamrg")
299 call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &
300 1_blas_kind, 1_blas_kind, idxblas )
301 idx(:) = int(idxblas(:),kind=ik)
302 call obj%timer%stop("lapack_lamrg")
303
304 ! Calculate the allowable deflation tolerance
305
306 zmax = maxval(abs(z))
307 dmax = maxval(abs(d))
308 eps = precision_lamch( 'E' ) ! return epsilon
309 tol = 8.0_rk*eps*max(dmax,zmax)
310
311 ! If the rank-1 modifier is small enough, no more needs to be done
312 ! except to reorganize D and Q
313
314 IF ( rho*zmax <= tol ) THEN
315
316 ! Rearrange eigenvalues
317
318 tmp = d
319 do i=1,na
320 d(i) = tmp(idx(i))
321 enddo
322
323 ! Rearrange eigenvectors
324 call resort_ev_&
325 &precision &
326 (obj, idx, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &
327 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
328
329 call obj%timer%stop("merge_systems" // precision_suffix)
330
331 return
332 ENDIF
333
334 ! Merge and deflate system
335
336 na1 = 0
337 na2 = 0
338
339 ! COLTYP:
340 ! 1 : non-zero in the upper half only;
341 ! 2 : dense;
342 ! 3 : non-zero in the lower half only;
343 ! 4 : deflated.
344
345 coltyp(1:nm) = 1
346 coltyp(nm+1:na) = 3
347
348 do i=1,na
349
350 if (rho*abs(z(idx(i))) <= tol) then
351
352 ! Deflate due to small z component.
353
354 na2 = na2+1
355 d2(na2) = d(idx(i))
356 idx2(na2) = idx(i)
357 coltyp(idx(i)) = 4
358
359 else if (na1>0) then
360
361 ! Check if eigenvalues are close enough to allow deflation.
362
363 s = z(idx(i))
364 c = z1(na1)
365
366 ! Find sqrt(a**2+b**2) without overflow or
367 ! destructive underflow.
368 tau = precision_lapy2( c, s )
369 t = d1(na1) - d(idx(i))
370 c = c / tau
371 s = -s / tau
372 IF ( abs( t*c*s ) <= tol ) THEN
373
374 ! Deflation is possible.
375
376 na2 = na2+1
377
378 z1(na1) = tau
379
380 d2new = d(idx(i))*c**2 + d1(na1)*s**2
381 d1new = d(idx(i))*s**2 + d1(na1)*c**2
382
383 ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
384 ! This means that after the above transformation it must be
385 ! D1(na1) <= d1new <= D(idx(i))
386 ! D1(na1) <= d2new <= D(idx(i))
387 !
388 ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
389 ! so there is no problem with sorting here.
390 ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
391 ! which makes a check (and possibly a resort) necessary.
392 !
393 ! The above relations may not hold exactly due to numeric differences
394 ! so they have to be enforced in order not to get troubles with sorting.
395
396
397 if (d1new<d1(na1) ) d1new = d1(na1)
398 if (d1new>d(idx(i))) d1new = d(idx(i))
399
400 if (d2new<d1(na1) ) d2new = d1(na1)
401 if (d2new>d(idx(i))) d2new = d(idx(i))
402
403 d1(na1) = d1new
404
405 do j=na2-1,1,-1
406 if (d2new<d2(j)) then
407 d2(j+1) = d2(j)
408 idx2(j+1) = idx2(j)
409 else
410 exit ! Loop
411 endif
412 enddo
413
414 d2(j+1) = d2new
415 idx2(j+1) = idx(i)
416
417 qtrans(1,1) = c; qtrans(1,2) =-s
418 qtrans(2,1) = s; qtrans(2,2) = c
419 call transform_columns_&
420 &precision &
421 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
422 q, matrixrows, matrixcols, l_rows, mpi_comm_cols, &
423 p_col, l_col, qtrans)
424 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
425 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
426
427 coltyp(idx(i)) = 4
428
429 else
430 na1 = na1+1
431 d1(na1) = d(idx(i))
432 z1(na1) = z(idx(i))
433 idx1(na1) = idx(i)
434 endif
435 else
436 na1 = na1+1
437 d1(na1) = d(idx(i))
438 z1(na1) = z(idx(i))
439 idx1(na1) = idx(i)
440 endif
441
442 enddo ! do i=1,na
443
444 call check_monotony_&
445 &precision&
446 &(obj, na1,d1,'Sorted1', wantdebug, success)
447 if (.not.(success)) then
448 call obj%timer%stop("merge_systems" // precision_suffix)
449 return
450 endif
451 call check_monotony_&
452 &precision&
453 &(obj, na2,d2,'Sorted2', wantdebug, success)
454 if (.not.(success)) then
455 call obj%timer%stop("merge_systems" // precision_suffix)
456 return
457 endif
458
459 if (na1==1 .or. na1==2) then
460 ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
461
462 if (na1==1) then
463 d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
464 else ! na1==2
465 call obj%timer%start("lapack_laed5_x2")
466 call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))
467 call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))
468 call obj%timer%stop("lapack_laed5_x2")
469 call transform_columns_&
470 &precision&
471 &(obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &
472 matrixrows, matrixcols, l_rows, mpi_comm_cols, &
473 p_col, l_col, qtrans)
474
475 endif
476
477 ! Add the deflated eigenvalues
478 d(na1+1:na) = d2(1:na2)
479
480 ! Calculate arrangement of all eigenvalues in output
481 call obj%timer%start("lapack_lamrg")
482 call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
483 1_blas_kind, 1_blas_kind, idxblas )
484 idx(:) = int(idxblas(:),kind=ik)
485 call obj%timer%stop("lapack_lamrg")
486 ! Rearrange eigenvalues
487
488 tmp = d
489 do i=1,na
490 d(i) = tmp(idx(i))
491 enddo
492
493 ! Rearrange eigenvectors
494
495 do i=1,na
496 if (idx(i)<=na1) then
497 idxq1(i) = idx1(idx(i))
498 else
499 idxq1(i) = idx2(idx(i)-na1)
500 endif
501 enddo
502 call resort_ev_&
503 &precision&
504 &(obj, idxq1, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &
505 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
506
507 else if (na1>2) then
508
509 ! Solve secular equation
510
511 z(1:na1) = 1
512#ifdef WITH_OPENMP_TRADITIONAL
513 z_p(1:na1,:) = 1
514#endif
515 dbase(1:na1) = 0
516 ddiff(1:na1) = 0
517
518 info = 0
519 infoblas = int(info,kind=blas_kind)
520!#ifdef WITH_OPENMP_TRADITIONAL
521!
522! call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
523!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
524! my_thread = omp_get_thread_num()
525!!$OMP DO
526!#endif
527 DO i = my_proc+1, na1, n_procs ! work distributed over all processors
528 call obj%timer%start("lapack_laed4")
529 call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &
530 rho, s, infoblas) ! s is not used!
531 info = int(infoblas,kind=ik)
532 call obj%timer%stop("lapack_laed4")
533 if (info/=0) then
534 ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
535 ! use the more stable bisection algorithm in solve_secular_equation
536 ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
537 call solve_secular_equation_&
538 &precision&
539 &(obj, na1, i, d1, z1, delta, rho, s)
540 endif
541
542 ! Compute updated z
543
544!#ifdef WITH_OPENMP_TRADITIONAL
545! do j=1,na1
546! if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
547! enddo
548! z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
549!#else
550 do j=1,na1
551 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
552 enddo
553 z(i) = z(i)*delta(i)
554!#endif
555 ! store dbase/ddiff
556
557 if (i<na1) then
558 if (abs(delta(i+1)) < abs(delta(i))) then
559 dbase(i) = d1(i+1)
560 ddiff(i) = delta(i+1)
561 else
562 dbase(i) = d1(i)
563 ddiff(i) = delta(i)
564 endif
565 else
566 dbase(i) = d1(i)
567 ddiff(i) = delta(i)
568 endif
569 enddo ! i = my_proc+1, na1, n_procs
570!#ifdef WITH_OPENMP_TRADITIONAL
571!!$OMP END PARALLEL
572!
573! call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
574!
575! do i = 0, max_threads-1
576! z(1:na1) = z(1:na1)*z_p(1:na1,i)
577! enddo
578!#endif
579
580 call global_product_&
581 &precision&
582 (obj, z, na1, mpi_comm_rows, mpi_comm_cols, npc_0, npc_n, success)
583 if (.not.(success)) then
584 write(error_unit,*) "Error in global_product. Aborting..."
585 return
586 endif
587 z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )
588
589 call global_gather_&
590 &precision&
591 &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
592 if (.not.(success)) then
593 write(error_unit,*) "Error in global_gather. Aborting..."
594 return
595 endif
596 call global_gather_&
597 &precision&
598 &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
599 if (.not.(success)) then
600 write(error_unit,*) "Error in global_gather. Aborting..."
601 return
602 endif
603 d(1:na1) = dbase(1:na1) - ddiff(1:na1)
604
605 ! Calculate scale factors for eigenvectors
606 ev_scale(:) = 0.0_rk
607
608#ifdef WITH_OPENMP_TRADITIONAL
609
610 call obj%timer%start("OpenMP parallel" // precision_suffix)
611
612!$omp PARALLEL DO &
613!$omp default(none) &
614!$omp private(i) &
615!$omp SHARED(na1, my_proc, n_procs, &
616!$OMP d1, dbase, ddiff, z, ev_scale, obj)
617
618#endif
619 DO i = my_proc+1, na1, n_procs ! work distributed over all processors
620
621 ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code
622 ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
623
624 ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
625 ! in exactly this order, but we want to prevent compiler optimization
626! ev_scale_val = ev_scale(i)
627 call add_tmp_&
628 &precision&
629 &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)
630! ev_scale(i) = ev_scale_val
631 enddo
632#ifdef WITH_OPENMP_TRADITIONAL
633!$OMP END PARALLEL DO
634
635 call obj%timer%stop("OpenMP parallel" // precision_suffix)
636
637#endif
638
639 call global_gather_&
640 &precision&
641 &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
642 if (.not.(success)) then
643 write(error_unit,*) "Error in global_gather. Aborting..."
644 return
645 endif
646 ! Add the deflated eigenvalues
647 d(na1+1:na) = d2(1:na2)
648
649 call obj%timer%start("lapack_lamrg")
650 ! Calculate arrangement of all eigenvalues in output
651 call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
652 1_blas_kind, 1_blas_kind, idxblas )
653 idx(:) = int(idxblas(:),kind=ik)
654 call obj%timer%stop("lapack_lamrg")
655 ! Rearrange eigenvalues
656 tmp = d
657 do i=1,na
658 d(i) = tmp(idx(i))
659 enddo
660 call check_monotony_&
661 &precision&
662 &(obj, na,d,'Output', wantdebug, success)
663
664 if (.not.(success)) then
665 call obj%timer%stop("merge_systems" // precision_suffix)
666 return
667 endif
668 ! Eigenvector calculations
669 if (usegpu) then
670 num = 2 * size_of_int
671 successgpu = gpu_malloc(nnzul_dev, num)
672 check_alloc_gpu("merge_systems: ", successgpu)
673
674 num = na * size_of_int
675 successgpu = gpu_malloc(idxq1_dev, num)
676 check_alloc_gpu("merge_systems: ", successgpu)
677
678 num = na * size_of_int
679 successgpu = gpu_malloc(idx_dev, num)
680 check_alloc_gpu("merge_systems: idx_dev", successgpu)
681
682 num = na * size_of_int
683#ifdef WITH_GPU_STREAMS
684 my_stream = obj%gpu_setup%my_stream
685 successgpu = gpu_memcpy_async(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
686 num, gpumemcpyhosttodevice, my_stream)
687 check_memcpy_gpu("merge_systems: ", successgpu)
688#else
689 successgpu = gpu_memcpy(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
690 num, gpumemcpyhosttodevice)
691 check_memcpy_gpu("merge_systems: idx_dev", successgpu)
692#endif
693 endif
694
695 ! Calculate the number of columns in the new local matrix Q
696 ! which are updated from non-deflated/deflated eigenvectors.
697 ! idxq1/2 stores the global column numbers.
698
699 !if (useGPU) then
700
701
702
703 ! !nqcols1 is needed later on host !!
704 ! ! memcopy back needed!!
705 !else
706 nqcols1 = 0 ! number of non-deflated eigenvectors
707 !nqcols2 = 0 ! number of deflated eigenvectors
708 DO i = 1, na
709 if (p_col_out(i)==my_pcol) then
710 if (idx(i)<=na1) then
711 nqcols1 = nqcols1+1
712 idxq1(nqcols1) = i
713 !else
714 !nqcols2 = nqcols2+1
715 !idxq2(nqcols2) = i
716 endif
717 endif
718 enddo
719 !endif
720
721 if (usegpu) then
722 num = na * size_of_int
723#ifdef WITH_GPU_STREAMS
724 my_stream = obj%gpu_setup%my_stream
725 successgpu = gpu_memcpy_async(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
726 num, gpumemcpyhosttodevice, my_stream)
727 check_memcpy_gpu("merge_systems: ", successgpu)
728#else
729 successgpu = gpu_memcpy(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
730 num, gpumemcpyhosttodevice)
731 check_memcpy_gpu("merge_systems: idxq1_dev", successgpu)
732#endif
733 endif
734
735
736
737 if (usegpu) then
738 allocate(ndef_c(na), stat=istat, errmsg=errormessage)
739 check_allocate("merge_systems: ndef_c",istat, errormessage)
740 endif
741
742
743 gemm_dim_k = max(1,l_rows)
744 gemm_dim_l = max_local_cols
745 gemm_dim_m = min(max_strip,max(1,nqcols1))
746
747 allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)
748 check_allocate("merge_systems: qtmp1",istat, errormessage)
749
750 allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)
751 check_allocate("merge_systems: ev",istat, errormessage)
752
753 allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)
754 check_allocate("merge_systems: qtmp2",istat, errormessage)
755
756 qtmp1 = 0 ! May contain empty (unset) parts
757 qtmp2 = 0 ! Not really needed
758
759
760 ! check memory copies
761
762 if (usegpu) then
763 num = na * size_of_int
764 successgpu = gpu_malloc(ndef_c_dev, num)
765 check_alloc_gpu("merge_systems: ndef_c_dev", successgpu)
766
767 num = na * size_of_int
768 successgpu = gpu_malloc(idx1_dev, num)
769 check_alloc_gpu("merge_systems: idx1_dev", successgpu)
770
771 num = na * size_of_int
772 successgpu = gpu_malloc(p_col_dev, num)
773 check_alloc_gpu("merge_systems: p_col_dev", successgpu)
774
775 num = na * size_of_int
776 successgpu = gpu_malloc(p_col_out_dev, num)
777 check_alloc_gpu("merge_systems: p_col_out_dev", successgpu)
778
779 num = na * size_of_int
780 successgpu = gpu_malloc(coltyp_dev, num)
781 check_alloc_gpu("merge_systems: coltyp_dev", successgpu)
782
783 num = na * size_of_int
784 successgpu = gpu_malloc(idx2_dev, num)
785 check_alloc_gpu("merge_systems: idx2_dev", successgpu)
786
787 num = na * size_of_int
788 successgpu = gpu_malloc(l_col_out_dev, num)
789 check_alloc_gpu("merge_systems: l_col_out_dev", successgpu)
790
791 num = (na) * size_of_datatype
792 successgpu = gpu_malloc(z_dev, num)
793 check_alloc_gpu("merge_systems: z_dev", successgpu)
794
795 num = (na) * size_of_datatype
796 successgpu = gpu_malloc(d1_dev, num)
797 check_alloc_gpu("merge_systems: d1_dev", successgpu)
798
799 num = (na) * size_of_datatype
800 successgpu = gpu_malloc(d1u_dev, num)
801 check_alloc_gpu("merge_systems: d1u_dev", successgpu)
802
803 num = (na) * size_of_datatype
804 successgpu = gpu_malloc(dbase_dev, num)
805 check_alloc_gpu("merge_systems: dbase_dev", successgpu)
806
807 num = (na) * size_of_datatype
808 successgpu = gpu_malloc(ddiff_dev, num)
809 check_alloc_gpu("merge_systems: ddiff_dev", successgpu)
810
811 num = (na) * size_of_datatype
812 successgpu = gpu_malloc(zu_dev, num)
813 check_alloc_gpu("merge_systems: zu_dev", successgpu)
814
815 num = (na) * size_of_datatype
816 successgpu = gpu_malloc(ev_scale_dev, num)
817 check_alloc_gpu("merge_systems: ev_scale_dev", successgpu)
818
819 num = (na) * size_of_datatype
820 successgpu = gpu_malloc(d1l_dev, num)
821 check_alloc_gpu("merge_systems: d1l_dev", successgpu)
822
823 num = (na) * size_of_datatype
824 successgpu = gpu_malloc(zl_dev, num)
825 check_alloc_gpu("merge_systems: zl_dev", successgpu)
826
827 num = (matrixrows*matrixcols) * size_of_datatype
828 successgpu = gpu_malloc(q_dev, num)
829 check_alloc_gpu("merge_systems: q_dev", successgpu)
830
831 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
832#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
833 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
834 successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&
835 gpuhostregisterdefault)
836 check_host_register_gpu("merge_systems: qtmp1", successgpu)
837 endif
838#endif
839 successgpu = gpu_malloc(qtmp1_dev, num)
840 check_alloc_gpu("merge_systems: qtmp1_dev", successgpu)
841
842 successgpu = gpu_malloc(qtmp1_tmp_dev, num)
843 check_alloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)
844
845 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
846#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
847 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
848 successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t),num,&
849 gpuhostregisterdefault)
850 check_host_register_gpu("merge_systems: ev", successgpu)
851 endif
852#endif
853 successgpu = gpu_malloc(ev_dev, num)
854 check_alloc_gpu("merge_systems: ev_dev", successgpu)
855
856
857 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
858#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
859 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
860 successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&
861 gpuhostregisterdefault)
862 check_host_register_gpu("merge_systems: qtmp2", successgpu)
863 endif
864#endif
865 successgpu = gpu_malloc(qtmp2_dev, num)
866 check_alloc_gpu("merge_systems: qtmp2_dev", successgpu)
867 endif !useGPU
868
869 ! Gather nonzero upper/lower components of old matrix Q
870 ! which are needed for multiplication with new eigenvectors
871
872
873 ! kernel compute nnzu on device
874 nnzu = 0
875 nnzl = 0
876 do i = 1, na1
877 l_idx = l_col(idx1(i))
878 if (p_col(idx1(i))==my_pcol) then
879 if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
880 nnzu = nnzu+1
881 qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
882 endif
883 if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
884 nnzl = nnzl+1
885 qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
886 endif
887 endif
888 enddo
889
890 if (usegpu) then
891
892 num = gemm_dim_k * gemm_dim_l * size_of_datatype
893#ifdef WITH_GPU_STREAMS
894 my_stream = obj%gpu_setup%my_stream
895 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
896 num, gpumemcpyhosttodevice, my_stream)
897 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
898#else
899 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
900 num, gpumemcpyhosttodevice)
901 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
902#endif
903
904
905 num = matrixrows*matrixcols*size_of_datatype
906#ifdef WITH_GPU_STREAMS
907 successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
908 num, gpumemcpyhosttodevice, my_stream)
909 check_memcpy_gpu("merge_systems: q_dev", successgpu)
910#else
911 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
912 num, gpumemcpyhosttodevice)
913 check_memcpy_gpu("merge_systems: q_dev", successgpu)
914#endif
915
916
917
918 num = na * size_of_int
919 successgpu = gpu_malloc(l_col_dev, num)
920 check_alloc_gpu("merge_systems: l_col_dev", successgpu)
921
922 num = na * size_of_int
923#ifdef WITH_GPU_STREAMS
924 my_stream = obj%gpu_setup%my_stream
925 successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &
926 num, gpumemcpyhosttodevice, my_stream)
927 check_memcpy_gpu("merge_systems: ", successgpu)
928#else
929 successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &
930 num, gpumemcpyhosttodevice)
931 check_memcpy_gpu("merge_systems: l_col_dev", successgpu)
932#endif
933 endif
934
935 ! Gather deflated eigenvalues behind nonzero components
936
937 ! compute ndef on device
938 ndef = max(nnzu,nnzl)
939
940 if (usegpu) then
941 num = na * size_of_int
942#ifdef WITH_GPU_STREAMS
943 my_stream = obj%gpu_setup%my_stream
944 successgpu = gpu_memcpy_async(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &
945 num, gpumemcpyhosttodevice, my_stream)
946 check_memcpy_gpu("merge_systems: ", successgpu)
947#else
948 successgpu = gpu_memcpy(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &
949 num, gpumemcpyhosttodevice)
950 check_memcpy_gpu("merge_systems: idx2_dev", successgpu)
951#endif
952 num = na * size_of_int
953#ifdef WITH_GPU_STREAMS
954 my_stream = obj%gpu_setup%my_stream
955
956 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
957 num, gpumemcpyhosttodevice, my_stream)
958 check_memcpy_gpu("merge_systems: ", successgpu)
959#else
960 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
961 num, gpumemcpyhosttodevice)
962 check_memcpy_gpu("merge_systems: p_col_dev", successgpu)
963#endif
964 endif
965
966
967
968 if (usegpu) then
969 ndef_c(:) = ndef
970
971 num = na * size_of_int
972#ifdef WITH_GPU_STREAMS
973 my_stream = obj%gpu_setup%my_stream
974 successgpu = gpu_memcpy_async(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &
975 num, gpumemcpyhosttodevice, my_stream)
976 check_memcpy_gpu("merge_systems: ndef_c_dev 4", successgpu)
977
978#else
979 successgpu = gpu_memcpy(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &
980 num, gpumemcpyhosttodevice)
981 check_memcpy_gpu("merge_systems: ndef_c_dev", successgpu)
982#endif
983
984 call gpu_copy_q_slice_to_qtmp1_precision (qtmp1_dev, q_dev, ndef_c_dev, l_col_dev, idx2_dev, p_col_dev, na2, na, &
985 my_pcol, l_rows, l_rqs, l_rqe, matrixrows, gemm_dim_k, my_stream)
986 else
987 do i = 1, na2
988 l_idx = l_col(idx2(i))
989 if (p_col(idx2(i))==my_pcol) then
990 ndef = ndef+1
991 qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
992 endif
993 enddo
994 endif
995
996 l_cols_qreorg = ndef ! Number of columns in reorganized matrix
997 if (usegpu) then
998 num = na * size_of_int
999#ifdef WITH_GPU_STREAMS
1000 my_stream = obj%gpu_setup%my_stream
1001 successgpu = gpu_memcpy_async(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1002 num, gpumemcpyhosttodevice, my_stream)
1003 check_memcpy_gpu("merge_systems: ", successgpu)
1004
1005#else
1006 successgpu = gpu_memcpy(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1007 num, gpumemcpyhosttodevice)
1008 check_memcpy_gpu("merge_systems: p_col_out_dev", successgpu)
1009#endif
1010 num = na * size_of_int
1011#ifdef WITH_GPU_STREAMS
1012 my_stream = obj%gpu_setup%my_stream
1013 successgpu = gpu_memcpy_async(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1014 num, gpumemcpyhosttodevice, my_stream)
1015 check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)
1016
1017#else
1018 successgpu = gpu_memcpy(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1019 num, gpumemcpyhosttodevice)
1020 check_memcpy_gpu("merge_systems: l_col_out_dev", successgpu)
1021#endif
1022 endif
1023
1024
1025 ! Set (output) Q to 0, it will sum up new Q
1026
1027
1028 if (usegpu) then
1029 call gpu_zero_q_precision (q_dev, p_col_out_dev, l_col_out_dev, na, my_pcol, l_rqs, l_rqe, matrixrows, my_stream)
1030 else
1031 DO i = 1, na
1032 if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
1033 enddo
1034 endif
1035
1036 ! check memory copies
1037
1038 if (usegpu) then
1039 num = na * size_of_int
1040#ifdef WITH_GPU_STREAMS
1041 my_stream = obj%gpu_setup%my_stream
1042 successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &
1043 num, gpumemcpyhosttodevice, my_stream)
1044 check_memcpy_gpu("merge_systems: ", successgpu)
1045#else
1046 successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &
1047 num, gpumemcpyhosttodevice)
1048 check_memcpy_gpu("merge_systems: idx1_dev", successgpu)
1049#endif
1050
1051 num = na * size_of_int
1052#ifdef WITH_GPU_STREAMS
1053 my_stream = obj%gpu_setup%my_stream
1054 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
1055 num, gpumemcpyhosttodevice, my_stream)
1056 check_memcpy_gpu("merge_systems: ", successgpu)
1057#else
1058 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
1059 num, gpumemcpyhosttodevice)
1060 check_memcpy_gpu("merge_systems: p_col_dev", successgpu)
1061#endif
1062
1063 num = na * size_of_int
1064#ifdef WITH_GPU_STREAMS
1065 my_stream = obj%gpu_setup%my_stream
1066 successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &
1067 num, gpumemcpyhosttodevice, my_stream)
1068 check_memcpy_gpu("merge_systems: ", successgpu)
1069#else
1070 successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &
1071 num, gpumemcpyhosttodevice)
1072 check_memcpy_gpu("merge_systems: coltyp_dev", successgpu)
1073#endif
1074
1075 num = na*size_of_datatype
1076#ifdef WITH_GPU_STREAMS
1077 my_stream = obj%gpu_setup%my_stream
1078 successgpu = gpu_memcpy_async(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &
1079 num, gpumemcpyhosttodevice, my_stream)
1080 check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)
1081#else
1082 successgpu = gpu_memcpy(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &
1083 num, gpumemcpyhosttodevice)
1084 check_memcpy_gpu("merge_systems: ev_scale_dev", successgpu)
1085#endif
1086
1087 num = na*size_of_datatype
1088#ifdef WITH_GPU_STREAMS
1089 my_stream = obj%gpu_setup%my_stream
1090 successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), &
1091 num, gpumemcpyhosttodevice, my_stream)
1092 check_memcpy_gpu("merge_systems: z_dev", successgpu)
1093#else
1094 successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), &
1095 num, gpumemcpyhosttodevice)
1096 check_memcpy_gpu("merge_systems: z_dev", successgpu)
1097#endif
1098
1099 num = na*size_of_datatype
1100#ifdef WITH_GPU_STREAMS
1101 my_stream = obj%gpu_setup%my_stream
1102 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &
1103 num, gpumemcpyhosttodevice, my_stream)
1104#else
1105 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &
1106 num, gpumemcpyhosttodevice)
1107 check_memcpy_gpu("merge_systems: d1_dev", successgpu)
1108#endif
1109
1110 num = gemm_dim_l * gemm_dim_m * size_of_datatype
1111#ifdef WITH_GPU_STREAMS
1112 my_stream = obj%gpu_setup%my_stream
1113 successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
1114 num, gpumemcpyhosttodevice, my_stream)
1115 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
1116#else
1117 !TODO the previous loop could be possible to do on device and thus
1118 !copy less
1119 successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
1120 num, gpumemcpyhosttodevice)
1121 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
1122#endif
1123
1124 num = na*size_of_datatype
1125#ifdef WITH_GPU_STREAMS
1126 my_stream = obj%gpu_setup%my_stream
1127 successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &
1128 num, gpumemcpyhosttodevice, my_stream)
1129 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1130#else
1131 successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &
1132 num, gpumemcpyhosttodevice)
1133 check_memcpy_gpu("merge_systems: dbase_dev", successgpu)
1134#endif
1135
1136 num = na*size_of_datatype
1137#ifdef WITH_GPU_STREAMS
1138 successgpu = gpu_stream_synchronize(my_stream)
1139 successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &
1140 num, gpumemcpyhosttodevice, my_stream)
1141 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1142#else
1143 successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &
1144 num, gpumemcpyhosttodevice)
1145 check_memcpy_gpu("merge_systems: ddiff_dev", successgpu)
1146#endif
1147 endif
1148
1149 allocate(nnzu_val(na1,npc_n))
1150 allocate(nnzl_val(na1,npc_n))
1151
1152 nnzu_val(:,:) = 0
1153 nnzl_val(:,:) = 0
1154
1155 if (usegpu) then
1156 num = na1 * npc_n* size_of_int
1157 successgpu = gpu_malloc(nnzu_val_dev, num)
1158 check_alloc_gpu("merge_systems: nnzu_val_dev", successgpu)
1159
1160 num = na1 * npc_n* size_of_int
1161 successgpu = gpu_malloc(nnzl_val_dev, num)
1162 check_alloc_gpu("merge_systems: nnzl_val_dev", successgpu)
1163 endif
1164
1165
1166 np_rem = my_pcol
1167 if (usegpu) then
1168 do np = 1, npc_n
1169 if (np > 1) then
1170 if (np_rem == npc_0) then
1171 np_rem = npc_0+npc_n-1
1172 else
1173 np_rem = np_rem-1
1174 endif
1175 endif
1176 nnzu = 0
1177 nnzl = 0
1178
1179 call gpu_compute_nnzl_nnzu_val_part1 (p_col_dev, idx1_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, &
1180 na, na1, np_rem, npc_n, nnzu_save, nnzl_save, np, my_stream)
1181
1182 enddo ! np = 1, npc_n
1183
1184
1185 nnzu_start = 0
1186 nnzl_start = 0
1187
1188 call gpu_compute_nnzl_nnzu_val_part2 (nnzu_val_dev, nnzl_val_dev, na, na1, nnzu_start, nnzl_start, npc_n, my_stream)
1189 else
1190 ! precompute nnzu_val, nnzl_val
1191 do np = 1, npc_n
1192 if (np > 1) then
1193 if (np_rem == npc_0) then
1194 np_rem = npc_0+npc_n-1
1195 else
1196 np_rem = np_rem-1
1197 endif
1198 endif
1199 nnzu = 0
1200 nnzl = 0
1201 do i=1,na1
1202 if (p_col(idx1(i)) == np_rem) then
1203 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then
1204 nnzu = nnzu+1
1205 nnzu_val(i,np) = nnzu
1206 endif
1207 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then
1208 nnzl = nnzl+1
1209 nnzl_val(i,np) = nnzl
1210 endif
1211 endif
1212 enddo
1213 enddo ! np = 1, npc_n
1214 endif
1215
1216 np_rem = my_pcol
1217
1218 ! is nnzu updated in main loop
1219 ! main loop
1220
1221 do np = 1, npc_n
1222 ! Do a ring send of qtmp1
1223
1224
1225 if (np > 1) then
1226
1227 if (np_rem == npc_0) then
1228 np_rem = npc_0+npc_n-1
1229 else
1230 np_rem = np_rem-1
1231 endif
1232
1233 if (usegpu) then
1234#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
1235 if (useccl) then
1236 my_stream = obj%gpu_setup%my_stream
1237 call gpu_copy_qtmp1_to_qtmp1_tmp_precision (qtmp1_dev, qtmp1_tmp_dev, gemm_dim_k, gemm_dim_l, my_stream)
1238
1239 call obj%timer%start("ccl_send_recv")
1240 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
1241 successgpu = ccl_group_start() ! PETERDEBUG: should this be moved outside of the loop or deleted?
1242 if (.not.successgpu) then
1243 print *,"Error in setting up nccl_group_start!"
1244 stop
1245 endif
1246 successgpu = ccl_send(qtmp1_tmp_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1247#ifdef DOUBLE_PRECISION
1248 ccldouble, &
1249#endif
1250#ifdef SINGLE_PRECISION
1251 cclfloat, &
1252#endif
1253 np_next, ccl_comm_cols, my_stream)
1254 if (.not.successgpu) then
1255 print *,"Error in nccl_send"
1256 stop
1257 endif
1258 successgpu = ccl_recv(qtmp1_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1259#ifdef DOUBLE_PRECISION
1260 ccldouble, &
1261#endif
1262#ifdef SINGLE_PRECISION
1263 cclfloat, &
1264#endif
1265 np_prev, ccl_comm_cols, my_stream)
1266
1267
1268 if (.not.successgpu) then
1269 print *,"Error in ccl_send/ccl_recv"
1270 stop
1271 endif
1272 successgpu = ccl_group_end()
1273 if (.not.successgpu) then
1274 print *,"Error in setting up ccl_group_end!"
1275 stop
1276 endif
1277 successgpu = gpu_stream_synchronize(my_stream)
1278 check_stream_synchronize_gpu("trans_ev", successgpu)
1279 call obj%timer%stop("ccl_send_recv")
1280 else ! useCCL
1281#endif /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */
1282
1283
1284#ifdef WITH_MPI
1285 call obj%timer%start("mpi_communication")
1286#ifdef WITH_GPU_STREAMS
1287 my_stream = obj%gpu_setup%my_stream
1288 successgpu = gpu_stream_synchronize(my_stream)
1289 check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)
1290
1291 successgpu = gpu_memcpy_async(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1292 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost, my_stream)
1293 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1294
1295 my_stream = obj%gpu_setup%my_stream
1296 successgpu = gpu_stream_synchronize(my_stream)
1297 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1298 ! synchronize streamsPerThread; maybe not neccessary
1299 successgpu = gpu_stream_synchronize()
1300 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1301
1302#else
1303 successgpu = gpu_memcpy(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1304 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost)
1305 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1306#endif
1307
1308
1309 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1310 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1311 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
1312#ifdef WITH_GPU_STREAMS
1313 my_stream = obj%gpu_setup%my_stream
1314 successgpu = gpu_stream_synchronize(my_stream)
1315 check_stream_synchronize_gpu("merge_systems qtmp1_dev", successgpu)
1316
1317 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1318 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice, my_stream)
1319 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1320
1321 my_stream = obj%gpu_setup%my_stream
1322 successgpu = gpu_stream_synchronize(my_stream)
1323 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1324 ! synchronize streamsPerThread; maybe not neccessary
1325 successgpu = gpu_stream_synchronize()
1326 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
1327
1328#else
1329 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1330 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice)
1331 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
1332#endif
1333 call obj%timer%stop("mpi_communication")
1334#endif /* WITH_MPI */
1335#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
1336 endif ! useCCL
1337#endif /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */
1338 else ! useGPU
1339#ifdef WITH_MPI
1340 call obj%timer%start("mpi_communication")
1341 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1342 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1343 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
1344 call obj%timer%stop("mpi_communication")
1345#endif /* WITH_MPI */
1346
1347 endif ! useGPU
1348
1349 endif ! (np > 1) then
1350
1351 ! Gather the parts in d1 and z which are fitting to qtmp1.
1352 ! This also delivers nnzu/nnzl for proc np_rem
1353 nnzu = 0
1354 nnzl = 0
1355 if (usegpu) then
1356
1357 my_stream = obj%gpu_setup%my_stream
1358 call gpu_fill_tmp_arrays_precision (idx1_dev, p_col_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, nnzul_dev, &
1359 d1u_dev, d1_dev, zu_dev, z_dev, d1l_dev, zl_dev, na, np, na1, np_rem, my_stream)
1360
1361 num = 2* size_of_int
1362#ifdef WITH_GPU_STREAMS
1363 successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &
1364 num, gpumemcpydevicetohost, my_stream)
1365 check_memcpy_gpu("merge_systems: nnzul_dev", successgpu)
1366#else
1367 successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &
1368 num, gpumemcpydevicetohost)
1369 check_memcpy_gpu("merge_systems: nnzl_val", successgpu)
1370#endif
1371 nnzu = nnzul(1)
1372 nnzl = nnzul(2)
1373
1374 else ! useGPU
1375 do i=1,na1
1376 if (p_col(idx1(i)) == np_rem) then
1377 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then
1378 nnzu = nnzu+1
1379 d1u(nnzu) = d1(i)
1380 zu(nnzu) = z(i)
1381 endif
1382 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then
1383 nnzl = nnzl+1
1384 d1l(nnzl) = d1(i)
1385 zl(nnzl) = z(i)
1386 endif
1387 endif
1388 enddo
1389 endif ! useGPU
1390
1391 ! Set the deflated eigenvectors in Q (comming from proc np_rem)
1392
1393
1394 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1395 if (usegpu) then
1396 ! PETERDEBUG: idx2_dev, problem with garbage values
1397 call gpu_update_ndef_c(ndef_c_dev, idx_dev, p_col_dev, idx2_dev, na, na1, np_rem, ndef, my_stream)
1398
1399 endif ! useGPU
1400
1401 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1402 if (usegpu) then
1403 call gpu_copy_qtmp1_slice_to_q_precision (q_dev, qtmp1_dev, l_col_out_dev, p_col_out_dev, ndef_c_dev, p_col_dev, &
1404 idx2_dev, idx_dev, l_rqs, l_rqe, l_rows, matrixrows, &
1405 gemm_dim_k, my_pcol, na1, np_rem, na, my_stream)
1406 else ! ! useGPU
1407 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
1408 do i = 1, na
1409 j = idx(i)
1410 if (j>na1) then
1411 if (p_col(idx2(j-na1)) == np_rem) then
1412 ndef = ndef+1
1413 if (p_col_out(i) == my_pcol) then
1414 q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
1415 endif
1416 endif
1417 endif
1418 enddo
1419
1420 endif ! useGPU
1421
1422
1423 do ns = 0, nqcols1-1, max_strip ! strimining loop
1424 ncnt = min(max_strip,nqcols1-ns) ! number of columns in this strip
1425
1426 ! Get partial result from (output) Q
1427 if (usegpu) then
1428 call gpu_copy_q_slice_to_qtmp2_precision (q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rows, l_rqs, l_rqe, &
1429 matrixrows, matrixcols, gemm_dim_k, gemm_dim_m, ns, &
1430 ncnt, ind_ex, ind_ex2, na, my_stream)
1431 else ! useGPU
1432!$omp PARALLEL DO &
1433!$omp default(none) &
1434!$omp private(i, j, k) &
1435!$omp SHARED(ns, q, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)
1436 do i = 1, ncnt
1437 j = idxq1(i+ns)
1438 k = l_col_out(j)
1439 qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)
1440 enddo
1441!$OMP END PARALLEL DO
1442 endif ! useGPU
1443
1444 ! Compute eigenvectors of the rank-1 modified matrix.
1445 ! Parts for multiplying with upper half of Q:
1446 if (usegpu) then
1447 if (nnzu .ge. 1) then
1448 ! Calculate the j-th eigenvector of the deflated system
1449 ! See above why we are doing it this way!
1450 call gpu_fill_ev_precision (ev_dev, d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1451 na, gemm_dim_l, gemm_dim_m, nnzu, ns, ncnt, my_stream)
1452 endif ! nnzu
1453
1454 else ! useGPU
1455!$omp PARALLEL DO &
1456!$omp default(none) &
1457!$omp private(i, j, k, tmp) &
1458!$omp shared(ncnt, nnzu, idx, idxq1, ns, d1u, dbase, ddiff, zu, ev_scale, ev)
1459 do i = 1, ncnt
1460 do k = 1, nnzu
1461 j = idx(idxq1(i+ns))
1462 ! Calculate the j-th eigenvector of the deflated system
1463 ! See above why we are doing it this way!
1464
1465 ! kernel here
1466 tmp(k) = d1u(k) - dbase(j)
1467 tmp(k) = tmp(k) + ddiff(j)
1468 ev(k,i) = zu(k) / tmp(k) * ev_scale(j)
1469 enddo
1470 enddo
1471!$OMP END PARALLEL DO
1472 endif ! useGPU
1473
1474
1475 ! Multiply old Q with eigenvectors (upper half)
1476
1477 if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
1478 if (usegpu) then
1479 call obj%timer%start("gpublas_gemm")
1480 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1481 call gpublas_precision_gemm('N', 'N', l_rnm, ncnt, nnzu, &
1482 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
1483 ev_dev, ubound(ev,dim=1), &
1484 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1), gpuhandle)
1485 if (wantdebug) successgpu = gpu_devicesynchronize()
1486 call obj%timer%stop("gpublas_gemm")
1487 else ! useGPU
1488 call obj%timer%start("blas_gemm")
1489 call precision_gemm('N', 'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1490 int(nnzu,kind=blas_kind), &
1491 1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=blas_kind), &
1492 ev, int(ubound(ev,dim=1),kind=blas_kind), &
1493 1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
1494 call obj%timer%stop("blas_gemm")
1495 endif ! useGPU
1496 endif ! (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
1497
1498
1499
1500
1501 ! Compute eigenvectors of the rank-1 modified matrix.
1502 ! Parts for multiplying with lower half of Q:
1503
1504
1505 if (usegpu) then
1506 if (nnzl .ge. 1) then
1507 call gpu_fill_ev_precision (ev_dev, d1l_dev, dbase_dev, ddiff_dev, zl_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1508 na, gemm_dim_l, gemm_dim_m, nnzl, ns, ncnt, my_stream)
1509 endif
1510 else ! useGPU
1511!$omp PARALLEL DO &
1512!$omp private(i, j, k, tmp)
1513 do i = 1, ncnt
1514 do k = 1, nnzl
1515 j = idx(idxq1(i+ns))
1516 ! Calculate the j-th eigenvector of the deflated system
1517 ! See above why we are doing it this way!
1518 tmp(k) = d1l(k) - dbase(j)
1519 tmp(k) = tmp(k) + ddiff(j)
1520 ev(k,i) = zl(k) / tmp(k) * ev_scale(j)
1521 enddo
1522 enddo
1523!$OMP END PARALLEL DO
1524 endif ! useGPU
1525
1526
1527 ! Multiply old Q with eigenvectors (lower half)
1528
1529 if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then
1530 if (usegpu) then
1531 call obj%timer%start("gpublas_gemm")
1532 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1533 call gpublas_precision_gemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, &
1534 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), &
1535 ev_dev, ubound(ev,dim=1), &
1536 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1), gpuhandle)
1537 if (wantdebug) successgpu = gpu_devicesynchronize()
1538 call obj%timer%stop("gpublas_gemm")
1539 else ! useGPU
1540 call obj%timer%start("blas_gemm")
1541 call precision_gemm('N', 'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1542 int(nnzl,kind=blas_kind), &
1543 1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=blas_kind), &
1544 ev, int(ubound(ev,dim=1),kind=blas_kind), &
1545 1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
1546 call obj%timer%stop("blas_gemm")
1547 endif ! useGPU
1548 endif
1549
1550
1551 ! Put partial result into (output) Q
1552 if (usegpu) then
1553 call gpu_copy_qtmp2_slice_to_q_precision(q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rqs, l_rqe, l_rows, ncnt, &
1554 gemm_dim_k, matrixrows, ns, my_stream)
1555 else ! useGPU
1556!$omp PARALLEL DO &
1557!$omp default(none) &
1558!$omp private(i) &
1559!$omp SHARED(q, ns, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)
1560 do i = 1, ncnt
1561 q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
1562 enddo
1563!$OMP END PARALLEL DO
1564 endif ! useGPU
1565
1566
1567 enddo !ns = 0, nqcols1-1, max_strip ! strimining loop
1568 enddo !do np = 1, npc_n
1569
1570
1571 deallocate(nnzu_val, nnzl_val)
1572
1573
1574 if (usegpu) then
1575
1576 num = matrixrows*matrixcols*size_of_datatype
1577#ifdef WITH_GPU_STREAMS
1578 successgpu = gpu_memcpy_async(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
1579 num, gpumemcpydevicetohost, my_stream)
1580 check_memcpy_gpu("merge_systems: q_dev", successgpu)
1581
1582 my_stream = obj%gpu_setup%my_stream
1583 successgpu = gpu_stream_synchronize(my_stream)
1584 check_stream_synchronize_gpu("merge_systems: q_dev", successgpu)
1585#else
1586 successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
1587 num, gpumemcpydevicetohost)
1588 check_memcpy_gpu("merge_systems: q_dev", successgpu)
1589#endif
1590#ifdef WITH_GPU_STREAMS
1591 successgpu = gpu_memcpy_async(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &
1592 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost, my_stream)
1593 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
1594
1595 successgpu = gpu_stream_synchronize(my_stream)
1596 check_stream_synchronize_gpu("merge_systems: ev_dev", successgpu)
1597#else
1598 !TODO the previous loop could be possible to do on device and thus
1599 !copy less
1600 successgpu = gpu_memcpy(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &
1601 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost)
1602 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
1603#endif
1604 endif
1605
1606 if (usegpu) then
1607 deallocate(ndef_c, stat=istat, errmsg=errormessage)
1608 check_deallocate("merge_systems: ndef_c",istat, errormessage)
1609 endif
1610
1611 if (usegpu) then
1612 successgpu = gpu_free(nnzul_dev)
1613 check_dealloc_gpu("merge_systems: nnzul_dev", successgpu)
1614
1615 successgpu = gpu_free(l_col_dev)
1616 check_dealloc_gpu("merge_systems: l_col_dev", successgpu)
1617
1618 successgpu = gpu_free(ndef_c_dev)
1619 check_dealloc_gpu("merge_systems: ndef_c_dev", successgpu)
1620
1621 successgpu = gpu_free(nnzu_val_dev)
1622 check_dealloc_gpu("merge_systems: nnzu_val_dev", successgpu)
1623
1624 successgpu = gpu_free(nnzl_val_dev)
1625 check_dealloc_gpu("merge_systems: nnzl_val_dev", successgpu)
1626
1627 successgpu = gpu_free(idx1_dev)
1628 check_dealloc_gpu("merge_systems: idx1_dev", successgpu)
1629
1630 successgpu = gpu_free(idx2_dev)
1631 check_dealloc_gpu("merge_systems: idx2_dev", successgpu)
1632
1633 successgpu = gpu_free(p_col_dev)
1634 check_dealloc_gpu("merge_systems: p_col_dev", successgpu)
1635
1636 successgpu = gpu_free(p_col_out_dev)
1637 check_dealloc_gpu("merge_systems: p_col_out_dev", successgpu)
1638
1639 successgpu = gpu_free(coltyp_dev)
1640 check_dealloc_gpu("merge_systems: coltyp_dev", successgpu)
1641
1642 successgpu = gpu_free(idx_dev)
1643 check_dealloc_gpu("merge_systems: idx_dev", successgpu)
1644
1645 successgpu = gpu_free(l_col_out_dev)
1646 check_dealloc_gpu("merge_systems: l_col_out_dev", successgpu)
1647
1648 successgpu = gpu_free(idxq1_dev)
1649 check_dealloc_gpu("merge_systems: ", successgpu)
1650
1651 successgpu = gpu_free(q_dev)
1652 check_dealloc_gpu("merge_systems: q_dev", successgpu)
1653
1654 successgpu = gpu_free(d1_dev)
1655 check_dealloc_gpu("merge_systems: d1_dev", successgpu)
1656
1657 successgpu = gpu_free(z_dev)
1658 check_dealloc_gpu("merge_systems: z_dev", successgpu)
1659
1660 successgpu = gpu_free(d1u_dev)
1661 check_dealloc_gpu("merge_systems: d1u_dev", successgpu)
1662
1663 successgpu = gpu_free(dbase_dev)
1664 check_dealloc_gpu("merge_systems: dbase_dev", successgpu)
1665
1666 successgpu = gpu_free(ddiff_dev)
1667 check_dealloc_gpu("merge_systems: ddiff_dev", successgpu)
1668
1669 successgpu = gpu_free(zu_dev)
1670 check_dealloc_gpu("merge_systems: zu_dev", successgpu)
1671
1672 successgpu = gpu_free(ev_scale_dev)
1673 check_dealloc_gpu("merge_systems: ev_scale_dev", successgpu)
1674
1675 successgpu = gpu_free(d1l_dev)
1676 check_dealloc_gpu("merge_systems: d1l_dev", successgpu)
1677
1678 successgpu = gpu_free(zl_dev)
1679 check_dealloc_gpu("merge_systems: zl_dev", successgpu)
1680
1681
1682#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1683 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1684 successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
1685 check_host_unregister_gpu("merge_systems: qtmp1", successgpu)
1686 endif
1687#endif
1688 successgpu = gpu_free(qtmp1_dev)
1689 check_dealloc_gpu("merge_systems: qtmp1_dev", successgpu)
1690
1691 successgpu = gpu_free(qtmp1_tmp_dev)
1692 check_dealloc_gpu("merge_systems: qtmp1_tmp_dev", successgpu)
1693
1694#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1695 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1696 successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
1697 check_host_unregister_gpu("merge_systems: qtmp2", successgpu)
1698 endif
1699#endif
1700 successgpu = gpu_free(qtmp2_dev)
1701 check_dealloc_gpu("merge_systems: qtmp2_dev", successgpu)
1702
1703#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1704 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1705 successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))
1706 check_host_unregister_gpu("merge_systems: ev", successgpu)
1707 endif
1708#endif
1709 successgpu = gpu_free(ev_dev)
1710 check_dealloc_gpu("merge_systems: ev_dev", successgpu)
1711 endif ! useGPU
1712
1713 deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)
1714 check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errormessage)
1715 endif !very outer test (na1==1 .or. na1==2)
1716#ifdef WITH_OPENMP_TRADITIONAL
1717 deallocate(z_p, stat=istat, errmsg=errormessage)
1718 check_deallocate("merge_systems: z_p",istat, errormessage)
1719#endif
1720
1721 call obj%timer%stop("merge_systems" // precision_suffix)
1722
1723 return
1724
1725 end
Definition mod_add_tmp.F90:3
Definition mod_check_monotony.F90:3
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_global_gather.F90:3
Definition mod_global_product.F90:3
Definition mod_resort_ev.F90:3
Definition mod_solve_secular_equation.F90:55
Definition mod_transform_columns.F90:3
Definition mod_v_add_s.F90:55
Definition elpa_abstract_impl.F90:73