Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.05.001
Loading...
Searching...
No Matches
merge_systems_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58 subroutine merge_systems_&
59 &precision &
60 (obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &
61 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
62 use elpa_gpu
63 use, intrinsic :: iso_c_binding
64 use precision
66 use elpa_blas_interfaces
69 use resort_ev
72 use add_tmp
73 use v_add_s
74 use elpa_utilities
75 use elpa_mpi
77#ifdef WITH_OPENMP_TRADITIONAL
78 use omp_lib
79#endif
80 implicit none
81#include "../general/precision_kinds.F90"
82 class(elpa_abstract_impl_t), intent(inout) :: obj
83 integer(kind=ik), intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
84 mpi_comm_cols, npc_0, npc_n
85 integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
86 real(kind=real_datatype), intent(inout) :: d(na), e
87#ifdef USE_ASSUMED_SIZE
88 real(kind=real_datatype), intent(inout) :: q(ldq,*)
89#else
90 real(kind=real_datatype), intent(inout) :: q(ldq,matrixcols)
91#endif
92 logical, intent(in) :: useGPU, wantDebug
93
94 logical, intent(out) :: success
95
96 ! TODO: play with max_strip. If it was larger, matrices being multiplied
97 ! might be larger as well!
98 integer(kind=ik), parameter :: max_strip=128
99
100
101 real(kind=real_datatype) :: beta, sig, s, c, t, tau, rho, eps, tol, &
102 qtrans(2,2), dmax, zmax, d1new, d2new
103 real(kind=real_datatype) :: z(na), d1(na), d2(na), z1(na), delta(na), &
104 dbase(na), ddiff(na), ev_scale(na), tmp(na)
105 real(kind=real_datatype) :: d1u(na), zu(na), d1l(na), zl(na)
106 real(kind=real_datatype), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
107#ifdef WITH_OPENMP_TRADITIONAL
108 real(kind=real_datatype), allocatable :: z_p(:,:)
109#endif
110
111 integer(kind=ik) :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
112 l_rqm, ns, info
113 integer(kind=BLAS_KIND) :: infoBLAS
114 integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
115 l_cols_qreorg, np, l_idx, nqcols1, nqcols2
116 integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
117 np_cols
118 integer(kind=MPI_KIND) :: mpierr
119 integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
120 integer(kind=ik) :: np_next, np_prev, np_rem
121 integer(kind=ik) :: idx(na), idx1(na), idx2(na)
122 integer(kind=BLAS_KIND) :: idxBLAS(NA)
123 integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
124
125 integer(kind=ik) :: istat
126 character(200) :: errorMessage
127 integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
128
129 integer(kind=c_intptr_t) :: num
130 integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
131 logical :: successGPU
132 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
133 &precision&
134 &_real
135 integer(kind=c_intptr_t) :: gpuHandle
136 integer(kind=ik), intent(in) :: max_threads
137 integer(kind=c_intptr_t) :: my_stream
138#ifdef WITH_OPENMP_TRADITIONAL
139 integer(kind=ik) :: my_thread
140
141 allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errormessage)
142 check_allocate("merge_systems: z_p",istat, errormessage)
143#endif
144
145 call obj%timer%start("merge_systems" // precision_suffix)
146 success = .true.
147
148 call obj%timer%start("mpi_communication")
149 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
150 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
151 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
152 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
153
154 my_prow = int(my_prowmpi,kind=c_int)
155 np_rows = int(np_rowsmpi,kind=c_int)
156 my_pcol = int(my_pcolmpi,kind=c_int)
157 np_cols = int(np_colsmpi,kind=c_int)
158
159 call obj%timer%stop("mpi_communication")
160
161
162 ! If my processor column isn't in the requested set, do nothing
163
164 if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
165 call obj%timer%stop("merge_systems" // precision_suffix)
166 return
167 endif
168 ! Determine number of "next" and "prev" column for ring sends
169
170 if (my_pcol == npc_0+npc_n-1) then
171 np_next = npc_0
172 else
173 np_next = my_pcol + 1
174 endif
175
176 if (my_pcol == npc_0) then
177 np_prev = npc_0+npc_n-1
178 else
179 np_prev = my_pcol - 1
180 endif
181 call check_monotony_&
182 &precision&
183 &(obj, nm,d,'Input1',wantdebug, success)
184 if (.not.(success)) then
185 call obj%timer%stop("merge_systems" // precision_suffix)
186 return
187 endif
188 call check_monotony_&
189 &precision&
190 &(obj,na-nm,d(nm+1),'Input2',wantdebug, success)
191 if (.not.(success)) then
192 call obj%timer%stop("merge_systems" // precision_suffix)
193 return
194 endif
195 ! Get global number of processors and my processor number.
196 ! Please note that my_proc does not need to match any real processor number,
197 ! it is just used for load balancing some loops.
198
199 n_procs = np_rows*npc_n
200 my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
201
202
203 ! Local limits of the rows of Q
204
205 l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
206 l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
207 l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
208
209 l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm
210 l_rows = l_rqe-l_rqs+1 ! Total number of local rows
211
212
213 ! My number of local columns
214
215 l_cols = count(p_col(1:na)==my_pcol)
216
217 ! Get max number of local columns
218
219 max_local_cols = 0
220 do np = npc_0, npc_0+npc_n-1
221 max_local_cols = max(max_local_cols,count(p_col(1:na)==np))
222 enddo
223
224 ! Calculations start here
225
226 beta = abs(e)
227 sig = sign(1.0_rk,e)
228
229 ! Calculate rank-1 modifier z
230
231 z(:) = 0
232
233 if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow) then
234 ! nm is local on my row
235 do i = 1, na
236 if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
237 enddo
238 endif
239
240 if (mod((nqoff+nm)/nblk,np_rows)==my_prow) then
241 ! nm+1 is local on my row
242 do i = 1, na
243 if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
244 enddo
245 endif
246
247 call global_gather_&
248 &precision&
249 &(obj, z, na, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
250 if (.not.(success)) then
251 write(error_unit,*) "Error in global_gather. ABorting"
252 success = .false.
253 return
254 endif
255 ! Normalize z so that norm(z) = 1. Since z is the concatenation of
256 ! two normalized vectors, norm2(z) = sqrt(2).
257 z = z/sqrt(2.0_rk)
258 rho = 2.0_rk*beta
259 ! Calculate index for merging both systems by ascending eigenvalues
260 call obj%timer%start("lapack")
261 call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &
262 1_blas_kind, 1_blas_kind, idxblas )
263 idx(:) = int(idxblas(:),kind=ik)
264 call obj%timer%stop("lapack")
265
266 ! Calculate the allowable deflation tolerance
267
268 zmax = maxval(abs(z))
269 dmax = maxval(abs(d))
270 eps = precision_lamch( 'E' ) ! return epsilon
271 tol = 8.0_rk*eps*max(dmax,zmax)
272
273 ! If the rank-1 modifier is small enough, no more needs to be done
274 ! except to reorganize D and Q
275
276 IF ( rho*zmax <= tol ) THEN
277
278 ! Rearrange eigenvalues
279
280 tmp = d
281 do i=1,na
282 d(i) = tmp(idx(i))
283 enddo
284
285 ! Rearrange eigenvectors
286 call resort_ev_&
287 &precision &
288 (obj, idx, na, na, p_col_out, q, ldq, matrixcols, l_rows, l_rqe, &
289 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
290
291 call obj%timer%stop("merge_systems" // precision_suffix)
292
293 return
294 ENDIF
295
296 ! Merge and deflate system
297
298 na1 = 0
299 na2 = 0
300
301 ! COLTYP:
302 ! 1 : non-zero in the upper half only;
303 ! 2 : dense;
304 ! 3 : non-zero in the lower half only;
305 ! 4 : deflated.
306
307 coltyp(1:nm) = 1
308 coltyp(nm+1:na) = 3
309
310 do i=1,na
311
312 if (rho*abs(z(idx(i))) <= tol) then
313
314 ! Deflate due to small z component.
315
316 na2 = na2+1
317 d2(na2) = d(idx(i))
318 idx2(na2) = idx(i)
319 coltyp(idx(i)) = 4
320
321 else if (na1>0) then
322
323 ! Check if eigenvalues are close enough to allow deflation.
324
325 s = z(idx(i))
326 c = z1(na1)
327
328 ! Find sqrt(a**2+b**2) without overflow or
329 ! destructive underflow.
330 tau = precision_lapy2( c, s )
331 t = d1(na1) - d(idx(i))
332 c = c / tau
333 s = -s / tau
334 IF ( abs( t*c*s ) <= tol ) THEN
335
336 ! Deflation is possible.
337
338 na2 = na2+1
339
340 z1(na1) = tau
341
342 d2new = d(idx(i))*c**2 + d1(na1)*s**2
343 d1new = d(idx(i))*s**2 + d1(na1)*c**2
344
345 ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
346 ! This means that after the above transformation it must be
347 ! D1(na1) <= d1new <= D(idx(i))
348 ! D1(na1) <= d2new <= D(idx(i))
349 !
350 ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
351 ! so there is no problem with sorting here.
352 ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
353 ! which makes a check (and possibly a resort) necessary.
354 !
355 ! The above relations may not hold exactly due to numeric differences
356 ! so they have to be enforced in order not to get troubles with sorting.
357
358
359 if (d1new<d1(na1) ) d1new = d1(na1)
360 if (d1new>d(idx(i))) d1new = d(idx(i))
361
362 if (d2new<d1(na1) ) d2new = d1(na1)
363 if (d2new>d(idx(i))) d2new = d(idx(i))
364
365 d1(na1) = d1new
366
367 do j=na2-1,1,-1
368 if (d2new<d2(j)) then
369 d2(j+1) = d2(j)
370 idx2(j+1) = idx2(j)
371 else
372 exit ! Loop
373 endif
374 enddo
375
376 d2(j+1) = d2new
377 idx2(j+1) = idx(i)
378
379 qtrans(1,1) = c; qtrans(1,2) =-s
380 qtrans(2,1) = s; qtrans(2,2) = c
381 call transform_columns_&
382 &precision &
383 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
384 q, ldq, matrixcols, l_rows, mpi_comm_cols, &
385 p_col, l_col, qtrans)
386 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
387 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
388
389 coltyp(idx(i)) = 4
390
391 else
392 na1 = na1+1
393 d1(na1) = d(idx(i))
394 z1(na1) = z(idx(i))
395 idx1(na1) = idx(i)
396 endif
397 else
398 na1 = na1+1
399 d1(na1) = d(idx(i))
400 z1(na1) = z(idx(i))
401 idx1(na1) = idx(i)
402 endif
403
404 enddo
405 call check_monotony_&
406 &precision&
407 &(obj, na1,d1,'Sorted1', wantdebug, success)
408 if (.not.(success)) then
409 call obj%timer%stop("merge_systems" // precision_suffix)
410 return
411 endif
412 call check_monotony_&
413 &precision&
414 &(obj, na2,d2,'Sorted2', wantdebug, success)
415 if (.not.(success)) then
416 call obj%timer%stop("merge_systems" // precision_suffix)
417 return
418 endif
419
420 if (na1==1 .or. na1==2) then
421 ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
422
423 if (na1==1) then
424 d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
425 else ! na1==2
426 call obj%timer%start("lapack")
427 call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))
428 call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))
429 call obj%timer%stop("lapack")
430 call transform_columns_&
431 &precision&
432 &(obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &
433 ldq, matrixcols, l_rows, mpi_comm_cols, &
434 p_col, l_col, qtrans)
435
436 endif
437
438 ! Add the deflated eigenvalues
439 d(na1+1:na) = d2(1:na2)
440
441 ! Calculate arrangement of all eigenvalues in output
442 call obj%timer%start("lapack")
443 call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
444 1_blas_kind, 1_blas_kind, idxblas )
445 idx(:) = int(idxblas(:),kind=ik)
446 call obj%timer%stop("lapack")
447 ! Rearrange eigenvalues
448
449 tmp = d
450 do i=1,na
451 d(i) = tmp(idx(i))
452 enddo
453
454 ! Rearrange eigenvectors
455
456 do i=1,na
457 if (idx(i)<=na1) then
458 idxq1(i) = idx1(idx(i))
459 else
460 idxq1(i) = idx2(idx(i)-na1)
461 endif
462 enddo
463 call resort_ev_&
464 &precision&
465 &(obj, idxq1, na, na, p_col_out, q, ldq, matrixcols, l_rows, l_rqe, &
466 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
467
468 else if (na1>2) then
469
470 ! Solve secular equation
471
472 z(1:na1) = 1
473#ifdef WITH_OPENMP_TRADITIONAL
474 z_p(1:na1,:) = 1
475#endif
476 dbase(1:na1) = 0
477 ddiff(1:na1) = 0
478
479 info = 0
480 infoblas = int(info,kind=blas_kind)
481!#ifdef WITH_OPENMP_TRADITIONAL
482!
483! call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
484!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
485! my_thread = omp_get_thread_num()
486!!$OMP DO
487!#endif
488 DO i = my_proc+1, na1, n_procs ! work distributed over all processors
489 call obj%timer%start("lapack")
490 call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &
491 rho, s, infoblas) ! s is not used!
492 info = int(infoblas,kind=ik)
493 call obj%timer%stop("lapack")
494 if (info/=0) then
495 ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
496 ! use the more stable bisection algorithm in solve_secular_equation
497 ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
498 call solve_secular_equation_&
499 &precision&
500 &(obj, na1, i, d1, z1, delta, rho, s)
501 endif
502
503 ! Compute updated z
504
505!#ifdef WITH_OPENMP_TRADITIONAL
506! do j=1,na1
507! if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
508! enddo
509! z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
510!#else
511 do j=1,na1
512 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
513 enddo
514 z(i) = z(i)*delta(i)
515!#endif
516 ! store dbase/ddiff
517
518 if (i<na1) then
519 if (abs(delta(i+1)) < abs(delta(i))) then
520 dbase(i) = d1(i+1)
521 ddiff(i) = delta(i+1)
522 else
523 dbase(i) = d1(i)
524 ddiff(i) = delta(i)
525 endif
526 else
527 dbase(i) = d1(i)
528 ddiff(i) = delta(i)
529 endif
530 enddo
531!#ifdef WITH_OPENMP_TRADITIONAL
532!!$OMP END PARALLEL
533!
534! call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
535!
536! do i = 0, max_threads-1
537! z(1:na1) = z(1:na1)*z_p(1:na1,i)
538! enddo
539!#endif
540
541 call global_product_&
542 &precision&
543 (obj, z, na1, mpi_comm_rows, mpi_comm_cols, npc_0, npc_n, success)
544 if (.not.(success)) then
545 write(error_unit,*) "Error in global_product. Aborting..."
546 return
547 endif
548 z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )
549
550 call global_gather_&
551 &precision&
552 &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
553 if (.not.(success)) then
554 write(error_unit,*) "Error in global_gather. Aborting..."
555 return
556 endif
557 call global_gather_&
558 &precision&
559 &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
560 if (.not.(success)) then
561 write(error_unit,*) "Error in global_gather. Aborting..."
562 return
563 endif
564 d(1:na1) = dbase(1:na1) - ddiff(1:na1)
565
566 ! Calculate scale factors for eigenvectors
567 ev_scale(:) = 0.0_rk
568
569#ifdef WITH_OPENMP_TRADITIONAL
570
571 call obj%timer%start("OpenMP parallel" // precision_suffix)
572
573!$omp PARALLEL DO &
574!$omp default(none) &
575!$omp private(i) &
576!$omp SHARED(na1, my_proc, n_procs, &
577!$OMP d1, dbase, ddiff, z, ev_scale, obj)
578
579#endif
580 DO i = my_proc+1, na1, n_procs ! work distributed over all processors
581
582 ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code
583 ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
584
585 ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
586 ! in exactly this order, but we want to prevent compiler optimization
587! ev_scale_val = ev_scale(i)
588 call add_tmp_&
589 &precision&
590 &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)
591! ev_scale(i) = ev_scale_val
592 enddo
593#ifdef WITH_OPENMP_TRADITIONAL
594!$OMP END PARALLEL DO
595
596 call obj%timer%stop("OpenMP parallel" // precision_suffix)
597
598#endif
599
600 call global_gather_&
601 &precision&
602 &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
603 if (.not.(success)) then
604 write(error_unit,*) "Error in global_gather. Aborting..."
605 return
606 endif
607 ! Add the deflated eigenvalues
608 d(na1+1:na) = d2(1:na2)
609
610 call obj%timer%start("lapack")
611 ! Calculate arrangement of all eigenvalues in output
612 call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
613 1_blas_kind, 1_blas_kind, idxblas )
614 idx(:) = int(idxblas(:),kind=ik)
615 call obj%timer%stop("lapack")
616 ! Rearrange eigenvalues
617 tmp = d
618 do i=1,na
619 d(i) = tmp(idx(i))
620 enddo
621 call check_monotony_&
622 &precision&
623 &(obj, na,d,'Output', wantdebug, success)
624
625 if (.not.(success)) then
626 call obj%timer%stop("merge_systems" // precision_suffix)
627 return
628 endif
629 ! Eigenvector calculations
630
631
632 ! Calculate the number of columns in the new local matrix Q
633 ! which are updated from non-deflated/deflated eigenvectors.
634 ! idxq1/2 stores the global column numbers.
635
636 nqcols1 = 0 ! number of non-deflated eigenvectors
637 nqcols2 = 0 ! number of deflated eigenvectors
638 DO i = 1, na
639 if (p_col_out(i)==my_pcol) then
640 if (idx(i)<=na1) then
641 nqcols1 = nqcols1+1
642 idxq1(nqcols1) = i
643 else
644 nqcols2 = nqcols2+1
645 idxq2(nqcols2) = i
646 endif
647 endif
648 enddo
649
650 gemm_dim_k = max(1,l_rows)
651 gemm_dim_l = max_local_cols
652 gemm_dim_m = min(max_strip,max(1,nqcols1))
653
654 allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)
655 check_allocate("merge_systems: qtmp1",istat, errormessage)
656
657 allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)
658 check_allocate("merge_systems: ev",istat, errormessage)
659
660 allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)
661 check_allocate("merge_systems: qtmp2",istat, errormessage)
662
663 qtmp1 = 0 ! May contain empty (unset) parts
664 qtmp2 = 0 ! Not really needed
665
666 if (usegpu) then
667 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
668#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
669 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
670 successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&
671 gpuhostregisterdefault)
672 check_host_register_gpu("merge_systems: qtmp1", successgpu)
673 endif
674#endif
675
676 successgpu = gpu_malloc(qtmp1_dev, num)
677 check_alloc_gpu("merge_systems: qtmp1_dev", successgpu)
678
679 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
680#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
681 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
682 successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t),num,&
683 gpuhostregisterdefault)
684 check_host_register_gpu("merge_systems: ev", successgpu)
685 endif
686#endif
687 successgpu = gpu_malloc(ev_dev, num)
688 check_alloc_gpu("merge_systems: ev_dev", successgpu)
689
690
691 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
692#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
693 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
694 successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&
695 gpuhostregisterdefault)
696 check_host_register_gpu("merge_systems: qtmp2", successgpu)
697 endif
698#endif
699 successgpu = gpu_malloc(qtmp2_dev, num)
700 check_alloc_gpu("merge_systems: qtmp2_dev", successgpu)
701 endif !useGPU
702
703 ! Gather nonzero upper/lower components of old matrix Q
704 ! which are needed for multiplication with new eigenvectors
705
706 nnzu = 0
707 nnzl = 0
708 do i = 1, na1
709 l_idx = l_col(idx1(i))
710 if (p_col(idx1(i))==my_pcol) then
711 if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
712 nnzu = nnzu+1
713 qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
714 endif
715 if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
716 nnzl = nnzl+1
717 qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
718 endif
719 endif
720 enddo
721
722 ! Gather deflated eigenvalues behind nonzero components
723
724 ndef = max(nnzu,nnzl)
725 do i = 1, na2
726 l_idx = l_col(idx2(i))
727 if (p_col(idx2(i))==my_pcol) then
728 ndef = ndef+1
729 qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
730 endif
731 enddo
732
733 l_cols_qreorg = ndef ! Number of columns in reorganized matrix
734
735 ! Set (output) Q to 0, it will sum up new Q
736
737 DO i = 1, na
738 if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
739 enddo
740
741 np_rem = my_pcol
742
743 do np = 1, npc_n
744 ! Do a ring send of qtmp1
745
746 if (np > 1) then
747
748 if (np_rem == npc_0) then
749 np_rem = npc_0+npc_n-1
750 else
751 np_rem = np_rem-1
752 endif
753#ifdef WITH_MPI
754 call obj%timer%start("mpi_communication")
755 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
756 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
757 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
758 call obj%timer%stop("mpi_communication")
759#endif /* WITH_MPI */
760 endif
761
762 if (usegpu) then
763 ! copy back after sendrecv
764#ifdef WITH_GPU_STREAMS
765 my_stream = obj%gpu_setup%my_stream
766 successgpu = gpu_stream_synchronize(my_stream)
767 check_stream_synchronize_gpu("tridiag qtmp1_dev", successgpu)
768
769 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
770 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice, my_stream)
771 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
772
773 my_stream = obj%gpu_setup%my_stream
774 successgpu = gpu_stream_synchronize(my_stream)
775 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
776 ! synchronize streamsPerThread; maybe not neccessary
777 successgpu = gpu_stream_synchronize()
778 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
779
780#else
781 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
782 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice)
783 check_memcpy_gpu("merge_systems: qtmp1_dev", successgpu)
784#endif
785 endif
786
787 ! Gather the parts in d1 and z which are fitting to qtmp1.
788 ! This also delivers nnzu/nnzl for proc np_rem
789
790 nnzu = 0
791 nnzl = 0
792 do i=1,na1
793 if (p_col(idx1(i)) == np_rem) then
794 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2) then
795 nnzu = nnzu+1
796 d1u(nnzu) = d1(i)
797 zu(nnzu) = z(i)
798 endif
799 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2) then
800 nnzl = nnzl+1
801 d1l(nnzl) = d1(i)
802 zl(nnzl) = z(i)
803 endif
804 endif
805 enddo
806
807 ! Set the deflated eigenvectors in Q (comming from proc np_rem)
808
809 ndef = max(nnzu,nnzl) ! Remote counter in input matrix
810 do i = 1, na
811 j = idx(i)
812 if (j>na1) then
813 if (p_col(idx2(j-na1)) == np_rem) then
814 ndef = ndef+1
815 if (p_col_out(i) == my_pcol) then
816 q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
817 endif
818 endif
819 endif
820 enddo
821
822 do ns = 0, nqcols1-1, max_strip ! strimining loop
823
824 ncnt = min(max_strip,nqcols1-ns) ! number of columns in this strip
825
826 ! Get partial result from (output) Q
827!$omp PARALLEL DO &
828!$omp default(none) &
829!$omp private(i, j, k) &
830!$omp SHARED(ns, q, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)
831 do i = 1, ncnt
832 j = idxq1(i+ns)
833 k = l_col_out(j)
834 qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)
835 enddo
836!$OMP END PARALLEL DO
837 ! Compute eigenvectors of the rank-1 modified matrix.
838 ! Parts for multiplying with upper half of Q:
839!$omp PARALLEL DO &
840!$omp default(none) &
841!$omp private(i, j, k, tmp) &
842!$omp shared(ncnt, nnzu, idx, idxq1, ns, d1u, dbase, ddiff, zu, ev_scale, ev)
843 do i = 1, ncnt
844 do k = 1, nnzu
845 j = idx(idxq1(i+ns))
846 ! Calculate the j-th eigenvector of the deflated system
847 ! See above why we are doing it this way!
848 tmp(k) = d1u(k) - dbase(j)
849 tmp(k) = tmp(k) + ddiff(j)
850 ev(k,i) = zu(k) / tmp(k) * ev_scale(j)
851 enddo
852 enddo
853!$OMP END PARALLEL DO
854
855 if (usegpu) then
856 !TODO: it should be enough to copy l_rows x ncnt
857 ! copy to device
858#ifdef WITH_GPU_STREAMS
859 successgpu = gpu_stream_synchronize(my_stream)
860 check_stream_synchronize_gpu("tridiag qtmp2_dev", successgpu)
861
862 successgpu = gpu_memcpy_async(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
863 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, my_stream)
864 check_memcpy_gpu("merge_systems: qtmp2_dev", successgpu)
865
866 !TODO the previous loop could be possible to do on device and thus
867 !copy less
868 successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
869 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, my_stream)
870 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
871
872 successgpu = gpu_stream_synchronize(my_stream)
873 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
874 ! synchronize streamsPerThread; maybe not neccessary
875 successgpu = gpu_stream_synchronize()
876 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
877#else
878 successgpu = gpu_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
879 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)
880 check_memcpy_gpu("merge_systems: qtmp2_dev", successgpu)
881
882 !TODO the previous loop could be possible to do on device and thus
883 !copy less
884 successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
885 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)
886 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
887#endif
888 endif
889
890 ! Multiply old Q with eigenvectors (upper half)
891
892 if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
893 if (usegpu) then
894 call obj%timer%start("gpublas")
895 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
896 call gpublas_precision_gemm('N', 'N', l_rnm, ncnt, nnzu, &
897 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
898 ev_dev, ubound(ev,dim=1), &
899 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1), gpuhandle)
900 call obj%timer%stop("gpublas")
901 else ! useGPU
902 call obj%timer%start("blas")
903 call obj%timer%start("gemm")
904 call precision_gemm('N', 'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
905 int(nnzu,kind=blas_kind), &
906 1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=blas_kind), &
907 ev, int(ubound(ev,dim=1),kind=blas_kind), &
908 1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
909 call obj%timer%stop("gemm")
910 call obj%timer%stop("blas")
911 endif ! useGPU
912 endif
913
914 ! Compute eigenvectors of the rank-1 modified matrix.
915 ! Parts for multiplying with lower half of Q:
916
917!$omp PARALLEL DO &
918!$omp private(i, j, k, tmp)
919 do i = 1, ncnt
920 do k = 1, nnzl
921 j = idx(idxq1(i+ns))
922 ! Calculate the j-th eigenvector of the deflated system
923 ! See above why we are doing it this way!
924 tmp(k) = d1l(k) - dbase(j)
925 tmp(k) = tmp(k) + ddiff(j)
926 ev(k,i) = zl(k) / tmp(k) * ev_scale(j)
927 enddo
928 enddo
929!$OMP END PARALLEL DO
930
931 if (usegpu) then
932 !TODO the previous loop could be possible to do on device and thus
933 !copy less
934#ifdef WITH_GPU_STREAMS
935 successgpu = gpu_stream_synchronize(my_stream)
936 check_stream_synchronize_gpu("tridiag qtmp1_dev", successgpu)
937
938 successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
939 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, &
940 my_stream)
941 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
942
943 successgpu = gpu_stream_synchronize(my_stream)
944 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
945 ! synchronize streamsPerThread; maybe not neccessary
946 successgpu = gpu_stream_synchronize()
947 check_stream_synchronize_gpu("merge_systems: qtmp1_dev", successgpu)
948#else
949 successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
950 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)
951 check_memcpy_gpu("merge_systems: ev_dev", successgpu)
952#endif
953 endif
954
955 ! Multiply old Q with eigenvectors (lower half)
956
957 if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then
958 if (usegpu) then
959 call obj%timer%start("gpublas")
960 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
961 call gpublas_precision_gemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, &
962 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), &
963 ev_dev, ubound(ev,dim=1), &
964 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1), gpuhandle)
965 call obj%timer%stop("gpublas")
966 else ! useGPU
967 call obj%timer%start("blas")
968 call obj%timer%start("gemm")
969 call precision_gemm('N', 'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
970 int(nnzl,kind=blas_kind), &
971 1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=blas_kind), &
972 ev, int(ubound(ev,dim=1),kind=blas_kind), &
973 1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
974 call obj%timer%stop("gemm")
975 call obj%timer%stop("blas")
976 endif ! useGPU
977 endif
978
979 if (usegpu) then
980 !TODO either copy only half of the matrix here, and get rid of the
981 !previous copy or copy whole array here
982
983 ! COPY BACK
984#ifdef WITH_GPU_STREAMS
985 successgpu = gpu_stream_synchronize(my_stream)
986 check_stream_synchronize_gpu("tridiag qtmp2_dev", successgpu)
987
988 successgpu = gpu_memcpy_async(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
989 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost, my_stream)
990 check_memcpy_gpu("merge_systems: qtmp2_dev", successgpu)
991
992 successgpu = gpu_stream_synchronize(my_stream)
993 check_stream_synchronize_gpu("merge_systems: qtmp2_dev", successgpu)
994 ! synchronize streamsPerThread; maybe not neccessary
995 successgpu = gpu_stream_synchronize()
996 check_stream_synchronize_gpu("merge_systems: qtmp2_dev", successgpu)
997#else
998 successgpu = gpu_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
999 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost)
1000 check_memcpy_gpu("merge_systems: qtmp2_dev", successgpu)
1001#endif
1002 endif
1003
1004
1005 ! Put partial result into (output) Q
1006
1007!$omp PARALLEL DO &
1008!$omp default(none) &
1009!$omp private(i) &
1010!$omp SHARED(q, ns, l_rqs, l_rqe, l_col_out, idxq1, qtmp2, l_rows, ncnt)
1011 do i = 1, ncnt
1012 q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
1013 enddo
1014!$OMP END PARALLEL DO
1015
1016 enddo !ns = 0, nqcols1-1, max_strip ! strimining loop
1017 enddo !do np = 1, npc_n
1018
1019 if (usegpu) then
1020
1021#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1022 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1023 successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
1024 check_host_unregister_gpu("merge_systems: qtmp1", successgpu)
1025 endif
1026#endif
1027 successgpu = gpu_free(qtmp1_dev)
1028 check_dealloc_gpu("merge_systems: qtmp1_dev", successgpu)
1029
1030#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1031 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1032 successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
1033 check_host_unregister_gpu("merge_systems: qtmp2", successgpu)
1034 endif
1035#endif
1036 successgpu = gpu_free(qtmp2_dev)
1037 check_dealloc_gpu("merge_systems: qtmp2_dev", successgpu)
1038
1039#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1040 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu) then
1041 successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))
1042 check_host_unregister_gpu("merge_systems: ev", successgpu)
1043 endif
1044#endif
1045 successgpu = gpu_free(ev_dev)
1046 check_dealloc_gpu("merge_systems: ev_dev", successgpu)
1047 endif ! useGPU
1048
1049 deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)
1050 check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errormessage)
1051 endif !very outer test (na1==1 .or. na1==2)
1052#ifdef WITH_OPENMP_TRADITIONAL
1053 deallocate(z_p, stat=istat, errmsg=errormessage)
1054 check_deallocate("merge_systems: z_p",istat, errormessage)
1055#endif
1056
1057 call obj%timer%stop("merge_systems" // precision_suffix)
1058
1059 return
1060
1061 end subroutine merge_systems_&
1062 &precision
Definition mod_add_tmp.F90:3
Definition mod_check_monotony.F90:3
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_global_gather.F90:3
Definition mod_global_product.F90:3
Definition mod_resort_ev.F90:3
Definition mod_solve_secular_equation.F90:55
Definition mod_transform_columns.F90:3
Definition mod_v_add_s.F90:55
Definition elpa_abstract_impl.F90:73