Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.05.001
Loading...
Searching...
No Matches
elpa2_utils_template.F90
Go to the documentation of this file.
1#if 0
2! Copyright 2024, A. Marek
3!
4! This file is part of ELPA.
5!
6! The ELPA library was originally created by the ELPA consortium,
7! consisting of the following organizations:
8!
9! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
10! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
11! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
12! Informatik,
13! - Technische Universität München, Lehrstuhl für Informatik mit
14! Schwerpunkt Wissenschaftliches Rechnen ,
15! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
16! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
17! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
18! and
19! - IBM Deutschland GmbH
20!
21! This particular source code file contains additions, changes and
22! enhancements authored by Intel Corporation which is not part of
23! the ELPA consortium.
24!
25! More information can be found here:
26! http://elpa.mpcdf.mpg.de/
27!
28! ELPA is free software: you can redistribute it and/or modify
29! it under the terms of the version 3 of the license of the
30! GNU Lesser General Public License as published by the Free
31! Software Foundation.
32!
33! ELPA is distributed in the hope that it will be useful,
34! but WITHOUT ANY WARRANTY; without even the implied warranty of
35! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
36! GNU Lesser General Public License for more details.
37!
38! You should have received a copy of the GNU Lesser General Public License
39! along with ELPA. If not, see <http://www.gnu.org/licenses/>
40!
41! ELPA reflects a substantial effort on the part of the original
42! ELPA consortium, and we ask you to respect the spirit of the
43! license that we chose: i.e., please contribute any changes you
44! may have back to the original ELPA library distribution, and keep
45! any derivatives of ELPA under the same license that we chose for
46! the original distribution, the GNU Lesser General Public License.
47!
48! This file was written by A. Marek, MPCDF
49#endif
50
51subroutine get_hh_vec_&
52&math_datatype&
53&_&
54&precision &
55(obj, my_prow, nrow, nblk, np_rows, mpi_comm_rows, lr, allreduce_request1, &
56 usenonblockingcollectivesrows, wantdebug, vec_in, vr, tau, vrl)
57 use precision
58 use elpa1_compute
59
61 use elpa_mpi
62 use elpa_utilities
63
64 implicit none
65#include "../general/precision_kinds.F90"
66
67 class(elpa_abstract_impl_t), intent(inout) :: obj
68 integer(kind=ik), intent(in) :: nrow, my_prow, nblk, np_rows, lr
69 integer(kind=ik), intent(in) :: mpi_comm_rows
70 logical, intent(in) :: useNonBlockingCollectivesRows, wantDebug
71 integer(kind=MPI_KIND) :: mpierr
72 integer(kind=MPI_KIND), intent(inout) :: allreduce_request1
73 math_datatype(kind=rck):: vr(:), vec_in(:), tau, vrl
74 math_datatype(kind=rck):: aux1(2), xf
75 real(kind=rk):: vnorm2
76 ! Get Vector to be transformed; distribute last element and norm of
77 ! remaining elements to all procs in current column
78
79 if (my_prow==prow(nrow, nblk, np_rows)) then
80 aux1(1) = dot_product(vec_in(1:lr-1),vec_in(1:lr-1))
81 aux1(2) = vec_in(lr)
82 else
83 aux1(1) = dot_product(vec_in(1:lr),vec_in(1:lr))
84 aux1(2) = 0.0_rck
85 endif
86
87#ifdef WITH_MPI
88 if (usenonblockingcollectivesrows) then
89 if (wantdebug) call obj%timer%start("mpi_nbc_communication")
90 call mpi_iallreduce(mpi_in_place, aux1, 2_mpi_kind, mpi_math_datatype_precision, &
91 mpi_sum, int(mpi_comm_rows,kind=mpi_kind), &
92 allreduce_request1, mpierr)
93
94 call mpi_wait(allreduce_request1, mpi_status_ignore, mpierr)
95
96 if (wantdebug) call obj%timer%stop("mpi_nbc_communication")
97 else
98 ! if (wantDebug) call obj%timer%start("mpi_comm")
99 call mpi_allreduce(mpi_in_place, aux1, 2_mpi_kind, mpi_math_datatype_precision, &
100 mpi_sum, int(mpi_comm_rows,kind=mpi_kind), &
101 mpierr)
102
103 ! if (wantDebug) call obj%timer%stop("mpi_comm")
104 endif
105
106#endif
107
108#if REALCASE == 1
109 vnorm2 = aux1(1)
110#endif
111#if COMPLEXCASE == 1
112 vnorm2 = real(aux1(1),kind=rk)
113#endif
114 vrl = aux1(2)
115
116 ! Householder transformation
117 call hh_transform_&
118 &math_datatype&
119 &_&
120 &precision &
121 (obj, vrl, vnorm2, xf, tau, wantdebug)
122 ! Scale vr and store Householder Vector for back transformation
123
124 vr(1:lr) = vec_in(1:lr) * xf
125 if (my_prow==prow(nrow, nblk, np_rows)) vr(lr) = 1.0_rck
126
127end
128
129
130subroutine apply_ht_&
131&math_datatype&
132&_&
133&precision &
134 (obj, max_threads, lr, nbw, mpi_comm_rows, usenonblockingcollectivesrows, allreduce_request3, wantdebug, tau, vr, ex_buff2d)
135 use precision
137 use elpa_mpi
138 use elpa_utilities
139
140 implicit none
141 class(elpa_abstract_impl_t), intent(inout) :: obj
142 integer(kind=MPI_KIND) :: mpierr
143 integer(kind=ik),intent(in) :: max_threads, lr, nbw
144 integer(kind=ik), intent(in) :: mpi_comm_rows
145 logical, intent(in) :: usenonblockingcollectivesrows, wantdebug
146 integer(kind=MPI_KIND), intent(inout) :: allreduce_request3
147
148#include "../general/precision_kinds.F90"
149 math_datatype(kind=rck) :: tau, vr(:), ex_buff2d(:,:)
150 math_datatype(kind=rck) :: tauc
151 math_datatype(kind=rck) :: aux1(nbw)
152 integer :: nlc, imax
153 logical :: use_blas
154
155 imax=ubound(ex_buff2d,2)
156
157 if((imax.lt.3).or.(max_threads.gt.1)) then
158 !don't use BLAS for very small imax because overhead is too high
159 !don't use BLAS with OpenMP because measurements showed that threading is not effective for these routines
160 use_blas=.false.
161 else
162 use_blas=.true.
163 end if
164
165 !we need to transform the remaining ex_buff
166 if (lr>0) then
167 if(use_blas) then !note that aux1 is conjg between > and < thresh_blas!!
168 call precision_gemv(blas_trans_or_conj,int(lr,kind=blas_kind),int(imax,kind=blas_kind), &
169 one, ex_buff2d, size(ex_buff2d,1,kind=blas_kind), vr, 1_blas_kind, zero, aux1, &
170 1_blas_kind)
171 else
172#ifdef WITH_OPENMP_TRADITIONAL
173 !$omp parallel do private(nlc)
174#endif
175 do nlc=1,imax
176 aux1(nlc) = dot_product(vr(1:lr),ex_buff2d(1:lr,nlc))
177 end do
178#ifdef WITH_OPENMP_TRADITIONAL
179 !$omp end parallel do
180#endif
181 end if
182 else
183 aux1(1:imax) = 0.
184 end if
185
186 ! Get global dot products
187#ifdef WITH_MPI
188 if (usenonblockingcollectivesrows) then
189 if (wantdebug) call obj%timer%start("mpi_nbc_communication")
190 if (imax > 0) then
191 call mpi_iallreduce(mpi_in_place, aux1, int(imax,kind=mpi_kind), mpi_math_datatype_precision, &
192 mpi_sum, int(mpi_comm_rows,kind=mpi_kind), &
193 allreduce_request3, mpierr)
194 call mpi_wait(allreduce_request3, mpi_status_ignore, mpierr)
195 endif
196 if (wantdebug) call obj%timer%stop("mpi_nbc_communication")
197 else
198 if (wantdebug) call obj%timer%start("mpi_communication")
199 if (imax>0) then
200 call mpi_allreduce(mpi_in_place, aux1, int(imax,kind=mpi_kind), mpi_math_datatype_precision, &
201 mpi_sum, int(mpi_comm_rows,kind=mpi_kind), &
202 mpierr)
203 endif
204 if (wantdebug) call obj%timer%stop("mpi_communication")
205 endif
206#endif /* WITH_MPI */
207
208 if(lr.le.0) return !no data on this processor
209
210 ! Transform
211#if REALCASE == 1
212 tauc=-tau
213#else
214 tauc=-conjg(tau)
215#endif
216 if(use_blas) then
217 call precision_gerc(int(lr,kind=blas_kind),int(imax,kind=blas_kind),tauc,vr,1_blas_kind,&
218 aux1,1_blas_kind,ex_buff2d,ubound(ex_buff2d,1,kind=blas_kind))
219 else
220#ifdef WITH_OPENMP_TRADITIONAL
221 !$omp parallel do private(nlc)
222#endif
223 do nlc=1,imax
224 ex_buff2d(1:lr,nlc) = ex_buff2d(1:lr,nlc) + tauc*aux1(nlc)*vr(1:lr)
225 end do
226#ifdef WITH_OPENMP_TRADITIONAL
227 !$omp end parallel do
228#endif
229 end if
230
231end
232
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition elpa_abstract_impl.F90:73