Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.01.002
Loading...
Searching...
No Matches
solve_tridi_single_problem_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58
59#ifdef SOLVE_TRIDI_GPU_BUILD
60 subroutine solve_tridi_single_problem_gpu_&
61 &precision_and_suffix &
62 (obj, nlen, d_dev, e_dev, q_dev, ldq, qtmp_dev, wantdebug, success)
63#else
64 subroutine solve_tridi_single_problem_cpu_&
65 &precision_and_suffix &
66 (obj, nlen, d, e, q, ldq, wantdebug, success)
67#endif
68 ! PETERDEBUG: calling from solve_trodi_col: q_dev=qmat1_dev, ldq=max_size
69 ! qmat1(max_size, max_size) -> q(ldq, ldq), and not q(ldq,nlen)!! same for q_dev
70 ! but that's fine if nlen<=ldq, then not the whole matrix is used. otherwise can lead to errors
71 ! Now: called from two places differently: with np_rows==1 and np_rows>1 and should be treated with care
72
73 ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
74 ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
75 use precision
77 use elpa_blas_interfaces
78 use elpa_utilities
79 use elpa_gpu
80 use solve_single_problem_gpu
81 implicit none
82 class(elpa_abstract_impl_t), intent(inout) :: obj
83 logical :: useGPU, useGPUsolver
84 integer(kind=ik) :: nlen, ldq
85 real(kind=real_datatype) :: d(nlen), e(nlen), q(ldq,nlen)
86
87 real(kind=real_datatype), allocatable :: work(:), qtmp(:), ds(:), es(:)
88 real(kind=real_datatype) :: dtmp
89
90 integer(kind=ik) :: i, j, lwork, liwork, info
91 integer(kind=BLAS_KIND) :: infoBLAS
92 integer(kind=ik), allocatable :: iwork(:)
93 !real(kind=REAL_DATATYPE), allocatable :: mat(:,:)
94
95 logical, intent(in) :: wantDebug
96 logical, intent(out) :: success
97 integer(kind=ik) :: istat
98 character(200) :: errorMessage
99 integer(kind=c_intptr_t) :: num, my_stream
100 integer(kind=c_intptr_t) :: q_dev, d_dev, info_dev, qtmp_dev, e_dev
101 logical :: successGPU
102 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
103 &precision&
104 &_real
105
106 integer(kind=c_intptr_t) :: gpusolverHandle
107
108 ! print *, "solve_tridi_single_problem_gpu: nlen=", nlen, " ldq=", ldq ! PETERDEBUG
109
110 usegpu =.false.
111 usegpusolver =.false.
112#ifdef SOLVE_TRIDI_GPU_BUILD
113 usegpu =.true.
114
115#if defined(WITH_NVIDIA_CUSOLVER)
116 usegpusolver =.true.
117#endif
118#if defined(WITH_AMD_ROCSOLVER)
119 usegpusolver =.false. ! As of ELPA 2025.01 release, rocsolver_?stedc/rocsolver_?syevd showed bad performance (worse than on CPU). Hopefully, this will be fixed by AMD and then we can enable it.
120#endif
121#endif /* SOLVE_TRIDI_GPU_BUILD */
122
123 call obj%timer%start("solve_tridi_single" // precision_suffix)
124
125 success = .true.
126 allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)
127 check_allocate("solve_tridi_single: ds, es", istat, errormessage)
128
129 if (usegpusolver) then
130 num = 1 * size_of_int
131 successgpu = gpu_malloc(info_dev, num)
132 check_alloc_gpu("solve_tridi_single info_dev: ", successgpu)
133
134 my_stream = obj%gpu_setup%my_stream
135 call gpu_construct_tridi_matrix_precision (q_dev, d_dev, e_dev, nlen, ldq, my_stream)
136 endif
137
138 ! Save d and e for the case that dstedc fails
139#if defined(WITH_NVIDIA_CUSOLVER) || defined(WITH_AMD_ROCSOLVER)
140 if (.not.usegpu) then
141 ds(:) = d(:)
142 es(:) = e(:)
143 endif
144#endif
145
146 if (usegpu) then
147 if (.not. usegpusolver) then
148 ! use CPU solver
149
150 ! copy to host
151 num = nlen * size_of_datatype
152#ifdef WITH_GPU_STREAMS
153 my_stream = obj%gpu_setup%my_stream
154 successgpu = gpu_memcpy_async(int(loc(d(1)),kind=c_intptr_t), d_dev, &
155 num, gpumemcpydevicetohost, my_stream)
156 check_memcpy_gpu("solve_tridi_single: d_dev ", successgpu)
157#else
158 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
159 num, gpumemcpydevicetohost)
160 check_memcpy_gpu("solve_tridi_single: d_dev", successgpu)
161#endif
162 num = nlen * size_of_datatype
163#ifdef WITH_GPU_STREAMS
164 my_stream = obj%gpu_setup%my_stream
165 successgpu = gpu_memcpy_async(int(loc(e(1)),kind=c_intptr_t), e_dev, &
166 num, gpumemcpydevicetohost, my_stream)
167 check_memcpy_gpu("solve_tridi_single: e_dev ", successgpu)
168#else
169 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
170 num, gpumemcpydevicetohost)
171 check_memcpy_gpu("solve_tridi_single: e_dev", successgpu)
172#endif
173
174 ds(:) = d(:)
175 es(:) = e(:)
176
177#include "./solve_tridi_single_problem_include.F90"
178
179 ! copy back
180 num = nlen * size_of_datatype
181#ifdef WITH_GPU_STREAMS
182 my_stream = obj%gpu_setup%my_stream
183 successgpu = gpu_memcpy_async(d_dev, int(loc(d(1)),kind=c_intptr_t), &
184 num, gpumemcpyhosttodevice, my_stream)
185 check_memcpy_gpu("solve_tridi_single: d_dev ", successgpu)
186#else
187 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
188 num, gpumemcpyhosttodevice)
189 check_memcpy_gpu("solve_tridi_single: d_dev", successgpu)
190#endif
191 num = nlen * size_of_datatype
192#ifdef WITH_GPU_STREAMS
193 my_stream = obj%gpu_setup%my_stream
194 successgpu = gpu_memcpy_async(e_dev, int(loc(e(1)),kind=c_intptr_t), &
195 num, gpumemcpyhosttodevice, my_stream)
196 check_memcpy_gpu("solve_tridi_single: e_dev ", successgpu)
197#else
198 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
199 num, gpumemcpyhosttodevice)
200 check_memcpy_gpu("solve_tridi_single: e_dev", successgpu)
201#endif
202
203! fails
204 num = ldq*nlen * size_of_datatype
205#ifdef WITH_GPU_STREAMS
206 my_stream = obj%gpu_setup%my_stream
207 successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
208 num, gpumemcpyhosttodevice, my_stream)
209 check_memcpy_gpu("solve_tridi_single: q_dev1 ", successgpu)
210#else
211 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
212 num, gpumemcpyhosttodevice)
213 check_memcpy_gpu("solve_tridi_single: q_dev1", successgpu)
214#endif
215
216 else ! (.not. useGPUsolver)
217
218 call obj%timer%start("gpusolver_syevd")
219 gpusolverhandle = obj%gpu_setup%gpusolverHandleArray(0)
220 call gpusolver_precision_syevd (nlen, q_dev, ldq, d_dev, info_dev, gpusolverhandle)
221 if (wantdebug) successgpu = gpu_devicesynchronize()
222 call obj%timer%stop("gpusolver_syevd")
223
224 num = 1 * size_of_int
225#ifdef WITH_GPU_STREAMS
226 my_stream = obj%gpu_setup%my_stream
227 successgpu = gpu_memcpy_async(int(loc(info),kind=c_intptr_t), info_dev, &
228 num, gpumemcpydevicetohost, my_stream)
229 check_memcpy_gpu("solve_tridi_single: ", successgpu)
230
231 successgpu = gpu_stream_synchronize(my_stream)
232 check_stream_synchronize_gpu("solve_tridi_single: info_dev -> info", successgpu)
233#else
234 successgpu = gpu_memcpy(int(loc(info),kind=c_intptr_t), info_dev, &
235 num, gpumemcpydevicetohost)
236 check_memcpy_gpu("solve_tridi_single: info_dev", successgpu)
237#endif
238
239 if (info .ne. 0) then
240 write(error_unit,'(a,i8,a)') "Error in gpusolver_PRECISION_syevd, info=", info, ", aborting..."
241 stop 1
242 endif
243 endif ! (.not. useGPUsolver)
244!!
245!! else
246! !copy to host
247! num = nlen * size_of_datatype
248!#ifdef WITH_GPU_STREAMS
249! my_stream = obj%gpu_setup%my_stream
250! successGPU = gpu_memcpy_async(int(loc(d(1)),kind=c_intptr_t), d_dev, &
251! num, gpuMemcpyDeviceToHost, my_stream)
252! check_memcpy_gpu("solve_tridi_single: d_dev ", successGPU)
253!#else
254! successGPU = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
255! num, gpuMemcpyDeviceToHost)
256! check_memcpy_gpu("solve_tridi_single: d_dev", successGPU)
257!#endif
258! num = nlen * size_of_datatype
259!#ifdef WITH_GPU_STREAMS
260! my_stream = obj%gpu_setup%my_stream
261! successGPU = gpu_memcpy_async(int(loc(e(1)),kind=c_intptr_t), e_dev, &
262! num, gpuMemcpyDeviceToHost, my_stream)
263! check_memcpy_gpu("solve_tridi_single: e_dev ", successGPU)
264!#else
265! successGPU = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
266! num, gpuMemcpyDeviceToHost)
267! check_memcpy_gpu("solve_tridi_single: e_dev", successGPU)
268!#endif
269!
270! ds(:) = d(:)
271! es(:) = e(:)
272!
273!#endif /* defined(WITH_NVIDIA_CUSOLVER) || defined(WITH_AMD_ROCSOLVER) */
274!
275!#include "./solve_tridi_single_problem_include.F90"
276!
277!
278!#if defined(WITH_NVIDIA_CUSOLVER) || defined(WITH_AMD_ROCSOLVER)
279! !copy back
280! num = nlen * size_of_datatype
281!#ifdef WITH_GPU_STREAMS
282! my_stream = obj%gpu_setup%my_stream
283! successGPU = gpu_memcpy_async(d_dev, int(loc(d(1)),kind=c_intptr_t), &
284! num, gpuMemcpyHostToDevice, my_stream)
285! check_memcpy_gpu("solve_tridi_single: d_dev ", successGPU)
286!#else
287! successGPU = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
288! num, gpuMemcpyHostToDevice)
289! check_memcpy_gpu("solve_tridi_single: d_dev", successGPU)
290!#endif
291! num = nlen * size_of_datatype
292!#ifdef WITH_GPU_STREAMS
293! my_stream = obj%gpu_setup%my_stream
294! successGPU = gpu_memcpy_async(e_dev, int(loc(e(1)),kind=c_intptr_t), &
295! num, gpuMemcpyHostToDevice, my_stream)
296! check_memcpy_gpu("solve_tridi_single: e_dev ", successGPU)
297!#else
298! successGPU = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
299! num, gpuMemcpyHostToDevice)
300! check_memcpy_gpu("solve_tridi_single: e_dev", successGPU)
301!#endif
302!
303!! fails
304! num = ldq*nlen * size_of_datatype
305!#ifdef WITH_GPU_STREAMS
306! my_stream = obj%gpu_setup%my_stream
307! successGPU = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
308! num, gpuMemcpyHostToDevice, my_stream)
309! check_memcpy_gpu("solve_tridi_single: q_dev1 ", successGPU)
310!#else
311! successGPU = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
312! num, gpuMemcpyHostToDevice)
313! check_memcpy_gpu("solve_tridi_single: q_dev1", successGPU)
314!#endif
315!! endif ! nlen
316
317
318 else ! useGPU
319#include "./solve_tridi_single_problem_include.F90"
320 endif ! useGPU
321
322 if (usegpusolver) then
323 successgpu = gpu_free(info_dev)
324 check_dealloc_gpu("solve_tridi_single: info_dev", successgpu)
325 endif
326
327 ! Check if eigenvalues are monotonically increasing
328 ! This seems to be not always the case (in the IBM implementation of dstedc ???)
329
330 if (usegpu) then
331 my_stream = obj%gpu_setup%my_stream
332 call gpu_check_monotony_precision (d_dev, q_dev, qtmp_dev, nlen, ldq, my_stream)
333 else
334 do i=1,nlen-1
335 if (d(i+1)<d(i)) then
336#ifdef DOUBLE_PRECISION_REAL
337 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
338#else
339 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
340#endif
341 write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
342 else
343 write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
344 write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
345 write(error_unit,'(a)') 'In this extent, this is completely harmless.'
346 write(error_unit,'(a)') 'Still, we keep this info message just in case.'
347 end if
348 allocate(qtmp(nlen), stat=istat, errmsg=errormessage)
349 check_allocate("solve_tridi_single: qtmp", istat, errormessage)
350
351 dtmp = d(i+1)
352 qtmp(1:nlen) = q(1:nlen,i+1)
353 do j=i,1,-1
354 if (dtmp<d(j)) then
355 d(j+1) = d(j)
356 q(1:nlen,j+1) = q(1:nlen,j)
357 else
358 exit ! Loop
359 endif
360 enddo
361 d(j+1) = dtmp
362 q(1:nlen,j+1) = qtmp(1:nlen)
363 deallocate(qtmp, stat=istat, errmsg=errormessage)
364 check_deallocate("solve_tridi_single: qtmp", istat, errormessage)
365 endif
366 enddo
367 endif ! useGPU
368
369 call obj%timer%stop("solve_tridi_single" // precision_suffix)
370
371#ifdef SOLVE_TRIDI_GPU_BUILD
372 end subroutine solve_tridi_single_problem_gpu_&
373 &precision_and_suffix
374#else
375 end subroutine solve_tridi_single_problem_cpu_&
376 &precision_and_suffix
377#endif
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50