Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.06.001.rc1
Loading...
Searching...
No Matches
solve_tridi_single_problem_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58
59#ifdef SOLVE_TRIDI_GPU_BUILD
60 subroutine solve_tridi_single_problem_gpu_&
61 &precision_and_suffix &
62 (obj, nlen, d_dev, e_dev, q_dev, ldq, qtmp_dev, wantdebug, success)
63#else
64 subroutine solve_tridi_single_problem_cpu_&
65 &precision_and_suffix &
66 (obj, nlen, d, e, q, ldq, wantdebug, success)
67#endif
68 ! solve_tridi_single_problem is called from solve_trodi_col: with parameters q_dev=qmat1_dev, ldq=max_size
69 ! qmat1(max_size, max_size) -> q(ldq, ldq), and not q(ldq,nlen)!! same for q_dev
70 ! but that's fine if nlen<=ldq, then not the whole matrix is used. otherwise can lead to errors
71 ! Now: called from two places differently: with np_rows==1 and np_rows>1 and should be treated with care
72
73 ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
74 ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
75 use precision
77 use elpa_blas_interfaces
78 use elpa_utilities
79 use elpa_gpu
80 use solve_single_problem_gpu
81#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
82 use cuda_functions ! for NVTX labels
83#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
84 use hip_functions ! for ROCTX labels
85#endif
86 implicit none
87 class(elpa_abstract_impl_t), intent(inout) :: obj
88 logical :: useGPU, useGPUsolver
89 integer(kind=ik) :: nlen, ldq
90 real(kind=real_datatype) :: d(nlen), e(nlen), q(ldq,nlen)
91
92 real(kind=real_datatype), allocatable :: work(:), qtmp(:), ds(:), es(:)
93 real(kind=real_datatype) :: dtmp
94
95 integer(kind=ik) :: i, j, lwork, liwork, info
96 integer(kind=BLAS_KIND) :: infoBLAS
97 integer(kind=ik), allocatable :: iwork(:)
98 !real(kind=REAL_DATATYPE), allocatable :: mat(:,:)
99
100 logical, intent(in) :: wantDebug
101 logical, intent(out) :: success
102 integer(kind=c_int) :: debug
103 integer(kind=ik) :: istat
104 character(200) :: errorMessage
105 integer(kind=c_intptr_t) :: num, my_stream
106 integer(kind=c_intptr_t) :: q_dev, d_dev, info_dev, qtmp_dev, e_dev
107 logical :: successGPU
108 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
109 &precision&
110 &_real
111
112 integer(kind=c_intptr_t) :: gpusolverHandle
113
114 debug = 0
115 if (wantdebug) debug = 1
116
117 usegpu =.false.
118 usegpusolver =.false.
119#ifdef SOLVE_TRIDI_GPU_BUILD
120 usegpu =.true.
121
122#if defined(WITH_NVIDIA_CUSOLVER)
123 usegpusolver =.true.
124#endif
125#if defined(WITH_AMD_ROCSOLVER)
126 ! As of ELPA 2025.01 release, rocsolver_?stedc/rocsolver_?syevd showed bad performance (worse than on CPU).
127 ! Hopefully, this will be fixed by AMD and then we can enable it.
128 usegpusolver =.false.
129#endif
130#endif /* SOLVE_TRIDI_GPU_BUILD */
131
132 call obj%timer%start("solve_tridi_single" // precision_suffix)
133
134 success = .true.
135 allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)
136 check_allocate("solve_tridi_single: ds, es", istat, errormessage)
137
138 if (usegpusolver) then
139 num = 1 * size_of_int
140 successgpu = gpu_malloc(info_dev, num)
141 check_alloc_gpu("solve_tridi_single info_dev: ", successgpu)
142
143 my_stream = obj%gpu_setup%my_stream
144 call gpu_construct_tridi_matrix(precision_char, q_dev, d_dev, e_dev, nlen, ldq, debug, my_stream)
145 endif
146
147 ! Save d and e for the case that dstedc fails
148#if defined(WITH_NVIDIA_CUSOLVER) || defined(WITH_AMD_ROCSOLVER)
149 if (.not.usegpu) then
150 ds(:) = d(:)
151 es(:) = e(:)
152 endif
153#endif
154
155 if (usegpu) then
156 if (.not. usegpusolver) then
157 ! use CPU solver
158
159 ! copy to host
160 num = nlen * size_of_datatype
161#ifdef WITH_GPU_STREAMS
162 my_stream = obj%gpu_setup%my_stream
163 successgpu = gpu_memcpy_async(int(loc(d(1)),kind=c_intptr_t), d_dev, &
164 num, gpumemcpydevicetohost, my_stream)
165 check_memcpy_gpu("solve_tridi_single: d_dev ", successgpu)
166#else
167 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
168 num, gpumemcpydevicetohost)
169 check_memcpy_gpu("solve_tridi_single: d_dev", successgpu)
170#endif
171 num = nlen * size_of_datatype
172#ifdef WITH_GPU_STREAMS
173 my_stream = obj%gpu_setup%my_stream
174 successgpu = gpu_memcpy_async(int(loc(e(1)),kind=c_intptr_t), e_dev, &
175 num, gpumemcpydevicetohost, my_stream)
176 check_memcpy_gpu("solve_tridi_single: e_dev ", successgpu)
177#else
178 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
179 num, gpumemcpydevicetohost)
180 check_memcpy_gpu("solve_tridi_single: e_dev", successgpu)
181#endif
182
183 ds(:) = d(:)
184 es(:) = e(:)
185
186#include "./solve_tridi_single_problem_include.F90"
187
188 ! copy back
189 num = nlen * size_of_datatype
190#ifdef WITH_GPU_STREAMS
191 my_stream = obj%gpu_setup%my_stream
192 successgpu = gpu_memcpy_async(d_dev, int(loc(d(1)),kind=c_intptr_t), &
193 num, gpumemcpyhosttodevice, my_stream)
194 check_memcpy_gpu("solve_tridi_single: d_dev ", successgpu)
195#else
196 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
197 num, gpumemcpyhosttodevice)
198 check_memcpy_gpu("solve_tridi_single: d_dev", successgpu)
199#endif
200 num = nlen * size_of_datatype
201#ifdef WITH_GPU_STREAMS
202 my_stream = obj%gpu_setup%my_stream
203 successgpu = gpu_memcpy_async(e_dev, int(loc(e(1)),kind=c_intptr_t), &
204 num, gpumemcpyhosttodevice, my_stream)
205 check_memcpy_gpu("solve_tridi_single: e_dev ", successgpu)
206#else
207 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
208 num, gpumemcpyhosttodevice)
209 check_memcpy_gpu("solve_tridi_single: e_dev", successgpu)
210#endif
211
212! fails
213 num = ldq*nlen * size_of_datatype
214#ifdef WITH_GPU_STREAMS
215 my_stream = obj%gpu_setup%my_stream
216 successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
217 num, gpumemcpyhosttodevice, my_stream)
218 check_memcpy_gpu("solve_tridi_single: q_dev1 ", successgpu)
219#else
220 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
221 num, gpumemcpyhosttodevice)
222 check_memcpy_gpu("solve_tridi_single: q_dev1", successgpu)
223#endif
224
225 else ! (.not. useGPUsolver)
226
227 call obj%timer%start("gpusolver_syevd")
228 nvtx_range_push("gpusolver_syevd")
229 gpusolverhandle = obj%gpu_setup%gpusolverHandleArray(0)
230 call gpusolver_precision_syevd (nlen, q_dev, ldq, d_dev, info_dev, gpusolverhandle)
231 if (wantdebug) successgpu = gpu_devicesynchronize()
232 nvtx_range_pop("gpusolver_syevd")
233 call obj%timer%stop("gpusolver_syevd")
234
235 num = 1 * size_of_int
236#ifdef WITH_GPU_STREAMS
237 my_stream = obj%gpu_setup%my_stream
238 successgpu = gpu_memcpy_async(int(loc(info),kind=c_intptr_t), info_dev, &
239 num, gpumemcpydevicetohost, my_stream)
240 check_memcpy_gpu("solve_tridi_single: ", successgpu)
241
242 successgpu = gpu_stream_synchronize(my_stream)
243 check_stream_synchronize_gpu("solve_tridi_single: info_dev -> info", successgpu)
244#else
245 successgpu = gpu_memcpy(int(loc(info),kind=c_intptr_t), info_dev, &
246 num, gpumemcpydevicetohost)
247 check_memcpy_gpu("solve_tridi_single: info_dev", successgpu)
248#endif
249
250 if (info .ne. 0) then
251 write(error_unit,'(a,i8,a)') "Error in gpusolver_PRECISION_syevd, info=", info, ", aborting..."
252 stop 1
253 endif
254 endif ! (.not. useGPUsolver)
255!!
256!! else
257! !copy to host
258! num = nlen * size_of_datatype
259!#ifdef WITH_GPU_STREAMS
260! my_stream = obj%gpu_setup%my_stream
261! successGPU = gpu_memcpy_async(int(loc(d(1)),kind=c_intptr_t), d_dev, &
262! num, gpuMemcpyDeviceToHost, my_stream)
263! check_memcpy_gpu("solve_tridi_single: d_dev ", successGPU)
264!#else
265! successGPU = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
266! num, gpuMemcpyDeviceToHost)
267! check_memcpy_gpu("solve_tridi_single: d_dev", successGPU)
268!#endif
269! num = nlen * size_of_datatype
270!#ifdef WITH_GPU_STREAMS
271! my_stream = obj%gpu_setup%my_stream
272! successGPU = gpu_memcpy_async(int(loc(e(1)),kind=c_intptr_t), e_dev, &
273! num, gpuMemcpyDeviceToHost, my_stream)
274! check_memcpy_gpu("solve_tridi_single: e_dev ", successGPU)
275!#else
276! successGPU = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
277! num, gpuMemcpyDeviceToHost)
278! check_memcpy_gpu("solve_tridi_single: e_dev", successGPU)
279!#endif
280!
281! ds(:) = d(:)
282! es(:) = e(:)
283!
284!#endif /* defined(WITH_NVIDIA_CUSOLVER) || defined(WITH_AMD_ROCSOLVER) */
285!
286!#include "./solve_tridi_single_problem_include.F90"
287!
288!
289!#if defined(WITH_NVIDIA_CUSOLVER) || defined(WITH_AMD_ROCSOLVER)
290! !copy back
291! num = nlen * size_of_datatype
292!#ifdef WITH_GPU_STREAMS
293! my_stream = obj%gpu_setup%my_stream
294! successGPU = gpu_memcpy_async(d_dev, int(loc(d(1)),kind=c_intptr_t), &
295! num, gpuMemcpyHostToDevice, my_stream)
296! check_memcpy_gpu("solve_tridi_single: d_dev ", successGPU)
297!#else
298! successGPU = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
299! num, gpuMemcpyHostToDevice)
300! check_memcpy_gpu("solve_tridi_single: d_dev", successGPU)
301!#endif
302! num = nlen * size_of_datatype
303!#ifdef WITH_GPU_STREAMS
304! my_stream = obj%gpu_setup%my_stream
305! successGPU = gpu_memcpy_async(e_dev, int(loc(e(1)),kind=c_intptr_t), &
306! num, gpuMemcpyHostToDevice, my_stream)
307! check_memcpy_gpu("solve_tridi_single: e_dev ", successGPU)
308!#else
309! successGPU = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
310! num, gpuMemcpyHostToDevice)
311! check_memcpy_gpu("solve_tridi_single: e_dev", successGPU)
312!#endif
313!
314!! fails
315! num = ldq*nlen * size_of_datatype
316!#ifdef WITH_GPU_STREAMS
317! my_stream = obj%gpu_setup%my_stream
318! successGPU = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
319! num, gpuMemcpyHostToDevice, my_stream)
320! check_memcpy_gpu("solve_tridi_single: q_dev1 ", successGPU)
321!#else
322! successGPU = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
323! num, gpuMemcpyHostToDevice)
324! check_memcpy_gpu("solve_tridi_single: q_dev1", successGPU)
325!#endif
326!! endif ! nlen
327
328
329 else ! useGPU
330#include "./solve_tridi_single_problem_include.F90"
331 endif ! useGPU
332
333 if (usegpusolver) then
334 successgpu = gpu_free(info_dev)
335 check_dealloc_gpu("solve_tridi_single: info_dev", successgpu)
336 endif
337
338 ! Check if eigenvalues are monotonically increasing
339 ! This seems to be not always the case (in the IBM implementation of dstedc ???)
340
341 if (usegpu) then
342 my_stream = obj%gpu_setup%my_stream
343 call gpu_check_monotony(precision_char, d_dev, q_dev, qtmp_dev, nlen, ldq, debug, my_stream)
344 else
345 do i=1,nlen-1
346 if (d(i+1)<d(i)) then
347#ifdef DOUBLE_PRECISION_REAL
348 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
349#else
350 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
351#endif
352 write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
353 else
354 write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
355 write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
356 write(error_unit,'(a)') 'In this extent, this is completely harmless.'
357 write(error_unit,'(a)') 'Still, we keep this info message just in case.'
358 end if
359 allocate(qtmp(nlen), stat=istat, errmsg=errormessage)
360 check_allocate("solve_tridi_single: qtmp", istat, errormessage)
361
362 dtmp = d(i+1)
363 qtmp(1:nlen) = q(1:nlen,i+1)
364 do j=i,1,-1
365 if (dtmp<d(j)) then
366 d(j+1) = d(j)
367 q(1:nlen,j+1) = q(1:nlen,j)
368 else
369 exit ! Loop
370 endif
371 enddo
372 d(j+1) = dtmp
373 q(1:nlen,j+1) = qtmp(1:nlen)
374 deallocate(qtmp, stat=istat, errmsg=errormessage)
375 check_deallocate("solve_tridi_single: qtmp", istat, errormessage)
376 endif
377 enddo
378 endif ! useGPU
379
380 call obj%timer%stop("solve_tridi_single" // precision_suffix)
381
382#ifdef SOLVE_TRIDI_GPU_BUILD
383 end subroutine solve_tridi_single_problem_gpu_&
384 &precision_and_suffix
385#else
386 end subroutine solve_tridi_single_problem_cpu_&
387 &precision_and_suffix
388#endif
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50