Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.06.001.rc1
Loading...
Searching...
No Matches
solve_tridi_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58#ifdef SOLVE_TRIDI_GPU_BUILD
59subroutine solve_tridi_gpu_&
60 &precision_and_suffix &
61 ( obj, na, nev, &
62 d_dev, e_dev, q_dev, &
63 ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &
64 mpi_comm_cols, wantdebug, success, max_threads )
65#else
66subroutine solve_tridi_cpu_&
67 &precision_and_suffix &
68 ( obj, na, nev, &
69 d, e, q, &
70 ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &
71 mpi_comm_cols, wantdebug, success, max_threads )
72#endif
73
74 use precision
78 use elpa_mpi
79 use elpa_utilities
81 use elpa_mpi
82 use elpa_gpu
83 use elpa_gpu_util
84 use solve_tridi_col_gpu
85 implicit none
86#include "../../src/general/precision_kinds.F90"
87 class(elpa_abstract_impl_t), intent(inout) :: obj
88 integer(kind=ik), intent(in) :: na, nev, ldq, nblk, matrixCols, &
89 mpi_comm_all, mpi_comm_rows, mpi_comm_cols
90
91 integer(kind=c_intptr_t) :: d_dev, e_dev, q_dev
92#ifndef SOLVE_TRIDI_GPU_BUILD
93 real(kind=real_datatype), intent(inout) :: d(na), e(na)
94#ifdef USE_ASSUMED_SIZE
95 real(kind=real_datatype), intent(inout) :: q(ldq,*)
96#else
97 real(kind=real_datatype), intent(inout) :: q(ldq,matrixcols)
98#endif
99#else /* SOLVE_TRIDI_GPU_BUILD */
100 real(kind=real_datatype) :: d(na), e(na)
101 real(kind=real_datatype) :: q(ldq,matrixcols)
102#endif /* SOLVE_TRIDI_GPU_BUILD */
103
104 logical, intent(in) :: wantDebug
105 logical :: success
106 integer(kind=c_int) :: debug
107
108 integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
109 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
110 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
111 integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
112
113 integer(kind=ik) :: istat
114 character(200) :: errorMessage
115 character(20) :: gpuString
116 integer(kind=ik), intent(in) :: max_threads
117 logical :: useGPU
118 integer(kind=c_intptr_t) :: num
119 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
120 &precision&
121 &_&
122 &math_datatype
123 integer(kind=c_intptr_t), parameter :: size_of_datatype_real = size_of_&
124 &precision&
125 &_real
126 integer(kind=c_intptr_t) :: gpuHandle, my_stream
127 integer(kind=c_intptr_t) :: limits_dev
128 logical :: successGPU
129
130 debug = 0
131 if (wantdebug) debug = 1
132
133 usegpu = .false.
134#ifdef SOLVE_TRIDI_GPU_BUILD
135 usegpu = .true.
136#endif
137
138 if(usegpu) then
139 gpustring = "_gpu"
140 else
141 gpustring = ""
142 endif
143
144 call obj%timer%start("solve_tridi" // precision_suffix // gpustring)
145
146 call obj%timer%start("mpi_communication")
147 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
148 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
149 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
150 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
151
152 my_prow = int(my_prowmpi,kind=c_int)
153 np_rows = int(np_rowsmpi,kind=c_int)
154 my_pcol = int(my_pcolmpi,kind=c_int)
155 np_cols = int(np_colsmpi,kind=c_int)
156
157
158 call obj%timer%stop("mpi_communication")
159
160
161 success = .true.
162
163 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
164 l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
165
166
167 if (usegpu) then
168#ifdef WITH_GPU_STREAMS
169 my_stream = obj%gpu_setup%my_stream
170 successgpu = gpu_memset_async(q_dev, 0, l_rows*l_cols*size_of_datatype_real, my_stream)
171 check_memset_gpu("solve_tridi: tmp_dev", successgpu)
172#else
173 successgpu = gpu_memset(q_dev, 0, l_rows*l_cols*size_of_datatype_real)
174 check_memset_gpu("solve_tridi: tmp_dev", successgpu)
175#endif
176
177 if (l_rows .ne. ldq) then
178 print *,"oh shit ldq:",l_rows,ldq
179 stop
180 endif
181 if (l_cols .ne. matrixcols) then
182 print *,"oh shit matrixCols:",l_cols,matrixcols
183 stop
184 endif
185 else
186 !if (.not.(obj%eigenvalues_only)) then
187 ! Set Q to 0
188 q(1:l_rows, 1:l_cols) = 0.0_rk
189 !endif
190 endif
191
192
193 ! Get the limits of the subdivisons, each subdivison has as many cols
194 ! as fit on the respective processor column
195
196 allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)
197 check_allocate("solve_tridi: limits", istat, errormessage)
198
199 limits(0) = 0
200 do np=0,np_cols-1
201 nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
202
203 ! Check for the case that a column has have zero width.
204 ! This is not supported!
205 ! Scalapack supports it but delivers no results for these columns,
206 ! which is rather annoying
207 if (nc==0) then
208 call obj%timer%stop("solve_tridi" // precision_suffix)
209 if (wantdebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
210 success = .false.
211 return
212 endif
213 limits(np+1) = limits(np) + nc
214 enddo
215
216 ! Subdivide matrix by subtracting rank 1 modifications
217
218 if (usegpu) then
219
220 ! carefull: only from 1:np_cols and not 0:np_cols as on CPU
221 num = (np_cols) * size_of_int
222 successgpu = gpu_malloc(limits_dev, num)
223 check_alloc_gpu("solve_tridi limits_dev: ", successgpu)
224
225 num = (np_cols) * size_of_int
226#ifdef WITH_GPU_STREAMS
227 my_stream = obj%gpu_setup%my_stream
228 successgpu = gpu_memcpy_async(limits_dev, int(loc(limits(1)),kind=c_intptr_t), &
229 num, gpumemcpyhosttodevice, my_stream)
230 check_memcpy_gpu("solve_tridi limits_dev: ", successgpu)
231#else
232 successgpu = gpu_memcpy(limits_dev, int(loc(limits(1)),kind=c_intptr_t), &
233 num, gpumemcpyhosttodevice)
234 check_memcpy_gpu("solve_tridi: limits_dev", successgpu)
235#endif
236
237
238 my_stream = obj%gpu_setup%my_stream
239 call gpu_update_d (precision_char, d_dev, e_dev, limits_dev, np_cols, na, debug, my_stream)
240
241 successgpu = gpu_free(limits_dev)
242 check_dealloc_gpu("solve_tridi: limits_dev", successgpu)
243
244 else
245 do i=1,np_cols-1
246 n = limits(i)
247 d(n) = d(n)-abs(e(n))
248 d(n+1) = d(n+1)-abs(e(n))
249 enddo
250 endif
251
252
253 ! Solve sub problems on processsor columns
254
255 nc = limits(my_pcol) ! column after which my problem starts
256
257 if (np_cols>1) then
258 nev1 = l_cols ! all eigenvectors are needed
259 else
260 nev1 = min(nev,l_cols)
261 endif
262
263
264 if (usegpu) then
265 call solve_tridi_col_gpu_&
266 &precision_and_suffix &
267 (obj, l_cols, nev1, nc, d_dev +(nc+1-1)*size_of_datatype_real, &
268 e_dev + (nc+1-1)*size_of_datatype_real, q_dev, ldq, nblk, &
269 matrixcols, mpi_comm_rows, wantdebug, success, max_threads)
270 else
271 call solve_tridi_col_cpu_&
272 &precision_and_suffix &
273 (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
274 matrixcols, mpi_comm_rows, wantdebug, success, max_threads)
275 endif
276
277 if (.not.(success)) then
278 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
279 return
280 endif
281
282
283
284 ! If there is only 1 processor column, we are done
285
286 if (np_cols==1) then
287 deallocate(limits, stat=istat, errmsg=errormessage)
288 check_deallocate("solve_tridi: limits", istat, errormessage)
289
290 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
291 return
292 endif
293
294 ! Set index arrays for Q columns
295
296 ! Dense distribution scheme:
297
298 allocate(l_col(na), stat=istat, errmsg=errormessage)
299 check_allocate("solve_tridi: l_col", istat, errormessage)
300
301 allocate(p_col(na), stat=istat, errmsg=errormessage)
302 check_allocate("solve_tridi: p_col", istat, errormessage)
303
304 n = 0
305 do np=0,np_cols-1
306 nc = local_index(na, np, np_cols, nblk, -1)
307 do i=1,nc
308 n = n+1
309 l_col(n) = i
310 p_col(n) = np
311 enddo
312 enddo
313
314 ! Block cyclic distribution scheme, only nev columns are set:
315
316 allocate(l_col_bc(na), stat=istat, errmsg=errormessage)
317 check_allocate("solve_tridi: l_col_bc", istat, errormessage)
318
319 allocate(p_col_bc(na), stat=istat, errmsg=errormessage)
320 check_allocate("solve_tridi: p_col_bc", istat, errormessage)
321
322 p_col_bc(:) = -1
323 l_col_bc(:) = -1
324
325 do i = 0, na-1, nblk*np_cols
326 do j = 0, np_cols-1
327 do n = 1, nblk
328 if (i+j*nblk+n <= min(nev,na)) then
329 p_col_bc(i+j*nblk+n) = j
330 l_col_bc(i+j*nblk+n) = i/np_cols + n
331 endif
332 enddo
333 enddo
334 enddo
335
336
337 ! gpu function or memcpy
338 if (usegpu) then
339 num = na * size_of_datatype_real
340#ifdef WITH_GPU_STREAMS
341 my_stream = obj%gpu_setup%my_stream
342 call gpu_memcpy_async_and_stream_synchronize &
343 ("solve_tridi d_dev -> d", d_dev, 0_c_intptr_t, &
344 d(1:na), &
345 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
346#else
347 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
348 num, gpumemcpydevicetohost)
349 check_memcpy_gpu("solve_tridi: 1: d_dev", successgpu)
350#endif
351
352 num = na * size_of_datatype_real
353#ifdef WITH_GPU_STREAMS
354 my_stream = obj%gpu_setup%my_stream
355 call gpu_memcpy_async_and_stream_synchronize &
356 ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
357 e(1:na), &
358 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
359#else
360 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
361 num, gpumemcpydevicetohost)
362 check_memcpy_gpu("solve_tridi: 1: d_dev", successgpu)
363#endif
364 endif
365
366
367
368 ! Recursively merge sub problems
369 if (usegpu) then
370 call merge_recursive_gpu_&
371 &precision &
372 (obj, 0, np_cols, ldq, matrixcols, nblk, &
373 l_col, p_col, l_col_bc, p_col_bc, limits, &
374 np_cols, na, q_dev, d, e, &
375 mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&
376 usegpu, wantdebug, success, max_threads)
377 else
378 call merge_recursive_cpu_&
379 &precision &
380 (obj, 0, np_cols, ldq, matrixcols, nblk, &
381 l_col, p_col, l_col_bc, p_col_bc, limits, &
382 np_cols, na, q, d, e, &
383 mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&
384 usegpu, wantdebug, success, max_threads)
385 endif
386
387 if (.not.(success)) then
388 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
389 return
390 endif
391
392 deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)
393 check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)
394
395
396 if (usegpu) then
397 ! dirty hack
398 num = na * size_of_datatype_real
399#ifdef WITH_GPU_STREAMS
400 my_stream = obj%gpu_setup%my_stream
401 call gpu_memcpy_async_and_stream_synchronize &
402 ("solve_trid d -> d_dev", d_dev, 0_c_intptr_t, &
403 d(1:na), &
404 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
405#else
406 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
407 num, gpumemcpyhosttodevice)
408 check_memcpy_gpu("solve_tridi: d_dev", successgpu)
409#endif
410
411 num = na * size_of_datatype_real
412#ifdef WITH_GPU_STREAMS
413 my_stream = obj%gpu_setup%my_stream
414 call gpu_memcpy_async_and_stream_synchronize &
415 ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
416 e(1:na), &
417 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
418#else
419 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
420 num, gpumemcpyhosttodevice)
421 check_memcpy_gpu("solve_tridi: e_dev", successgpu)
422#endif
423 endif
424
425 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
426 return
427
428 end
Definition mod_distribute_global_column.F90:55
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_merge_recursive.F90:3
Definition mod_merge_systems.F90:3
Definition elpa_abstract_impl.F90:73