Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.01.002
Loading...
Searching...
No Matches
solve_tridi_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58#ifdef SOLVE_TRIDI_GPU_BUILD
59subroutine solve_tridi_gpu_&
60&precision_and_suffix &
61#else
62subroutine solve_tridi_cpu_&
63&precision_and_suffix &
64#endif
65 ( obj, na, nev, &
66#ifdef SOLVE_TRIDI_GPU_BUILD
67 d_dev, e_dev, q_dev, &
68#else
69 d, e, q, &
70#endif
71 ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &
72 mpi_comm_cols, wantdebug, success, max_threads )
73
74 use precision
78 use elpa_mpi
79 use elpa_utilities
81 use elpa_mpi
82 use elpa_gpu
83 use elpa_gpu_util
84 use tridi_col_gpu
85 implicit none
86#include "../../src/general/precision_kinds.F90"
87 class(elpa_abstract_impl_t), intent(inout) :: obj
88 integer(kind=ik), intent(in) :: na, nev, ldq, nblk, matrixCols, &
89 mpi_comm_all, mpi_comm_rows, mpi_comm_cols
90
91 integer(kind=c_intptr_t) :: d_dev, e_dev, q_dev
92#ifndef SOLVE_TRIDI_GPU_BUILD
93 real(kind=real_datatype), intent(inout) :: d(na), e(na)
94#ifdef USE_ASSUMED_SIZE
95 real(kind=real_datatype), intent(inout) :: q(ldq,*)
96#else
97 real(kind=real_datatype), intent(inout) :: q(ldq,matrixcols)
98#endif
99#else /* SOLVE_TRIDI_GPU_BUILD */
100 real(kind=real_datatype) :: d(na), e(na)
101 real(kind=real_datatype) :: q(ldq,matrixcols)
102#endif /* SOLVE_TRIDI_GPU_BUILD */
103
104 logical, intent(in) :: wantDebug
105 logical :: success
106
107 integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
108 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
109 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
110 integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
111
112 integer(kind=ik) :: istat
113 character(200) :: errorMessage
114 character(20) :: gpuString
115 integer(kind=ik), intent(in) :: max_threads
116 logical :: useGPU
117 integer(kind=c_intptr_t) :: num
118 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
119 &precision&
120 &_&
121 &math_datatype
122 integer(kind=c_intptr_t), parameter :: size_of_datatype_real = size_of_&
123 &precision&
124 &_real
125 integer(kind=c_intptr_t) :: gpuHandle, my_stream
126 type(c_ptr) :: limits_dev
127 logical :: successGPU
128
129 usegpu = .false.
130#ifdef SOLVE_TRIDI_GPU_BUILD
131 usegpu = .true.
132#endif
133
134 if(usegpu) then
135 gpustring = "_gpu"
136 else
137 gpustring = ""
138 endif
139
140 call obj%timer%start("solve_tridi" // precision_suffix // gpustring)
141
142 call obj%timer%start("mpi_communication")
143 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
144 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
145 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
146 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
147
148 my_prow = int(my_prowmpi,kind=c_int)
149 np_rows = int(np_rowsmpi,kind=c_int)
150 my_pcol = int(my_pcolmpi,kind=c_int)
151 np_cols = int(np_colsmpi,kind=c_int)
152
153
154 call obj%timer%stop("mpi_communication")
155
156
157 success = .true.
158
159 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
160 l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
161
162
163 if (usegpu) then
164#ifdef WITH_GPU_STREAMS
165 my_stream = obj%gpu_setup%my_stream
166 successgpu = gpu_memset_async(q_dev, 0, l_rows*l_cols*size_of_datatype_real, my_stream)
167 check_memset_gpu("solve_tridi: tmp_dev", successgpu)
168#else
169 successgpu = gpu_memset(q_dev, 0, l_rows*l_cols*size_of_datatype_real)
170 check_memset_gpu("solve_tridi: tmp_dev", successgpu)
171#endif
172
173 if (l_rows .ne. ldq) then
174 print *,"oh shit ldq:",l_rows,ldq
175 stop
176 endif
177 if (l_cols .ne. matrixcols) then
178 print *,"oh shit matrixCols:",l_cols,matrixcols
179 stop
180 endif
181 else
182 !if (.not.(obj%eigenvalues_only)) then
183 ! Set Q to 0
184 q(1:l_rows, 1:l_cols) = 0.0_rk
185 !endif
186 endif
187
188
189 ! Get the limits of the subdivisons, each subdivison has as many cols
190 ! as fit on the respective processor column
191
192 allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)
193 check_allocate("solve_tridi: limits", istat, errormessage)
194
195 limits(0) = 0
196 do np=0,np_cols-1
197 nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
198
199 ! Check for the case that a column has have zero width.
200 ! This is not supported!
201 ! Scalapack supports it but delivers no results for these columns,
202 ! which is rather annoying
203 if (nc==0) then
204 call obj%timer%stop("solve_tridi" // precision_suffix)
205 if (wantdebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
206 success = .false.
207 return
208 endif
209 limits(np+1) = limits(np) + nc
210 enddo
211
212 ! Subdivide matrix by subtracting rank 1 modifications
213
214 if (usegpu) then
215
216 ! carefull: only from 1:np_cols and not 0:np_cols as on CPU
217 num = (np_cols) * size_of_int
218 successgpu = gpu_malloc(limits_dev, num)
219 check_alloc_gpu("solve_tridi limits_dev: ", successgpu)
220
221 num = (np_cols) * size_of_int
222#ifdef WITH_GPU_STREAMS
223 my_stream = obj%gpu_setup%my_stream
224 successgpu = gpu_memcpy_async(limits_dev, int(loc(limits(1)),kind=c_intptr_t), &
225 num, gpumemcpyhosttodevice, my_stream)
226 check_memcpy_gpu("solve_tridi limits_dev: ", successgpu)
227#else
228 successgpu = gpu_memcpy(limits_dev, int(loc(limits(1)),kind=c_intptr_t), &
229 num, gpumemcpyhosttodevice)
230 check_memcpy_gpu("solve_tridi: limits_dev", successgpu)
231#endif
232
233
234 my_stream = obj%gpu_setup%my_stream
235 call gpu_update_d_precision (limits_dev, d_dev, e_dev, np_cols, na, my_stream)
236
237 successgpu = gpu_free(limits_dev)
238 check_dealloc_gpu("solve_tridi: limits_dev", successgpu)
239
240 else
241 do i=1,np_cols-1
242 n = limits(i)
243 d(n) = d(n)-abs(e(n))
244 d(n+1) = d(n+1)-abs(e(n))
245 enddo
246 endif
247
248
249 ! Solve sub problems on processsor columns
250
251 nc = limits(my_pcol) ! column after which my problem starts
252
253 if (np_cols>1) then
254 nev1 = l_cols ! all eigenvectors are needed
255 else
256 nev1 = min(nev,l_cols)
257 endif
258
259
260 if (usegpu) then
261 call solve_tridi_col_gpu_&
262 &precision_and_suffix &
263 (obj, l_cols, nev1, nc, d_dev +(nc+1-1)*size_of_datatype_real, &
264 e_dev + (nc+1-1)*size_of_datatype_real, q_dev, ldq, nblk, &
265 matrixcols, mpi_comm_rows, wantdebug, success, max_threads)
266
267 num = ldq*matrixcols * size_of_datatype_real
268#ifdef WITH_GPU_STREAMS
269 my_stream = obj%gpu_setup%my_stream
270 call gpu_memcpy_async_and_stream_synchronize &
271 ("solve_tridi q_dev -> q_vec aaa", q_dev, 0_c_intptr_t, &
272 q(1:ldq,1:matrixcols), &
273 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
274#else
275 successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
276 num, gpumemcpydevicetohost)
277 check_memcpy_gpu("solve_tridi aaa: q_dev", successgpu)
278#endif
279 else ! useGPU
280 call solve_tridi_col_cpu_&
281 &precision_and_suffix &
282 (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
283 matrixcols, mpi_comm_rows, wantdebug, success, max_threads)
284 endif ! useGPU
285 if (.not.(success)) then
286 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
287 return
288 endif
289
290
291
292 ! If there is only 1 processor column, we are done
293
294 if (np_cols==1) then
295 deallocate(limits, stat=istat, errmsg=errormessage)
296 check_deallocate("solve_tridi: limits", istat, errormessage)
297
298 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
299 return
300 endif
301
302 ! Set index arrays for Q columns
303
304 ! Dense distribution scheme:
305
306 allocate(l_col(na), stat=istat, errmsg=errormessage)
307 check_allocate("solve_tridi: l_col", istat, errormessage)
308
309 allocate(p_col(na), stat=istat, errmsg=errormessage)
310 check_allocate("solve_tridi: p_col", istat, errormessage)
311
312 n = 0
313 do np=0,np_cols-1
314 nc = local_index(na, np, np_cols, nblk, -1)
315 do i=1,nc
316 n = n+1
317 l_col(n) = i
318 p_col(n) = np
319 enddo
320 enddo
321
322 ! Block cyclic distribution scheme, only nev columns are set:
323
324 allocate(l_col_bc(na), stat=istat, errmsg=errormessage)
325 check_allocate("solve_tridi: l_col_bc", istat, errormessage)
326
327 allocate(p_col_bc(na), stat=istat, errmsg=errormessage)
328 check_allocate("solve_tridi: p_col_bc", istat, errormessage)
329
330 p_col_bc(:) = -1
331 l_col_bc(:) = -1
332
333 do i = 0, na-1, nblk*np_cols
334 do j = 0, np_cols-1
335 do n = 1, nblk
336 if (i+j*nblk+n <= min(nev,na)) then
337 p_col_bc(i+j*nblk+n) = j
338 l_col_bc(i+j*nblk+n) = i/np_cols + n
339 endif
340 enddo
341 enddo
342 enddo
343
344
345 ! gpu function or memcpy
346 if (usegpu) then
347 num = na * size_of_datatype_real
348#ifdef WITH_GPU_STREAMS
349 my_stream = obj%gpu_setup%my_stream
350 call gpu_memcpy_async_and_stream_synchronize &
351 ("solve_tridi d_dev -> d", d_dev, 0_c_intptr_t, &
352 d(1:na), &
353 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
354#else
355 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
356 num, gpumemcpydevicetohost)
357 check_memcpy_gpu("solve_tridi: 1: d_dev", successgpu)
358#endif
359 num = na * size_of_datatype_real
360#ifdef WITH_GPU_STREAMS
361 my_stream = obj%gpu_setup%my_stream
362 call gpu_memcpy_async_and_stream_synchronize &
363 ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
364 e(1:na), &
365 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
366#else
367 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
368 num, gpumemcpydevicetohost)
369 check_memcpy_gpu("solve_tridi: 1: d_dev", successgpu)
370#endif
371 endif
372
373
374
375 ! Recursively merge sub problems
376 call merge_recursive_&
377 &precision &
378 (obj, 0, np_cols, ldq, matrixcols, nblk, &
379 l_col, p_col, l_col_bc, p_col_bc, limits, &
380 np_cols, na, q, d, e, &
381 mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&
382 usegpu, wantdebug, success, max_threads)
383
384 if (.not.(success)) then
385 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
386 return
387 endif
388
389 deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)
390 check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)
391
392
393 if (usegpu) then
394 ! dirty hack
395 num = na * size_of_datatype_real
396#ifdef WITH_GPU_STREAMS
397 my_stream = obj%gpu_setup%my_stream
398 call gpu_memcpy_async_and_stream_synchronize &
399 ("solve_trid d -> d_dev", d_dev, 0_c_intptr_t, &
400 d(1:na), &
401 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
402#else
403 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
404 num, gpumemcpyhosttodevice)
405 check_memcpy_gpu("solve_tridi: d_dev", successgpu)
406#endif
407 num = na * size_of_datatype_real
408#ifdef WITH_GPU_STREAMS
409 my_stream = obj%gpu_setup%my_stream
410 call gpu_memcpy_async_and_stream_synchronize &
411 ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
412 e(1:na), &
413 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
414#else
415 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
416 num, gpumemcpyhosttodevice)
417 check_memcpy_gpu("solve_tridi: e_dev", successgpu)
418#endif
419 if (.not.(obj%eigenvalues_only)) then
420 num = ldq*matrixcols * size_of_datatype_real
421#ifdef WITH_GPU_STREAMS
422 my_stream = obj%gpu_setup%my_stream
423 call gpu_memcpy_async_and_stream_synchronize &
424 ("solve_tride q_dev -> q_vec", q_dev, 0_c_intptr_t, &
425 q(1:ldq,1:matrixcols), &
426 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
427#else
428 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
429 num, gpumemcpyhosttodevice)
430 check_memcpy_gpu("solve_tridi: q_dev", successgpu)
431#endif
432 endif ! eigenvalues_only
433 endif
434
435 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
436 return
437
438 end
Definition mod_distribute_global_column.F90:55
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_merge_recursive.F90:3
Definition mod_merge_systems.F90:3
Definition elpa_abstract_impl.F90:73