Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.05.001
Loading...
Searching...
No Matches
solve_tridi_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19! This particular source code file contains additions, changes and
20! enhancements authored by Intel Corporation which is not part of
21! the ELPA consortium.
22!
23! More information can be found here:
24! http://elpa.mpcdf.mpg.de/
25!
26! ELPA is free software: you can redistribute it and/or modify
27! it under the terms of the version 3 of the license of the
28! GNU Lesser General Public License as published by the Free
29! Software Foundation.
30!
31! ELPA is distributed in the hope that it will be useful,
32! but WITHOUT ANY WARRANTY; without even the implied warranty of
33! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34! GNU Lesser General Public License for more details.
35!
36! You should have received a copy of the GNU Lesser General Public License
37! along with ELPA. If not, see <http://www.gnu.org/licenses/>
38!
39! ELPA reflects a substantial effort on the part of the original
40! ELPA consortium, and we ask you to respect the spirit of the
41! license that we chose: i.e., please contribute any changes you
42! may have back to the original ELPA library distribution, and keep
43! any derivatives of ELPA under the same license that we chose for
44! the original distribution, the GNU Lesser General Public License.
45!
46!
47! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
48!
49! Copyright of the original code rests with the authors inside the ELPA
50! consortium. The copyright of any additional modifications shall rest
51! with their original authors, but shall adhere to the licensing terms
52! distributed along with the original code in the file "COPYING".
53#endif
54
55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
57
58#ifdef SOLVE_TRIDI_GPU_BUILD
59subroutine solve_tridi_gpu_&
60&precision_and_suffix &
61#else
62subroutine solve_tridi_cpu_&
63&precision_and_suffix &
64#endif
65 ( obj, na, nev, &
66#ifdef SOLVE_TRIDI_GPU_BUILD
67 d_dev, e_dev, q_dev, &
68#else
69 d, e, q, &
70#endif
71 ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &
72 mpi_comm_cols, wantdebug, success, max_threads )
73
74 use precision
78 use elpa_mpi
79 use elpa_utilities
81 use elpa_mpi
82 use elpa_gpu
83 use elpa_gpu_util
84 implicit none
85#include "../../src/general/precision_kinds.F90"
86 class(elpa_abstract_impl_t), intent(inout) :: obj
87 integer(kind=ik), intent(in) :: na, nev, ldq, nblk, matrixCols, &
88 mpi_comm_all, mpi_comm_rows, mpi_comm_cols
89
90 integer(kind=c_intptr_t) :: d_dev, e_dev, q_dev
91#ifndef SOLVE_TRIDI_GPU_BUILD
92 real(kind=real_datatype), intent(inout) :: d(na), e(na)
93#ifdef USE_ASSUMED_SIZE
94 real(kind=real_datatype), intent(inout) :: q(ldq,*)
95#else
96 real(kind=real_datatype), intent(inout) :: q(ldq,matrixcols)
97#endif
98#else /* SOLVE_TRIDI_GPU_BUILD */
99 real(kind=real_datatype) :: d(na), e(na)
100 real(kind=real_datatype) :: q(ldq,matrixcols)
101#endif /* SOLVE_TRIDI_GPU_BUILD */
102
103 logical, intent(in) :: wantDebug
104 logical, intent(out) :: success
105
106 integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
107 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
108 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
109 integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
110
111 integer(kind=ik) :: istat
112 character(200) :: errorMessage
113 character(20) :: gpuString
114 integer(kind=ik), intent(in) :: max_threads
115 logical :: useGPU
116 integer(kind=c_intptr_t) :: num
117 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
118 &precision&
119 &_&
120 &math_datatype
121 integer(kind=c_intptr_t), parameter :: size_of_datatype_real = size_of_&
122 &precision&
123 &_real
124 integer(kind=c_intptr_t) :: gpuHandle, my_stream
125 logical :: successGPU
126
127 usegpu = .false.
128#ifdef SOLVE_TRIDI_GPU_BUILD
129 usegpu = .true.
130#endif
131
132 if(usegpu) then
133 gpustring = "_gpu"
134 else
135 gpustring = ""
136 endif
137
138 call obj%timer%start("solve_tridi" // precision_suffix // gpustring)
139
140 call obj%timer%start("mpi_communication")
141 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
142 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
143 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
144 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
145
146 my_prow = int(my_prowmpi,kind=c_int)
147 np_rows = int(np_rowsmpi,kind=c_int)
148 my_pcol = int(my_pcolmpi,kind=c_int)
149 np_cols = int(np_colsmpi,kind=c_int)
150
151
152 call obj%timer%stop("mpi_communication")
153 if (usegpu) then
154 ! dirty hack
155 num = na * size_of_datatype_real
156#ifdef WITH_GPU_STREAMS
157 my_stream = obj%gpu_setup%my_stream
158 call gpu_memcpy_async_and_stream_synchronize &
159 ("solve_tridi d_dev -> d", d_dev, 0_c_intptr_t, &
160 d(1:na), &
161 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
162#else
163 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
164 num, gpumemcpydevicetohost)
165 check_memcpy_gpu("solve_tridi: 1: d_dev", successgpu)
166#endif
167 num = na * size_of_datatype_real
168#ifdef WITH_GPU_STREAMS
169 my_stream = obj%gpu_setup%my_stream
170 call gpu_memcpy_async_and_stream_synchronize &
171 ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
172 e(1:na), &
173 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
174#else
175 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
176 num, gpumemcpydevicetohost)
177 check_memcpy_gpu("solve_tridi: e_dev", successgpu)
178#endif
179 if (.not.(obj%eigenvalues_only)) then
180 num = ldq*matrixcols * size_of_datatype_real
181#ifdef WITH_GPU_STREAMS
182 my_stream = obj%gpu_setup%my_stream
183 call gpu_memcpy_async_and_stream_synchronize &
184 ("solve_tridi q_dev -> q_vec", q_dev, 0_c_intptr_t, &
185 q(1:ldq,1:matrixcols), &
186 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
187#else
188 successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
189 num, gpumemcpydevicetohost)
190 check_memcpy_gpu("solve_tridi: q_dev", successgpu)
191#endif
192 endif ! eigenvalues_only
193 endif
194
195
196
197 success = .true.
198
199 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
200 l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
201
202 !if (.not.(obj%eigenvalues_only)) then
203 ! Set Q to 0
204 q(1:l_rows, 1:l_cols) = 0.0_rk
205 !endif
206
207 ! Get the limits of the subdivisons, each subdivison has as many cols
208 ! as fit on the respective processor column
209
210 allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)
211 check_allocate("solve_tridi: limits", istat, errormessage)
212
213 limits(0) = 0
214 do np=0,np_cols-1
215 nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
216
217 ! Check for the case that a column has have zero width.
218 ! This is not supported!
219 ! Scalapack supports it but delivers no results for these columns,
220 ! which is rather annoying
221 if (nc==0) then
222 call obj%timer%stop("solve_tridi" // precision_suffix)
223 if (wantdebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
224 success = .false.
225 return
226 endif
227 limits(np+1) = limits(np) + nc
228 enddo
229
230 ! Subdivide matrix by subtracting rank 1 modifications
231
232 do i=1,np_cols-1
233 n = limits(i)
234 d(n) = d(n)-abs(e(n))
235 d(n+1) = d(n+1)-abs(e(n))
236 enddo
237
238 ! Solve sub problems on processsor columns
239
240 nc = limits(my_pcol) ! column after which my problem starts
241
242 if (np_cols>1) then
243 nev1 = l_cols ! all eigenvectors are needed
244 else
245 nev1 = min(nev,l_cols)
246 endif
247 call solve_tridi_col_&
248 &precision_and_suffix &
249 (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
250 matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads)
251 if (.not.(success)) then
252 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
253 return
254 endif
255 ! If there is only 1 processor column, we are done
256
257 if (np_cols==1) then
258 deallocate(limits, stat=istat, errmsg=errormessage)
259 check_deallocate("solve_tridi: limits", istat, errormessage)
260 if (usegpu) then
261 ! dirty hack
262 num = na * size_of_datatype_real
263#ifdef WITH_GPU_STREAMS
264 my_stream = obj%gpu_setup%my_stream
265 call gpu_memcpy_async_and_stream_synchronize &
266 ("solve_trid d -> d_dev", d_dev, 0_c_intptr_t, &
267 d(1:na), &
268 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
269#else
270 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
271 num, gpumemcpyhosttodevice)
272 check_memcpy_gpu("solve_tridi: d_dev", successgpu)
273#endif
274 num = na * size_of_datatype_real
275#ifdef WITH_GPU_STREAMS
276 my_stream = obj%gpu_setup%my_stream
277 call gpu_memcpy_async_and_stream_synchronize &
278 ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
279 e(1:na), &
280 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
281#else
282 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
283 num, gpumemcpyhosttodevice)
284 check_memcpy_gpu("solve_tridi: e_dev", successgpu)
285#endif
286 if (.not.(obj%eigenvalues_only)) then
287 num = ldq*matrixcols * size_of_datatype_real
288#ifdef WITH_GPU_STREAMS
289 my_stream = obj%gpu_setup%my_stream
290 call gpu_memcpy_async_and_stream_synchronize &
291 ("solve_tride q_dev -> q_vec", q_dev, 0_c_intptr_t, &
292 q(1:ldq,1:matrixcols), &
293 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
294#else
295 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
296 num, gpumemcpyhosttodevice)
297 check_memcpy_gpu("solve_tridi: q_dev", successgpu)
298#endif
299 endif ! eigenvalues_only
300 endif
301
302 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
303 return
304 endif
305
306 ! Set index arrays for Q columns
307
308 ! Dense distribution scheme:
309
310 allocate(l_col(na), stat=istat, errmsg=errormessage)
311 check_allocate("solve_tridi: l_col", istat, errormessage)
312
313 allocate(p_col(na), stat=istat, errmsg=errormessage)
314 check_allocate("solve_tridi: p_col", istat, errormessage)
315
316 n = 0
317 do np=0,np_cols-1
318 nc = local_index(na, np, np_cols, nblk, -1)
319 do i=1,nc
320 n = n+1
321 l_col(n) = i
322 p_col(n) = np
323 enddo
324 enddo
325
326 ! Block cyclic distribution scheme, only nev columns are set:
327
328 allocate(l_col_bc(na), stat=istat, errmsg=errormessage)
329 check_allocate("solve_tridi: l_col_bc", istat, errormessage)
330
331 allocate(p_col_bc(na), stat=istat, errmsg=errormessage)
332 check_allocate("solve_tridi: p_col_bc", istat, errormessage)
333
334 p_col_bc(:) = -1
335 l_col_bc(:) = -1
336
337 do i = 0, na-1, nblk*np_cols
338 do j = 0, np_cols-1
339 do n = 1, nblk
340 if (i+j*nblk+n <= min(nev,na)) then
341 p_col_bc(i+j*nblk+n) = j
342 l_col_bc(i+j*nblk+n) = i/np_cols + n
343 endif
344 enddo
345 enddo
346 enddo
347
348 ! Recursively merge sub problems
349 call merge_recursive_&
350 &precision &
351 (obj, 0, np_cols, ldq, matrixcols, nblk, &
352 l_col, p_col, l_col_bc, p_col_bc, limits, &
353 np_cols, na, q, d, e, &
354 mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&
355 usegpu, wantdebug, success, max_threads)
356
357 if (.not.(success)) then
358 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
359 return
360 endif
361
362 deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)
363 check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)
364
365
366 if (usegpu) then
367 ! dirty hack
368 num = na * size_of_datatype_real
369#ifdef WITH_GPU_STREAMS
370 my_stream = obj%gpu_setup%my_stream
371 call gpu_memcpy_async_and_stream_synchronize &
372 ("solve_trid d -> d_dev", d_dev, 0_c_intptr_t, &
373 d(1:na), &
374 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
375#else
376 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
377 num, gpumemcpyhosttodevice)
378 check_memcpy_gpu("solve_tridi: d_dev", successgpu)
379#endif
380 num = na * size_of_datatype_real
381#ifdef WITH_GPU_STREAMS
382 my_stream = obj%gpu_setup%my_stream
383 call gpu_memcpy_async_and_stream_synchronize &
384 ("solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
385 e(1:na), &
386 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
387#else
388 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
389 num, gpumemcpyhosttodevice)
390 check_memcpy_gpu("solve_tridi: e_dev", successgpu)
391#endif
392 if (.not.(obj%eigenvalues_only)) then
393 num = ldq*matrixcols * size_of_datatype_real
394#ifdef WITH_GPU_STREAMS
395 my_stream = obj%gpu_setup%my_stream
396 call gpu_memcpy_async_and_stream_synchronize &
397 ("solve_tride q_dev -> q_vec", q_dev, 0_c_intptr_t, &
398 q(1:ldq,1:matrixcols), &
399 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
400#else
401 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
402 num, gpumemcpyhosttodevice)
403 check_memcpy_gpu("solve_tridi: q_dev", successgpu)
404#endif
405 endif ! eigenvalues_only
406 endif
407
408 call obj%timer%stop("solve_tridi" // precision_suffix // gpustring)
409 return
410
411 end
Definition mod_distribute_global_column.F90:55
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_merge_recursive.F90:3
Definition mod_merge_systems.F90:3
Definition elpa_abstract_impl.F90:73