62subroutine solve_tridi_cpu_&
63&precision_and_suffix &
66#ifdef SOLVE_TRIDI_GPU_BUILD
67 d_dev, e_dev, q_dev, &
71 ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &
72 mpi_comm_cols, wantdebug, success, max_threads )
85#include "../../src/general/precision_kinds.F90"
87 integer(kind=ik),
intent(in) :: na, nev, ldq, nblk, matrixCols, &
88 mpi_comm_all, mpi_comm_rows, mpi_comm_cols
90 integer(kind=c_intptr_t) :: d_dev, e_dev, q_dev
91#ifndef SOLVE_TRIDI_GPU_BUILD
92 real(kind=real_datatype),
intent(inout) :: d(na), e(na)
93#ifdef USE_ASSUMED_SIZE
94 real(kind=real_datatype),
intent(inout) :: q(ldq,*)
96 real(kind=real_datatype),
intent(inout) :: q(ldq,matrixcols)
98#else /* SOLVE_TRIDI_GPU_BUILD */
99 real(kind=real_datatype) :: d(na), e(na)
100 real(kind=real_datatype) :: q(ldq,matrixcols)
101#endif /* SOLVE_TRIDI_GPU_BUILD */
103 logical,
intent(in) :: wantDebug
104 logical,
intent(out) :: success
106 integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
107 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
108 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
109 integer(kind=ik),
allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
111 integer(kind=ik) :: istat
112 character(200) :: errorMessage
113 character(20) :: gpuString
114 integer(kind=ik),
intent(in) :: max_threads
116 integer(kind=c_intptr_t) :: num
117 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
121 integer(kind=c_intptr_t),
parameter :: size_of_datatype_real = size_of_&
124 integer(kind=c_intptr_t) :: gpuHandle, my_stream
125 logical :: successGPU
128#ifdef SOLVE_TRIDI_GPU_BUILD
138 call obj%timer%start(
"solve_tridi" // precision_suffix // gpustring)
140 call obj%timer%start(
"mpi_communication")
141 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
142 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
143 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
144 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
146 my_prow = int(my_prowmpi,kind=c_int)
147 np_rows = int(np_rowsmpi,kind=c_int)
148 my_pcol = int(my_pcolmpi,kind=c_int)
149 np_cols = int(np_colsmpi,kind=c_int)
152 call obj%timer%stop(
"mpi_communication")
155 num = na * size_of_datatype_real
156#ifdef WITH_GPU_STREAMS
157 my_stream = obj%gpu_setup%my_stream
158 call gpu_memcpy_async_and_stream_synchronize &
159 (
"solve_tridi d_dev -> d", d_dev, 0_c_intptr_t, &
161 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
163 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
164 num, gpumemcpydevicetohost)
165 check_memcpy_gpu(
"solve_tridi: 1: d_dev", successgpu)
167 num = na * size_of_datatype_real
168#ifdef WITH_GPU_STREAMS
169 my_stream = obj%gpu_setup%my_stream
170 call gpu_memcpy_async_and_stream_synchronize &
171 (
"solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
173 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
175 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
176 num, gpumemcpydevicetohost)
177 check_memcpy_gpu(
"solve_tridi: e_dev", successgpu)
179 if (.not.(obj%eigenvalues_only))
then
180 num = ldq*matrixcols * size_of_datatype_real
181#ifdef WITH_GPU_STREAMS
182 my_stream = obj%gpu_setup%my_stream
183 call gpu_memcpy_async_and_stream_synchronize &
184 (
"solve_tridi q_dev -> q_vec", q_dev, 0_c_intptr_t, &
185 q(1:ldq,1:matrixcols), &
186 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
188 successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
189 num, gpumemcpydevicetohost)
190 check_memcpy_gpu(
"solve_tridi: q_dev", successgpu)
199 l_rows = local_index(na, my_prow, np_rows, nblk, -1)
200 l_cols = local_index(na, my_pcol, np_cols, nblk, -1)
204 q(1:l_rows, 1:l_cols) = 0.0_rk
210 allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)
211 check_allocate(
"solve_tridi: limits", istat, errormessage)
215 nc = local_index(na, np, np_cols, nblk, -1)
222 call obj%timer%stop(
"solve_tridi" // precision_suffix)
223 if (wantdebug)
write(error_unit,*)
'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
227 limits(np+1) = limits(np) + nc
234 d(n) = d(n)-abs(e(n))
235 d(n+1) = d(n+1)-abs(e(n))
245 nev1 = min(nev,l_cols)
247 call solve_tridi_col_&
248 &precision_and_suffix &
249 (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
250 matrixcols, mpi_comm_rows, usegpu, wantdebug, success, max_threads)
251 if (.not.(success))
then
252 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
258 deallocate(limits, stat=istat, errmsg=errormessage)
259 check_deallocate(
"solve_tridi: limits", istat, errormessage)
262 num = na * size_of_datatype_real
263#ifdef WITH_GPU_STREAMS
264 my_stream = obj%gpu_setup%my_stream
265 call gpu_memcpy_async_and_stream_synchronize &
266 (
"solve_trid d -> d_dev", d_dev, 0_c_intptr_t, &
268 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
270 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
271 num, gpumemcpyhosttodevice)
272 check_memcpy_gpu(
"solve_tridi: d_dev", successgpu)
274 num = na * size_of_datatype_real
275#ifdef WITH_GPU_STREAMS
276 my_stream = obj%gpu_setup%my_stream
277 call gpu_memcpy_async_and_stream_synchronize &
278 (
"solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
280 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
282 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
283 num, gpumemcpyhosttodevice)
284 check_memcpy_gpu(
"solve_tridi: e_dev", successgpu)
286 if (.not.(obj%eigenvalues_only))
then
287 num = ldq*matrixcols * size_of_datatype_real
288#ifdef WITH_GPU_STREAMS
289 my_stream = obj%gpu_setup%my_stream
290 call gpu_memcpy_async_and_stream_synchronize &
291 (
"solve_tride q_dev -> q_vec", q_dev, 0_c_intptr_t, &
292 q(1:ldq,1:matrixcols), &
293 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
295 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
296 num, gpumemcpyhosttodevice)
297 check_memcpy_gpu(
"solve_tridi: q_dev", successgpu)
302 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
310 allocate(l_col(na), stat=istat, errmsg=errormessage)
311 check_allocate(
"solve_tridi: l_col", istat, errormessage)
313 allocate(p_col(na), stat=istat, errmsg=errormessage)
314 check_allocate(
"solve_tridi: p_col", istat, errormessage)
318 nc = local_index(na, np, np_cols, nblk, -1)
328 allocate(l_col_bc(na), stat=istat, errmsg=errormessage)
329 check_allocate(
"solve_tridi: l_col_bc", istat, errormessage)
331 allocate(p_col_bc(na), stat=istat, errmsg=errormessage)
332 check_allocate(
"solve_tridi: p_col_bc", istat, errormessage)
337 do i = 0, na-1, nblk*np_cols
340 if (i+j*nblk+n <= min(nev,na))
then
341 p_col_bc(i+j*nblk+n) = j
342 l_col_bc(i+j*nblk+n) = i/np_cols + n
349 call merge_recursive_&
351 (obj, 0, np_cols, ldq, matrixcols, nblk, &
352 l_col, p_col, l_col_bc, p_col_bc, limits, &
353 np_cols, na, q, d, e, &
354 mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&
355 usegpu, wantdebug, success, max_threads)
357 if (.not.(success))
then
358 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
362 deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)
363 check_deallocate(
"solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)
368 num = na * size_of_datatype_real
369#ifdef WITH_GPU_STREAMS
370 my_stream = obj%gpu_setup%my_stream
371 call gpu_memcpy_async_and_stream_synchronize &
372 (
"solve_trid d -> d_dev", d_dev, 0_c_intptr_t, &
374 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
376 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
377 num, gpumemcpyhosttodevice)
378 check_memcpy_gpu(
"solve_tridi: d_dev", successgpu)
380 num = na * size_of_datatype_real
381#ifdef WITH_GPU_STREAMS
382 my_stream = obj%gpu_setup%my_stream
383 call gpu_memcpy_async_and_stream_synchronize &
384 (
"solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
386 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
388 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
389 num, gpumemcpyhosttodevice)
390 check_memcpy_gpu(
"solve_tridi: e_dev", successgpu)
392 if (.not.(obj%eigenvalues_only))
then
393 num = ldq*matrixcols * size_of_datatype_real
394#ifdef WITH_GPU_STREAMS
395 my_stream = obj%gpu_setup%my_stream
396 call gpu_memcpy_async_and_stream_synchronize &
397 (
"solve_tride q_dev -> q_vec", q_dev, 0_c_intptr_t, &
398 q(1:ldq,1:matrixcols), &
399 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
401 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
402 num, gpumemcpyhosttodevice)
403 check_memcpy_gpu(
"solve_tridi: q_dev", successgpu)
408 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)