62subroutine solve_tridi_cpu_&
63&precision_and_suffix &
66#ifdef SOLVE_TRIDI_GPU_BUILD
67 d_dev, e_dev, q_dev, &
71 ldq, nblk, matrixcols, mpi_comm_all, mpi_comm_rows, &
72 mpi_comm_cols, wantdebug, success, max_threads )
86#include "../../src/general/precision_kinds.F90"
88 integer(kind=ik),
intent(in) :: na, nev, ldq, nblk, matrixCols, &
89 mpi_comm_all, mpi_comm_rows, mpi_comm_cols
91 integer(kind=c_intptr_t) :: d_dev, e_dev, q_dev
92#ifndef SOLVE_TRIDI_GPU_BUILD
93 real(kind=real_datatype),
intent(inout) :: d(na), e(na)
94#ifdef USE_ASSUMED_SIZE
95 real(kind=real_datatype),
intent(inout) :: q(ldq,*)
97 real(kind=real_datatype),
intent(inout) :: q(ldq,matrixcols)
99#else /* SOLVE_TRIDI_GPU_BUILD */
100 real(kind=real_datatype) :: d(na), e(na)
101 real(kind=real_datatype) :: q(ldq,matrixcols)
102#endif /* SOLVE_TRIDI_GPU_BUILD */
104 logical,
intent(in) :: wantDebug
107 integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
108 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
109 integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
110 integer(kind=ik),
allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
112 integer(kind=ik) :: istat
113 character(200) :: errorMessage
114 character(20) :: gpuString
115 integer(kind=ik),
intent(in) :: max_threads
117 integer(kind=c_intptr_t) :: num
118 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
122 integer(kind=c_intptr_t),
parameter :: size_of_datatype_real = size_of_&
125 integer(kind=c_intptr_t) :: gpuHandle, my_stream
126 type(c_ptr) :: limits_dev
127 logical :: successGPU
130#ifdef SOLVE_TRIDI_GPU_BUILD
140 call obj%timer%start(
"solve_tridi" // precision_suffix // gpustring)
142 call obj%timer%start(
"mpi_communication")
143 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
144 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
145 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
146 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
148 my_prow = int(my_prowmpi,kind=c_int)
149 np_rows = int(np_rowsmpi,kind=c_int)
150 my_pcol = int(my_pcolmpi,kind=c_int)
151 np_cols = int(np_colsmpi,kind=c_int)
154 call obj%timer%stop(
"mpi_communication")
159 l_rows = local_index(na, my_prow, np_rows, nblk, -1)
160 l_cols = local_index(na, my_pcol, np_cols, nblk, -1)
164#ifdef WITH_GPU_STREAMS
165 my_stream = obj%gpu_setup%my_stream
166 successgpu = gpu_memset_async(q_dev, 0, l_rows*l_cols*size_of_datatype_real, my_stream)
167 check_memset_gpu(
"solve_tridi: tmp_dev", successgpu)
169 successgpu = gpu_memset(q_dev, 0, l_rows*l_cols*size_of_datatype_real)
170 check_memset_gpu(
"solve_tridi: tmp_dev", successgpu)
173 if (l_rows .ne. ldq)
then
174 print *,
"oh shit ldq:",l_rows,ldq
177 if (l_cols .ne. matrixcols)
then
178 print *,
"oh shit matrixCols:",l_cols,matrixcols
184 q(1:l_rows, 1:l_cols) = 0.0_rk
192 allocate(limits(0:np_cols), stat=istat, errmsg=errormessage)
193 check_allocate(
"solve_tridi: limits", istat, errormessage)
197 nc = local_index(na, np, np_cols, nblk, -1)
204 call obj%timer%stop(
"solve_tridi" // precision_suffix)
205 if (wantdebug)
write(error_unit,*)
'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
209 limits(np+1) = limits(np) + nc
217 num = (np_cols) * size_of_int
218 successgpu = gpu_malloc(limits_dev, num)
219 check_alloc_gpu(
"solve_tridi limits_dev: ", successgpu)
221 num = (np_cols) * size_of_int
222#ifdef WITH_GPU_STREAMS
223 my_stream = obj%gpu_setup%my_stream
224 successgpu = gpu_memcpy_async(limits_dev, int(loc(limits(1)),kind=c_intptr_t), &
225 num, gpumemcpyhosttodevice, my_stream)
226 check_memcpy_gpu(
"solve_tridi limits_dev: ", successgpu)
228 successgpu = gpu_memcpy(limits_dev, int(loc(limits(1)),kind=c_intptr_t), &
229 num, gpumemcpyhosttodevice)
230 check_memcpy_gpu(
"solve_tridi: limits_dev", successgpu)
234 my_stream = obj%gpu_setup%my_stream
235 call gpu_update_d_precision (limits_dev, d_dev, e_dev, np_cols, na, my_stream)
237 successgpu = gpu_free(limits_dev)
238 check_dealloc_gpu(
"solve_tridi: limits_dev", successgpu)
243 d(n) = d(n)-abs(e(n))
244 d(n+1) = d(n+1)-abs(e(n))
256 nev1 = min(nev,l_cols)
261 call solve_tridi_col_gpu_&
262 &precision_and_suffix &
263 (obj, l_cols, nev1, nc, d_dev +(nc+1-1)*size_of_datatype_real, &
264 e_dev + (nc+1-1)*size_of_datatype_real, q_dev, ldq, nblk, &
265 matrixcols, mpi_comm_rows, wantdebug, success, max_threads)
267 num = ldq*matrixcols * size_of_datatype_real
268#ifdef WITH_GPU_STREAMS
269 my_stream = obj%gpu_setup%my_stream
270 call gpu_memcpy_async_and_stream_synchronize &
271 (
"solve_tridi q_dev -> q_vec aaa", q_dev, 0_c_intptr_t, &
272 q(1:ldq,1:matrixcols), &
273 1, 1, num, gpumemcpydevicetohost, my_stream, .false., .false., .false.)
275 successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
276 num, gpumemcpydevicetohost)
277 check_memcpy_gpu(
"solve_tridi aaa: q_dev", successgpu)
280 call solve_tridi_col_cpu_&
281 &precision_and_suffix &
282 (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
283 matrixcols, mpi_comm_rows, wantdebug, success, max_threads)
285 if (.not.(success))
then
286 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
295 deallocate(limits, stat=istat, errmsg=errormessage)
296 check_deallocate(
"solve_tridi: limits", istat, errormessage)
298 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
306 allocate(l_col(na), stat=istat, errmsg=errormessage)
307 check_allocate(
"solve_tridi: l_col", istat, errormessage)
309 allocate(p_col(na), stat=istat, errmsg=errormessage)
310 check_allocate(
"solve_tridi: p_col", istat, errormessage)
314 nc = local_index(na, np, np_cols, nblk, -1)
324 allocate(l_col_bc(na), stat=istat, errmsg=errormessage)
325 check_allocate(
"solve_tridi: l_col_bc", istat, errormessage)
327 allocate(p_col_bc(na), stat=istat, errmsg=errormessage)
328 check_allocate(
"solve_tridi: p_col_bc", istat, errormessage)
333 do i = 0, na-1, nblk*np_cols
336 if (i+j*nblk+n <= min(nev,na))
then
337 p_col_bc(i+j*nblk+n) = j
338 l_col_bc(i+j*nblk+n) = i/np_cols + n
347 num = na * size_of_datatype_real
348#ifdef WITH_GPU_STREAMS
349 my_stream = obj%gpu_setup%my_stream
350 call gpu_memcpy_async_and_stream_synchronize &
351 (
"solve_tridi d_dev -> d", d_dev, 0_c_intptr_t, &
353 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
355 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
356 num, gpumemcpydevicetohost)
357 check_memcpy_gpu(
"solve_tridi: 1: d_dev", successgpu)
359 num = na * size_of_datatype_real
360#ifdef WITH_GPU_STREAMS
361 my_stream = obj%gpu_setup%my_stream
362 call gpu_memcpy_async_and_stream_synchronize &
363 (
"solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
365 1, num, gpumemcpydevicetohost, my_stream, .false., .true., .false.)
367 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
368 num, gpumemcpydevicetohost)
369 check_memcpy_gpu(
"solve_tridi: 1: d_dev", successgpu)
376 call merge_recursive_&
378 (obj, 0, np_cols, ldq, matrixcols, nblk, &
379 l_col, p_col, l_col_bc, p_col_bc, limits, &
380 np_cols, na, q, d, e, &
381 mpi_comm_all, mpi_comm_rows, mpi_comm_cols,&
382 usegpu, wantdebug, success, max_threads)
384 if (.not.(success))
then
385 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)
389 deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errormessage)
390 check_deallocate(
"solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errormessage)
395 num = na * size_of_datatype_real
396#ifdef WITH_GPU_STREAMS
397 my_stream = obj%gpu_setup%my_stream
398 call gpu_memcpy_async_and_stream_synchronize &
399 (
"solve_trid d -> d_dev", d_dev, 0_c_intptr_t, &
401 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
403 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
404 num, gpumemcpyhosttodevice)
405 check_memcpy_gpu(
"solve_tridi: d_dev", successgpu)
407 num = na * size_of_datatype_real
408#ifdef WITH_GPU_STREAMS
409 my_stream = obj%gpu_setup%my_stream
410 call gpu_memcpy_async_and_stream_synchronize &
411 (
"solve_tridi e_dev -> e", e_dev, 0_c_intptr_t, &
413 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
415 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
416 num, gpumemcpyhosttodevice)
417 check_memcpy_gpu(
"solve_tridi: e_dev", successgpu)
419 if (.not.(obj%eigenvalues_only))
then
420 num = ldq*matrixcols * size_of_datatype_real
421#ifdef WITH_GPU_STREAMS
422 my_stream = obj%gpu_setup%my_stream
423 call gpu_memcpy_async_and_stream_synchronize &
424 (
"solve_tride q_dev -> q_vec", q_dev, 0_c_intptr_t, &
425 q(1:ldq,1:matrixcols), &
426 1, 1, num, gpumemcpyhosttodevice, my_stream, .false., .false., .false.)
428 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
429 num, gpumemcpyhosttodevice)
430 check_memcpy_gpu(
"solve_tridi: q_dev", successgpu)
435 call obj%timer%stop(
"solve_tridi" // precision_suffix // gpustring)