55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
59#ifdef SOLVE_TRIDI_GPU_BUILD
60 subroutine solve_tridi_single_problem_gpu_&
61 &precision_and_suffix &
62 (obj, nlen, d_dev, e_dev, q_dev, ldq, qtmp_dev, wantdebug, success)
64 subroutine solve_tridi_single_problem_cpu_&
65 &precision_and_suffix &
66 (obj, nlen, d, e, q, ldq, wantdebug, success)
77 use elpa_blas_interfaces
80 use solve_single_problem_gpu
82 class(elpa_abstract_impl_t),
intent(inout) :: obj
83 logical :: useGPU, useGPUsolver
84 integer(kind=ik) :: nlen, ldq
85 real(kind=real_datatype) :: d(nlen), e(nlen), q(ldq,nlen)
87 real(kind=real_datatype),
allocatable :: work(:), qtmp(:), ds(:), es(:)
88 real(kind=real_datatype) :: dtmp
90 integer(kind=ik) :: i, j, lwork, liwork, info
91 integer(kind=BLAS_KIND) :: infoBLAS
92 integer(kind=ik),
allocatable :: iwork(:)
95 logical,
intent(in) :: wantDebug
96 logical,
intent(out) :: success
97 integer(kind=ik) :: istat
98 character(200) :: errorMessage
99 integer(kind=c_intptr_t) :: num, my_stream
100 integer(kind=c_intptr_t) :: q_dev, d_dev, info_dev, qtmp_dev, e_dev
101 logical :: successGPU
102 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
106 integer(kind=c_intptr_t) :: gpusolverHandle
111 usegpusolver =.false.
112#ifdef SOLVE_TRIDI_GPU_BUILD
115#if defined(WITH_NVIDIA_CUSOLVER)
118#if defined(WITH_AMD_ROCSOLVER)
119 usegpusolver =.false.
121#endif /* SOLVE_TRIDI_GPU_BUILD */
123 call obj%timer%start(
"solve_tridi_single" // precision_suffix)
126 allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)
127 check_allocate(
"solve_tridi_single: ds, es", istat, errormessage)
129 if (usegpusolver)
then
130 num = 1 * size_of_int
131 successgpu = gpu_malloc(info_dev, num)
132 check_alloc_gpu(
"solve_tridi_single info_dev: ", successgpu)
134 my_stream = obj%gpu_setup%my_stream
135 call gpu_construct_tridi_matrix_precision (q_dev, d_dev, e_dev, nlen, ldq, my_stream)
139#if defined(WITH_NVIDIA_CUSOLVER) || defined(WITH_AMD_ROCSOLVER)
140 if (.not.usegpu)
then
147 if (.not. usegpusolver)
then
151 num = nlen * size_of_datatype
152#ifdef WITH_GPU_STREAMS
153 my_stream = obj%gpu_setup%my_stream
154 successgpu = gpu_memcpy_async(int(loc(d(1)),kind=c_intptr_t), d_dev, &
155 num, gpumemcpydevicetohost, my_stream)
156 check_memcpy_gpu(
"solve_tridi_single: d_dev ", successgpu)
158 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
159 num, gpumemcpydevicetohost)
160 check_memcpy_gpu(
"solve_tridi_single: d_dev", successgpu)
162 num = nlen * size_of_datatype
163#ifdef WITH_GPU_STREAMS
164 my_stream = obj%gpu_setup%my_stream
165 successgpu = gpu_memcpy_async(int(loc(e(1)),kind=c_intptr_t), e_dev, &
166 num, gpumemcpydevicetohost, my_stream)
167 check_memcpy_gpu(
"solve_tridi_single: e_dev ", successgpu)
169 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
170 num, gpumemcpydevicetohost)
171 check_memcpy_gpu(
"solve_tridi_single: e_dev", successgpu)
177#include "./solve_tridi_single_problem_include.F90"
180 num = nlen * size_of_datatype
181#ifdef WITH_GPU_STREAMS
182 my_stream = obj%gpu_setup%my_stream
183 successgpu = gpu_memcpy_async(d_dev, int(loc(d(1)),kind=c_intptr_t), &
184 num, gpumemcpyhosttodevice, my_stream)
185 check_memcpy_gpu(
"solve_tridi_single: d_dev ", successgpu)
187 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
188 num, gpumemcpyhosttodevice)
189 check_memcpy_gpu(
"solve_tridi_single: d_dev", successgpu)
191 num = nlen * size_of_datatype
192#ifdef WITH_GPU_STREAMS
193 my_stream = obj%gpu_setup%my_stream
194 successgpu = gpu_memcpy_async(e_dev, int(loc(e(1)),kind=c_intptr_t), &
195 num, gpumemcpyhosttodevice, my_stream)
196 check_memcpy_gpu(
"solve_tridi_single: e_dev ", successgpu)
198 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
199 num, gpumemcpyhosttodevice)
200 check_memcpy_gpu(
"solve_tridi_single: e_dev", successgpu)
204 num = ldq*nlen * size_of_datatype
205#ifdef WITH_GPU_STREAMS
206 my_stream = obj%gpu_setup%my_stream
207 successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
208 num, gpumemcpyhosttodevice, my_stream)
209 check_memcpy_gpu(
"solve_tridi_single: q_dev1 ", successgpu)
211 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
212 num, gpumemcpyhosttodevice)
213 check_memcpy_gpu(
"solve_tridi_single: q_dev1", successgpu)
218 call obj%timer%start(
"gpusolver_syevd")
219 gpusolverhandle = obj%gpu_setup%gpusolverHandleArray(0)
220 call gpusolver_precision_syevd (nlen, q_dev, ldq, d_dev, info_dev, gpusolverhandle)
221 if (wantdebug) successgpu = gpu_devicesynchronize()
222 call obj%timer%stop(
"gpusolver_syevd")
224 num = 1 * size_of_int
225#ifdef WITH_GPU_STREAMS
226 my_stream = obj%gpu_setup%my_stream
227 successgpu = gpu_memcpy_async(int(loc(info),kind=c_intptr_t), info_dev, &
228 num, gpumemcpydevicetohost, my_stream)
229 check_memcpy_gpu(
"solve_tridi_single: ", successgpu)
231 successgpu = gpu_stream_synchronize(my_stream)
232 check_stream_synchronize_gpu(
"solve_tridi_single: info_dev -> info", successgpu)
234 successgpu = gpu_memcpy(int(loc(info),kind=c_intptr_t), info_dev, &
235 num, gpumemcpydevicetohost)
236 check_memcpy_gpu(
"solve_tridi_single: info_dev", successgpu)
239 if (info .ne. 0)
then
240 write(error_unit,
'(a,i8,a)')
"Error in gpusolver_PRECISION_syevd, info=", info,
", aborting..."
319#include "./solve_tridi_single_problem_include.F90"
322 if (usegpusolver)
then
323 successgpu = gpu_free(info_dev)
324 check_dealloc_gpu(
"solve_tridi_single: info_dev", successgpu)
331 my_stream = obj%gpu_setup%my_stream
332 call gpu_check_monotony_precision (d_dev, q_dev, qtmp_dev, nlen, ldq, my_stream)
335 if (d(i+1)<d(i))
then
336#ifdef DOUBLE_PRECISION_REAL
337 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8)
then
339 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4)
then
341 write(error_unit,
'(a,i8,2g25.16)')
'***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
343 write(error_unit,
'(a,i8,2g25.16)')
'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
344 write(error_unit,
'(a)')
'The eigenvalues from a lapack call are not sorted to machine precision.'
345 write(error_unit,
'(a)')
'In this extent, this is completely harmless.'
346 write(error_unit,
'(a)')
'Still, we keep this info message just in case.'
348 allocate(qtmp(nlen), stat=istat, errmsg=errormessage)
349 check_allocate(
"solve_tridi_single: qtmp", istat, errormessage)
352 qtmp(1:nlen) = q(1:nlen,i+1)
356 q(1:nlen,j+1) = q(1:nlen,j)
362 q(1:nlen,j+1) = qtmp(1:nlen)
363 deallocate(qtmp, stat=istat, errmsg=errormessage)
364 check_deallocate(
"solve_tridi_single: qtmp", istat, errormessage)
369 call obj%timer%stop(
"solve_tridi_single" // precision_suffix)
371#ifdef SOLVE_TRIDI_GPU_BUILD
372 end subroutine solve_tridi_single_problem_gpu_&
373 &precision_and_suffix
375 end subroutine solve_tridi_single_problem_cpu_&
376 &precision_and_suffix
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50