55#include "../general/sanity.F90"
56#include "../general/error_checking.inc"
59#ifdef SOLVE_TRIDI_GPU_BUILD
60 subroutine solve_tridi_single_problem_gpu_&
61 &precision_and_suffix &
62 (obj, nlen, d_dev, e_dev, q_dev, ldq, qtmp_dev, wantdebug, success)
64 subroutine solve_tridi_single_problem_cpu_&
65 &precision_and_suffix &
66 (obj, nlen, d, e, q, ldq, wantdebug, success)
77 use elpa_blas_interfaces
80 use solve_single_problem_gpu
81#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
83#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
87 class(elpa_abstract_impl_t),
intent(inout) :: obj
88 logical :: useGPU, useGPUsolver
89 integer(kind=ik) :: nlen, ldq
90 real(kind=real_datatype) :: d(nlen), e(nlen), q(ldq,nlen)
92 real(kind=real_datatype),
allocatable :: work(:), qtmp(:), ds(:), es(:)
93 real(kind=real_datatype) :: dtmp
95 integer(kind=ik) :: i, j, lwork, liwork, info
96 integer(kind=BLAS_KIND) :: infoBLAS
97 integer(kind=ik),
allocatable :: iwork(:)
100 logical,
intent(in) :: wantDebug
101 logical,
intent(out) :: success
102 integer(kind=c_int) :: debug
103 integer(kind=ik) :: istat
104 character(200) :: errorMessage
105 integer(kind=c_intptr_t) :: num, my_stream
106 integer(kind=c_intptr_t) :: q_dev, d_dev, info_dev, qtmp_dev, e_dev
107 logical :: successGPU
108 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
112 integer(kind=c_intptr_t) :: gpusolverHandle
115 if (wantdebug) debug = 1
118 usegpusolver =.false.
119#ifdef SOLVE_TRIDI_GPU_BUILD
122#if defined(WITH_NVIDIA_CUSOLVER)
125#if defined(WITH_AMD_ROCSOLVER)
128 usegpusolver =.false.
130#endif /* SOLVE_TRIDI_GPU_BUILD */
132 call obj%timer%start(
"solve_tridi_single" // precision_suffix)
135 allocate(ds(nlen), es(nlen), stat=istat, errmsg=errormessage)
136 check_allocate(
"solve_tridi_single: ds, es", istat, errormessage)
138 if (usegpusolver)
then
139 num = 1 * size_of_int
140 successgpu = gpu_malloc(info_dev, num)
141 check_alloc_gpu(
"solve_tridi_single info_dev: ", successgpu)
143 my_stream = obj%gpu_setup%my_stream
144 call gpu_construct_tridi_matrix(precision_char, q_dev, d_dev, e_dev, nlen, ldq, debug, my_stream)
148#if defined(WITH_NVIDIA_CUSOLVER) || defined(WITH_AMD_ROCSOLVER)
149 if (.not.usegpu)
then
156 if (.not. usegpusolver)
then
160 num = nlen * size_of_datatype
161#ifdef WITH_GPU_STREAMS
162 my_stream = obj%gpu_setup%my_stream
163 successgpu = gpu_memcpy_async(int(loc(d(1)),kind=c_intptr_t), d_dev, &
164 num, gpumemcpydevicetohost, my_stream)
165 check_memcpy_gpu(
"solve_tridi_single: d_dev ", successgpu)
167 successgpu = gpu_memcpy(int(loc(d(1)),kind=c_intptr_t), d_dev, &
168 num, gpumemcpydevicetohost)
169 check_memcpy_gpu(
"solve_tridi_single: d_dev", successgpu)
171 num = nlen * size_of_datatype
172#ifdef WITH_GPU_STREAMS
173 my_stream = obj%gpu_setup%my_stream
174 successgpu = gpu_memcpy_async(int(loc(e(1)),kind=c_intptr_t), e_dev, &
175 num, gpumemcpydevicetohost, my_stream)
176 check_memcpy_gpu(
"solve_tridi_single: e_dev ", successgpu)
178 successgpu = gpu_memcpy(int(loc(e(1)),kind=c_intptr_t), e_dev, &
179 num, gpumemcpydevicetohost)
180 check_memcpy_gpu(
"solve_tridi_single: e_dev", successgpu)
186#include "./solve_tridi_single_problem_include.F90"
189 num = nlen * size_of_datatype
190#ifdef WITH_GPU_STREAMS
191 my_stream = obj%gpu_setup%my_stream
192 successgpu = gpu_memcpy_async(d_dev, int(loc(d(1)),kind=c_intptr_t), &
193 num, gpumemcpyhosttodevice, my_stream)
194 check_memcpy_gpu(
"solve_tridi_single: d_dev ", successgpu)
196 successgpu = gpu_memcpy(d_dev, int(loc(d(1)),kind=c_intptr_t), &
197 num, gpumemcpyhosttodevice)
198 check_memcpy_gpu(
"solve_tridi_single: d_dev", successgpu)
200 num = nlen * size_of_datatype
201#ifdef WITH_GPU_STREAMS
202 my_stream = obj%gpu_setup%my_stream
203 successgpu = gpu_memcpy_async(e_dev, int(loc(e(1)),kind=c_intptr_t), &
204 num, gpumemcpyhosttodevice, my_stream)
205 check_memcpy_gpu(
"solve_tridi_single: e_dev ", successgpu)
207 successgpu = gpu_memcpy(e_dev, int(loc(e(1)),kind=c_intptr_t), &
208 num, gpumemcpyhosttodevice)
209 check_memcpy_gpu(
"solve_tridi_single: e_dev", successgpu)
213 num = ldq*nlen * size_of_datatype
214#ifdef WITH_GPU_STREAMS
215 my_stream = obj%gpu_setup%my_stream
216 successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
217 num, gpumemcpyhosttodevice, my_stream)
218 check_memcpy_gpu(
"solve_tridi_single: q_dev1 ", successgpu)
220 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
221 num, gpumemcpyhosttodevice)
222 check_memcpy_gpu(
"solve_tridi_single: q_dev1", successgpu)
227 call obj%timer%start(
"gpusolver_syevd")
228 nvtx_range_push(
"gpusolver_syevd")
229 gpusolverhandle = obj%gpu_setup%gpusolverHandleArray(0)
230 call gpusolver_precision_syevd (nlen, q_dev, ldq, d_dev, info_dev, gpusolverhandle)
231 if (wantdebug) successgpu = gpu_devicesynchronize()
232 nvtx_range_pop(
"gpusolver_syevd")
233 call obj%timer%stop(
"gpusolver_syevd")
235 num = 1 * size_of_int
236#ifdef WITH_GPU_STREAMS
237 my_stream = obj%gpu_setup%my_stream
238 successgpu = gpu_memcpy_async(int(loc(info),kind=c_intptr_t), info_dev, &
239 num, gpumemcpydevicetohost, my_stream)
240 check_memcpy_gpu(
"solve_tridi_single: ", successgpu)
242 successgpu = gpu_stream_synchronize(my_stream)
243 check_stream_synchronize_gpu(
"solve_tridi_single: info_dev -> info", successgpu)
245 successgpu = gpu_memcpy(int(loc(info),kind=c_intptr_t), info_dev, &
246 num, gpumemcpydevicetohost)
247 check_memcpy_gpu(
"solve_tridi_single: info_dev", successgpu)
250 if (info .ne. 0)
then
251 write(error_unit,
'(a,i8,a)')
"Error in gpusolver_PRECISION_syevd, info=", info,
", aborting..."
330#include "./solve_tridi_single_problem_include.F90"
333 if (usegpusolver)
then
334 successgpu = gpu_free(info_dev)
335 check_dealloc_gpu(
"solve_tridi_single: info_dev", successgpu)
342 my_stream = obj%gpu_setup%my_stream
343 call gpu_check_monotony(precision_char, d_dev, q_dev, qtmp_dev, nlen, ldq, debug, my_stream)
346 if (d(i+1)<d(i))
then
347#ifdef DOUBLE_PRECISION_REAL
348 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8)
then
350 if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4)
then
352 write(error_unit,
'(a,i8,2g25.16)')
'***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
354 write(error_unit,
'(a,i8,2g25.16)')
'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
355 write(error_unit,
'(a)')
'The eigenvalues from a lapack call are not sorted to machine precision.'
356 write(error_unit,
'(a)')
'In this extent, this is completely harmless.'
357 write(error_unit,
'(a)')
'Still, we keep this info message just in case.'
359 allocate(qtmp(nlen), stat=istat, errmsg=errormessage)
360 check_allocate(
"solve_tridi_single: qtmp", istat, errormessage)
363 qtmp(1:nlen) = q(1:nlen,i+1)
367 q(1:nlen,j+1) = q(1:nlen,j)
373 q(1:nlen,j+1) = qtmp(1:nlen)
374 deallocate(qtmp, stat=istat, errmsg=errormessage)
375 check_deallocate(
"solve_tridi_single: qtmp", istat, errormessage)
380 call obj%timer%stop(
"solve_tridi_single" // precision_suffix)
382#ifdef SOLVE_TRIDI_GPU_BUILD
383 end subroutine solve_tridi_single_problem_gpu_&
384 &precision_and_suffix
386 end subroutine solve_tridi_single_problem_cpu_&
387 &precision_and_suffix
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50