65 subroutine merge_systems_cpu_&
67 (obj, na, nm, d, e, q, &
68 matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols_self, &
69 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
74 use,
intrinsic :: iso_c_binding
77 use elpa_blas_interfaces
89 use merge_systems_gpu_new
90#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
92#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
96#ifdef WITH_OPENMP_TRADITIONAL
100#include "../general/precision_kinds.F90"
102 integer(kind=ik),
intent(in) :: na
103 integer(kind=ik),
intent(in) :: nm, matrixRows, nqoff, nblk, matrixCols, mpi_comm_rows, &
104 mpi_comm_cols_self, npc_0, npc_n
105 integer(kind=ik),
intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
107#ifdef SOLVE_TRIDI_GPU_BUILD
108 integer(kind=c_intptr_t) :: d_dev, e_dev
109 integer(kind=c_intptr_t) :: q_dev
111 integer(kind=c_intptr_t) :: d_dev, e_dev
112 integer(kind=c_intptr_t) :: q_dev
115 real(kind=real_datatype),
intent(inout) :: d(na), e
116#if defined(USE_ASSUMED_SIZE) && !defined(SOLVE_TRIDI_GPU_BUILD)
117 real(kind=real_datatype) :: q(matrixrows,*)
119 real(kind=real_datatype) :: q(matrixrows,matrixcols)
121 logical,
intent(in) :: useGPU, wantDebug
122 integer(kind=c_int) :: SM_count
124 logical,
intent(out) :: success
130 integer(kind=ik) :: max_strip
133 real(kind=real_datatype) :: beta, sig, s, c, t, tau, rho, eps, tol, &
134 qtrans(2,2), dmax, zmax, d1new, d2new
135 real(kind=real_datatype) :: z(na), d1(na), d2(na), z1(na), delta(na), &
136 dbase(na), ddiff(na), ev_scale(na), tmp(na)
137 real(kind=real_datatype) :: d1u(na), zu(na), d1l(na), zl(na)
138 real(kind=real_datatype),
allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
139#ifdef WITH_OPENMP_TRADITIONAL
140 real(kind=real_datatype),
allocatable :: z_p(:,:)
141 integer(kind=ik) :: my_thread
144 integer(kind=ik) :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
145 l_rqm, ns, lc1, lc2, info
146 integer(kind=BLAS_KIND) :: infoBLAS
147 integer(kind=ik) :: sig_int
148 integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
149 l_cols_qreorg, np, l_idx, nqcols1
150 integer(kind=ik) :: nnzu_save, nnzl_save
151 integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
153 integer(kind=MPI_KIND) :: mpierr
154 integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
155 integer(kind=ik) :: np_next, np_prev, np_rem
156 integer(kind=ik) :: idx(na), idx1(na), idx2(na)
157 integer(kind=BLAS_KIND) :: idxBLAS(NA)
158 integer(kind=ik) :: coltyp(na), idxq1(na)
160 integer(kind=ik) :: istat, debug
161 character(200) :: errorMessage
162 integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
164 integer(kind=c_intptr_t) :: num
165 integer(kind=C_intptr_t) :: qtmp1_dev, qtmp1_tmp_dev, qtmp2_dev, ev_dev
166 integer(kind=c_intptr_t) :: z1_dev, delta_dev, rho_dev
167 integer(kind=c_intptr_t) :: d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev
168 integer(kind=c_intptr_t) :: d1l_dev, zl_dev, z_dev, d1_dev, ztmp_extended_dev
169 integer(kind=c_intptr_t) :: idx1_dev, p_col_dev, coltyp_dev, p_col_out_dev, ndef_c_dev
170 integer(kind=c_intptr_t) :: idxq1_dev, l_col_out_dev, idx_dev, idx2_dev, l_col_dev
171 integer(kind=c_intptr_t) :: nnzul_dev
172 integer(kind=c_intptr_t) :: tmp_dev, zero_dev, one_dev, qtrans_dev
174 integer(kind=c_intptr_t) :: nnzu_val_dev, nnzl_val_dev
175 logical :: successGPU
176 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
179 integer(kind=c_intptr_t) :: gpuHandle
180 integer(kind=ik),
intent(in) :: max_threads
181 integer(kind=c_intptr_t) :: my_stream
182 integer(kind=ik) :: l_col_out_tmp
183 integer(kind=ik),
allocatable :: nnzu_val(:,:), nnzl_val(:,:)
184 integer(kind=ik) :: nnzul(2)
186 integer(kind=ik) :: nnzu_start, nnzl_start
188 integer(kind=ik),
allocatable :: ndef_c(:)
190 integer(kind=ik) :: ii,jj, indx, ind_ex, ind_ex2, p_col_tmp, index2, counter1, counter2
193 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
194 integer(kind=c_int) :: cclDataType
196 call obj%timer%start(
"merge_systems" // precision_suffix)
199 call obj%timer%start(
"mpi_communication")
200 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
201 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
202 call mpi_comm_rank(int(mpi_comm_cols_self,kind=mpi_kind) ,my_pcolmpi, mpierr)
203 call mpi_comm_size(int(mpi_comm_cols_self,kind=mpi_kind) ,np_colsmpi, mpierr)
205 my_prow = int(my_prowmpi,kind=c_int)
206 np_rows = int(np_rowsmpi,kind=c_int)
207 my_pcol = int(my_pcolmpi,kind=c_int)
208 np_cols = int(np_colsmpi,kind=c_int)
210 call obj%timer%stop(
"mpi_communication")
223 if (wantdebug) print *,
"max_strip = ", max_strip
225 useccl = obj%gpu_setup%useCCL
228#ifdef WITH_GPU_STREAMS
229 my_stream = obj%gpu_setup%my_stream
231 sm_count = obj%gpu_setup%gpuSMcount
234 my_stream = obj%gpu_setup%my_stream
235 ccl_comm_rows = obj%gpu_setup%ccl_comm_rows
236 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
237#if defined(DOUBLE_PRECISION)
238 ccldatatype = ccldouble
240#if defined(SINGLE_PRECISION)
241 ccldatatype = cclfloat
252 if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n)
then
253 call obj%timer%stop(
"merge_systems" // precision_suffix)
258 if (my_pcol == npc_0+npc_n-1)
then
261 np_next = my_pcol + 1
264 if (my_pcol == npc_0)
then
265 np_prev = npc_0+npc_n-1
267 np_prev = my_pcol - 1
270 call check_monotony_&
272 &(obj, nm,d,
'Input1',wantdebug, success)
273 if (.not.(success))
then
274 call obj%timer%stop(
"merge_systems" // precision_suffix)
278 call check_monotony_&
280 &(obj,na-nm,d(nm+1),
'Input2',wantdebug, success)
281 if (.not.(success))
then
282 call obj%timer%stop(
"merge_systems" // precision_suffix)
289 n_procs = np_rows*npc_n
290 my_proc = my_prow*npc_n + (my_pcol-npc_0)
295 l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1)
296 l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1)
297 l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1)
299 l_rnm = l_rqm-l_rqs+1
300 l_rows = l_rqe-l_rqs+1
305 l_cols = count(p_col(1:na)==my_pcol)
310 do np = npc_0, npc_0+npc_n-1
311 max_local_cols = max(max_local_cols,count(p_col(1:na)==np))
316 num = na * size_of_int
317 successgpu = gpu_malloc(ndef_c_dev, num)
318 check_alloc_gpu(
"merge_systems: ndef_c_dev", successgpu)
320 num = na * size_of_int
321 successgpu = gpu_malloc(idx1_dev, num)
322 check_alloc_gpu(
"merge_systems: idx1_dev", successgpu)
324 num = na * size_of_int
325 successgpu = gpu_malloc(p_col_dev, num)
326 check_alloc_gpu(
"merge_systems: p_col_dev", successgpu)
328 num = na * size_of_int
329 successgpu = gpu_malloc(p_col_out_dev, num)
330 check_alloc_gpu(
"merge_systems: p_col_out_dev", successgpu)
332 num = na * size_of_int
333 successgpu = gpu_malloc(coltyp_dev, num)
334 check_alloc_gpu(
"merge_systems: coltyp_dev", successgpu)
336 num = na * size_of_int
337 successgpu = gpu_malloc(idx2_dev, num)
338 check_alloc_gpu(
"merge_systems: idx2_dev", successgpu)
340 num = na * size_of_int
341 successgpu = gpu_malloc(l_col_dev, num)
342 check_alloc_gpu(
"merge_systems: l_col_dev", successgpu)
344 num = na * size_of_int
345 successgpu = gpu_malloc(l_col_out_dev, num)
346 check_alloc_gpu(
"merge_systems: l_col_out_dev", successgpu)
348 num = (na) * size_of_datatype
349 successgpu = gpu_malloc(z_dev, num)
350 check_alloc_gpu(
"merge_systems: z_dev", successgpu)
352 num = (na) * size_of_datatype
353 successgpu = gpu_malloc(z1_dev, num)
354 check_alloc_gpu(
"merge_systems: z1_dev", successgpu)
356 num = (na) * size_of_datatype
357 successgpu = gpu_malloc(d1_dev, num)
358 check_alloc_gpu(
"merge_systems: d1_dev", successgpu)
360 num = 1 * size_of_datatype
361 successgpu = gpu_malloc(rho_dev, num)
362 check_alloc_gpu(
"merge_systems: rho_dev", successgpu)
364 num = (na) * size_of_datatype
365 successgpu = gpu_malloc(d1u_dev, num)
366 check_alloc_gpu(
"merge_systems: d1u_dev", successgpu)
368 num = (na) * size_of_datatype
369 successgpu = gpu_malloc(dbase_dev, num)
370 check_alloc_gpu(
"merge_systems: dbase_dev", successgpu)
372 num = (na) * size_of_datatype
373 successgpu = gpu_malloc(ddiff_dev, num)
374 check_alloc_gpu(
"merge_systems: ddiff_dev", successgpu)
376 num = (na) * size_of_datatype
377 successgpu = gpu_malloc(zu_dev, num)
378 check_alloc_gpu(
"merge_systems: zu_dev", successgpu)
380 num = (na) * size_of_datatype
381 successgpu = gpu_malloc(ev_scale_dev, num)
382 check_alloc_gpu(
"merge_systems: ev_scale_dev", successgpu)
384 num = (na) * size_of_datatype
385 successgpu = gpu_malloc(d1l_dev, num)
386 check_alloc_gpu(
"merge_systems: d1l_dev", successgpu)
388 num = (na) * size_of_datatype
389 successgpu = gpu_malloc(zl_dev, num)
390 check_alloc_gpu(
"merge_systems: zl_dev", successgpu)
392 num = (l_rows) * size_of_datatype
393 successgpu = gpu_malloc(tmp_dev, num)
394 check_alloc_gpu(
"merge_systems: tmp_dev", successgpu)
396 num = 1 * size_of_datatype
397 successgpu = gpu_malloc(zero_dev, num)
398 check_alloc_gpu(
"merge_systems: zero_dev", successgpu)
400 num = 1 * size_of_datatype
401 successgpu = gpu_malloc(one_dev, num)
402 check_alloc_gpu(
"merge_systems: one_dev", successgpu)
404 num = 4 * size_of_datatype
405 successgpu = gpu_malloc(qtrans_dev, num)
406 check_alloc_gpu(
"merge_systems: qtrans_dev", successgpu)
408 num = na * size_of_int
409#ifdef WITH_GPU_STREAMS
410 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
412 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
414 check_memcpy_gpu(
"merge_systems: p_col_dev", successgpu)
416 num = na * size_of_int
417#ifdef WITH_GPU_STREAMS
418 successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
420 successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
422 check_memcpy_gpu(
"merge_systems: l_col_dev", successgpu)
424 num = 1 * size_of_datatype
426#ifdef WITH_GPU_STREAMS
427 successgpu = gpu_memcpy_async(zero_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
429 successgpu = gpu_memcpy(zero_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice)
431 check_memcpy_gpu(
"merge_systems: zero_dev", successgpu)
433 num = 1 * size_of_datatype
435#ifdef WITH_GPU_STREAMS
436 successgpu = gpu_memcpy_async(one_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
438 successgpu = gpu_memcpy(one_dev, int(loc(beta),kind=c_intptr_t), num, gpumemcpyhosttodevice)
440 check_memcpy_gpu(
"merge_systems: one_dev", successgpu)
451 num = na * size_of_datatype
452#ifdef WITH_GPU_STREAMS
453 successgpu = gpu_memset_async(z_dev, 0, num, my_stream)
455 successgpu = gpu_memset(z_dev, 0, num)
457 check_memcpy_gpu(
"merge_systems: memset z_dev", successgpu)
463 if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow)
then
467 nvtx_range_push(
"gpu_fill_z_kernel")
468 call gpu_fill_z(precision_char, z_dev, q_dev, p_col_dev, l_col_dev, &
469 sig_int, na, my_pcol, l_rqm, matrixrows, sm_count, debug, my_stream)
470 nvtx_range_pop(
"gpu_fill_z_kernel")
473 if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
478 if (mod((nqoff+nm)/nblk,np_rows)==my_prow)
then
488 nvtx_range_push(
"gpu_fill_z_kernel")
489 call gpu_fill_z(precision_char, z_dev, q_dev, p_col_dev, l_col_dev, &
490 sig_int, na, my_pcol, l_rqm+1, matrixrows, sm_count, debug, my_stream)
491 nvtx_range_pop(
"gpu_fill_z_kernel")
494 if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
500 num = na * size_of_datatype
501#ifdef WITH_GPU_STREAMS
502 successgpu = gpu_memcpy_async(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost, my_stream)
504 successgpu = gpu_memcpy(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost)
506 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
512 &(obj, z, na, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)
513 if (.not.(success))
then
514 write(error_unit,*)
"Error in global_gather. Aborting"
524 call obj%timer%start(
"lapack_lamrg")
525 nvtx_range_push(
"lapack_lamrg_1")
526 call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &
527 1_blas_kind, 1_blas_kind, idxblas )
528 nvtx_range_pop(
"lapack_lamrg_1")
529 idx(:) = int(idxblas(:),kind=ik)
530 call obj%timer%stop(
"lapack_lamrg")
534 zmax = maxval(abs(z))
535 dmax = maxval(abs(d))
536 eps = precision_lamch(
'E' )
537 tol = 8.0_rk*eps*max(dmax,zmax)
542 IF ( rho*zmax <= tol )
THEN
551 nvtx_range_push(
"resort_ev_gpu")
552 call obj%timer%start(
"resort_ev_gpu")
555 (obj, idx, na, na, p_col_out, q_dev, matrixrows, matrixcols, l_rows, l_rqe, &
556 l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)
557 call obj%timer%stop(
"resort_ev_gpu")
558 nvtx_range_pop(
"resort_ev_gpu")
562 (obj, idx, na, na, p_col_out, q , matrixrows, matrixcols, l_rows, l_rqe, &
563 l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)
566 call obj%timer%stop(
"merge_systems" // precision_suffix)
568 if (wantdebug)
write(error_unit,*)
"Returing early from merge_systems (RHO*zmax <= TOL): matrix is block-diagonal"
588 nvtx_range_push(
"deflation_loop")
591 if (rho*abs(z(idx(i))) <= tol)
then
609 tau = precision_lapy2( c, s )
610 t = d1(na1) - d(idx(i))
613 IF ( abs( t*c*s ) <= tol )
THEN
621 d2new = d(idx(i))*c**2 + d1(na1)*s**2
622 d1new = d(idx(i))*s**2 + d1(na1)*c**2
638 if (d1new<d1(na1) ) d1new = d1(na1)
639 if (d1new>d(idx(i))) d1new = d(idx(i))
641 if (d2new<d1(na1) ) d2new = d1(na1)
642 if (d2new>d(idx(i))) d2new = d(idx(i))
647 if (d2new<d2(j))
then
658 qtrans(1,1) = c; qtrans(1,2) =-s
659 qtrans(2,1) = s; qtrans(2,2) = c
661 nvtx_range_push(
"transform_columns")
663#ifdef WITH_GPU_STREAMS
664 successgpu = gpu_memcpy_async(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &
665 4*size_of_datatype, gpumemcpyhosttodevice, my_stream)
666 if (wantdebug) successgpu = gpu_devicesynchronize()
668 successgpu = gpu_memcpy(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &
669 4*size_of_datatype, gpumemcpyhosttodevice)
671 check_memcpy_gpu(
"transform_columns: q_dev", successgpu)
673 call transform_columns_gpu_&
675 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
676 q_dev, matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &
677 p_col, l_col, qtrans_dev, &
678 tmp_dev, zero_dev, one_dev, debug, my_stream)
680 call transform_columns_cpu_&
682 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
683 q , matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &
684 p_col, l_col, qtrans)
686 nvtx_range_pop(
"transform_columns")
688 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
689 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
707 nvtx_range_pop(
"deflation_loop")
709 call check_monotony_&
711 &(obj, na1,d1,
'Sorted1', wantdebug, success)
712 if (.not.(success))
then
713 call obj%timer%stop(
"merge_systems" // precision_suffix)
716 call check_monotony_&
718 &(obj, na2,d2,
'Sorted2', wantdebug, success)
719 if (.not.(success))
then
720 call obj%timer%stop(
"merge_systems" // precision_suffix)
724 if (na1==1 .or. na1==2)
then
728 d(1) = d1(1) + rho*z1(1)**2
730 call obj%timer%start(
"lapack_laed5_x2")
731 nvtx_range_push(
"lapack_laed5_x2")
732 call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))
733 call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))
734 nvtx_range_pop(
"lapack_laed5_x2")
735 call obj%timer%stop(
"lapack_laed5_x2")
738#ifdef WITH_GPU_STREAMS
739 successgpu = gpu_memcpy_async(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &
740 4*size_of_datatype, gpumemcpyhosttodevice, my_stream)
741 if (wantdebug) successgpu = gpu_devicesynchronize()
743 successgpu = gpu_memcpy(qtrans_dev, int(loc(qtrans(1,1)),kind=c_intptr_t), &
744 4*size_of_datatype, gpumemcpyhosttodevice)
746 check_memcpy_gpu(
"transform_columns: q_dev", successgpu)
748 call transform_columns_gpu_&
750 (obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, &
751 q_dev, matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &
752 p_col, l_col, qtrans_dev, &
753 tmp_dev, zero_dev, one_dev, debug, my_stream)
755 call transform_columns_cpu_&
757 & (obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &
758 matrixrows, matrixcols, l_rows, mpi_comm_cols_self, &
759 p_col, l_col, qtrans)
764 d(na1+1:na) = d2(1:na2)
767 call obj%timer%start(
"lapack_lamrg")
768 nvtx_range_push(
"lapack_lamrg_2")
769 call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
770 1_blas_kind, 1_blas_kind, idxblas )
771 nvtx_range_pop(
"lapack_lamrg_2")
772 idx(:) = int(idxblas(:),kind=ik)
773 call obj%timer%stop(
"lapack_lamrg")
784 if (idx(i)<=na1)
then
785 idxq1(i) = idx1(idx(i))
787 idxq1(i) = idx2(idx(i)-na1)
794 &(obj, idxq1, na, na, p_col_out, q_dev, matrixrows, matrixcols, l_rows, l_rqe, &
795 l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)
799 &(obj, idxq1, na, na, p_col_out, q , matrixrows, matrixcols, l_rows, l_rqe, &
800 l_rqs, mpi_comm_cols_self, p_col, l_col, l_col_out)
803 write(error_unit,*) .or.
"Returing early from merge_systems (na1==1 na1==2)"
811 num = (na1*sm_count) * size_of_datatype
812 successgpu = gpu_malloc(ztmp_extended_dev, num)
813 check_alloc_gpu(
"merge_systems: delta_dev", successgpu)
815 call gpu_fill_array(precision_char, ztmp_extended_dev, one_dev, na1*sm_count, sm_count, debug, my_stream)
817 call gpu_fill_array(precision_char, z_dev, one_dev, na1, sm_count, debug, my_stream)
819 num = na1 * size_of_datatype
820#ifdef WITH_GPU_STREAMS
821 successgpu = gpu_memset_async(dbase_dev, 0, num, my_stream)
823 successgpu = gpu_memset(dbase_dev, 0, num)
825 check_memcpy_gpu(
"merge_systems: memset dbase_dev", successgpu)
827 num = na1 * size_of_datatype
828#ifdef WITH_GPU_STREAMS
829 successgpu = gpu_memset_async(ddiff_dev, 0, num, my_stream)
831 successgpu = gpu_memset(ddiff_dev, 0, num)
833 check_memcpy_gpu(
"merge_systems: memset ddiff_dev", successgpu)
843 nvtx_range_push(
"lapack_laed4_loop")
847#ifdef WITH_GPU_STREAMS
848 num = na * size_of_datatype
849 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
850 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
852 successgpu = gpu_memcpy_async(z1_dev, int(loc(z1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
853 check_memcpy_gpu(
"merge_systems: z1_dev", successgpu)
855 num = 1 * size_of_datatype
856 successgpu = gpu_memcpy_async(rho_dev, int(loc(rho),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
857 check_memcpy_gpu(
"merge_systems: rho_dev", successgpu)
860 num = na * size_of_datatype
861 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
862 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
864 successgpu = gpu_memcpy(z1_dev, int(loc(z1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
865 check_memcpy_gpu(
"merge_systems: z1_dev", successgpu)
867 num = 1 * size_of_datatype
868 successgpu = gpu_memcpy(rho_dev, int(loc(rho),kind=c_intptr_t), num, gpumemcpyhosttodevice)
869 check_memcpy_gpu(
"merge_systems: rho_dev", successgpu)
873 num = (na1*sm_count) * size_of_datatype
874 successgpu = gpu_malloc(delta_dev, num)
875 check_alloc_gpu(
"merge_systems: delta_dev", successgpu)
877 call gpu_solve_secular_equation_loop (precision_char, d1_dev, z1_dev, delta_dev, rho_dev, &
878 ztmp_extended_dev, dbase_dev, ddiff_dev, my_proc, na1, n_procs, sm_count, debug, my_stream)
880 call gpu_local_product(precision_char, z_dev, ztmp_extended_dev, na1, sm_count, debug, my_stream)
882 successgpu = gpu_free(delta_dev)
883 check_dealloc_gpu(
"merge_systems: delta_dev", successgpu)
886#ifdef WITH_GPU_STREAMS
887 num = na * size_of_datatype
888 successgpu = gpu_memcpy_async(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost, my_stream)
889 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
891 successgpu = gpu_memcpy_async(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost, my_stream)
892 check_memcpy_gpu(
"merge_systems: z1_dev", successgpu)
894 successgpu = gpu_memcpy_async(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost, my_stream)
895 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
897 successgpu = gpu_memcpy_async(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost, my_stream)
898 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
900 successgpu = gpu_memcpy_async(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost, my_stream)
901 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
903 num = 1 * size_of_datatype
904 successgpu = gpu_memcpy_async(int(loc(rho),kind=c_intptr_t), rho_dev, num, gpumemcpydevicetohost, my_stream)
905 check_memcpy_gpu(
"merge_systems: rho_dev", successgpu)
907 num = na * size_of_datatype
908 successgpu = gpu_memcpy(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost)
909 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
911 successgpu = gpu_memcpy(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost)
912 check_memcpy_gpu(
"merge_systems: z1_dev", successgpu)
914 successgpu = gpu_memcpy(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost)
915 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
917 successgpu = gpu_memcpy(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost)
918 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
920 successgpu = gpu_memcpy(int(loc(z(1)),kind=c_intptr_t), z_dev, num, gpumemcpydevicetohost)
921 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
923 num = 1 * size_of_datatype
924 successgpu = gpu_memcpy(int(loc(rho),kind=c_intptr_t), rho_dev, num, gpumemcpydevicetohost)
925 check_memcpy_gpu(
"merge_systems: rho_dev", successgpu)
937 do i = my_proc+1, na1, n_procs
938 call obj%timer%start(
"lapack_laed4")
939 nvtx_range_push(
"lapack_laed4")
940 call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &
942 info = int(infoblas,kind=ik)
943 nvtx_range_pop(
"lapack_laed4")
944 call obj%timer%stop(
"lapack_laed4")
949 call solve_secular_equation_&
951 &(obj, na1, i, d1, z1, delta, rho, s)
963 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
970 if (abs(delta(i+1)) < abs(delta(i)))
then
972 ddiff(i) = delta(i+1)
983 nvtx_range_pop(
"lapack_laed4_loop")
995 nvtx_range_push(
"global_product")
996 call global_product_&
998 (obj, z, na1, mpi_comm_rows, mpi_comm_cols_self, npc_0, npc_n, success)
999 if (.not.(success))
then
1000 write(error_unit,*)
"Error in global_product. Aborting..."
1003 nvtx_range_pop(
"global_product")
1005 z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )
1007 nvtx_range_push(
"global_gather_x2")
1008 call global_gather_&
1010 &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)
1011 if (.not.(success))
then
1012 write(error_unit,*)
"Error in global_gather. Aborting..."
1015 call global_gather_&
1017 &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)
1018 if (.not.(success))
then
1019 write(error_unit,*)
"Error in global_gather. Aborting..."
1022 nvtx_range_pop(
"global_gather_x2")
1024 d(1:na1) = dbase(1:na1) - ddiff(1:na1)
1028 num = na * size_of_datatype
1029#ifdef WITH_GPU_STREAMS
1030 successgpu = gpu_memset_async(ev_scale_dev, 0, num, my_stream)
1032 successgpu = gpu_memset(ev_scale_dev, 0, num)
1034 check_memcpy_gpu(
"merge_systems: memset ev_scale_dev", successgpu)
1036 ev_scale(:) = 0.0_rk
1040 nvtx_range_push(
"add_tmp_loop")
1041 if (wantdebug)
call obj%timer%start(
"add_tmp_loop")
1045 num = na * size_of_datatype
1046#ifdef WITH_GPU_STREAMS
1047 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1048 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
1050 successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1051 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1053 successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1054 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1056 successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1057 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
1059 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1060 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
1062 successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1063 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1065 successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1066 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1068 successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1069 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
1072 call gpu_add_tmp_loop(precision_char, d1_dev, dbase_dev, ddiff_dev, z_dev, ev_scale_dev, ztmp_extended_dev, &
1073 na1, my_proc, n_procs, sm_count, debug, my_stream)
1075 successgpu = gpu_free(ztmp_extended_dev)
1076 check_dealloc_gpu(
"merge_systems: ztmp_extended_dev", successgpu)
1079 num = na * size_of_datatype
1080#ifdef WITH_GPU_STREAMS
1081 successgpu = gpu_memcpy_async(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost, my_stream)
1082 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
1084 successgpu = gpu_memcpy_async(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost, my_stream)
1085 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1087 successgpu = gpu_memcpy_async(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost, my_stream)
1088 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1090 successgpu = gpu_memcpy_async(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost, my_stream)
1091 check_memcpy_gpu(
"merge_systems: z1_dev", successgpu)
1093 successgpu = gpu_memcpy_async(int(loc(ev_scale(1)),kind=c_intptr_t), ev_scale_dev, num, gpumemcpydevicetohost, my_stream)
1094 check_memcpy_gpu(
"merge_systems: ev_scale_dev", successgpu)
1096 successgpu = gpu_memcpy(int(loc(d1(1)),kind=c_intptr_t), d1_dev, num, gpumemcpydevicetohost)
1097 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
1099 successgpu = gpu_memcpy(int(loc(dbase(1)),kind=c_intptr_t), dbase_dev, num, gpumemcpydevicetohost)
1100 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1102 successgpu = gpu_memcpy(int(loc(ddiff(1)),kind=c_intptr_t), ddiff_dev, num, gpumemcpydevicetohost)
1103 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1105 successgpu = gpu_memcpy(int(loc(z1(1)),kind=c_intptr_t), z1_dev, num, gpumemcpydevicetohost)
1106 check_memcpy_gpu(
"merge_systems: z1_dev", successgpu)
1108 successgpu = gpu_memcpy(int(loc(ev_scale(1)),kind=c_intptr_t), ev_scale_dev, num, gpumemcpydevicetohost)
1109 check_memcpy_gpu(
"merge_systems: ev_scale_dev", successgpu)
1112#ifdef WITH_OPENMP_TRADITIONAL
1113 call obj%timer%start(
"OpenMP parallel" // precision_suffix)
1121 do i = my_proc+1, na1, n_procs
1131 &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)
1134#ifdef WITH_OPENMP_TRADITIONAL
1137 call obj%timer%stop(
"OpenMP parallel" // precision_suffix)
1141 if (wantdebug)
call obj%timer%stop(
"add_tmp_loop")
1142 nvtx_range_pop(
"add_tmp_loop")
1145 nvtx_range_push(
"global_gather")
1146 call global_gather_&
1148 &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols_self, npc_n, np_prev, np_next, success)
1149 if (.not.(success))
then
1150 write(error_unit,*)
"Error in global_gather. Aborting..."
1153 nvtx_range_pop(
"global_gather")
1156 d(na1+1:na) = d2(1:na2)
1158 call obj%timer%start(
"lapack_lamrg")
1159 nvtx_range_push(
"lapack_lamrg_3")
1161 call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
1162 1_blas_kind, 1_blas_kind, idxblas )
1163 nvtx_range_pop(
"lapack_lamrg_3")
1164 idx(:) = int(idxblas(:),kind=ik)
1165 call obj%timer%stop(
"lapack_lamrg")
1172 call check_monotony_&
1174 &(obj, na,d,
'Output', wantdebug, success)
1176 if (.not.(success))
then
1177 call obj%timer%stop(
"merge_systems" // precision_suffix)
1178 write(error_unit,*)
"Error in check_monotony. Aborting..."
1183 num = 2 * size_of_int
1184 successgpu = gpu_malloc(nnzul_dev, num)
1185 check_alloc_gpu(
"merge_systems: ", successgpu)
1187 num = na * size_of_int
1188 successgpu = gpu_malloc(idxq1_dev, num)
1189 check_alloc_gpu(
"merge_systems: ", successgpu)
1191 num = na * size_of_int
1192 successgpu = gpu_malloc(idx_dev, num)
1193 check_alloc_gpu(
"merge_systems: idx_dev", successgpu)
1195 num = na * size_of_int
1196#ifdef WITH_GPU_STREAMS
1197 my_stream = obj%gpu_setup%my_stream
1198 successgpu = gpu_memcpy_async(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
1199 num, gpumemcpyhosttodevice, my_stream)
1200 check_memcpy_gpu(
"merge_systems: ", successgpu)
1202 successgpu = gpu_memcpy(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
1203 num, gpumemcpyhosttodevice)
1204 check_memcpy_gpu(
"merge_systems: idx_dev", successgpu)
1221 nvtx_range_push(
"loop_idxq1")
1223 if (p_col_out(i)==my_pcol)
then
1224 if (idx(i)<=na1)
then
1233 nvtx_range_pop(
"loop_idxq1")
1237 num = na * size_of_int
1238#ifdef WITH_GPU_STREAMS
1239 my_stream = obj%gpu_setup%my_stream
1240 successgpu = gpu_memcpy_async(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
1241 num, gpumemcpyhosttodevice, my_stream)
1242 check_memcpy_gpu(
"merge_systems: ", successgpu)
1244 successgpu = gpu_memcpy(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
1245 num, gpumemcpyhosttodevice)
1246 check_memcpy_gpu(
"merge_systems: idxq1_dev", successgpu)
1252 allocate(ndef_c(na), stat=istat, errmsg=errormessage)
1253 check_allocate(
"merge_systems: ndef_c",istat, errormessage)
1257 gemm_dim_k = max(1,l_rows)
1258 gemm_dim_l = max_local_cols
1259 gemm_dim_m = min(max_strip, max(1,nqcols1))
1261 if (.not. useccl)
then
1262 allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)
1263 check_allocate(
"merge_systems: qtmp1",istat, errormessage)
1265 allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)
1266 check_allocate(
"merge_systems: ev",istat, errormessage)
1268 allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)
1269 check_allocate(
"merge_systems: qtmp2",istat, errormessage)
1273 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
1274 successgpu = gpu_malloc(qtmp1_dev, num)
1275 check_alloc_gpu(
"merge_systems: qtmp1_dev", successgpu)
1277 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
1278 successgpu = gpu_malloc(qtmp1_tmp_dev, num)
1279 check_alloc_gpu(
"merge_systems: qtmp1_tmp_dev", successgpu)
1281 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
1282 successgpu = gpu_malloc(ev_dev, num)
1283 check_alloc_gpu(
"merge_systems: ev_dev", successgpu)
1285 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
1286 successgpu = gpu_malloc(qtmp2_dev, num)
1287 check_alloc_gpu(
"merge_systems: qtmp2_dev", successgpu)
1289 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1290 if (wantdebug)
call obj%timer%start(
"gpu_host_register")
1292 if (.not. useccl)
then
1293 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
1294 nvtx_range_push(
"gpu_host_register_qtmp1")
1295 successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t), num, gpuhostregisterdefault)
1296 check_host_register_gpu(
"merge_systems: qtmp1", successgpu)
1297 nvtx_range_pop(
"gpu_host_register_qtmp1")
1299 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
1300 successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t), num, gpuhostregisterdefault)
1301 check_host_register_gpu(
"merge_systems: ev", successgpu)
1303 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
1304 successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t), num, gpuhostregisterdefault)
1305 check_host_register_gpu(
"merge_systems: qtmp2", successgpu)
1309 successgpu = gpu_devicesynchronize()
1310 call obj%timer%stop(
"gpu_host_register")
1316 num = gemm_dim_k * gemm_dim_l * size_of_datatype
1317#ifdef WITH_GPU_STREAMS
1318 successgpu = gpu_memset_async(qtmp1_dev, 0, num, my_stream)
1320 successgpu = gpu_memset(qtmp1_dev, 0, num)
1322 check_memcpy_gpu(
"merge_systems: memset qtmp1_dev", successgpu)
1324 nvtx_range_push(
"set_qtmp1_qtmp2_0")
1325 call obj%timer%start(
"set_qtmp1_qtmp2_0")
1328 call obj%timer%stop(
"set_qtmp1_qtmp2_0")
1329 nvtx_range_pop(
"set_qtmp1_qtmp2_0")
1338 num = na * size_of_int
1339#ifdef WITH_GPU_STREAMS
1340 successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1341 check_memcpy_gpu(
"merge_systems: idx1_dev", successgpu)
1343 successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1344 check_memcpy_gpu(
"merge_systems: coltyp_dev", successgpu)
1346 successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1347 check_memcpy_gpu(
"merge_systems: idx1_dev", successgpu)
1349 successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1350 check_memcpy_gpu(
"merge_systems: coltyp_dev", successgpu)
1353 nvtx_range_push(
"gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")
1354 if (wantdebug)
call obj%timer%start(
"gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")
1356 call gpu_copy_qtmp1_q_compute_nnzu_nnzl(precision_char, qtmp1_dev, q_dev, &
1357 p_col_dev, l_col_dev, idx1_dev, coltyp_dev, nnzul_dev, &
1358 na1, l_rnm, l_rqs, l_rqm, l_rows, my_pcol, gemm_dim_k, matrixrows, &
1359 sm_count, debug, my_stream)
1361 if (wantdebug)
call obj%timer%stop(
"gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")
1362 nvtx_range_pop(
"gpu_copy_qtmp1_q_compute_nnzu_nnzl_kernel")
1368 num = 2 * size_of_int
1369#ifdef WITH_GPU_STREAMS
1370 successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost, my_stream)
1372 successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost)
1374 check_memcpy_gpu(
"merge_systems: nnzul_dev", successgpu)
1379 nvtx_range_push(
"loop_compute_nnzu")
1380 if (wantdebug)
call obj%timer%start(
"loop_compute_nnzu")
1384 if (p_col(idx1(i))==my_pcol)
then
1385 l_idx = l_col(idx1(i))
1387 if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2)
then
1389 qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
1392 if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2)
then
1394 qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
1398 if (wantdebug)
call obj%timer%stop(
"loop_compute_nnzu")
1399 nvtx_range_pop(
"loop_compute_nnzu")
1403 call obj%timer%start(
"gpu_memcpy")
1404 num = na * size_of_int
1405#ifdef WITH_GPU_STREAMS
1406 my_stream = obj%gpu_setup%my_stream
1407 successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1408 check_memcpy_gpu(
"merge_systems: ", successgpu)
1410 successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1411 check_memcpy_gpu(
"merge_systems: l_col_dev", successgpu)
1413 call obj%timer%stop(
"gpu_memcpy")
1419 ndef = max(nnzu,nnzl)
1422 num = na * size_of_int
1423#ifdef WITH_GPU_STREAMS
1424 successgpu = gpu_memcpy_async(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1425 check_memcpy_gpu(
"merge_systems: ", successgpu)
1427 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1428 check_memcpy_gpu(
"merge_systems: ", successgpu)
1430 successgpu = gpu_memcpy(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1431 check_memcpy_gpu(
"merge_systems: idx2_dev", successgpu)
1433 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1434 check_memcpy_gpu(
"merge_systems: p_col_dev", successgpu)
1442 num = na * size_of_int
1443#ifdef WITH_GPU_STREAMS
1444 successgpu = gpu_memcpy_async(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1445 check_memcpy_gpu(
"merge_systems: ndef_c_dev 4", successgpu)
1447 successgpu = gpu_memcpy(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1448 check_memcpy_gpu(
"merge_systems: ndef_c_dev", successgpu)
1451 call gpu_copy_q_slice_to_qtmp1 (precision_char, qtmp1_dev, q_dev, ndef_c_dev, l_col_dev, idx2_dev, p_col_dev, &
1452 na2, na, my_pcol, l_rows, l_rqs, l_rqe, matrixrows, gemm_dim_k, debug, my_stream)
1455 l_idx = l_col(idx2(i))
1456 if (p_col(idx2(i))==my_pcol)
then
1458 qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
1463 l_cols_qreorg = ndef
1465 num = na * size_of_int
1466#ifdef WITH_GPU_STREAMS
1467 successgpu = gpu_memcpy_async(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1468 num, gpumemcpyhosttodevice, my_stream)
1469 check_memcpy_gpu(
"merge_systems: ", successgpu)
1471 successgpu = gpu_memcpy_async(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1472 num, gpumemcpyhosttodevice, my_stream)
1473 check_memcpy_gpu(
"merge_systems: l_col_out_dev", successgpu)
1475 successgpu = gpu_memcpy(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1476 check_memcpy_gpu(
"merge_systems: p_col_out_dev", successgpu)
1478 successgpu = gpu_memcpy(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1479 check_memcpy_gpu(
"merge_systems: l_col_out_dev", successgpu)
1488 call gpu_zero_q(precision_char, q_dev, p_col_out_dev, l_col_out_dev, &
1489 na, my_pcol, l_rqs, l_rqe, matrixrows, debug, my_stream)
1492 if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
1499#ifdef WITH_GPU_STREAMS
1500 num = na * size_of_int
1501 successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1502 check_memcpy_gpu(
"merge_systems: idx1_dev", successgpu)
1504 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1505 check_memcpy_gpu(
"merge_systems: p_col_dev", successgpu)
1507 successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1508 check_memcpy_gpu(
"merge_systems: coltyp_dev", successgpu)
1510 num = na * size_of_datatype
1511 successgpu = gpu_memcpy_async(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1512 check_memcpy_gpu(
"merge_systems: ev_scale_dev", successgpu)
1514 successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1515 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
1517 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1518 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
1520 successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1521 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1523 successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice, my_stream)
1524 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1526 num = na * size_of_int
1527 successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1528 check_memcpy_gpu(
"merge_systems: idx1_dev", successgpu)
1530 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1531 check_memcpy_gpu(
"merge_systems: p_col_dev", successgpu)
1533 successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1534 check_memcpy_gpu(
"merge_systems: coltyp_dev", successgpu)
1536 num = na * size_of_datatype
1537 successgpu = gpu_memcpy(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1538 check_memcpy_gpu(
"merge_systems: ev_scale_dev", successgpu)
1540 successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1541 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
1543 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1544 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
1546 successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1547 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1549 successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), num, gpumemcpyhosttodevice)
1550 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1554 allocate(nnzu_val(na1,npc_n))
1555 allocate(nnzl_val(na1,npc_n))
1561 num = na1 * npc_n* size_of_int
1562 successgpu = gpu_malloc(nnzu_val_dev, num)
1563 check_alloc_gpu(
"merge_systems: nnzu_val_dev", successgpu)
1565 num = na1 * npc_n* size_of_int
1566 successgpu = gpu_malloc(nnzl_val_dev, num)
1567 check_alloc_gpu(
"merge_systems: nnzl_val_dev", successgpu)
1575 if (np_rem == npc_0)
then
1576 np_rem = npc_0+npc_n-1
1584 call gpu_compute_nnzl_nnzu_val_part1 (p_col_dev, idx1_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, &
1585 na, na1, np_rem, npc_n, nnzu_save, nnzl_save, np, debug, my_stream)
1592 call gpu_compute_nnzl_nnzu_val_part2 (nnzu_val_dev, nnzl_val_dev, na, na1, nnzu_start, nnzl_start, npc_n, &
1598 if (np_rem == npc_0)
then
1599 np_rem = npc_0+npc_n-1
1607 if (p_col(idx1(i)) == np_rem)
then
1608 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2)
then
1610 nnzu_val(i,np) = nnzu
1612 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2)
then
1614 nnzl_val(i,np) = nnzl
1626 nvtx_range_push(
"main_loop")
1628 nvtx_range_push(
"np=1,npc_n")
1632 if (np_rem == npc_0)
then
1633 np_rem = npc_0+npc_n-1
1640 my_stream = obj%gpu_setup%my_stream
1641 call gpu_copy_qtmp1_to_qtmp1_tmp (precision_char, qtmp1_dev, qtmp1_tmp_dev, gemm_dim_k, gemm_dim_l, &
1644 call obj%timer%start(
"ccl_send_recv")
1645 successgpu = ccl_group_start()
1646 if (.not.successgpu)
then
1647 print *,
"Error in setting up ccl_group_start!"
1651 successgpu = ccl_send(qtmp1_tmp_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1652 ccldatatype, np_next, ccl_comm_cols, my_stream)
1654 if (.not.successgpu)
then
1655 print *,
"Error in ccl_send"
1659 successgpu = ccl_recv(qtmp1_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1660 ccldatatype, np_prev, ccl_comm_cols, my_stream)
1663 if (.not.successgpu)
then
1664 print *,
"Error in ccl_recv"
1668 successgpu = ccl_group_end()
1670 if (.not.successgpu)
then
1671 print *,
"Error in setting up ccl_group_end!"
1674 successgpu = gpu_stream_synchronize(my_stream)
1675 check_stream_synchronize_gpu(
"trans_ev", successgpu)
1676 call obj%timer%stop(
"ccl_send_recv")
1679 call obj%timer%start(
"mpi_communication")
1680#ifdef WITH_GPU_STREAMS
1681 my_stream = obj%gpu_setup%my_stream
1682 successgpu = gpu_stream_synchronize(my_stream)
1683 check_stream_synchronize_gpu(
"merge_systems qtmp1_dev", successgpu)
1685 successgpu = gpu_memcpy_async(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1686 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost, my_stream)
1687 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1689 my_stream = obj%gpu_setup%my_stream
1690 successgpu = gpu_stream_synchronize(my_stream)
1691 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1693 successgpu = gpu_stream_synchronize()
1694 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1697 successgpu = gpu_memcpy(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1698 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost)
1699 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1703 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1704 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1705 1111_mpi_kind, int(mpi_comm_cols_self,kind=mpi_kind), mpi_status_ignore, mpierr)
1706#ifdef WITH_GPU_STREAMS
1707 my_stream = obj%gpu_setup%my_stream
1708 successgpu = gpu_stream_synchronize(my_stream)
1709 check_stream_synchronize_gpu(
"merge_systems qtmp1_dev", successgpu)
1711 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1712 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice, my_stream)
1713 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1715 my_stream = obj%gpu_setup%my_stream
1716 successgpu = gpu_stream_synchronize(my_stream)
1717 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1719 successgpu = gpu_stream_synchronize()
1720 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1723 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1724 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice)
1725 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1727 call obj%timer%stop(
"mpi_communication")
1728#endif /* WITH_MPI */
1734 call obj%timer%start(
"mpi_communication")
1735 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1736 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1737 1111_mpi_kind, int(mpi_comm_cols_self,kind=mpi_kind), mpi_status_ignore, mpierr)
1738 call obj%timer%stop(
"mpi_communication")
1739#endif /* WITH_MPI */
1751 nvtx_range_push(
"gpu_fill_tmp_arrays")
1752 call gpu_fill_tmp_arrays (precision_char, d1u_dev, d1_dev, zu_dev, z_dev, d1l_dev, zl_dev, &
1753 idx1_dev, p_col_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, nnzul_dev, &
1754 na, np, na1, np_rem, debug, my_stream)
1755 if (wantdebug) successgpu = gpu_devicesynchronize()
1756 nvtx_range_pop(
"gpu_fill_tmp_arrays")
1758 num = 2* size_of_int
1759#ifdef WITH_GPU_STREAMS
1760 successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost, my_stream)
1761 check_memcpy_gpu(
"merge_systems: nnzul_dev", successgpu)
1763 successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, num, gpumemcpydevicetohost)
1764 check_memcpy_gpu(
"merge_systems: nnzl_val", successgpu)
1771 if (p_col(idx1(i)) == np_rem)
then
1772 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2)
then
1777 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2)
then
1789 ndef = max(nnzu,nnzl)
1792 call gpu_update_ndef_c(ndef_c_dev, idx_dev, p_col_dev, idx2_dev, na, na1, np_rem, ndef, debug, my_stream)
1796 ndef = max(nnzu,nnzl)
1798 call gpu_copy_qtmp1_slice_to_q (precision_char, q_dev, qtmp1_dev, &
1799 l_col_out_dev, p_col_out_dev, ndef_c_dev, p_col_dev, idx2_dev, idx_dev, &
1800 l_rqs, l_rqe, l_rows, matrixrows, gemm_dim_k, my_pcol, na1, np_rem, na, &
1803 ndef = max(nnzu,nnzl)
1807 if (p_col(idx2(j-na1)) == np_rem)
then
1809 if (p_col_out(i) == my_pcol)
then
1810 q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
1819 do ns = 0, nqcols1-1, max_strip
1820 nvtx_range_push(
"ns=0,nqcols1-1,max_strip")
1821 ncnt = min(max_strip,nqcols1-ns)
1825 call gpu_copy_q_slice_to_qtmp2 (precision_char, q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, &
1826 l_rows, l_rqs, l_rqe, matrixrows, matrixcols, &
1827 gemm_dim_k, gemm_dim_m, ns, ncnt, ind_ex, ind_ex2, na, debug, my_stream)
1836 qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)
1844 if (nnzu .ge. 1)
then
1847 call gpu_fill_ev (precision_char, ev_dev, d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev, idxq1_dev, idx_dev,&
1848 na, gemm_dim_l, gemm_dim_m, nnzu, ns, ncnt, debug, my_stream)
1858 j = idx(idxq1(i+ns))
1863 tmp(k) = d1u(k) - dbase(j)
1864 tmp(k) = tmp(k) + ddiff(j)
1865 ev(k,i) = zu(k) / tmp(k) * ev_scale(j)
1874 if (l_rnm>0 .and. ncnt>0 .and. nnzu>0)
then
1876 call obj%timer%start(
"gpublas_gemm")
1877 nvtx_range_push(
"gpublas_gemm_upper")
1878 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1879 call gpublas_precision_gemm(
'N',
'N', l_rnm, ncnt, nnzu, 1.0_rk, &
1880 qtmp1_dev, gemm_dim_k, &
1881 ev_dev, gemm_dim_l, 1.0_rk, &
1882 qtmp2_dev, gemm_dim_k, gpuhandle)
1883 if (wantdebug) successgpu = gpu_devicesynchronize()
1884 nvtx_range_pop(
"gpublas_gemm_upper")
1885 call obj%timer%stop(
"gpublas_gemm")
1887 call obj%timer%start(
"blas_gemm")
1888 call precision_gemm(
'N',
'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1889 int(nnzu,kind=blas_kind), 1.0_rk, &
1890 qtmp1, int(gemm_dim_k,kind=blas_kind), &
1891 ev, int(gemm_dim_l,kind=blas_kind), 1.0_rk, &
1892 qtmp2(1,1), int(gemm_dim_k,kind=blas_kind))
1893 call obj%timer%stop(
"blas_gemm")
1905 if (nnzl .ge. 1)
then
1906 call gpu_fill_ev (precision_char, ev_dev, d1l_dev, dbase_dev, ddiff_dev, zl_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1907 na, gemm_dim_l, gemm_dim_m, nnzl, ns, ncnt, debug, my_stream)
1914 j = idx(idxq1(i+ns))
1917 tmp(k) = d1l(k) - dbase(j)
1918 tmp(k) = tmp(k) + ddiff(j)
1919 ev(k,i) = zl(k) / tmp(k) * ev_scale(j)
1928 if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0)
then
1930 call obj%timer%start(
"gpublas_gemm")
1931 nvtx_range_push(
"gpublas_gemm_lower")
1932 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1933 call gpublas_precision_gemm(
'N',
'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk, &
1934 qtmp1_dev + l_rnm*size_of_datatype, gemm_dim_k, &
1935 ev_dev, gemm_dim_l, 1.0_rk, &
1936 qtmp2_dev + l_rnm*size_of_datatype, gemm_dim_k, gpuhandle)
1937 if (wantdebug) successgpu = gpu_devicesynchronize()
1938 nvtx_range_pop(
"gpublas_gemm_lower")
1939 call obj%timer%stop(
"gpublas_gemm")
1941 call obj%timer%start(
"blas_gemm")
1942 call precision_gemm(
'N',
'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1943 int(nnzl,kind=blas_kind), 1.0_rk, &
1944 qtmp1(l_rnm+1,1), int(gemm_dim_k,kind=blas_kind), &
1945 ev, int(gemm_dim_l,kind=blas_kind), 1.0_rk, &
1946 qtmp2(l_rnm+1,1), int(gemm_dim_k,kind=blas_kind))
1947 call obj%timer%stop(
"blas_gemm")
1954 call gpu_copy_qtmp2_slice_to_q (precision_char, q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, &
1955 l_rqs, l_rqe, l_rows, ncnt, gemm_dim_k, matrixrows, ns, debug, my_stream)
1962 q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
1967 nvtx_range_pop(
"ns=0,nqcols1-1,max_strip")
1970 nvtx_range_pop(
"np=1,npc_n")
1972 nvtx_range_pop(
"main_loop")
1975 deallocate(nnzu_val, nnzl_val)
1979 deallocate(ndef_c, stat=istat, errmsg=errormessage)
1980 check_deallocate(
"merge_systems: ndef_c",istat, errormessage)
1984 successgpu = gpu_free(nnzul_dev)
1985 check_dealloc_gpu(
"merge_systems: nnzul_dev", successgpu)
1987 successgpu = gpu_free(l_col_dev)
1988 check_dealloc_gpu(
"merge_systems: l_col_dev", successgpu)
1990 successgpu = gpu_free(ndef_c_dev)
1991 check_dealloc_gpu(
"merge_systems: ndef_c_dev", successgpu)
1993 successgpu = gpu_free(nnzu_val_dev)
1994 check_dealloc_gpu(
"merge_systems: nnzu_val_dev", successgpu)
1996 successgpu = gpu_free(nnzl_val_dev)
1997 check_dealloc_gpu(
"merge_systems: nnzl_val_dev", successgpu)
1999 successgpu = gpu_free(idx1_dev)
2000 check_dealloc_gpu(
"merge_systems: idx1_dev", successgpu)
2002 successgpu = gpu_free(idx2_dev)
2003 check_dealloc_gpu(
"merge_systems: idx2_dev", successgpu)
2005 successgpu = gpu_free(p_col_dev)
2006 check_dealloc_gpu(
"merge_systems: p_col_dev", successgpu)
2008 successgpu = gpu_free(p_col_out_dev)
2009 check_dealloc_gpu(
"merge_systems: p_col_out_dev", successgpu)
2011 successgpu = gpu_free(coltyp_dev)
2012 check_dealloc_gpu(
"merge_systems: coltyp_dev", successgpu)
2014 successgpu = gpu_free(idx_dev)
2015 check_dealloc_gpu(
"merge_systems: idx_dev", successgpu)
2017 successgpu = gpu_free(l_col_out_dev)
2018 check_dealloc_gpu(
"merge_systems: l_col_out_dev", successgpu)
2020 successgpu = gpu_free(idxq1_dev)
2021 check_dealloc_gpu(
"merge_systems: ", successgpu)
2023 successgpu = gpu_free(d1_dev)
2024 check_dealloc_gpu(
"merge_systems: d1_dev", successgpu)
2026 successgpu = gpu_free(z_dev)
2027 check_dealloc_gpu(
"merge_systems: z_dev", successgpu)
2029 successgpu = gpu_free(z1_dev)
2030 check_dealloc_gpu(
"merge_systems: z1_dev", successgpu)
2032 successgpu = gpu_free(rho_dev)
2033 check_dealloc_gpu(
"merge_systems: rho_dev", successgpu)
2035 successgpu = gpu_free(d1u_dev)
2036 check_dealloc_gpu(
"merge_systems: d1u_dev", successgpu)
2038 successgpu = gpu_free(dbase_dev)
2039 check_dealloc_gpu(
"merge_systems: dbase_dev", successgpu)
2041 successgpu = gpu_free(ddiff_dev)
2042 check_dealloc_gpu(
"merge_systems: ddiff_dev", successgpu)
2044 successgpu = gpu_free(zu_dev)
2045 check_dealloc_gpu(
"merge_systems: zu_dev", successgpu)
2047 successgpu = gpu_free(ev_scale_dev)
2048 check_dealloc_gpu(
"merge_systems: ev_scale_dev", successgpu)
2050 successgpu = gpu_free(d1l_dev)
2051 check_dealloc_gpu(
"merge_systems: d1l_dev", successgpu)
2053 successgpu = gpu_free(zl_dev)
2054 check_dealloc_gpu(
"merge_systems: zl_dev", successgpu)
2056 successgpu = gpu_free(qtmp1_dev)
2057 check_dealloc_gpu(
"merge_systems: qtmp1_dev", successgpu)
2059 successgpu = gpu_free(qtmp1_tmp_dev)
2060 check_dealloc_gpu(
"merge_systems: qtmp1_tmp_dev", successgpu)
2062 successgpu = gpu_free(qtmp2_dev)
2063 check_dealloc_gpu(
"merge_systems: qtmp2_dev", successgpu)
2065 successgpu = gpu_free(ev_dev)
2066 check_dealloc_gpu(
"merge_systems: ev_dev", successgpu)
2068 successgpu = gpu_free(tmp_dev)
2069 check_dealloc_gpu(
"merge_systems: tmp_dev", successgpu)
2071 successgpu = gpu_free(zero_dev)
2072 check_dealloc_gpu(
"merge_systems: zero_dev", successgpu)
2074 successgpu = gpu_free(one_dev)
2075 check_dealloc_gpu(
"merge_systems: one_dev", successgpu)
2077 successgpu = gpu_free(qtrans_dev)
2078 check_dealloc_gpu(
"merge_systems: qtrans_dev", successgpu)
2080 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
2081 if (wantdebug)
call obj%timer%start(
"gpu_host_register")
2082 if (.not. useccl)
then
2083 successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
2084 check_host_unregister_gpu(
"merge_systems: qtmp1", successgpu)
2086 successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
2087 check_host_unregister_gpu(
"merge_systems: qtmp2", successgpu)
2089 successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))
2090 check_host_unregister_gpu(
"merge_systems: ev", successgpu)
2093 if (wantdebug) successgpu = gpu_devicesynchronize()
2094 if (wantdebug)
call obj%timer%stop(
"gpu_host_register")
2098 if (.not. useccl)
then
2099 deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)
2100 check_deallocate(
"merge_systems: ev, qtmp1, qtmp2",istat, errormessage)
2109 call obj%timer%stop(
"merge_systems" // precision_suffix)