58 subroutine merge_systems_&
60 (obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &
61 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
63 use,
intrinsic :: iso_c_binding
66 use elpa_blas_interfaces
77#ifdef WITH_OPENMP_TRADITIONAL
81#include "../general/precision_kinds.F90"
83 integer(kind=ik),
intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
84 mpi_comm_cols, npc_0, npc_n
85 integer(kind=ik),
intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
86 real(kind=real_datatype),
intent(inout) :: d(na), e
87#ifdef USE_ASSUMED_SIZE
88 real(kind=real_datatype),
intent(inout) :: q(ldq,*)
90 real(kind=real_datatype),
intent(inout) :: q(ldq,matrixcols)
92 logical,
intent(in) :: useGPU, wantDebug
94 logical,
intent(out) :: success
98 integer(kind=ik),
parameter :: max_strip=128
101 real(kind=real_datatype) :: beta, sig, s, c, t, tau, rho, eps, tol, &
102 qtrans(2,2), dmax, zmax, d1new, d2new
103 real(kind=real_datatype) :: z(na), d1(na), d2(na), z1(na), delta(na), &
104 dbase(na), ddiff(na), ev_scale(na), tmp(na)
105 real(kind=real_datatype) :: d1u(na), zu(na), d1l(na), zl(na)
106 real(kind=real_datatype),
allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
107#ifdef WITH_OPENMP_TRADITIONAL
108 real(kind=real_datatype),
allocatable :: z_p(:,:)
111 integer(kind=ik) :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
113 integer(kind=BLAS_KIND) :: infoBLAS
114 integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
115 l_cols_qreorg, np, l_idx, nqcols1, nqcols2
116 integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
118 integer(kind=MPI_KIND) :: mpierr
119 integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
120 integer(kind=ik) :: np_next, np_prev, np_rem
121 integer(kind=ik) :: idx(na), idx1(na), idx2(na)
122 integer(kind=BLAS_KIND) :: idxBLAS(NA)
123 integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
125 integer(kind=ik) :: istat
126 character(200) :: errorMessage
127 integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
129 integer(kind=c_intptr_t) :: num
130 integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
131 logical :: successGPU
132 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
135 integer(kind=c_intptr_t) :: gpuHandle
136 integer(kind=ik),
intent(in) :: max_threads
137 integer(kind=c_intptr_t) :: my_stream
138#ifdef WITH_OPENMP_TRADITIONAL
139 integer(kind=ik) :: my_thread
141 allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errormessage)
142 check_allocate(
"merge_systems: z_p",istat, errormessage)
145 call obj%timer%start(
"merge_systems" // precision_suffix)
148 call obj%timer%start(
"mpi_communication")
149 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
150 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
151 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
152 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
154 my_prow = int(my_prowmpi,kind=c_int)
155 np_rows = int(np_rowsmpi,kind=c_int)
156 my_pcol = int(my_pcolmpi,kind=c_int)
157 np_cols = int(np_colsmpi,kind=c_int)
159 call obj%timer%stop(
"mpi_communication")
164 if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n)
then
165 call obj%timer%stop(
"merge_systems" // precision_suffix)
170 if (my_pcol == npc_0+npc_n-1)
then
173 np_next = my_pcol + 1
176 if (my_pcol == npc_0)
then
177 np_prev = npc_0+npc_n-1
179 np_prev = my_pcol - 1
181 call check_monotony_&
183 &(obj, nm,d,
'Input1',wantdebug, success)
184 if (.not.(success))
then
185 call obj%timer%stop(
"merge_systems" // precision_suffix)
188 call check_monotony_&
190 &(obj,na-nm,d(nm+1),
'Input2',wantdebug, success)
191 if (.not.(success))
then
192 call obj%timer%stop(
"merge_systems" // precision_suffix)
199 n_procs = np_rows*npc_n
200 my_proc = my_prow*npc_n + (my_pcol-npc_0)
205 l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1)
206 l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1)
207 l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1)
209 l_rnm = l_rqm-l_rqs+1
210 l_rows = l_rqe-l_rqs+1
215 l_cols = count(p_col(1:na)==my_pcol)
220 do np = npc_0, npc_0+npc_n-1
221 max_local_cols = max(max_local_cols,count(p_col(1:na)==np))
233 if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow)
then
236 if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
240 if (mod((nqoff+nm)/nblk,np_rows)==my_prow)
then
243 if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
249 &(obj, z, na, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
250 if (.not.(success))
then
251 write(error_unit,*)
"Error in global_gather. ABorting"
260 call obj%timer%start(
"lapack")
261 call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &
262 1_blas_kind, 1_blas_kind, idxblas )
263 idx(:) = int(idxblas(:),kind=ik)
264 call obj%timer%stop(
"lapack")
268 zmax = maxval(abs(z))
269 dmax = maxval(abs(d))
270 eps = precision_lamch(
'E' )
271 tol = 8.0_rk*eps*max(dmax,zmax)
276 IF ( rho*zmax <= tol )
THEN
288 (obj, idx, na, na, p_col_out, q, ldq, matrixcols, l_rows, l_rqe, &
289 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
291 call obj%timer%stop(
"merge_systems" // precision_suffix)
312 if (rho*abs(z(idx(i))) <= tol)
then
330 tau = precision_lapy2( c, s )
331 t = d1(na1) - d(idx(i))
334 IF ( abs( t*c*s ) <= tol )
THEN
342 d2new = d(idx(i))*c**2 + d1(na1)*s**2
343 d1new = d(idx(i))*s**2 + d1(na1)*c**2
359 if (d1new<d1(na1) ) d1new = d1(na1)
360 if (d1new>d(idx(i))) d1new = d(idx(i))
362 if (d2new<d1(na1) ) d2new = d1(na1)
363 if (d2new>d(idx(i))) d2new = d(idx(i))
368 if (d2new<d2(j))
then
379 qtrans(1,1) = c; qtrans(1,2) =-s
380 qtrans(2,1) = s; qtrans(2,2) = c
381 call transform_columns_&
383 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
384 q, ldq, matrixcols, l_rows, mpi_comm_cols, &
385 p_col, l_col, qtrans)
386 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
387 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
405 call check_monotony_&
407 &(obj, na1,d1,
'Sorted1', wantdebug, success)
408 if (.not.(success))
then
409 call obj%timer%stop(
"merge_systems" // precision_suffix)
412 call check_monotony_&
414 &(obj, na2,d2,
'Sorted2', wantdebug, success)
415 if (.not.(success))
then
416 call obj%timer%stop(
"merge_systems" // precision_suffix)
420 if (na1==1 .or. na1==2)
then
424 d(1) = d1(1) + rho*z1(1)**2
426 call obj%timer%start(
"lapack")
427 call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))
428 call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))
429 call obj%timer%stop(
"lapack")
430 call transform_columns_&
432 &(obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &
433 ldq, matrixcols, l_rows, mpi_comm_cols, &
434 p_col, l_col, qtrans)
439 d(na1+1:na) = d2(1:na2)
442 call obj%timer%start(
"lapack")
443 call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
444 1_blas_kind, 1_blas_kind, idxblas )
445 idx(:) = int(idxblas(:),kind=ik)
446 call obj%timer%stop(
"lapack")
457 if (idx(i)<=na1)
then
458 idxq1(i) = idx1(idx(i))
460 idxq1(i) = idx2(idx(i)-na1)
465 &(obj, idxq1, na, na, p_col_out, q, ldq, matrixcols, l_rows, l_rqe, &
466 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
473#ifdef WITH_OPENMP_TRADITIONAL
480 infoblas = int(info,kind=blas_kind)
488 DO i = my_proc+1, na1, n_procs
489 call obj%timer%start(
"lapack")
490 call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &
492 info = int(infoblas,kind=ik)
493 call obj%timer%stop(
"lapack")
498 call solve_secular_equation_&
500 &(obj, na1, i, d1, z1, delta, rho, s)
512 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
519 if (abs(delta(i+1)) < abs(delta(i)))
then
521 ddiff(i) = delta(i+1)
541 call global_product_&
543 (obj, z, na1, mpi_comm_rows, mpi_comm_cols, npc_0, npc_n, success)
544 if (.not.(success))
then
545 write(error_unit,*)
"Error in global_product. Aborting..."
548 z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )
552 &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
553 if (.not.(success))
then
554 write(error_unit,*)
"Error in global_gather. Aborting..."
559 &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
560 if (.not.(success))
then
561 write(error_unit,*)
"Error in global_gather. Aborting..."
564 d(1:na1) = dbase(1:na1) - ddiff(1:na1)
569#ifdef WITH_OPENMP_TRADITIONAL
571 call obj%timer%start(
"OpenMP parallel" // precision_suffix)
580 DO i = my_proc+1, na1, n_procs
590 &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)
593#ifdef WITH_OPENMP_TRADITIONAL
596 call obj%timer%stop(
"OpenMP parallel" // precision_suffix)
602 &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
603 if (.not.(success))
then
604 write(error_unit,*)
"Error in global_gather. Aborting..."
608 d(na1+1:na) = d2(1:na2)
610 call obj%timer%start(
"lapack")
612 call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
613 1_blas_kind, 1_blas_kind, idxblas )
614 idx(:) = int(idxblas(:),kind=ik)
615 call obj%timer%stop(
"lapack")
621 call check_monotony_&
623 &(obj, na,d,
'Output', wantdebug, success)
625 if (.not.(success))
then
626 call obj%timer%stop(
"merge_systems" // precision_suffix)
639 if (p_col_out(i)==my_pcol)
then
640 if (idx(i)<=na1)
then
650 gemm_dim_k = max(1,l_rows)
651 gemm_dim_l = max_local_cols
652 gemm_dim_m = min(max_strip,max(1,nqcols1))
654 allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)
655 check_allocate(
"merge_systems: qtmp1",istat, errormessage)
657 allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)
658 check_allocate(
"merge_systems: ev",istat, errormessage)
660 allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)
661 check_allocate(
"merge_systems: qtmp2",istat, errormessage)
667 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
668#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
669 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
670 successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&
671 gpuhostregisterdefault)
672 check_host_register_gpu(
"merge_systems: qtmp1", successgpu)
676 successgpu = gpu_malloc(qtmp1_dev, num)
677 check_alloc_gpu(
"merge_systems: qtmp1_dev", successgpu)
679 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
680#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
681 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
682 successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t),num,&
683 gpuhostregisterdefault)
684 check_host_register_gpu(
"merge_systems: ev", successgpu)
687 successgpu = gpu_malloc(ev_dev, num)
688 check_alloc_gpu(
"merge_systems: ev_dev", successgpu)
691 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
692#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
693 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
694 successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&
695 gpuhostregisterdefault)
696 check_host_register_gpu(
"merge_systems: qtmp2", successgpu)
699 successgpu = gpu_malloc(qtmp2_dev, num)
700 check_alloc_gpu(
"merge_systems: qtmp2_dev", successgpu)
709 l_idx = l_col(idx1(i))
710 if (p_col(idx1(i))==my_pcol)
then
711 if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2)
then
713 qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
715 if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2)
then
717 qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
724 ndef = max(nnzu,nnzl)
726 l_idx = l_col(idx2(i))
727 if (p_col(idx2(i))==my_pcol)
then
729 qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
738 if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
748 if (np_rem == npc_0)
then
749 np_rem = npc_0+npc_n-1
754 call obj%timer%start(
"mpi_communication")
755 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
756 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
757 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
758 call obj%timer%stop(
"mpi_communication")
764#ifdef WITH_GPU_STREAMS
765 my_stream = obj%gpu_setup%my_stream
766 successgpu = gpu_stream_synchronize(my_stream)
767 check_stream_synchronize_gpu(
"tridiag qtmp1_dev", successgpu)
769 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
770 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice, my_stream)
771 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
773 my_stream = obj%gpu_setup%my_stream
774 successgpu = gpu_stream_synchronize(my_stream)
775 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
777 successgpu = gpu_stream_synchronize()
778 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
781 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
782 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice)
783 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
793 if (p_col(idx1(i)) == np_rem)
then
794 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2)
then
799 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2)
then
809 ndef = max(nnzu,nnzl)
813 if (p_col(idx2(j-na1)) == np_rem)
then
815 if (p_col_out(i) == my_pcol)
then
816 q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
822 do ns = 0, nqcols1-1, max_strip
824 ncnt = min(max_strip,nqcols1-ns)
834 qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)
848 tmp(k) = d1u(k) - dbase(j)
849 tmp(k) = tmp(k) + ddiff(j)
850 ev(k,i) = zu(k) / tmp(k) * ev_scale(j)
858#ifdef WITH_GPU_STREAMS
859 successgpu = gpu_stream_synchronize(my_stream)
860 check_stream_synchronize_gpu(
"tridiag qtmp2_dev", successgpu)
862 successgpu = gpu_memcpy_async(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
863 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, my_stream)
864 check_memcpy_gpu(
"merge_systems: qtmp2_dev", successgpu)
868 successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
869 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, my_stream)
870 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
872 successgpu = gpu_stream_synchronize(my_stream)
873 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
875 successgpu = gpu_stream_synchronize()
876 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
878 successgpu = gpu_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
879 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)
880 check_memcpy_gpu(
"merge_systems: qtmp2_dev", successgpu)
884 successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
885 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)
886 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
892 if (l_rnm>0 .and. ncnt>0 .and. nnzu>0)
then
894 call obj%timer%start(
"gpublas")
895 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
896 call gpublas_precision_gemm(
'N',
'N', l_rnm, ncnt, nnzu, &
897 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
898 ev_dev, ubound(ev,dim=1), &
899 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1), gpuhandle)
900 call obj%timer%stop(
"gpublas")
902 call obj%timer%start(
"blas")
903 call obj%timer%start(
"gemm")
904 call precision_gemm(
'N',
'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
905 int(nnzu,kind=blas_kind), &
906 1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=blas_kind), &
907 ev, int(ubound(ev,dim=1),kind=blas_kind), &
908 1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
909 call obj%timer%stop(
"gemm")
910 call obj%timer%stop(
"blas")
924 tmp(k) = d1l(k) - dbase(j)
925 tmp(k) = tmp(k) + ddiff(j)
926 ev(k,i) = zl(k) / tmp(k) * ev_scale(j)
934#ifdef WITH_GPU_STREAMS
935 successgpu = gpu_stream_synchronize(my_stream)
936 check_stream_synchronize_gpu(
"tridiag qtmp1_dev", successgpu)
938 successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
939 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice, &
941 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
943 successgpu = gpu_stream_synchronize(my_stream)
944 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
946 successgpu = gpu_stream_synchronize()
947 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
949 successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
950 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpyhosttodevice)
951 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
957 if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0)
then
959 call obj%timer%start(
"gpublas")
960 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
961 call gpublas_precision_gemm(
'N',
'N', l_rows-l_rnm, ncnt, nnzl, &
962 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), &
963 ev_dev, ubound(ev,dim=1), &
964 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1), gpuhandle)
965 call obj%timer%stop(
"gpublas")
967 call obj%timer%start(
"blas")
968 call obj%timer%start(
"gemm")
969 call precision_gemm(
'N',
'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
970 int(nnzl,kind=blas_kind), &
971 1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=blas_kind), &
972 ev, int(ubound(ev,dim=1),kind=blas_kind), &
973 1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
974 call obj%timer%stop(
"gemm")
975 call obj%timer%stop(
"blas")
984#ifdef WITH_GPU_STREAMS
985 successgpu = gpu_stream_synchronize(my_stream)
986 check_stream_synchronize_gpu(
"tridiag qtmp2_dev", successgpu)
988 successgpu = gpu_memcpy_async(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
989 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost, my_stream)
990 check_memcpy_gpu(
"merge_systems: qtmp2_dev", successgpu)
992 successgpu = gpu_stream_synchronize(my_stream)
993 check_stream_synchronize_gpu(
"merge_systems: qtmp2_dev", successgpu)
995 successgpu = gpu_stream_synchronize()
996 check_stream_synchronize_gpu(
"merge_systems: qtmp2_dev", successgpu)
998 successgpu = gpu_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
999 gemm_dim_k * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost)
1000 check_memcpy_gpu(
"merge_systems: qtmp2_dev", successgpu)
1012 q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
1021#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1022 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1023 successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
1024 check_host_unregister_gpu(
"merge_systems: qtmp1", successgpu)
1027 successgpu = gpu_free(qtmp1_dev)
1028 check_dealloc_gpu(
"merge_systems: qtmp1_dev", successgpu)
1030#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1031 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1032 successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
1033 check_host_unregister_gpu(
"merge_systems: qtmp2", successgpu)
1036 successgpu = gpu_free(qtmp2_dev)
1037 check_dealloc_gpu(
"merge_systems: qtmp2_dev", successgpu)
1039#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1040 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1041 successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))
1042 check_host_unregister_gpu(
"merge_systems: ev", successgpu)
1045 successgpu = gpu_free(ev_dev)
1046 check_dealloc_gpu(
"merge_systems: ev_dev", successgpu)
1049 deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)
1050 check_deallocate(
"merge_systems: ev, qtmp1, qtmp2",istat, errormessage)
1052#ifdef WITH_OPENMP_TRADITIONAL
1053 deallocate(z_p, stat=istat, errmsg=errormessage)
1054 check_deallocate(
"merge_systems: z_p",istat, errormessage)
1057 call obj%timer%stop(
"merge_systems" // precision_suffix)