64 subroutine merge_systems_cpu_&
66 (obj, na, nm, d, e, q, matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &
67 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
72 use,
intrinsic :: iso_c_binding
75 use elpa_blas_interfaces
86#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
92#ifdef WITH_OPENMP_TRADITIONAL
96#include "../general/precision_kinds.F90"
98 integer(kind=ik),
intent(in) :: na, nm, matrixRows, nqoff, nblk, matrixCols, mpi_comm_rows, &
99 mpi_comm_cols, npc_0, npc_n
100 integer(kind=ik),
intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
101 real(kind=real_datatype),
intent(inout) :: d(na), e
102#ifdef USE_ASSUMED_SIZE
103 real(kind=real_datatype),
intent(inout) :: q(matrixrows,*)
105 real(kind=real_datatype),
intent(inout) :: q(matrixrows,matrixcols)
107 logical,
intent(in) :: useGPU, wantDebug
109 logical,
intent(out) :: success
113 integer(kind=ik),
parameter :: max_strip=128
116 real(kind=real_datatype) :: beta, sig, s, c, t, tau, rho, eps, tol, &
117 qtrans(2,2), dmax, zmax, d1new, d2new
118 real(kind=real_datatype) :: z(na), d1(na), d2(na), z1(na), delta(na), &
119 dbase(na), ddiff(na), ev_scale(na), tmp(na)
120 real(kind=real_datatype) :: d1u(na), zu(na), d1l(na), zl(na)
121 real(kind=real_datatype),
allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
122#ifdef WITH_OPENMP_TRADITIONAL
123 real(kind=real_datatype),
allocatable :: z_p(:,:)
126 integer(kind=ik) :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
128 integer(kind=BLAS_KIND) :: infoBLAS
129 integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
130 l_cols_qreorg, np, l_idx, nqcols1
131 integer(kind=ik) :: nnzu_save, nnzl_save
132 integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
134 integer(kind=MPI_KIND) :: mpierr
135 integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
136 integer(kind=ik) :: np_next, np_prev, np_rem
137 integer(kind=ik) :: idx(na), idx1(na), idx2(na)
138 integer(kind=BLAS_KIND) :: idxBLAS(NA)
139 integer(kind=ik) :: coltyp(na), idxq1(na)
141 integer(kind=ik) :: istat
142 character(200) :: errorMessage
143 integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
145 integer(kind=c_intptr_t) :: num
146 integer(kind=C_intptr_T) :: qtmp1_dev, qtmp1_tmp_dev, qtmp2_dev, ev_dev, q_dev
147 integer(kind=c_intptr_t) :: d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev
148 integer(kind=c_intptr_t) :: d1l_dev, zl_dev, z_dev, d1_dev
149 integer(kind=c_intptr_t) :: idx1_dev, p_col_dev, coltyp_dev, p_col_out_dev, ndef_c_dev
150 integer(kind=c_intptr_t) :: idxq1_dev, l_col_out_dev, idx_dev, idx2_dev, l_col_dev
151 integer(kind=c_intptr_t) :: nnzul_dev
153 integer(kind=c_intptr_t) :: nnzu_val_dev, nnzl_val_dev
154 logical :: successGPU
155 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
158 integer(kind=c_intptr_t) :: gpuHandle
159 integer(kind=ik),
intent(in) :: max_threads
160 integer(kind=c_intptr_t) :: my_stream
161 integer(kind=ik) :: l_col_out_tmp
162 integer(kind=ik),
allocatable :: nnzu_val(:,:), nnzl_val(:,:)
163 integer(kind=ik) :: nnzul(2)
165 integer(kind=ik) :: nnzu_start, nnzl_start
167 integer(kind=ik),
allocatable :: ndef_c(:)
170 integer(kind=ik) :: ii,jj, indx, ind_ex, ind_ex2, p_col_tmp, index2, counter1, counter2
171#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
172 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
175#ifdef WITH_OPENMP_TRADITIONAL
176 integer(kind=ik) :: my_thread
178 allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errormessage)
179 check_allocate(
"merge_systems: z_p",istat, errormessage)
182 call obj%timer%start(
"merge_systems" // precision_suffix)
185 call obj%timer%start(
"mpi_communication")
186 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
187 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
188 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
189 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
191 my_prow = int(my_prowmpi,kind=c_int)
192 np_rows = int(np_rowsmpi,kind=c_int)
193 my_pcol = int(my_pcolmpi,kind=c_int)
194 np_cols = int(np_colsmpi,kind=c_int)
196 call obj%timer%stop(
"mpi_communication")
199 useccl = obj%gpu_setup%useCCL
202 if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n)
then
203 call obj%timer%stop(
"merge_systems" // precision_suffix)
208 if (my_pcol == npc_0+npc_n-1)
then
211 np_next = my_pcol + 1
214 if (my_pcol == npc_0)
then
215 np_prev = npc_0+npc_n-1
217 np_prev = my_pcol - 1
219 call check_monotony_&
221 &(obj, nm,d,
'Input1',wantdebug, success)
222 if (.not.(success))
then
223 call obj%timer%stop(
"merge_systems" // precision_suffix)
226 call check_monotony_&
228 &(obj,na-nm,d(nm+1),
'Input2',wantdebug, success)
229 if (.not.(success))
then
230 call obj%timer%stop(
"merge_systems" // precision_suffix)
237 n_procs = np_rows*npc_n
238 my_proc = my_prow*npc_n + (my_pcol-npc_0)
243 l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1)
244 l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1)
245 l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1)
247 l_rnm = l_rqm-l_rqs+1
248 l_rows = l_rqe-l_rqs+1
253 l_cols = count(p_col(1:na)==my_pcol)
258 do np = npc_0, npc_0+npc_n-1
259 max_local_cols = max(max_local_cols,count(p_col(1:na)==np))
271 if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow)
then
274 if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
278 if (mod((nqoff+nm)/nblk,np_rows)==my_prow)
then
281 if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
287 &(obj, z, na, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
288 if (.not.(success))
then
289 write(error_unit,*)
"Error in global_gather. ABorting"
298 call obj%timer%start(
"lapack_lamrg")
299 call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &
300 1_blas_kind, 1_blas_kind, idxblas )
301 idx(:) = int(idxblas(:),kind=ik)
302 call obj%timer%stop(
"lapack_lamrg")
306 zmax = maxval(abs(z))
307 dmax = maxval(abs(d))
308 eps = precision_lamch(
'E' )
309 tol = 8.0_rk*eps*max(dmax,zmax)
314 IF ( rho*zmax <= tol )
THEN
326 (obj, idx, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &
327 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
329 call obj%timer%stop(
"merge_systems" // precision_suffix)
350 if (rho*abs(z(idx(i))) <= tol)
then
368 tau = precision_lapy2( c, s )
369 t = d1(na1) - d(idx(i))
372 IF ( abs( t*c*s ) <= tol )
THEN
380 d2new = d(idx(i))*c**2 + d1(na1)*s**2
381 d1new = d(idx(i))*s**2 + d1(na1)*c**2
397 if (d1new<d1(na1) ) d1new = d1(na1)
398 if (d1new>d(idx(i))) d1new = d(idx(i))
400 if (d2new<d1(na1) ) d2new = d1(na1)
401 if (d2new>d(idx(i))) d2new = d(idx(i))
406 if (d2new<d2(j))
then
417 qtrans(1,1) = c; qtrans(1,2) =-s
418 qtrans(2,1) = s; qtrans(2,2) = c
419 call transform_columns_&
421 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
422 q, matrixrows, matrixcols, l_rows, mpi_comm_cols, &
423 p_col, l_col, qtrans)
424 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
425 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
444 call check_monotony_&
446 &(obj, na1,d1,
'Sorted1', wantdebug, success)
447 if (.not.(success))
then
448 call obj%timer%stop(
"merge_systems" // precision_suffix)
451 call check_monotony_&
453 &(obj, na2,d2,
'Sorted2', wantdebug, success)
454 if (.not.(success))
then
455 call obj%timer%stop(
"merge_systems" // precision_suffix)
459 if (na1==1 .or. na1==2)
then
463 d(1) = d1(1) + rho*z1(1)**2
465 call obj%timer%start(
"lapack_laed5_x2")
466 call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))
467 call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))
468 call obj%timer%stop(
"lapack_laed5_x2")
469 call transform_columns_&
471 &(obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &
472 matrixrows, matrixcols, l_rows, mpi_comm_cols, &
473 p_col, l_col, qtrans)
478 d(na1+1:na) = d2(1:na2)
481 call obj%timer%start(
"lapack_lamrg")
482 call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
483 1_blas_kind, 1_blas_kind, idxblas )
484 idx(:) = int(idxblas(:),kind=ik)
485 call obj%timer%stop(
"lapack_lamrg")
496 if (idx(i)<=na1)
then
497 idxq1(i) = idx1(idx(i))
499 idxq1(i) = idx2(idx(i)-na1)
504 &(obj, idxq1, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &
505 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
512#ifdef WITH_OPENMP_TRADITIONAL
519 infoblas = int(info,kind=blas_kind)
527 DO i = my_proc+1, na1, n_procs
528 call obj%timer%start(
"lapack_laed4")
529 call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &
531 info = int(infoblas,kind=ik)
532 call obj%timer%stop(
"lapack_laed4")
537 call solve_secular_equation_&
539 &(obj, na1, i, d1, z1, delta, rho, s)
551 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
558 if (abs(delta(i+1)) < abs(delta(i)))
then
560 ddiff(i) = delta(i+1)
580 call global_product_&
582 (obj, z, na1, mpi_comm_rows, mpi_comm_cols, npc_0, npc_n, success)
583 if (.not.(success))
then
584 write(error_unit,*)
"Error in global_product. Aborting..."
587 z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )
591 &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
592 if (.not.(success))
then
593 write(error_unit,*)
"Error in global_gather. Aborting..."
598 &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
599 if (.not.(success))
then
600 write(error_unit,*)
"Error in global_gather. Aborting..."
603 d(1:na1) = dbase(1:na1) - ddiff(1:na1)
608#ifdef WITH_OPENMP_TRADITIONAL
610 call obj%timer%start(
"OpenMP parallel" // precision_suffix)
619 DO i = my_proc+1, na1, n_procs
629 &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)
632#ifdef WITH_OPENMP_TRADITIONAL
635 call obj%timer%stop(
"OpenMP parallel" // precision_suffix)
641 &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
642 if (.not.(success))
then
643 write(error_unit,*)
"Error in global_gather. Aborting..."
647 d(na1+1:na) = d2(1:na2)
649 call obj%timer%start(
"lapack_lamrg")
651 call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
652 1_blas_kind, 1_blas_kind, idxblas )
653 idx(:) = int(idxblas(:),kind=ik)
654 call obj%timer%stop(
"lapack_lamrg")
660 call check_monotony_&
662 &(obj, na,d,
'Output', wantdebug, success)
664 if (.not.(success))
then
665 call obj%timer%stop(
"merge_systems" // precision_suffix)
670 num = 2 * size_of_int
671 successgpu = gpu_malloc(nnzul_dev, num)
672 check_alloc_gpu(
"merge_systems: ", successgpu)
674 num = na * size_of_int
675 successgpu = gpu_malloc(idxq1_dev, num)
676 check_alloc_gpu(
"merge_systems: ", successgpu)
678 num = na * size_of_int
679 successgpu = gpu_malloc(idx_dev, num)
680 check_alloc_gpu(
"merge_systems: idx_dev", successgpu)
682 num = na * size_of_int
683#ifdef WITH_GPU_STREAMS
684 my_stream = obj%gpu_setup%my_stream
685 successgpu = gpu_memcpy_async(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
686 num, gpumemcpyhosttodevice, my_stream)
687 check_memcpy_gpu(
"merge_systems: ", successgpu)
689 successgpu = gpu_memcpy(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
690 num, gpumemcpyhosttodevice)
691 check_memcpy_gpu(
"merge_systems: idx_dev", successgpu)
709 if (p_col_out(i)==my_pcol)
then
710 if (idx(i)<=na1)
then
722 num = na * size_of_int
723#ifdef WITH_GPU_STREAMS
724 my_stream = obj%gpu_setup%my_stream
725 successgpu = gpu_memcpy_async(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
726 num, gpumemcpyhosttodevice, my_stream)
727 check_memcpy_gpu(
"merge_systems: ", successgpu)
729 successgpu = gpu_memcpy(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
730 num, gpumemcpyhosttodevice)
731 check_memcpy_gpu(
"merge_systems: idxq1_dev", successgpu)
738 allocate(ndef_c(na), stat=istat, errmsg=errormessage)
739 check_allocate(
"merge_systems: ndef_c",istat, errormessage)
743 gemm_dim_k = max(1,l_rows)
744 gemm_dim_l = max_local_cols
745 gemm_dim_m = min(max_strip,max(1,nqcols1))
747 allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)
748 check_allocate(
"merge_systems: qtmp1",istat, errormessage)
750 allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)
751 check_allocate(
"merge_systems: ev",istat, errormessage)
753 allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)
754 check_allocate(
"merge_systems: qtmp2",istat, errormessage)
763 num = na * size_of_int
764 successgpu = gpu_malloc(ndef_c_dev, num)
765 check_alloc_gpu(
"merge_systems: ndef_c_dev", successgpu)
767 num = na * size_of_int
768 successgpu = gpu_malloc(idx1_dev, num)
769 check_alloc_gpu(
"merge_systems: idx1_dev", successgpu)
771 num = na * size_of_int
772 successgpu = gpu_malloc(p_col_dev, num)
773 check_alloc_gpu(
"merge_systems: p_col_dev", successgpu)
775 num = na * size_of_int
776 successgpu = gpu_malloc(p_col_out_dev, num)
777 check_alloc_gpu(
"merge_systems: p_col_out_dev", successgpu)
779 num = na * size_of_int
780 successgpu = gpu_malloc(coltyp_dev, num)
781 check_alloc_gpu(
"merge_systems: coltyp_dev", successgpu)
783 num = na * size_of_int
784 successgpu = gpu_malloc(idx2_dev, num)
785 check_alloc_gpu(
"merge_systems: idx2_dev", successgpu)
787 num = na * size_of_int
788 successgpu = gpu_malloc(l_col_out_dev, num)
789 check_alloc_gpu(
"merge_systems: l_col_out_dev", successgpu)
791 num = (na) * size_of_datatype
792 successgpu = gpu_malloc(z_dev, num)
793 check_alloc_gpu(
"merge_systems: z_dev", successgpu)
795 num = (na) * size_of_datatype
796 successgpu = gpu_malloc(d1_dev, num)
797 check_alloc_gpu(
"merge_systems: d1_dev", successgpu)
799 num = (na) * size_of_datatype
800 successgpu = gpu_malloc(d1u_dev, num)
801 check_alloc_gpu(
"merge_systems: d1u_dev", successgpu)
803 num = (na) * size_of_datatype
804 successgpu = gpu_malloc(dbase_dev, num)
805 check_alloc_gpu(
"merge_systems: dbase_dev", successgpu)
807 num = (na) * size_of_datatype
808 successgpu = gpu_malloc(ddiff_dev, num)
809 check_alloc_gpu(
"merge_systems: ddiff_dev", successgpu)
811 num = (na) * size_of_datatype
812 successgpu = gpu_malloc(zu_dev, num)
813 check_alloc_gpu(
"merge_systems: zu_dev", successgpu)
815 num = (na) * size_of_datatype
816 successgpu = gpu_malloc(ev_scale_dev, num)
817 check_alloc_gpu(
"merge_systems: ev_scale_dev", successgpu)
819 num = (na) * size_of_datatype
820 successgpu = gpu_malloc(d1l_dev, num)
821 check_alloc_gpu(
"merge_systems: d1l_dev", successgpu)
823 num = (na) * size_of_datatype
824 successgpu = gpu_malloc(zl_dev, num)
825 check_alloc_gpu(
"merge_systems: zl_dev", successgpu)
827 num = (matrixrows*matrixcols) * size_of_datatype
828 successgpu = gpu_malloc(q_dev, num)
829 check_alloc_gpu(
"merge_systems: q_dev", successgpu)
831 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
832#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
833 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
834 successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&
835 gpuhostregisterdefault)
836 check_host_register_gpu(
"merge_systems: qtmp1", successgpu)
839 successgpu = gpu_malloc(qtmp1_dev, num)
840 check_alloc_gpu(
"merge_systems: qtmp1_dev", successgpu)
842 successgpu = gpu_malloc(qtmp1_tmp_dev, num)
843 check_alloc_gpu(
"merge_systems: qtmp1_tmp_dev", successgpu)
845 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
846#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
847 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
848 successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t),num,&
849 gpuhostregisterdefault)
850 check_host_register_gpu(
"merge_systems: ev", successgpu)
853 successgpu = gpu_malloc(ev_dev, num)
854 check_alloc_gpu(
"merge_systems: ev_dev", successgpu)
857 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
858#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
859 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
860 successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&
861 gpuhostregisterdefault)
862 check_host_register_gpu(
"merge_systems: qtmp2", successgpu)
865 successgpu = gpu_malloc(qtmp2_dev, num)
866 check_alloc_gpu(
"merge_systems: qtmp2_dev", successgpu)
877 l_idx = l_col(idx1(i))
878 if (p_col(idx1(i))==my_pcol)
then
879 if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2)
then
881 qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
883 if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2)
then
885 qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
892 num = gemm_dim_k * gemm_dim_l * size_of_datatype
893#ifdef WITH_GPU_STREAMS
894 my_stream = obj%gpu_setup%my_stream
895 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
896 num, gpumemcpyhosttodevice, my_stream)
897 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
899 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
900 num, gpumemcpyhosttodevice)
901 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
905 num = matrixrows*matrixcols*size_of_datatype
906#ifdef WITH_GPU_STREAMS
907 successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
908 num, gpumemcpyhosttodevice, my_stream)
909 check_memcpy_gpu(
"merge_systems: q_dev", successgpu)
911 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
912 num, gpumemcpyhosttodevice)
913 check_memcpy_gpu(
"merge_systems: q_dev", successgpu)
918 num = na * size_of_int
919 successgpu = gpu_malloc(l_col_dev, num)
920 check_alloc_gpu(
"merge_systems: l_col_dev", successgpu)
922 num = na * size_of_int
923#ifdef WITH_GPU_STREAMS
924 my_stream = obj%gpu_setup%my_stream
925 successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &
926 num, gpumemcpyhosttodevice, my_stream)
927 check_memcpy_gpu(
"merge_systems: ", successgpu)
929 successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &
930 num, gpumemcpyhosttodevice)
931 check_memcpy_gpu(
"merge_systems: l_col_dev", successgpu)
938 ndef = max(nnzu,nnzl)
941 num = na * size_of_int
942#ifdef WITH_GPU_STREAMS
943 my_stream = obj%gpu_setup%my_stream
944 successgpu = gpu_memcpy_async(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &
945 num, gpumemcpyhosttodevice, my_stream)
946 check_memcpy_gpu(
"merge_systems: ", successgpu)
948 successgpu = gpu_memcpy(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &
949 num, gpumemcpyhosttodevice)
950 check_memcpy_gpu(
"merge_systems: idx2_dev", successgpu)
952 num = na * size_of_int
953#ifdef WITH_GPU_STREAMS
954 my_stream = obj%gpu_setup%my_stream
956 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
957 num, gpumemcpyhosttodevice, my_stream)
958 check_memcpy_gpu(
"merge_systems: ", successgpu)
960 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
961 num, gpumemcpyhosttodevice)
962 check_memcpy_gpu(
"merge_systems: p_col_dev", successgpu)
971 num = na * size_of_int
972#ifdef WITH_GPU_STREAMS
973 my_stream = obj%gpu_setup%my_stream
974 successgpu = gpu_memcpy_async(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &
975 num, gpumemcpyhosttodevice, my_stream)
976 check_memcpy_gpu(
"merge_systems: ndef_c_dev 4", successgpu)
979 successgpu = gpu_memcpy(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &
980 num, gpumemcpyhosttodevice)
981 check_memcpy_gpu(
"merge_systems: ndef_c_dev", successgpu)
984 call gpu_copy_q_slice_to_qtmp1_precision (qtmp1_dev, q_dev, ndef_c_dev, l_col_dev, idx2_dev, p_col_dev, na2, na, &
985 my_pcol, l_rows, l_rqs, l_rqe, matrixrows, gemm_dim_k, my_stream)
988 l_idx = l_col(idx2(i))
989 if (p_col(idx2(i))==my_pcol)
then
991 qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
998 num = na * size_of_int
999#ifdef WITH_GPU_STREAMS
1000 my_stream = obj%gpu_setup%my_stream
1001 successgpu = gpu_memcpy_async(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1002 num, gpumemcpyhosttodevice, my_stream)
1003 check_memcpy_gpu(
"merge_systems: ", successgpu)
1006 successgpu = gpu_memcpy(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1007 num, gpumemcpyhosttodevice)
1008 check_memcpy_gpu(
"merge_systems: p_col_out_dev", successgpu)
1010 num = na * size_of_int
1011#ifdef WITH_GPU_STREAMS
1012 my_stream = obj%gpu_setup%my_stream
1013 successgpu = gpu_memcpy_async(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1014 num, gpumemcpyhosttodevice, my_stream)
1015 check_memcpy_gpu(
"merge_systems: l_col_out_dev", successgpu)
1018 successgpu = gpu_memcpy(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1019 num, gpumemcpyhosttodevice)
1020 check_memcpy_gpu(
"merge_systems: l_col_out_dev", successgpu)
1029 call gpu_zero_q_precision (q_dev, p_col_out_dev, l_col_out_dev, na, my_pcol, l_rqs, l_rqe, matrixrows, my_stream)
1032 if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
1039 num = na * size_of_int
1040#ifdef WITH_GPU_STREAMS
1041 my_stream = obj%gpu_setup%my_stream
1042 successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &
1043 num, gpumemcpyhosttodevice, my_stream)
1044 check_memcpy_gpu(
"merge_systems: ", successgpu)
1046 successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &
1047 num, gpumemcpyhosttodevice)
1048 check_memcpy_gpu(
"merge_systems: idx1_dev", successgpu)
1051 num = na * size_of_int
1052#ifdef WITH_GPU_STREAMS
1053 my_stream = obj%gpu_setup%my_stream
1054 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
1055 num, gpumemcpyhosttodevice, my_stream)
1056 check_memcpy_gpu(
"merge_systems: ", successgpu)
1058 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
1059 num, gpumemcpyhosttodevice)
1060 check_memcpy_gpu(
"merge_systems: p_col_dev", successgpu)
1063 num = na * size_of_int
1064#ifdef WITH_GPU_STREAMS
1065 my_stream = obj%gpu_setup%my_stream
1066 successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &
1067 num, gpumemcpyhosttodevice, my_stream)
1068 check_memcpy_gpu(
"merge_systems: ", successgpu)
1070 successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &
1071 num, gpumemcpyhosttodevice)
1072 check_memcpy_gpu(
"merge_systems: coltyp_dev", successgpu)
1075 num = na*size_of_datatype
1076#ifdef WITH_GPU_STREAMS
1077 my_stream = obj%gpu_setup%my_stream
1078 successgpu = gpu_memcpy_async(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &
1079 num, gpumemcpyhosttodevice, my_stream)
1080 check_memcpy_gpu(
"merge_systems: ev_scale_dev", successgpu)
1082 successgpu = gpu_memcpy(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &
1083 num, gpumemcpyhosttodevice)
1084 check_memcpy_gpu(
"merge_systems: ev_scale_dev", successgpu)
1087 num = na*size_of_datatype
1088#ifdef WITH_GPU_STREAMS
1089 my_stream = obj%gpu_setup%my_stream
1090 successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), &
1091 num, gpumemcpyhosttodevice, my_stream)
1092 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
1094 successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), &
1095 num, gpumemcpyhosttodevice)
1096 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
1099 num = na*size_of_datatype
1100#ifdef WITH_GPU_STREAMS
1101 my_stream = obj%gpu_setup%my_stream
1102 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &
1103 num, gpumemcpyhosttodevice, my_stream)
1105 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &
1106 num, gpumemcpyhosttodevice)
1107 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
1110 num = gemm_dim_l * gemm_dim_m * size_of_datatype
1111#ifdef WITH_GPU_STREAMS
1112 my_stream = obj%gpu_setup%my_stream
1113 successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
1114 num, gpumemcpyhosttodevice, my_stream)
1115 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
1119 successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
1120 num, gpumemcpyhosttodevice)
1121 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
1124 num = na*size_of_datatype
1125#ifdef WITH_GPU_STREAMS
1126 my_stream = obj%gpu_setup%my_stream
1127 successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &
1128 num, gpumemcpyhosttodevice, my_stream)
1129 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1131 successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &
1132 num, gpumemcpyhosttodevice)
1133 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1136 num = na*size_of_datatype
1137#ifdef WITH_GPU_STREAMS
1138 successgpu = gpu_stream_synchronize(my_stream)
1139 successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &
1140 num, gpumemcpyhosttodevice, my_stream)
1141 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1143 successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &
1144 num, gpumemcpyhosttodevice)
1145 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1149 allocate(nnzu_val(na1,npc_n))
1150 allocate(nnzl_val(na1,npc_n))
1156 num = na1 * npc_n* size_of_int
1157 successgpu = gpu_malloc(nnzu_val_dev, num)
1158 check_alloc_gpu(
"merge_systems: nnzu_val_dev", successgpu)
1160 num = na1 * npc_n* size_of_int
1161 successgpu = gpu_malloc(nnzl_val_dev, num)
1162 check_alloc_gpu(
"merge_systems: nnzl_val_dev", successgpu)
1170 if (np_rem == npc_0)
then
1171 np_rem = npc_0+npc_n-1
1179 call gpu_compute_nnzl_nnzu_val_part1 (p_col_dev, idx1_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, &
1180 na, na1, np_rem, npc_n, nnzu_save, nnzl_save, np, my_stream)
1188 call gpu_compute_nnzl_nnzu_val_part2 (nnzu_val_dev, nnzl_val_dev, na, na1, nnzu_start, nnzl_start, npc_n, my_stream)
1193 if (np_rem == npc_0)
then
1194 np_rem = npc_0+npc_n-1
1202 if (p_col(idx1(i)) == np_rem)
then
1203 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2)
then
1205 nnzu_val(i,np) = nnzu
1207 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2)
then
1209 nnzl_val(i,np) = nnzl
1227 if (np_rem == npc_0)
then
1228 np_rem = npc_0+npc_n-1
1234#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
1236 my_stream = obj%gpu_setup%my_stream
1237 call gpu_copy_qtmp1_to_qtmp1_tmp_precision (qtmp1_dev, qtmp1_tmp_dev, gemm_dim_k, gemm_dim_l, my_stream)
1239 call obj%timer%start(
"ccl_send_recv")
1240 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
1241 successgpu = ccl_group_start()
1242 if (.not.successgpu)
then
1243 print *,
"Error in setting up nccl_group_start!"
1246 successgpu = ccl_send(qtmp1_tmp_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1247#ifdef DOUBLE_PRECISION
1250#ifdef SINGLE_PRECISION
1253 np_next, ccl_comm_cols, my_stream)
1254 if (.not.successgpu)
then
1255 print *,
"Error in nccl_send"
1258 successgpu = ccl_recv(qtmp1_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1259#ifdef DOUBLE_PRECISION
1262#ifdef SINGLE_PRECISION
1265 np_prev, ccl_comm_cols, my_stream)
1268 if (.not.successgpu)
then
1269 print *,
"Error in ccl_send/ccl_recv"
1272 successgpu = ccl_group_end()
1273 if (.not.successgpu)
then
1274 print *,
"Error in setting up ccl_group_end!"
1277 successgpu = gpu_stream_synchronize(my_stream)
1278 check_stream_synchronize_gpu(
"trans_ev", successgpu)
1279 call obj%timer%stop(
"ccl_send_recv")
1281#endif /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */
1285 call obj%timer%start(
"mpi_communication")
1286#ifdef WITH_GPU_STREAMS
1287 my_stream = obj%gpu_setup%my_stream
1288 successgpu = gpu_stream_synchronize(my_stream)
1289 check_stream_synchronize_gpu(
"merge_systems qtmp1_dev", successgpu)
1291 successgpu = gpu_memcpy_async(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1292 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost, my_stream)
1293 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1295 my_stream = obj%gpu_setup%my_stream
1296 successgpu = gpu_stream_synchronize(my_stream)
1297 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1299 successgpu = gpu_stream_synchronize()
1300 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1303 successgpu = gpu_memcpy(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1304 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost)
1305 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1309 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1310 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1311 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
1312#ifdef WITH_GPU_STREAMS
1313 my_stream = obj%gpu_setup%my_stream
1314 successgpu = gpu_stream_synchronize(my_stream)
1315 check_stream_synchronize_gpu(
"merge_systems qtmp1_dev", successgpu)
1317 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1318 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice, my_stream)
1319 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1321 my_stream = obj%gpu_setup%my_stream
1322 successgpu = gpu_stream_synchronize(my_stream)
1323 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1325 successgpu = gpu_stream_synchronize()
1326 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1329 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1330 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice)
1331 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1333 call obj%timer%stop(
"mpi_communication")
1334#endif /* WITH_MPI */
1335#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
1337#endif /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */
1340 call obj%timer%start(
"mpi_communication")
1341 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1342 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1343 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
1344 call obj%timer%stop(
"mpi_communication")
1345#endif /* WITH_MPI */
1357 my_stream = obj%gpu_setup%my_stream
1358 call gpu_fill_tmp_arrays_precision (idx1_dev, p_col_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, nnzul_dev, &
1359 d1u_dev, d1_dev, zu_dev, z_dev, d1l_dev, zl_dev, na, np, na1, np_rem, my_stream)
1361 num = 2* size_of_int
1362#ifdef WITH_GPU_STREAMS
1363 successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &
1364 num, gpumemcpydevicetohost, my_stream)
1365 check_memcpy_gpu(
"merge_systems: nnzul_dev", successgpu)
1367 successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &
1368 num, gpumemcpydevicetohost)
1369 check_memcpy_gpu(
"merge_systems: nnzl_val", successgpu)
1376 if (p_col(idx1(i)) == np_rem)
then
1377 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2)
then
1382 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2)
then
1394 ndef = max(nnzu,nnzl)
1397 call gpu_update_ndef_c(ndef_c_dev, idx_dev, p_col_dev, idx2_dev, na, na1, np_rem, ndef, my_stream)
1401 ndef = max(nnzu,nnzl)
1403 call gpu_copy_qtmp1_slice_to_q_precision (q_dev, qtmp1_dev, l_col_out_dev, p_col_out_dev, ndef_c_dev, p_col_dev, &
1404 idx2_dev, idx_dev, l_rqs, l_rqe, l_rows, matrixrows, &
1405 gemm_dim_k, my_pcol, na1, np_rem, na, my_stream)
1407 ndef = max(nnzu,nnzl)
1411 if (p_col(idx2(j-na1)) == np_rem)
then
1413 if (p_col_out(i) == my_pcol)
then
1414 q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
1423 do ns = 0, nqcols1-1, max_strip
1424 ncnt = min(max_strip,nqcols1-ns)
1428 call gpu_copy_q_slice_to_qtmp2_precision (q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rows, l_rqs, l_rqe, &
1429 matrixrows, matrixcols, gemm_dim_k, gemm_dim_m, ns, &
1430 ncnt, ind_ex, ind_ex2, na, my_stream)
1439 qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)
1447 if (nnzu .ge. 1)
then
1450 call gpu_fill_ev_precision (ev_dev, d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1451 na, gemm_dim_l, gemm_dim_m, nnzu, ns, ncnt, my_stream)
1461 j = idx(idxq1(i+ns))
1466 tmp(k) = d1u(k) - dbase(j)
1467 tmp(k) = tmp(k) + ddiff(j)
1468 ev(k,i) = zu(k) / tmp(k) * ev_scale(j)
1477 if (l_rnm>0 .and. ncnt>0 .and. nnzu>0)
then
1479 call obj%timer%start(
"gpublas_gemm")
1480 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1481 call gpublas_precision_gemm(
'N',
'N', l_rnm, ncnt, nnzu, &
1482 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
1483 ev_dev, ubound(ev,dim=1), &
1484 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1), gpuhandle)
1485 if (wantdebug) successgpu = gpu_devicesynchronize()
1486 call obj%timer%stop(
"gpublas_gemm")
1488 call obj%timer%start(
"blas_gemm")
1489 call precision_gemm(
'N',
'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1490 int(nnzu,kind=blas_kind), &
1491 1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=blas_kind), &
1492 ev, int(ubound(ev,dim=1),kind=blas_kind), &
1493 1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
1494 call obj%timer%stop(
"blas_gemm")
1506 if (nnzl .ge. 1)
then
1507 call gpu_fill_ev_precision (ev_dev, d1l_dev, dbase_dev, ddiff_dev, zl_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1508 na, gemm_dim_l, gemm_dim_m, nnzl, ns, ncnt, my_stream)
1515 j = idx(idxq1(i+ns))
1518 tmp(k) = d1l(k) - dbase(j)
1519 tmp(k) = tmp(k) + ddiff(j)
1520 ev(k,i) = zl(k) / tmp(k) * ev_scale(j)
1529 if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0)
then
1531 call obj%timer%start(
"gpublas_gemm")
1532 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1533 call gpublas_precision_gemm(
'N',
'N', l_rows-l_rnm, ncnt, nnzl, &
1534 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), &
1535 ev_dev, ubound(ev,dim=1), &
1536 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1), gpuhandle)
1537 if (wantdebug) successgpu = gpu_devicesynchronize()
1538 call obj%timer%stop(
"gpublas_gemm")
1540 call obj%timer%start(
"blas_gemm")
1541 call precision_gemm(
'N',
'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1542 int(nnzl,kind=blas_kind), &
1543 1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=blas_kind), &
1544 ev, int(ubound(ev,dim=1),kind=blas_kind), &
1545 1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
1546 call obj%timer%stop(
"blas_gemm")
1553 call gpu_copy_qtmp2_slice_to_q_precision(q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rqs, l_rqe, l_rows, ncnt, &
1554 gemm_dim_k, matrixrows, ns, my_stream)
1561 q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
1571 deallocate(nnzu_val, nnzl_val)
1576 num = matrixrows*matrixcols*size_of_datatype
1577#ifdef WITH_GPU_STREAMS
1578 successgpu = gpu_memcpy_async(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
1579 num, gpumemcpydevicetohost, my_stream)
1580 check_memcpy_gpu(
"merge_systems: q_dev", successgpu)
1582 my_stream = obj%gpu_setup%my_stream
1583 successgpu = gpu_stream_synchronize(my_stream)
1584 check_stream_synchronize_gpu(
"merge_systems: q_dev", successgpu)
1586 successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
1587 num, gpumemcpydevicetohost)
1588 check_memcpy_gpu(
"merge_systems: q_dev", successgpu)
1590#ifdef WITH_GPU_STREAMS
1591 successgpu = gpu_memcpy_async(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &
1592 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost, my_stream)
1593 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
1595 successgpu = gpu_stream_synchronize(my_stream)
1596 check_stream_synchronize_gpu(
"merge_systems: ev_dev", successgpu)
1600 successgpu = gpu_memcpy(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &
1601 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost)
1602 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
1607 deallocate(ndef_c, stat=istat, errmsg=errormessage)
1608 check_deallocate(
"merge_systems: ndef_c",istat, errormessage)
1612 successgpu = gpu_free(nnzul_dev)
1613 check_dealloc_gpu(
"merge_systems: nnzul_dev", successgpu)
1615 successgpu = gpu_free(l_col_dev)
1616 check_dealloc_gpu(
"merge_systems: l_col_dev", successgpu)
1618 successgpu = gpu_free(ndef_c_dev)
1619 check_dealloc_gpu(
"merge_systems: ndef_c_dev", successgpu)
1621 successgpu = gpu_free(nnzu_val_dev)
1622 check_dealloc_gpu(
"merge_systems: nnzu_val_dev", successgpu)
1624 successgpu = gpu_free(nnzl_val_dev)
1625 check_dealloc_gpu(
"merge_systems: nnzl_val_dev", successgpu)
1627 successgpu = gpu_free(idx1_dev)
1628 check_dealloc_gpu(
"merge_systems: idx1_dev", successgpu)
1630 successgpu = gpu_free(idx2_dev)
1631 check_dealloc_gpu(
"merge_systems: idx2_dev", successgpu)
1633 successgpu = gpu_free(p_col_dev)
1634 check_dealloc_gpu(
"merge_systems: p_col_dev", successgpu)
1636 successgpu = gpu_free(p_col_out_dev)
1637 check_dealloc_gpu(
"merge_systems: p_col_out_dev", successgpu)
1639 successgpu = gpu_free(coltyp_dev)
1640 check_dealloc_gpu(
"merge_systems: coltyp_dev", successgpu)
1642 successgpu = gpu_free(idx_dev)
1643 check_dealloc_gpu(
"merge_systems: idx_dev", successgpu)
1645 successgpu = gpu_free(l_col_out_dev)
1646 check_dealloc_gpu(
"merge_systems: l_col_out_dev", successgpu)
1648 successgpu = gpu_free(idxq1_dev)
1649 check_dealloc_gpu(
"merge_systems: ", successgpu)
1651 successgpu = gpu_free(q_dev)
1652 check_dealloc_gpu(
"merge_systems: q_dev", successgpu)
1654 successgpu = gpu_free(d1_dev)
1655 check_dealloc_gpu(
"merge_systems: d1_dev", successgpu)
1657 successgpu = gpu_free(z_dev)
1658 check_dealloc_gpu(
"merge_systems: z_dev", successgpu)
1660 successgpu = gpu_free(d1u_dev)
1661 check_dealloc_gpu(
"merge_systems: d1u_dev", successgpu)
1663 successgpu = gpu_free(dbase_dev)
1664 check_dealloc_gpu(
"merge_systems: dbase_dev", successgpu)
1666 successgpu = gpu_free(ddiff_dev)
1667 check_dealloc_gpu(
"merge_systems: ddiff_dev", successgpu)
1669 successgpu = gpu_free(zu_dev)
1670 check_dealloc_gpu(
"merge_systems: zu_dev", successgpu)
1672 successgpu = gpu_free(ev_scale_dev)
1673 check_dealloc_gpu(
"merge_systems: ev_scale_dev", successgpu)
1675 successgpu = gpu_free(d1l_dev)
1676 check_dealloc_gpu(
"merge_systems: d1l_dev", successgpu)
1678 successgpu = gpu_free(zl_dev)
1679 check_dealloc_gpu(
"merge_systems: zl_dev", successgpu)
1682#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1683 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1684 successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
1685 check_host_unregister_gpu(
"merge_systems: qtmp1", successgpu)
1688 successgpu = gpu_free(qtmp1_dev)
1689 check_dealloc_gpu(
"merge_systems: qtmp1_dev", successgpu)
1691 successgpu = gpu_free(qtmp1_tmp_dev)
1692 check_dealloc_gpu(
"merge_systems: qtmp1_tmp_dev", successgpu)
1694#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1695 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1696 successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
1697 check_host_unregister_gpu(
"merge_systems: qtmp2", successgpu)
1700 successgpu = gpu_free(qtmp2_dev)
1701 check_dealloc_gpu(
"merge_systems: qtmp2_dev", successgpu)
1703#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1704 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1705 successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))
1706 check_host_unregister_gpu(
"merge_systems: ev", successgpu)
1709 successgpu = gpu_free(ev_dev)
1710 check_dealloc_gpu(
"merge_systems: ev_dev", successgpu)
1713 deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)
1714 check_deallocate(
"merge_systems: ev, qtmp1, qtmp2",istat, errormessage)
1716#ifdef WITH_OPENMP_TRADITIONAL
1717 deallocate(z_p, stat=istat, errmsg=errormessage)
1718 check_deallocate(
"merge_systems: z_p",istat, errormessage)
1721 call obj%timer%stop(
"merge_systems" // precision_suffix)