64 subroutine merge_systems_cpu_&
66 (obj, na, nm, d, e, q, matrixrows, nqoff, nblk, matrixcols, mpi_comm_rows, mpi_comm_cols, &
67 l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, usegpu, wantdebug, success, max_threads)
72 use,
intrinsic :: iso_c_binding
75 use elpa_blas_interfaces
86#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
92#ifdef WITH_OPENMP_TRADITIONAL
96#include "../general/precision_kinds.F90"
98 integer(kind=ik),
intent(in) :: na, nm, matrixRows, nqoff, nblk, matrixCols, mpi_comm_rows, &
99 mpi_comm_cols, npc_0, npc_n
100 integer(kind=ik),
intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
101 real(kind=real_datatype),
intent(inout) :: d(na), e
102#ifdef USE_ASSUMED_SIZE
103 real(kind=real_datatype),
intent(inout) :: q(matrixrows,*)
105 real(kind=real_datatype),
intent(inout) :: q(matrixrows,matrixcols)
107 logical,
intent(in) :: useGPU, wantDebug
109 logical,
intent(out) :: success
113 integer(kind=ik),
parameter :: max_strip=128
116 real(kind=real_datatype) :: beta, sig, s, c, t, tau, rho, eps, tol, &
117 qtrans(2,2), dmax, zmax, d1new, d2new
118 real(kind=real_datatype) :: z(na), d1(na), d2(na), z1(na), delta(na), &
119 dbase(na), ddiff(na), ev_scale(na), tmp(na)
120 real(kind=real_datatype) :: d1u(na), zu(na), d1l(na), zl(na)
121 real(kind=real_datatype),
allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
122#ifdef WITH_OPENMP_TRADITIONAL
123 real(kind=real_datatype),
allocatable :: z_p(:,:)
126 integer(kind=ik) :: i, j, k, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
128 integer(kind=BLAS_KIND) :: infoBLAS
129 integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
130 l_cols_qreorg, np, l_idx, nqcols1
131 integer(kind=ik) :: nnzu_save, nnzl_save
132 integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
134 integer(kind=MPI_KIND) :: mpierr
135 integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
136 integer(kind=ik) :: np_next, np_prev, np_rem
137 integer(kind=ik) :: idx(na), idx1(na), idx2(na)
138 integer(kind=BLAS_KIND) :: idxBLAS(NA)
139 integer(kind=ik) :: coltyp(na), idxq1(na)
141 integer(kind=ik) :: istat
142 character(200) :: errorMessage
143 integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
145 integer(kind=c_intptr_t) :: num
146 integer(kind=C_intptr_T) :: qtmp1_dev, qtmp1_tmp_dev, qtmp2_dev, ev_dev, q_dev
147 integer(kind=c_intptr_t) :: d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev
148 integer(kind=c_intptr_t) :: d1l_dev, zl_dev, z_dev, d1_dev
149 integer(kind=c_intptr_t) :: idx1_dev, p_col_dev, coltyp_dev, p_col_out_dev, ndef_c_dev
150 integer(kind=c_intptr_t) :: idxq1_dev, l_col_out_dev, idx_dev, idx2_dev, l_col_dev
151 integer(kind=c_intptr_t) :: nnzul_dev
153 integer(kind=c_intptr_t) :: nnzu_val_dev, nnzl_val_dev
154 logical :: successGPU
155 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
158 integer(kind=c_intptr_t) :: gpuHandle
159 integer(kind=ik),
intent(in) :: max_threads
160 integer(kind=c_intptr_t) :: my_stream
161 integer(kind=ik) :: l_col_out_tmp
162 integer(kind=ik),
allocatable :: nnzu_val(:,:), nnzl_val(:,:)
163 integer(kind=ik) :: nnzul(2)
165 integer(kind=ik) :: nnzu_start, nnzl_start
167 integer(kind=ik),
allocatable :: ndef_c(:)
170 integer(kind=ik) :: ii,jj, indx, ind_ex, ind_ex2, p_col_tmp, index2, counter1, counter2
171#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
172 integer(kind=c_intptr_t) :: ccl_comm_rows, ccl_comm_cols
174#ifdef WITH_OPENMP_TRADITIONAL
175 integer(kind=ik) :: my_thread
177 allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errormessage)
178 check_allocate(
"merge_systems: z_p",istat, errormessage)
181 call obj%timer%start(
"merge_systems" // precision_suffix)
184 call obj%timer%start(
"mpi_communication")
185 call mpi_comm_rank(int(mpi_comm_rows,kind=mpi_kind) ,my_prowmpi, mpierr)
186 call mpi_comm_size(int(mpi_comm_rows,kind=mpi_kind) ,np_rowsmpi, mpierr)
187 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
188 call mpi_comm_size(int(mpi_comm_cols,kind=mpi_kind) ,np_colsmpi, mpierr)
190 my_prow = int(my_prowmpi,kind=c_int)
191 np_rows = int(np_rowsmpi,kind=c_int)
192 my_pcol = int(my_pcolmpi,kind=c_int)
193 np_cols = int(np_colsmpi,kind=c_int)
195 call obj%timer%stop(
"mpi_communication")
200 if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n)
then
201 call obj%timer%stop(
"merge_systems" // precision_suffix)
206 if (my_pcol == npc_0+npc_n-1)
then
209 np_next = my_pcol + 1
212 if (my_pcol == npc_0)
then
213 np_prev = npc_0+npc_n-1
215 np_prev = my_pcol - 1
217 call check_monotony_&
219 &(obj, nm,d,
'Input1',wantdebug, success)
220 if (.not.(success))
then
221 call obj%timer%stop(
"merge_systems" // precision_suffix)
224 call check_monotony_&
226 &(obj,na-nm,d(nm+1),
'Input2',wantdebug, success)
227 if (.not.(success))
then
228 call obj%timer%stop(
"merge_systems" // precision_suffix)
235 n_procs = np_rows*npc_n
236 my_proc = my_prow*npc_n + (my_pcol-npc_0)
241 l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1)
242 l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1)
243 l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1)
245 l_rnm = l_rqm-l_rqs+1
246 l_rows = l_rqe-l_rqs+1
251 l_cols = count(p_col(1:na)==my_pcol)
256 do np = npc_0, npc_0+npc_n-1
257 max_local_cols = max(max_local_cols,count(p_col(1:na)==np))
269 if (mod((nqoff+nm-1)/nblk,np_rows)==my_prow)
then
272 if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
276 if (mod((nqoff+nm)/nblk,np_rows)==my_prow)
then
279 if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
285 &(obj, z, na, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
286 if (.not.(success))
then
287 write(error_unit,*)
"Error in global_gather. ABorting"
296 call obj%timer%start(
"lapack_lamrg")
297 call precision_lamrg( int(nm,kind=blas_kind), int(na-nm,kind=blas_kind), d, &
298 1_blas_kind, 1_blas_kind, idxblas )
299 idx(:) = int(idxblas(:),kind=ik)
300 call obj%timer%stop(
"lapack_lamrg")
304 zmax = maxval(abs(z))
305 dmax = maxval(abs(d))
306 eps = precision_lamch(
'E' )
307 tol = 8.0_rk*eps*max(dmax,zmax)
312 IF ( rho*zmax <= tol )
THEN
324 (obj, idx, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &
325 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
327 call obj%timer%stop(
"merge_systems" // precision_suffix)
348 if (rho*abs(z(idx(i))) <= tol)
then
366 tau = precision_lapy2( c, s )
367 t = d1(na1) - d(idx(i))
370 IF ( abs( t*c*s ) <= tol )
THEN
378 d2new = d(idx(i))*c**2 + d1(na1)*s**2
379 d1new = d(idx(i))*s**2 + d1(na1)*c**2
395 if (d1new<d1(na1) ) d1new = d1(na1)
396 if (d1new>d(idx(i))) d1new = d(idx(i))
398 if (d2new<d1(na1) ) d2new = d1(na1)
399 if (d2new>d(idx(i))) d2new = d(idx(i))
404 if (d2new<d2(j))
then
415 qtrans(1,1) = c; qtrans(1,2) =-s
416 qtrans(2,1) = s; qtrans(2,2) = c
417 call transform_columns_&
419 (obj, idx(i), idx1(na1), na, tmp, l_rqs, l_rqe, &
420 q, matrixrows, matrixcols, l_rows, mpi_comm_cols, &
421 p_col, l_col, qtrans)
422 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
423 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
442 call check_monotony_&
444 &(obj, na1,d1,
'Sorted1', wantdebug, success)
445 if (.not.(success))
then
446 call obj%timer%stop(
"merge_systems" // precision_suffix)
449 call check_monotony_&
451 &(obj, na2,d2,
'Sorted2', wantdebug, success)
452 if (.not.(success))
then
453 call obj%timer%stop(
"merge_systems" // precision_suffix)
457 if (na1==1 .or. na1==2)
then
461 d(1) = d1(1) + rho*z1(1)**2
463 call obj%timer%start(
"lapack_laed5_x2")
464 call precision_laed5(1_blas_kind, d1, z1, qtrans(1,1), rho, d(1))
465 call precision_laed5(2_blas_kind, d1, z1, qtrans(1,2), rho, d(2))
466 call obj%timer%stop(
"lapack_laed5_x2")
467 call transform_columns_&
469 &(obj, idx1(1), idx1(2), na, tmp, l_rqs, l_rqe, q, &
470 matrixrows, matrixcols, l_rows, mpi_comm_cols, &
471 p_col, l_col, qtrans)
476 d(na1+1:na) = d2(1:na2)
479 call obj%timer%start(
"lapack_lamrg")
480 call precision_lamrg( int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
481 1_blas_kind, 1_blas_kind, idxblas )
482 idx(:) = int(idxblas(:),kind=ik)
483 call obj%timer%stop(
"lapack_lamrg")
494 if (idx(i)<=na1)
then
495 idxq1(i) = idx1(idx(i))
497 idxq1(i) = idx2(idx(i)-na1)
502 &(obj, idxq1, na, na, p_col_out, q, matrixrows, matrixcols, l_rows, l_rqe, &
503 l_rqs, mpi_comm_cols, p_col, l_col, l_col_out)
510#ifdef WITH_OPENMP_TRADITIONAL
517 infoblas = int(info,kind=blas_kind)
525 DO i = my_proc+1, na1, n_procs
526 call obj%timer%start(
"lapack_laed4")
527 call precision_laed4(int(na1,kind=blas_kind), int(i,kind=blas_kind), d1, z1, delta, &
529 info = int(infoblas,kind=ik)
530 call obj%timer%stop(
"lapack_laed4")
535 call solve_secular_equation_&
537 &(obj, na1, i, d1, z1, delta, rho, s)
549 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
556 if (abs(delta(i+1)) < abs(delta(i)))
then
558 ddiff(i) = delta(i+1)
578 call global_product_&
580 (obj, z, na1, mpi_comm_rows, mpi_comm_cols, npc_0, npc_n, success)
581 if (.not.(success))
then
582 write(error_unit,*)
"Error in global_product. Aborting..."
585 z(1:na1) = sign( sqrt( abs( z(1:na1) ) ), z1(1:na1) )
589 &(obj, dbase, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
590 if (.not.(success))
then
591 write(error_unit,*)
"Error in global_gather. Aborting..."
596 &(obj, ddiff, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
597 if (.not.(success))
then
598 write(error_unit,*)
"Error in global_gather. Aborting..."
601 d(1:na1) = dbase(1:na1) - ddiff(1:na1)
606#ifdef WITH_OPENMP_TRADITIONAL
608 call obj%timer%start(
"OpenMP parallel" // precision_suffix)
617 DO i = my_proc+1, na1, n_procs
627 &(obj, d1, dbase, ddiff, z, ev_scale(i), na1, i)
630#ifdef WITH_OPENMP_TRADITIONAL
633 call obj%timer%stop(
"OpenMP parallel" // precision_suffix)
639 &(obj, ev_scale, na1, mpi_comm_rows, mpi_comm_cols, npc_n, np_prev, np_next, success)
640 if (.not.(success))
then
641 write(error_unit,*)
"Error in global_gather. Aborting..."
645 d(na1+1:na) = d2(1:na2)
647 call obj%timer%start(
"lapack_lamrg")
649 call precision_lamrg(int(na1,kind=blas_kind), int(na-na1,kind=blas_kind), d, &
650 1_blas_kind, 1_blas_kind, idxblas )
651 idx(:) = int(idxblas(:),kind=ik)
652 call obj%timer%stop(
"lapack_lamrg")
658 call check_monotony_&
660 &(obj, na,d,
'Output', wantdebug, success)
662 if (.not.(success))
then
663 call obj%timer%stop(
"merge_systems" // precision_suffix)
668 num = 2 * size_of_int
669 successgpu = gpu_malloc(nnzul_dev, num)
670 check_alloc_gpu(
"merge_systems: ", successgpu)
672 num = na * size_of_int
673 successgpu = gpu_malloc(idxq1_dev, num)
674 check_alloc_gpu(
"merge_systems: ", successgpu)
676 num = na * size_of_int
677 successgpu = gpu_malloc(idx_dev, num)
678 check_alloc_gpu(
"merge_systems: idx_dev", successgpu)
680 num = na * size_of_int
681#ifdef WITH_GPU_STREAMS
682 my_stream = obj%gpu_setup%my_stream
683 successgpu = gpu_memcpy_async(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
684 num, gpumemcpyhosttodevice, my_stream)
685 check_memcpy_gpu(
"merge_systems: ", successgpu)
687 successgpu = gpu_memcpy(idx_dev, int(loc(idx(1)),kind=c_intptr_t), &
688 num, gpumemcpyhosttodevice)
689 check_memcpy_gpu(
"merge_systems: idx_dev", successgpu)
707 if (p_col_out(i)==my_pcol)
then
708 if (idx(i)<=na1)
then
720 num = na * size_of_int
721#ifdef WITH_GPU_STREAMS
722 my_stream = obj%gpu_setup%my_stream
723 successgpu = gpu_memcpy_async(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
724 num, gpumemcpyhosttodevice, my_stream)
725 check_memcpy_gpu(
"merge_systems: ", successgpu)
727 successgpu = gpu_memcpy(idxq1_dev, int(loc(idxq1(1)),kind=c_intptr_t), &
728 num, gpumemcpyhosttodevice)
729 check_memcpy_gpu(
"merge_systems: idxq1_dev", successgpu)
736 allocate(ndef_c(na), stat=istat, errmsg=errormessage)
737 check_allocate(
"merge_systems: ndef_c",istat, errormessage)
741 gemm_dim_k = max(1,l_rows)
742 gemm_dim_l = max_local_cols
743 gemm_dim_m = min(max_strip,max(1,nqcols1))
745 allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errormessage)
746 check_allocate(
"merge_systems: qtmp1",istat, errormessage)
748 allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errormessage)
749 check_allocate(
"merge_systems: ev",istat, errormessage)
751 allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errormessage)
752 check_allocate(
"merge_systems: qtmp2",istat, errormessage)
761 num = na * size_of_int
762 successgpu = gpu_malloc(ndef_c_dev, num)
763 check_alloc_gpu(
"merge_systems: ndef_c_dev", successgpu)
765 num = na * size_of_int
766 successgpu = gpu_malloc(idx1_dev, num)
767 check_alloc_gpu(
"merge_systems: idx1_dev", successgpu)
769 num = na * size_of_int
770 successgpu = gpu_malloc(p_col_dev, num)
771 check_alloc_gpu(
"merge_systems: p_col_dev", successgpu)
773 num = na * size_of_int
774 successgpu = gpu_malloc(p_col_out_dev, num)
775 check_alloc_gpu(
"merge_systems: p_col_out_dev", successgpu)
777 num = na * size_of_int
778 successgpu = gpu_malloc(coltyp_dev, num)
779 check_alloc_gpu(
"merge_systems: coltyp_dev", successgpu)
781 num = na * size_of_int
782 successgpu = gpu_malloc(idx2_dev, num)
783 check_alloc_gpu(
"merge_systems: idx2_dev", successgpu)
785 num = na * size_of_int
786 successgpu = gpu_malloc(l_col_out_dev, num)
787 check_alloc_gpu(
"merge_systems: l_col_out_dev", successgpu)
789 num = (na) * size_of_datatype
790 successgpu = gpu_malloc(z_dev, num)
791 check_alloc_gpu(
"merge_systems: z_dev", successgpu)
793 num = (na) * size_of_datatype
794 successgpu = gpu_malloc(d1_dev, num)
795 check_alloc_gpu(
"merge_systems: d1_dev", successgpu)
797 num = (na) * size_of_datatype
798 successgpu = gpu_malloc(d1u_dev, num)
799 check_alloc_gpu(
"merge_systems: d1u_dev", successgpu)
801 num = (na) * size_of_datatype
802 successgpu = gpu_malloc(dbase_dev, num)
803 check_alloc_gpu(
"merge_systems: dbase_dev", successgpu)
805 num = (na) * size_of_datatype
806 successgpu = gpu_malloc(ddiff_dev, num)
807 check_alloc_gpu(
"merge_systems: ddiff_dev", successgpu)
809 num = (na) * size_of_datatype
810 successgpu = gpu_malloc(zu_dev, num)
811 check_alloc_gpu(
"merge_systems: zu_dev", successgpu)
813 num = (na) * size_of_datatype
814 successgpu = gpu_malloc(ev_scale_dev, num)
815 check_alloc_gpu(
"merge_systems: ev_scale_dev", successgpu)
817 num = (na) * size_of_datatype
818 successgpu = gpu_malloc(d1l_dev, num)
819 check_alloc_gpu(
"merge_systems: d1l_dev", successgpu)
821 num = (na) * size_of_datatype
822 successgpu = gpu_malloc(zl_dev, num)
823 check_alloc_gpu(
"merge_systems: zl_dev", successgpu)
825 num = (matrixrows*matrixcols) * size_of_datatype
826 successgpu = gpu_malloc(q_dev, num)
827 check_alloc_gpu(
"merge_systems: q_dev", successgpu)
829 num = (gemm_dim_k * gemm_dim_l) * size_of_datatype
830#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
831 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
832 successgpu = gpu_host_register(int(loc(qtmp1),kind=c_intptr_t),num,&
833 gpuhostregisterdefault)
834 check_host_register_gpu(
"merge_systems: qtmp1", successgpu)
837 successgpu = gpu_malloc(qtmp1_dev, num)
838 check_alloc_gpu(
"merge_systems: qtmp1_dev", successgpu)
840 successgpu = gpu_malloc(qtmp1_tmp_dev, num)
841 check_alloc_gpu(
"merge_systems: qtmp1_tmp_dev", successgpu)
843 num = (gemm_dim_l * gemm_dim_m) * size_of_datatype
844#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
845 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
846 successgpu = gpu_host_register(int(loc(ev),kind=c_intptr_t),num,&
847 gpuhostregisterdefault)
848 check_host_register_gpu(
"merge_systems: ev", successgpu)
851 successgpu = gpu_malloc(ev_dev, num)
852 check_alloc_gpu(
"merge_systems: ev_dev", successgpu)
855 num = (gemm_dim_k * gemm_dim_m) * size_of_datatype
856#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
857 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
858 successgpu = gpu_host_register(int(loc(qtmp2),kind=c_intptr_t),num,&
859 gpuhostregisterdefault)
860 check_host_register_gpu(
"merge_systems: qtmp2", successgpu)
863 successgpu = gpu_malloc(qtmp2_dev, num)
864 check_alloc_gpu(
"merge_systems: qtmp2_dev", successgpu)
875 l_idx = l_col(idx1(i))
876 if (p_col(idx1(i))==my_pcol)
then
877 if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2)
then
879 qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
881 if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2)
then
883 qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
890 num = gemm_dim_k * gemm_dim_l * size_of_datatype
891#ifdef WITH_GPU_STREAMS
892 my_stream = obj%gpu_setup%my_stream
893 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
894 num, gpumemcpyhosttodevice, my_stream)
895 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
897 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
898 num, gpumemcpyhosttodevice)
899 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
903 num = matrixrows*matrixcols*size_of_datatype
904#ifdef WITH_GPU_STREAMS
905 successgpu = gpu_memcpy_async(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
906 num, gpumemcpyhosttodevice, my_stream)
907 check_memcpy_gpu(
"merge_systems: q_dev", successgpu)
909 successgpu = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
910 num, gpumemcpyhosttodevice)
911 check_memcpy_gpu(
"merge_systems: q_dev", successgpu)
916 num = na * size_of_int
917 successgpu = gpu_malloc(l_col_dev, num)
918 check_alloc_gpu(
"merge_systems: l_col_dev", successgpu)
920 num = na * size_of_int
921#ifdef WITH_GPU_STREAMS
922 my_stream = obj%gpu_setup%my_stream
923 successgpu = gpu_memcpy_async(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &
924 num, gpumemcpyhosttodevice, my_stream)
925 check_memcpy_gpu(
"merge_systems: ", successgpu)
927 successgpu = gpu_memcpy(l_col_dev, int(loc(l_col(1)),kind=c_intptr_t), &
928 num, gpumemcpyhosttodevice)
929 check_memcpy_gpu(
"merge_systems: l_col_dev", successgpu)
936 ndef = max(nnzu,nnzl)
939 num = na * size_of_int
940#ifdef WITH_GPU_STREAMS
941 my_stream = obj%gpu_setup%my_stream
942 successgpu = gpu_memcpy_async(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &
943 num, gpumemcpyhosttodevice, my_stream)
944 check_memcpy_gpu(
"merge_systems: ", successgpu)
946 successgpu = gpu_memcpy(idx2_dev, int(loc(idx2(1)),kind=c_intptr_t), &
947 num, gpumemcpyhosttodevice)
948 check_memcpy_gpu(
"merge_systems: idx2_dev", successgpu)
950 num = na * size_of_int
951#ifdef WITH_GPU_STREAMS
952 my_stream = obj%gpu_setup%my_stream
954 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
955 num, gpumemcpyhosttodevice, my_stream)
956 check_memcpy_gpu(
"merge_systems: ", successgpu)
958 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
959 num, gpumemcpyhosttodevice)
960 check_memcpy_gpu(
"merge_systems: p_col_dev", successgpu)
969 num = na * size_of_int
970#ifdef WITH_GPU_STREAMS
971 my_stream = obj%gpu_setup%my_stream
972 successgpu = gpu_memcpy_async(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &
973 num, gpumemcpyhosttodevice, my_stream)
974 check_memcpy_gpu(
"merge_systems: ndef_c_dev 4", successgpu)
977 successgpu = gpu_memcpy(ndef_c_dev, int(loc(ndef_c(1)),kind=c_intptr_t), &
978 num, gpumemcpyhosttodevice)
979 check_memcpy_gpu(
"merge_systems: ndef_c_dev", successgpu)
982 call gpu_copy_q_slice_to_qtmp1_precision (qtmp1_dev, q_dev, ndef_c_dev, l_col_dev, idx2_dev, p_col_dev, na2, na, &
983 my_pcol, l_rows, l_rqs, l_rqe, matrixrows, gemm_dim_k, my_stream)
986 l_idx = l_col(idx2(i))
987 if (p_col(idx2(i))==my_pcol)
then
989 qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
996 num = na * size_of_int
997#ifdef WITH_GPU_STREAMS
998 my_stream = obj%gpu_setup%my_stream
999 successgpu = gpu_memcpy_async(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1000 num, gpumemcpyhosttodevice, my_stream)
1001 check_memcpy_gpu(
"merge_systems: ", successgpu)
1004 successgpu = gpu_memcpy(p_col_out_dev, int(loc(p_col_out(1)),kind=c_intptr_t), &
1005 num, gpumemcpyhosttodevice)
1006 check_memcpy_gpu(
"merge_systems: p_col_out_dev", successgpu)
1008 num = na * size_of_int
1009#ifdef WITH_GPU_STREAMS
1010 my_stream = obj%gpu_setup%my_stream
1011 successgpu = gpu_memcpy_async(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1012 num, gpumemcpyhosttodevice, my_stream)
1013 check_memcpy_gpu(
"merge_systems: l_col_out_dev", successgpu)
1016 successgpu = gpu_memcpy(l_col_out_dev, int(loc(l_col_out(1)),kind=c_intptr_t), &
1017 num, gpumemcpyhosttodevice)
1018 check_memcpy_gpu(
"merge_systems: l_col_out_dev", successgpu)
1027 call gpu_zero_q_precision (q_dev, p_col_out_dev, l_col_out_dev, na, my_pcol, l_rqs, l_rqe, matrixrows, my_stream)
1030 if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
1037 num = na * size_of_int
1038#ifdef WITH_GPU_STREAMS
1039 my_stream = obj%gpu_setup%my_stream
1040 successgpu = gpu_memcpy_async(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &
1041 num, gpumemcpyhosttodevice, my_stream)
1042 check_memcpy_gpu(
"merge_systems: ", successgpu)
1044 successgpu = gpu_memcpy(idx1_dev, int(loc(idx1(1)),kind=c_intptr_t), &
1045 num, gpumemcpyhosttodevice)
1046 check_memcpy_gpu(
"merge_systems: idx1_dev", successgpu)
1049 num = na * size_of_int
1050#ifdef WITH_GPU_STREAMS
1051 my_stream = obj%gpu_setup%my_stream
1052 successgpu = gpu_memcpy_async(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
1053 num, gpumemcpyhosttodevice, my_stream)
1054 check_memcpy_gpu(
"merge_systems: ", successgpu)
1056 successgpu = gpu_memcpy(p_col_dev, int(loc(p_col(1)),kind=c_intptr_t), &
1057 num, gpumemcpyhosttodevice)
1058 check_memcpy_gpu(
"merge_systems: p_col_dev", successgpu)
1061 num = na * size_of_int
1062#ifdef WITH_GPU_STREAMS
1063 my_stream = obj%gpu_setup%my_stream
1064 successgpu = gpu_memcpy_async(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &
1065 num, gpumemcpyhosttodevice, my_stream)
1066 check_memcpy_gpu(
"merge_systems: ", successgpu)
1068 successgpu = gpu_memcpy(coltyp_dev, int(loc(coltyp(1)),kind=c_intptr_t), &
1069 num, gpumemcpyhosttodevice)
1070 check_memcpy_gpu(
"merge_systems: coltyp_dev", successgpu)
1073 num = na*size_of_datatype
1074#ifdef WITH_GPU_STREAMS
1075 my_stream = obj%gpu_setup%my_stream
1076 successgpu = gpu_memcpy_async(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &
1077 num, gpumemcpyhosttodevice, my_stream)
1078 check_memcpy_gpu(
"merge_systems: ev_scale_dev", successgpu)
1080 successgpu = gpu_memcpy(ev_scale_dev, int(loc(ev_scale(1)),kind=c_intptr_t), &
1081 num, gpumemcpyhosttodevice)
1082 check_memcpy_gpu(
"merge_systems: ev_scale_dev", successgpu)
1085 num = na*size_of_datatype
1086#ifdef WITH_GPU_STREAMS
1087 my_stream = obj%gpu_setup%my_stream
1088 successgpu = gpu_memcpy_async(z_dev, int(loc(z(1)),kind=c_intptr_t), &
1089 num, gpumemcpyhosttodevice, my_stream)
1090 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
1092 successgpu = gpu_memcpy(z_dev, int(loc(z(1)),kind=c_intptr_t), &
1093 num, gpumemcpyhosttodevice)
1094 check_memcpy_gpu(
"merge_systems: z_dev", successgpu)
1097 num = na*size_of_datatype
1098#ifdef WITH_GPU_STREAMS
1099 my_stream = obj%gpu_setup%my_stream
1100 successgpu = gpu_memcpy_async(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &
1101 num, gpumemcpyhosttodevice, my_stream)
1103 successgpu = gpu_memcpy(d1_dev, int(loc(d1(1)),kind=c_intptr_t), &
1104 num, gpumemcpyhosttodevice)
1105 check_memcpy_gpu(
"merge_systems: d1_dev", successgpu)
1108 num = gemm_dim_l * gemm_dim_m * size_of_datatype
1109#ifdef WITH_GPU_STREAMS
1110 my_stream = obj%gpu_setup%my_stream
1111 successgpu = gpu_memcpy_async(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
1112 num, gpumemcpyhosttodevice, my_stream)
1113 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
1117 successgpu = gpu_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
1118 num, gpumemcpyhosttodevice)
1119 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
1122 num = na*size_of_datatype
1123#ifdef WITH_GPU_STREAMS
1124 my_stream = obj%gpu_setup%my_stream
1125 successgpu = gpu_memcpy_async(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &
1126 num, gpumemcpyhosttodevice, my_stream)
1127 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1129 successgpu = gpu_memcpy(dbase_dev, int(loc(dbase(1)),kind=c_intptr_t), &
1130 num, gpumemcpyhosttodevice)
1131 check_memcpy_gpu(
"merge_systems: dbase_dev", successgpu)
1134 num = na*size_of_datatype
1135#ifdef WITH_GPU_STREAMS
1136 successgpu = gpu_stream_synchronize(my_stream)
1137 successgpu = gpu_memcpy_async(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &
1138 num, gpumemcpyhosttodevice, my_stream)
1139 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1141 successgpu = gpu_memcpy(ddiff_dev, int(loc(ddiff(1)),kind=c_intptr_t), &
1142 num, gpumemcpyhosttodevice)
1143 check_memcpy_gpu(
"merge_systems: ddiff_dev", successgpu)
1147 allocate(nnzu_val(na1,npc_n))
1148 allocate(nnzl_val(na1,npc_n))
1154 num = na1 * npc_n* size_of_int
1155 successgpu = gpu_malloc(nnzu_val_dev, num)
1156 check_alloc_gpu(
"merge_systems: nnzu_val_dev", successgpu)
1158 num = na1 * npc_n* size_of_int
1159 successgpu = gpu_malloc(nnzl_val_dev, num)
1160 check_alloc_gpu(
"merge_systems: nnzl_val_dev", successgpu)
1168 if (np_rem == npc_0)
then
1169 np_rem = npc_0+npc_n-1
1177 call gpu_compute_nnzl_nnzu_val_part1 (p_col_dev, idx1_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, &
1178 na, na1, np_rem, npc_n, nnzu_save, nnzl_save, np, my_stream)
1186 call gpu_compute_nnzl_nnzu_val_part2 (nnzu_val_dev, nnzl_val_dev, na, na1, nnzu_start, nnzl_start, npc_n, my_stream)
1191 if (np_rem == npc_0)
then
1192 np_rem = npc_0+npc_n-1
1200 if (p_col(idx1(i)) == np_rem)
then
1201 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2)
then
1203 nnzu_val(i,np) = nnzu
1205 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2)
then
1207 nnzl_val(i,np) = nnzl
1225 if (np_rem == npc_0)
then
1226 np_rem = npc_0+npc_n-1
1232#if defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL)
1233 my_stream = obj%gpu_setup%my_stream
1234 call gpu_copy_qtmp1_to_qtmp1_tmp_precision (qtmp1_dev, qtmp1_tmp_dev, gemm_dim_k, gemm_dim_l, my_stream)
1236 call obj%timer%start(
"ccl_send_recv")
1237 ccl_comm_cols = obj%gpu_setup%ccl_comm_cols
1238 successgpu = ccl_group_start()
1239 if (.not.successgpu)
then
1240 print *,
"Error in setting up nccl_group_start!"
1243 successgpu = ccl_send(qtmp1_tmp_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1244#ifdef DOUBLE_PRECISION
1247#ifdef SINGLE_PRECISION
1250 np_next, ccl_comm_cols, my_stream)
1251 if (.not.successgpu)
then
1252 print *,
"Error in nccl_send"
1255 successgpu = ccl_recv(qtmp1_dev, int(l_rows*max_local_cols,kind=c_size_t), &
1256#ifdef DOUBLE_PRECISION
1259#ifdef SINGLE_PRECISION
1262 np_prev, ccl_comm_cols, my_stream)
1265 if (.not.successgpu)
then
1266 print *,
"Error in ccl_send/ccl_recv"
1269 successgpu = ccl_group_end()
1270 if (.not.successgpu)
then
1271 print *,
"Error in setting up ccl_group_end!"
1274 successgpu = gpu_stream_synchronize(my_stream)
1275 check_stream_synchronize_gpu(
"trans_ev", successgpu)
1276 call obj%timer%stop(
"ccl_send_recv")
1278#else /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */
1281 call obj%timer%start(
"mpi_communication")
1282#ifdef WITH_GPU_STREAMS
1283 my_stream = obj%gpu_setup%my_stream
1284 successgpu = gpu_stream_synchronize(my_stream)
1285 check_stream_synchronize_gpu(
"merge_systems qtmp1_dev", successgpu)
1287 successgpu = gpu_memcpy_async(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1288 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost, my_stream)
1289 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1291 my_stream = obj%gpu_setup%my_stream
1292 successgpu = gpu_stream_synchronize(my_stream)
1293 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1295 successgpu = gpu_stream_synchronize()
1296 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1299 successgpu = gpu_memcpy(int(loc(qtmp1(1,1)),kind=c_intptr_t), qtmp1_dev, &
1300 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpydevicetohost)
1301 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1305 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1306 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1307 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
1308#ifdef WITH_GPU_STREAMS
1309 my_stream = obj%gpu_setup%my_stream
1310 successgpu = gpu_stream_synchronize(my_stream)
1311 check_stream_synchronize_gpu(
"merge_systems qtmp1_dev", successgpu)
1313 successgpu = gpu_memcpy_async(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1314 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice, my_stream)
1315 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1317 my_stream = obj%gpu_setup%my_stream
1318 successgpu = gpu_stream_synchronize(my_stream)
1319 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1321 successgpu = gpu_stream_synchronize()
1322 check_stream_synchronize_gpu(
"merge_systems: qtmp1_dev", successgpu)
1325 successgpu = gpu_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
1326 gemm_dim_k * gemm_dim_l * size_of_datatype, gpumemcpyhosttodevice)
1327 check_memcpy_gpu(
"merge_systems: qtmp1_dev", successgpu)
1329 call obj%timer%stop(
"mpi_communication")
1330#endif /* WITH_MPI */
1332#endif /* defined(WITH_NVIDIA_NCCL) || defined(WITH_AMD_RCCL) */
1335 call obj%timer%start(
"mpi_communication")
1336 call mpi_sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=mpi_kind), mpi_real_precision, &
1337 int(np_next,kind=mpi_kind), 1111_mpi_kind, int(np_prev,kind=mpi_kind), &
1338 1111_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
1339 call obj%timer%stop(
"mpi_communication")
1340#endif /* WITH_MPI */
1352 my_stream = obj%gpu_setup%my_stream
1353 call gpu_fill_tmp_arrays_precision (idx1_dev, p_col_dev, coltyp_dev, nnzu_val_dev, nnzl_val_dev, nnzul_dev, &
1354 d1u_dev, d1_dev, zu_dev, z_dev, d1l_dev, zl_dev, na, np, na1, np_rem, my_stream)
1356 num = 2* size_of_int
1357#ifdef WITH_GPU_STREAMS
1358 successgpu = gpu_memcpy_async(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &
1359 num, gpumemcpydevicetohost, my_stream)
1360 check_memcpy_gpu(
"merge_systems: nnzul_dev", successgpu)
1362 successgpu = gpu_memcpy(int(loc(nnzul(1)),kind=c_intptr_t), nnzul_dev, &
1363 num, gpumemcpydevicetohost)
1364 check_memcpy_gpu(
"merge_systems: nnzl_val", successgpu)
1371 if (p_col(idx1(i)) == np_rem)
then
1372 if (coltyp(idx1(i)) == 1 .or. coltyp(idx1(i)) == 2)
then
1377 if (coltyp(idx1(i)) == 3 .or. coltyp(idx1(i)) == 2)
then
1389 ndef = max(nnzu,nnzl)
1392 call gpu_update_ndef_c(ndef_c_dev, idx_dev, p_col_dev, idx2_dev, na, na1, np_rem, ndef, my_stream)
1396 ndef = max(nnzu,nnzl)
1398 call gpu_copy_qtmp1_slice_to_q_precision (q_dev, qtmp1_dev, l_col_out_dev, p_col_out_dev, ndef_c_dev, p_col_dev, &
1399 idx2_dev, idx_dev, l_rqs, l_rqe, l_rows, matrixrows, &
1400 gemm_dim_k, my_pcol, na1, np_rem, na, my_stream)
1402 ndef = max(nnzu,nnzl)
1406 if (p_col(idx2(j-na1)) == np_rem)
then
1408 if (p_col_out(i) == my_pcol)
then
1409 q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
1418 do ns = 0, nqcols1-1, max_strip
1419 ncnt = min(max_strip,nqcols1-ns)
1423 call gpu_copy_q_slice_to_qtmp2_precision (q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rows, l_rqs, l_rqe, &
1424 matrixrows, matrixcols, gemm_dim_k, gemm_dim_m, ns, &
1425 ncnt, ind_ex, ind_ex2, na, my_stream)
1434 qtmp2(1:l_rows,i) = q(l_rqs:l_rqe, k)
1442 if (nnzu .ge. 1)
then
1445 call gpu_fill_ev_precision (ev_dev, d1u_dev, dbase_dev, ddiff_dev, zu_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1446 na, gemm_dim_l, gemm_dim_m, nnzu, ns, ncnt, my_stream)
1456 j = idx(idxq1(i+ns))
1461 tmp(k) = d1u(k) - dbase(j)
1462 tmp(k) = tmp(k) + ddiff(j)
1463 ev(k,i) = zu(k) / tmp(k) * ev_scale(j)
1472 if (l_rnm>0 .and. ncnt>0 .and. nnzu>0)
then
1474 call obj%timer%start(
"gpublas_gemm")
1475 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1476 call gpublas_precision_gemm(
'N',
'N', l_rnm, ncnt, nnzu, &
1477 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
1478 ev_dev, ubound(ev,dim=1), &
1479 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1), gpuhandle)
1480 if (wantdebug) successgpu = gpu_devicesynchronize()
1481 call obj%timer%stop(
"gpublas_gemm")
1483 call obj%timer%start(
"blas_gemm")
1484 call precision_gemm(
'N',
'N', int(l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1485 int(nnzu,kind=blas_kind), &
1486 1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=blas_kind), &
1487 ev, int(ubound(ev,dim=1),kind=blas_kind), &
1488 1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
1489 call obj%timer%stop(
"blas_gemm")
1501 if (nnzl .ge. 1)
then
1502 call gpu_fill_ev_precision (ev_dev, d1l_dev, dbase_dev, ddiff_dev, zl_dev, ev_scale_dev, idxq1_dev, idx_dev, &
1503 na, gemm_dim_l, gemm_dim_m, nnzl, ns, ncnt, my_stream)
1510 j = idx(idxq1(i+ns))
1513 tmp(k) = d1l(k) - dbase(j)
1514 tmp(k) = tmp(k) + ddiff(j)
1515 ev(k,i) = zl(k) / tmp(k) * ev_scale(j)
1524 if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0)
then
1526 call obj%timer%start(
"gpublas_gemm")
1527 gpuhandle = obj%gpu_setup%gpublasHandleArray(0)
1528 call gpublas_precision_gemm(
'N',
'N', l_rows-l_rnm, ncnt, nnzl, &
1529 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), &
1530 ev_dev, ubound(ev,dim=1), &
1531 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1), gpuhandle)
1532 if (wantdebug) successgpu = gpu_devicesynchronize()
1533 call obj%timer%stop(
"gpublas_gemm")
1535 call obj%timer%start(
"blas_gemm")
1536 call precision_gemm(
'N',
'N', int(l_rows-l_rnm,kind=blas_kind), int(ncnt,kind=blas_kind), &
1537 int(nnzl,kind=blas_kind), &
1538 1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=blas_kind), &
1539 ev, int(ubound(ev,dim=1),kind=blas_kind), &
1540 1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=blas_kind))
1541 call obj%timer%stop(
"blas_gemm")
1548 call gpu_copy_qtmp2_slice_to_q_precision(q_dev, qtmp2_dev, idxq1_dev, l_col_out_dev, l_rqs, l_rqe, l_rows, ncnt, &
1549 gemm_dim_k, matrixrows, ns, my_stream)
1556 q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
1566 deallocate(nnzu_val, nnzl_val)
1571 num = matrixrows*matrixcols*size_of_datatype
1572#ifdef WITH_GPU_STREAMS
1573 successgpu = gpu_memcpy_async(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
1574 num, gpumemcpydevicetohost, my_stream)
1575 check_memcpy_gpu(
"merge_systems: q_dev", successgpu)
1577 my_stream = obj%gpu_setup%my_stream
1578 successgpu = gpu_stream_synchronize(my_stream)
1579 check_stream_synchronize_gpu(
"merge_systems: q_dev", successgpu)
1581 successgpu = gpu_memcpy(int(loc(q(1,1)),kind=c_intptr_t), q_dev, &
1582 num, gpumemcpydevicetohost)
1583 check_memcpy_gpu(
"merge_systems: q_dev", successgpu)
1585#ifdef WITH_GPU_STREAMS
1586 successgpu = gpu_memcpy_async(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &
1587 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost, my_stream)
1588 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
1590 successgpu = gpu_stream_synchronize(my_stream)
1591 check_stream_synchronize_gpu(
"merge_systems: ev_dev", successgpu)
1595 successgpu = gpu_memcpy(int(loc(ev(1,1)),kind=c_intptr_t), ev_dev, &
1596 gemm_dim_l * gemm_dim_m * size_of_datatype, gpumemcpydevicetohost)
1597 check_memcpy_gpu(
"merge_systems: ev_dev", successgpu)
1602 deallocate(ndef_c, stat=istat, errmsg=errormessage)
1603 check_deallocate(
"merge_systems: ndef_c",istat, errormessage)
1607 successgpu = gpu_free(nnzul_dev)
1608 check_dealloc_gpu(
"merge_systems: nnzul_dev", successgpu)
1610 successgpu = gpu_free(l_col_dev)
1611 check_dealloc_gpu(
"merge_systems: l_col_dev", successgpu)
1613 successgpu = gpu_free(ndef_c_dev)
1614 check_dealloc_gpu(
"merge_systems: ndef_c_dev", successgpu)
1616 successgpu = gpu_free(nnzu_val_dev)
1617 check_dealloc_gpu(
"merge_systems: nnzu_val_dev", successgpu)
1619 successgpu = gpu_free(nnzl_val_dev)
1620 check_dealloc_gpu(
"merge_systems: nnzl_val_dev", successgpu)
1622 successgpu = gpu_free(idx1_dev)
1623 check_dealloc_gpu(
"merge_systems: idx1_dev", successgpu)
1625 successgpu = gpu_free(idx2_dev)
1626 check_dealloc_gpu(
"merge_systems: idx2_dev", successgpu)
1628 successgpu = gpu_free(p_col_dev)
1629 check_dealloc_gpu(
"merge_systems: p_col_dev", successgpu)
1631 successgpu = gpu_free(p_col_out_dev)
1632 check_dealloc_gpu(
"merge_systems: p_col_out_dev", successgpu)
1634 successgpu = gpu_free(coltyp_dev)
1635 check_dealloc_gpu(
"merge_systems: coltyp_dev", successgpu)
1637 successgpu = gpu_free(idx_dev)
1638 check_dealloc_gpu(
"merge_systems: idx_dev", successgpu)
1640 successgpu = gpu_free(l_col_out_dev)
1641 check_dealloc_gpu(
"merge_systems: l_col_out_dev", successgpu)
1643 successgpu = gpu_free(idxq1_dev)
1644 check_dealloc_gpu(
"merge_systems: ", successgpu)
1646 successgpu = gpu_free(q_dev)
1647 check_dealloc_gpu(
"merge_systems: q_dev", successgpu)
1649 successgpu = gpu_free(d1_dev)
1650 check_dealloc_gpu(
"merge_systems: d1_dev", successgpu)
1652 successgpu = gpu_free(z_dev)
1653 check_dealloc_gpu(
"merge_systems: z_dev", successgpu)
1655 successgpu = gpu_free(d1u_dev)
1656 check_dealloc_gpu(
"merge_systems: d1u_dev", successgpu)
1658 successgpu = gpu_free(dbase_dev)
1659 check_dealloc_gpu(
"merge_systems: dbase_dev", successgpu)
1661 successgpu = gpu_free(ddiff_dev)
1662 check_dealloc_gpu(
"merge_systems: ddiff_dev", successgpu)
1664 successgpu = gpu_free(zu_dev)
1665 check_dealloc_gpu(
"merge_systems: zu_dev", successgpu)
1667 successgpu = gpu_free(ev_scale_dev)
1668 check_dealloc_gpu(
"merge_systems: ev_scale_dev", successgpu)
1670 successgpu = gpu_free(d1l_dev)
1671 check_dealloc_gpu(
"merge_systems: d1l_dev", successgpu)
1673 successgpu = gpu_free(zl_dev)
1674 check_dealloc_gpu(
"merge_systems: zl_dev", successgpu)
1677#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1678 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1679 successgpu = gpu_host_unregister(int(loc(qtmp1),kind=c_intptr_t))
1680 check_host_unregister_gpu(
"merge_systems: qtmp1", successgpu)
1683 successgpu = gpu_free(qtmp1_dev)
1684 check_dealloc_gpu(
"merge_systems: qtmp1_dev", successgpu)
1686 successgpu = gpu_free(qtmp1_tmp_dev)
1687 check_dealloc_gpu(
"merge_systems: qtmp1_tmp_dev", successgpu)
1689#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1690 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1691 successgpu = gpu_host_unregister(int(loc(qtmp2),kind=c_intptr_t))
1692 check_host_unregister_gpu(
"merge_systems: qtmp2", successgpu)
1695 successgpu = gpu_free(qtmp2_dev)
1696 check_dealloc_gpu(
"merge_systems: qtmp2_dev", successgpu)
1698#if defined(WITH_NVIDIA_GPU_VERSION) || defined(WITH_AMD_GPU_VERSION) || defined(WITH_OPENMP_OFFLOAD_GPU_VERSION) || defined(WITH_SYCL_GPU_VERSION)
1699 if (gpu_vendor() /= openmp_offload_gpu .and. gpu_vendor() /= sycl_gpu)
then
1700 successgpu = gpu_host_unregister(int(loc(ev),kind=c_intptr_t))
1701 check_host_unregister_gpu(
"merge_systems: ev", successgpu)
1704 successgpu = gpu_free(ev_dev)
1705 check_dealloc_gpu(
"merge_systems: ev_dev", successgpu)
1708 deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errormessage)
1709 check_deallocate(
"merge_systems: ev, qtmp1, qtmp2",istat, errormessage)
1711#ifdef WITH_OPENMP_TRADITIONAL
1712 deallocate(z_p, stat=istat, errmsg=errormessage)
1713 check_deallocate(
"merge_systems: z_p",istat, errormessage)
1716 call obj%timer%stop(
"merge_systems" // precision_suffix)