53subroutine elpa_transpose_row_or_col&
60 a_dev, at_dev, buf_send_dev, buf_recv_dev, buf_self_dev, &
61#else /* USE_CCL_PXGEMM */
62 a, at, buf_send, buf_recv, buf_self, &
63#endif /* USE_CCL_PXGEMM */
64 np_fine, l_rows, l_cols, nblk_mult_rows_max, nblk_mult_cols_max, debug)
66 use,
intrinsic :: iso_c_binding
70 use elpa_utilities,
only : least_common_multiple, check_memcpy_gpu_f
75#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
77#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
82#include "../../src/general/precision_kinds.F90"
84 character(len=1),
intent(in) :: row_col_char
85 integer(kind=ik),
intent(in) :: np_fine, l_rows, l_cols, nblk_mult_rows_max, nblk_mult_cols_max
87 math_datatype(kind=rck),
allocatable :: a(:,:), at(:,:), buf_send(:,:), buf_recv(:,:), buf_self(:,:)
88 integer(kind=c_intptr_t) :: a_dev, at_dev, buf_send_dev, buf_recv_dev, buf_self_dev
89#else /* USE_CCL_PXGEMM */
90 math_datatype(kind=rck) :: a(l_rows,l_cols), at(l_rows,l_cols), &
91 buf_send(nblk_mult_rows_max, nblk_mult_cols_max), &
92 buf_recv(nblk_mult_rows_max, nblk_mult_cols_max), &
93 buf_self(nblk_mult_rows_max, nblk_mult_cols_max)
94 integer(kind=c_intptr_t) :: a_dev, at_dev, buf_send_dev, buf_recv_dev, buf_self_dev
95#endif /* USE_CCL_PXGEMM */
97 logical :: row_transposed, col_transposed
98 integer(kind=ik) :: l_dirs, l_dirs_t
99 integer(kind=ik) :: nblk, debug
102 integer(kind=MPI_KIND) :: mpierr
103 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
104 integer(kind=ik) :: my_pdir, my_pdir_t, np_dirs, np_dirs_t
105 integer(kind=ik) :: mpi_comm_all
106 integer(kind=ik) :: matrix_order
107 integer(kind=ik) :: my_mpi_rank, mpi_rank_target, mpi_rank_source, &
108 my_pdir_target, my_pdir_t_target, &
109 my_pdir_source, my_pdir_t_source, &
110 my_prow_target, my_pcol_target, &
111 my_prow_source, my_pcol_source, &
112 my_pdir_target_deadlock
114 integer(kind=ik) :: np_rows_fine, np_cols_fine, np_dirs_fine, np_dirs_t_fine, np_t_fine, np_bc_fine, &
115 np_t_fine_1, nblk_mult_dirs_1, np_fine_1, nblk_mult_dirs_t_1, &
116 m_blocks_loc_fine, mt_blocks_loc_fine, &
117 m_blocks_loc_fine_1, mt_blocks_loc_fine_1, &
119 integer(kind=ik) :: LCM, nblk_mult_rows, nblk_mult_cols, &
120 nblk_mult_dirs, nblk_mult_dirs_t, &
121 i_block_loc_fine, j_block_loc_fine, it_block_loc_fine, &
122 i_block_loc, j_block_loc, it_block_loc
123 integer(kind=ik) :: nblk_cut_row, nblk_cut_col
124 integer(kind=ik) :: nblk_cut_dir, nblk_cut_dir_t
125 integer(kind=ik) :: lld_buf
126 integer(kind=ik) :: i_block_loc_fine_max, j_block_loc_fine_max
127 integer(kind=ik) :: np, np_t
128 integer(kind=ik) :: error
129 integer(kind=c_intptr_t),
parameter :: size_of_datatype = size_of_&
135 logical :: successGPU, useCCL
136 integer(kind=c_intptr_t) :: my_stream
137 integer(kind=ik) :: SM_count
139 integer(kind=c_intptr_t) :: ccl_comm_all
140 integer(kind=c_int) :: cclDataType
141 integer(kind=ik) :: k_datatype
143 call obj%timer%start(
"elpa_transpose_row")
145 if (row_col_char ==
'R' .or. row_col_char ==
'r')
then
146 row_transposed = .true.
147 col_transposed = .false.
148 else if (row_col_char ==
'C' .or. row_col_char ==
'c')
then
149 row_transposed = .false.
150 col_transposed = .true.
152 print *,
"elpa_transpose_row_or_col: row_col_char must be 'R'/'r' or 'C'/'c'. Aborting..."
159 call obj%get(
"matrix_order", matrix_order, error)
160 if (error .ne. elpa_ok)
then
161 print *,
"elpa_pxgemm_multiply_transpose: Problem getting option matrix_order. Aborting..."
170 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
172 myid = obj%mpi_setup%myRank_comm_parent
173 my_prow = obj%mpi_setup%myRank_comm_rows
174 my_pcol = obj%mpi_setup%myRank_comm_cols
176 np_rows = obj%mpi_setup%nRanks_comm_rows
177 np_cols = obj%mpi_setup%nRanks_comm_cols
179 lcm = least_common_multiple(np_rows, np_cols)*nblk
181#if defined(USE_CCL_PXGEMM)
184 my_stream = obj%gpu_setup%my_stream
185 ccl_comm_all = obj%gpu_setup%ccl_comm_all
186 sm_count = obj%gpu_setup%gpuSMcount
188#if REALCASE == 1 && defined(DOUBLE_PRECISION)
189 ccldatatype = ccldouble
191#elif REALCASE == 1 && defined(SINGLE_PRECISION)
192 ccldatatype = cclfloat
194#elif COMPLEXCASE == 1 && defined(DOUBLE_PRECISION)
195 ccldatatype = ccldouble
197#elif COMPLEXCASE == 1 && defined(SINGLE_PRECISION)
198 ccldatatype = cclfloat
201#endif /* defined(USE_CCL_PXGEMM) */
207 if (row_transposed)
then
215 nblk_mult_dirs = nblk_mult_rows
216 nblk_mult_dirs_t = nblk_mult_cols
225 nblk_mult_dirs = nblk_mult_cols
226 nblk_mult_dirs_t = nblk_mult_rows
231 np_rows_fine = least_common_multiple(np_rows, np_cols)
232 np_cols_fine = np_rows_fine
233 np_dirs_fine = np_rows_fine
234 np_dirs_t_fine = np_rows_fine
237 call nvtxrangepush(
"transpose row/col")
241 if (mod(np_fine,np_dirs) == my_pdir)
then
243 my_pdir_t_target = mod(np_t_fine,np_dirs_t)
247 my_pdir_target_deadlock = mod(np_fine,np_dirs)
249 np_t_fine_1 = my_pdir_t
250 np_t_fine_1_start = mod(np_t_fine_1, np_dirs_t_fine)
252 do np_t_fine_1 = my_pdir_t, np_dirs_t_fine-1, np_dirs_t
253 np_fine_1 = np_t_fine_1
254 my_pdir_target = mod(np_fine_1, np_dirs)
255 if (my_pdir_target==my_pdir_target_deadlock)
then
256 np_t_fine_1_start = mod(np_t_fine_1+np_dirs_t, np_dirs_fine)
261 np_t_fine_1 = np_t_fine_1_start
263 np_fine_1 = np_t_fine_1
264 my_pdir_target = mod(np_fine_1, np_dirs)
266 if (row_transposed)
then
267 my_prow_target = my_pdir_target
268 my_pcol_target = my_pdir_t_target
270 my_prow_target = my_pdir_t_target
271 my_pcol_target = my_pdir_target
274 if (matrix_order==column_major_order)
then
275 mpi_rank_target = my_prow_target + np_rows*my_pcol_target
277 mpi_rank_target = my_pcol_target + np_cols*my_prow_target
282 mt_blocks_loc_fine_1 = (nblk_mult_dirs_t_1+nblk-1)/nblk
283 m_blocks_loc_fine = (nblk_mult_dirs +nblk-1)/nblk
285 if (row_transposed)
then
286 j_block_loc_fine_max = mt_blocks_loc_fine_1 - 1
287 i_block_loc_fine_max = m_blocks_loc_fine - 1
291 j_block_loc_fine_max = m_blocks_loc_fine - 1
292 i_block_loc_fine_max = mt_blocks_loc_fine_1 - 1
299 lld_buf = nblk_mult_rows_max
300 call gpu_ccl_copy_buf_send(precision_char, a_dev, buf_send_dev, l_rows, l_cols, lld_buf, &
301 nblk, i_block_loc_fine_max, j_block_loc_fine_max, np, np_t, &
302 np_rows_fine, np_cols_fine, np_rows, np_cols, sm_count, debug, my_stream)
305 do j_block_loc_fine = 0, j_block_loc_fine_max
306 j_block_loc = (np_t + j_block_loc_fine*np_cols_fine)/np_cols
307 nblk_cut_col = min(nblk, l_cols-j_block_loc*nblk)
309 do i_block_loc_fine = 0, i_block_loc_fine_max
310 i_block_loc = (np + i_block_loc_fine*np_rows_fine)/np_rows
311 nblk_cut_row = min(nblk, l_rows-i_block_loc*nblk)
313 buf_send(1+ i_block_loc_fine*nblk: nblk_cut_row + i_block_loc_fine*nblk, &
314 1+ j_block_loc_fine*nblk: nblk_cut_col + j_block_loc_fine*nblk) = &
315 a(1+ i_block_loc *nblk: nblk_cut_row + i_block_loc *nblk, &
316 1+ j_block_loc *nblk: nblk_cut_col + j_block_loc *nblk)
325 if (mpi_rank_target/=myid)
then
326 successgpu = gpu_stream_synchronize(my_stream)
327 check_stream_synchronize_gpu(
"elpa_pxgemm: ccl_send", successgpu)
329 successgpu = ccl_send(buf_send_dev, int(k_datatype*nblk_mult_rows_max*nblk_mult_cols_max,kind=c_size_t), &
330 ccldatatype, mpi_rank_target, ccl_comm_all, my_stream)
332 if (.not. successgpu)
then
333 print *,
"Error in ccl_send"
337 successgpu = gpu_stream_synchronize(my_stream)
338 check_stream_synchronize_gpu(
"elpa_pxgemm: ccl_send", successgpu)
342 successgpu = gpu_memcpy(buf_self_dev, buf_send_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype, &
343 gpumemcpydevicetodevice)
344 check_memcpy_gpu(
"elpa_pxgemm: buf_self_dev <- buf_send_dev", successgpu)
347 if (mpi_rank_target/=myid)
then
349 call mpi_send(buf_send, int(nblk_mult_rows_max*nblk_mult_cols_max, kind=mpi_kind), &
350 mpi_math_datatype_precision, int(mpi_rank_target, kind=mpi_kind), 0, &
351 int(mpi_comm_all, kind=mpi_kind), mpierr)
357 np_t_fine_1 = mod(np_t_fine_1+np_dirs_t, np_dirs_t_fine)
358 if (np_t_fine_1 == np_t_fine_1_start)
exit
363 if (mod(np_t_fine,np_dirs_t) == my_pdir_t)
then
364 my_pdir_source = mod(np_fine, np_dirs)
366 do np_fine_1 = my_pdir, np_dirs_fine-1, np_dirs
367 np_t_fine_1 = np_fine_1
368 my_pdir_t_source = mod(np_t_fine_1, np_dirs_t)
370 if (row_transposed)
then
371 my_prow_source = my_pdir_source
372 my_pcol_source = my_pdir_t_source
374 my_prow_source = my_pdir_t_source
375 my_pcol_source = my_pdir_source
378 if (matrix_order==column_major_order)
then
379 mpi_rank_source = my_prow_source + np_rows*my_pcol_source
381 mpi_rank_source = my_pcol_source + np_cols*my_prow_source
386 m_blocks_loc_fine_1 = (nblk_mult_dirs_1+nblk-1)/nblk
387 mt_blocks_loc_fine = (nblk_mult_dirs_t+nblk-1)/nblk
389 if (row_transposed)
then
390 j_block_loc_fine_max = mt_blocks_loc_fine - 1
391 i_block_loc_fine_max = m_blocks_loc_fine_1 - 1
395 j_block_loc_fine_max = m_blocks_loc_fine_1 - 1
396 i_block_loc_fine_max = mt_blocks_loc_fine - 1
402 if (mpi_rank_source/=myid)
then
403 successgpu = ccl_recv(buf_recv_dev, int(k_datatype*nblk_mult_rows_max*nblk_mult_cols_max,kind=c_size_t), &
404 ccldatatype, mpi_rank_source, ccl_comm_all, my_stream)
406 if (.not. successgpu)
then
407 print *,
"Error in ccl_recv"
411 successgpu = gpu_stream_synchronize(my_stream)
412 check_stream_synchronize_gpu(
"elpa_pxgemm: ccl_recv", successgpu)
415 successgpu = gpu_memcpy(buf_recv_dev, buf_self_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype, &
416 gpumemcpydevicetodevice)
417 check_memcpy_gpu(
"elpa_pxgemm: buf_recv_dev <- buf_self_dev", successgpu)
420 if (mpi_rank_source/=myid)
then
422 call mpi_recv(buf_recv, int(nblk_mult_rows_max*nblk_mult_cols_max, kind=mpi_kind), &
423 mpi_math_datatype_precision, int(mpi_rank_source, kind=mpi_kind), 0, &
424 int(mpi_comm_all, kind=mpi_kind), mpi_status_ignore, mpierr)
432 lld_buf = nblk_mult_rows_max
433 call gpu_ccl_copy_buf_recv(precision_char, at_dev, buf_recv_dev, l_rows, l_cols, &
434 lld_buf, nblk, i_block_loc_fine_max, j_block_loc_fine_max, np, np_t, &
435 np_rows_fine, np_cols_fine, np_rows, np_cols, sm_count, debug, my_stream)
437 do i_block_loc_fine = 0, i_block_loc_fine_max
438 i_block_loc = (np + i_block_loc_fine*np_rows_fine)/np_rows
439 nblk_cut_row = min(nblk, l_rows-i_block_loc*nblk)
441 do j_block_loc_fine = 0, j_block_loc_fine_max
442 j_block_loc = (np_t + j_block_loc_fine*np_cols_fine)/np_cols
443 nblk_cut_col = min(nblk, l_cols-j_block_loc*nblk)
445 at(1+ i_block_loc *nblk: nblk_cut_row + i_block_loc *nblk, &
446 1+ j_block_loc *nblk: nblk_cut_col + j_block_loc *nblk) = &
447#if defined (REALCASE)
448 transpose(buf_recv(1+ j_block_loc_fine*nblk: nblk_cut_col + j_block_loc_fine*nblk, &
449 1+ i_block_loc_fine*nblk: nblk_cut_row + i_block_loc_fine*nblk))
451 conjg(transpose(buf_recv(1+ j_block_loc_fine*nblk: nblk_cut_col + j_block_loc_fine*nblk, &
452 1+ i_block_loc_fine*nblk: nblk_cut_row + i_block_loc_fine*nblk)) )
462 if (debug==1) successgpu = gpu_devicesynchronize()
463 call obj%timer%stop(
"elpa_transpose_row")