Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.01.002
Loading...
Searching...
No Matches
elpa_pxgemm_transpose_template.F90
Go to the documentation of this file.
1#if 0
2! This file is part of ELPA.
3!
4! The ELPA library was originally created by the ELPA consortium,
5! consisting of the following organizations:
6!
7! - Max Planck Computing and Data Facility (MPCDF), formerly known as
8! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
9! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
10! Informatik,
11! - Technische Universität München, Lehrstuhl für Informatik mit
12! Schwerpunkt Wissenschaftliches Rechnen ,
13! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
16! and
17! - IBM Deutschland GmbH
18!
19!
20! More information can be found here:
21! http://elpa.mpcdf.mpg.de/
22!
23! ELPA is free software: you can redistribute it and/or modify
24! it under the terms of the version 3 of the license of the
25! GNU Lesser General Public License as published by the Free
26! Software Foundation.
27!
28! ELPA is distributed in the hope that it will be useful,
29! but WITHOUT ANY WARRANTY; without even the implied warranty of
30! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31! GNU Lesser General Public License for more details.
32!
33! You should have received a copy of the GNU Lesser General Public License
34! along with ELPA. If not, see <http://www.gnu.org/licenses/>
35!
36! ELPA reflects a substantial effort on the part of the original
37! ELPA consortium, and we ask you to respect the spirit of the
38! license that we chose: i.e., please contribute any changes you
39! may have back to the original ELPA library distribution, and keep
40! any derivatives of ELPA under the same license that we chose for
41! the original distribution, the GNU Lesser General Public License.
42!
43! Author: Peter Karpov, MPCDF
44#endif
45
46#include "config-f90.h"
47#include "../general/error_checking.inc"
48
49
50! transposes a row (col) of nblk-blocks of a matrix, along with the LCM-copies of the row
51! there are two versions: 1) with CCL; 2) without CCL (CPU or GPU+MPI)
52! row_col_char = R/r or C/c if we transpose a row or a column, respectively
53subroutine elpa_transpose_row_or_col&
54 &ccl&
55 &math_datatype&
56 &_&
57 &precision &
58 (obj, row_col_char, &
59#ifdef USE_CCL_PXGEMM
60 a_dev, at_dev, buf_send_dev, buf_recv_dev, buf_self_dev, &
61#else /* USE_CCL_PXGEMM */
62 a, at, buf_send, buf_recv, buf_self, &
63#endif /* USE_CCL_PXGEMM */
64 np_fine, l_rows, l_cols, nblk_mult_rows_max, nblk_mult_cols_max, debug)
65
66 use, intrinsic :: iso_c_binding
67 use precision
68 use elpa_mpi
70 use elpa_utilities, only : least_common_multiple, check_memcpy_gpu_f
72 use elpa_gpu
73 use elpa_ccl_gpu
74 use multiply_a_b_gpu
75#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
76 use cuda_functions ! for NVTX labels
77#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
78 use hip_functions ! for ROCTX labels
79#endif
80 implicit none
81
82#include "../../src/general/precision_kinds.F90"
83 class(elpa_abstract_impl_t), intent(inout) :: obj
84 character(len=1), intent(in) :: row_col_char
85 integer(kind=ik), intent(in) :: np_fine, l_rows, l_cols, nblk_mult_rows_max, nblk_mult_cols_max
86#ifdef USE_CCL_PXGEMM
87 math_datatype(kind=rck), allocatable :: a(:,:), at(:,:), buf_send(:,:), buf_recv(:,:), buf_self(:,:) ! dummy variables
88 integer(kind=c_intptr_t) :: a_dev, at_dev, buf_send_dev, buf_recv_dev, buf_self_dev
89#else /* USE_CCL_PXGEMM */
90 math_datatype(kind=rck) :: a(l_rows,l_cols), at(l_rows,l_cols), &
91 buf_send(nblk_mult_rows_max, nblk_mult_cols_max), &
92 buf_recv(nblk_mult_rows_max, nblk_mult_cols_max), &
93 buf_self(nblk_mult_rows_max, nblk_mult_cols_max)
94 integer(kind=c_intptr_t) :: a_dev, at_dev, buf_send_dev, buf_recv_dev, buf_self_dev
95#endif /* USE_CCL_PXGEMM */
96
97 logical :: row_transposed, col_transposed
98 integer(kind=ik) :: l_dirs, l_dirs_t
99 integer(kind=ik) :: nblk, debug
100
101 ! MPI-related
102 integer(kind=MPI_KIND) :: mpierr
103 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, myid
104 integer(kind=ik) :: my_pdir, my_pdir_t, np_dirs, np_dirs_t
105 integer(kind=ik) :: mpi_comm_all
106 integer(kind=ik) :: matrix_order
107 integer(kind=ik) :: my_mpi_rank, mpi_rank_target, mpi_rank_source, &
108 my_pdir_target, my_pdir_t_target, &
109 my_pdir_source, my_pdir_t_source, &
110 my_prow_target, my_pcol_target, &
111 my_prow_source, my_pcol_source, &
112 my_pdir_target_deadlock
113
114 integer(kind=ik) :: np_rows_fine, np_cols_fine, np_dirs_fine, np_dirs_t_fine, np_t_fine, np_bc_fine, &
115 np_t_fine_1, nblk_mult_dirs_1, np_fine_1, nblk_mult_dirs_t_1, &
116 m_blocks_loc_fine, mt_blocks_loc_fine, &
117 m_blocks_loc_fine_1, mt_blocks_loc_fine_1, &
118 np_t_fine_1_start
119 integer(kind=ik) :: LCM, nblk_mult_rows, nblk_mult_cols, &
120 nblk_mult_dirs, nblk_mult_dirs_t, &
121 i_block_loc_fine, j_block_loc_fine, it_block_loc_fine, &
122 i_block_loc, j_block_loc, it_block_loc
123 integer(kind=ik) :: nblk_cut_row, nblk_cut_col
124 integer(kind=ik) :: nblk_cut_dir, nblk_cut_dir_t
125 integer(kind=ik) :: lld_buf
126 integer(kind=ik) :: i_block_loc_fine_max, j_block_loc_fine_max
127 integer(kind=ik) :: np, np_t
128 integer(kind=ik) :: error
129 integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
130 &precision&
131 &_&
132 &math_datatype
133
134 ! GPU-related
135 logical :: successGPU, useCCL
136 integer(kind=c_intptr_t) :: my_stream
137 integer(kind=ik) :: SM_count
138
139 integer(kind=c_intptr_t) :: ccl_comm_all
140 integer(kind=c_int) :: cclDataType
141 integer(kind=ik) :: k_datatype
142
143 call obj%timer%start("elpa_transpose_row")
144
145 if (row_col_char == 'R' .or. row_col_char == 'r') then
146 row_transposed = .true.
147 col_transposed = .false.
148 else if (row_col_char == 'C' .or. row_col_char == 'c') then
149 row_transposed = .false.
150 col_transposed = .true.
151 else
152 print *, "elpa_transpose_row_or_col: row_col_char must be 'R'/'r' or 'C'/'c'. Aborting..."
153 stop 1
154 endif
155
156 ! success = .true.
157 useccl = .false.
158
159 call obj%get("matrix_order", matrix_order, error)
160 if (error .ne. elpa_ok) then
161 print *, "elpa_pxgemm_multiply_transpose: Problem getting option matrix_order. Aborting..."
162 stop 1
163 endif
164
165 !na = obj%na
166 nblk = obj%nblk
167 !lda = obj%local_nrows
168 !ldaCols = obj%local_ncols
169
170 mpi_comm_all = obj%mpi_setup%mpi_comm_parent
171
172 myid = obj%mpi_setup%myRank_comm_parent
173 my_prow = obj%mpi_setup%myRank_comm_rows
174 my_pcol = obj%mpi_setup%myRank_comm_cols
175
176 np_rows = obj%mpi_setup%nRanks_comm_rows
177 np_cols = obj%mpi_setup%nRanks_comm_cols
178
179 lcm = least_common_multiple(np_rows, np_cols)*nblk
180
181#if defined(USE_CCL_PXGEMM)
182 useccl = .true.
183
184 my_stream = obj%gpu_setup%my_stream
185 ccl_comm_all = obj%gpu_setup%ccl_comm_all
186 sm_count = obj%gpu_setup%gpuSMcount
187
188#if REALCASE == 1 && defined(DOUBLE_PRECISION)
189 ccldatatype = ccldouble
190 k_datatype = 1
191#elif REALCASE == 1 && defined(SINGLE_PRECISION)
192 ccldatatype = cclfloat
193 k_datatype = 1
194#elif COMPLEXCASE == 1 && defined(DOUBLE_PRECISION)
195 ccldatatype = ccldouble
196 k_datatype = 2
197#elif COMPLEXCASE == 1 && defined(SINGLE_PRECISION)
198 ccldatatype = cclfloat
199 k_datatype = 2
200#endif
201#endif /* defined(USE_CCL_PXGEMM) */
202
203 np_bc_fine = np_fine
204 nblk_mult_rows = find_nblk_mult_dirs(l_rows, nblk, np_rows, np_fine , lcm)
205 nblk_mult_cols = find_nblk_mult_dirs(l_cols, nblk, np_cols, np_bc_fine, lcm)
206
207 if (row_transposed) then
208 ! dir=row
209 my_pdir = my_prow
210 my_pdir_t = my_pcol
211 np_dirs = np_rows
212 np_dirs_t = np_cols
213 l_dirs = l_rows
214 l_dirs_t = l_cols
215 nblk_mult_dirs = nblk_mult_rows
216 nblk_mult_dirs_t = nblk_mult_cols
217 else
218 ! dir=col
219 my_pdir = my_pcol
220 my_pdir_t = my_prow
221 np_dirs = np_cols
222 np_dirs_t = np_rows
223 l_dirs = l_cols
224 l_dirs_t = l_rows
225 nblk_mult_dirs = nblk_mult_cols
226 nblk_mult_dirs_t = nblk_mult_rows
227 endif
228
229 np_bc_fine = np_fine
230 np_t_fine = np_fine
231 np_rows_fine = least_common_multiple(np_rows, np_cols)
232 np_cols_fine = np_rows_fine
233 np_dirs_fine = np_rows_fine
234 np_dirs_t_fine = np_rows_fine
235
236#ifdef WITH_NVTX
237 call nvtxrangepush("transpose row/col")
238#endif
239 ! a -> at: transpose block-row (block-col) #np_fine of a
240 ! Send
241 if (mod(np_fine,np_dirs) == my_pdir) then
242
243 my_pdir_t_target = mod(np_t_fine,np_dirs_t)
244
245 ! we send to the process (mod(np_fine,np_rows), mod(np_bc_fine,np_cols)) in last turn
246 ! to avoid the deadlock
247 my_pdir_target_deadlock = mod(np_fine,np_dirs)
248
249 np_t_fine_1 = my_pdir_t
250 np_t_fine_1_start = mod(np_t_fine_1, np_dirs_t_fine)
251 ! dry run: to find, whether there is a potential deadlock
252 do np_t_fine_1 = my_pdir_t, np_dirs_t_fine-1, np_dirs_t
253 np_fine_1 = np_t_fine_1
254 my_pdir_target = mod(np_fine_1, np_dirs)
255 if (my_pdir_target==my_pdir_target_deadlock) then
256 np_t_fine_1_start = mod(np_t_fine_1+np_dirs_t, np_dirs_fine)
257 exit
258 endif
259 enddo
260
261 np_t_fine_1 = np_t_fine_1_start
262 do ! np_t_fine_1 periodic loop
263 np_fine_1 = np_t_fine_1
264 my_pdir_target = mod(np_fine_1, np_dirs)
265
266 if (row_transposed) then
267 my_prow_target = my_pdir_target
268 my_pcol_target = my_pdir_t_target
269 else
270 my_prow_target = my_pdir_t_target
271 my_pcol_target = my_pdir_target
272 endif
273
274 if (matrix_order==column_major_order) then
275 mpi_rank_target = my_prow_target + np_rows*my_pcol_target
276 else
277 mpi_rank_target = my_pcol_target + np_cols*my_prow_target
278 endif
279
280 nblk_mult_dirs_t_1 = find_nblk_mult_dirs(l_dirs_t, nblk, np_dirs_t, np_t_fine_1, lcm)
281
282 mt_blocks_loc_fine_1 = (nblk_mult_dirs_t_1+nblk-1)/nblk ! number of complete and incomplete blocks that with fine-grained process np_t_fine_1
283 m_blocks_loc_fine = (nblk_mult_dirs +nblk-1)/nblk
284
285 if (row_transposed) then
286 j_block_loc_fine_max = mt_blocks_loc_fine_1 - 1
287 i_block_loc_fine_max = m_blocks_loc_fine - 1
288 np_t = np_t_fine_1
289 np = np_fine
290 else
291 j_block_loc_fine_max = m_blocks_loc_fine - 1
292 i_block_loc_fine_max = mt_blocks_loc_fine_1 - 1
293 np_t = np_t_fine
294 np = np_fine_1
295 endif
296
297 if (useccl) then
298 ! PETERDEBUG: changes needed here? Modification of kernel?
299 lld_buf = nblk_mult_rows_max
300 call gpu_ccl_copy_buf_send(precision_char, a_dev, buf_send_dev, l_rows, l_cols, lld_buf, &
301 nblk, i_block_loc_fine_max, j_block_loc_fine_max, np, np_t, &
302 np_rows_fine, np_cols_fine, np_rows, np_cols, sm_count, debug, my_stream)
303 else ! useCCL
304 ! The nested loop is symmetric wrt to i,j, so we use the rigid order of indices for convenience of copying
305 do j_block_loc_fine = 0, j_block_loc_fine_max
306 j_block_loc = (np_t + j_block_loc_fine*np_cols_fine)/np_cols
307 nblk_cut_col = min(nblk, l_cols-j_block_loc*nblk)
308
309 do i_block_loc_fine = 0, i_block_loc_fine_max
310 i_block_loc = (np + i_block_loc_fine*np_rows_fine)/np_rows
311 nblk_cut_row = min(nblk, l_rows-i_block_loc*nblk)
312
313 buf_send(1+ i_block_loc_fine*nblk: nblk_cut_row + i_block_loc_fine*nblk, &
314 1+ j_block_loc_fine*nblk: nblk_cut_col + j_block_loc_fine*nblk) = &
315 a(1+ i_block_loc *nblk: nblk_cut_row + i_block_loc *nblk, &
316 1+ j_block_loc *nblk: nblk_cut_col + j_block_loc *nblk)
317 enddo ! i_block_loc_fine
318 enddo ! j_block_loc_fine
319
320 endif ! useCCL
321
322 ! PETERDEBUG: we send extra data to resolve the problem of continuity of the data.
323 ! Alternatively, we could make buf_send and buf_recv to be 1D arrays of blocks (still 2D array of elements, so convenient to copy)
324 if (useccl) then
325 if (mpi_rank_target/=myid) then
326 successgpu = gpu_stream_synchronize(my_stream)
327 check_stream_synchronize_gpu("elpa_pxgemm: ccl_send", successgpu)
328
329 successgpu = ccl_send(buf_send_dev, int(k_datatype*nblk_mult_rows_max*nblk_mult_cols_max,kind=c_size_t), &
330 ccldatatype, mpi_rank_target, ccl_comm_all, my_stream)
331
332 if (.not. successgpu) then
333 print *,"Error in ccl_send"
334 stop 1
335 endif
336
337 successgpu = gpu_stream_synchronize(my_stream)
338 check_stream_synchronize_gpu("elpa_pxgemm: ccl_send", successgpu)
339 else
340 ! PETERDEBUG: optimize memory usage - copy directly to at_dev (kernel needed or use gpu_ccl_copy_buf_recv)
341 ! buf_self_dev = buf_send_dev
342 successgpu = gpu_memcpy(buf_self_dev, buf_send_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype, &
343 gpumemcpydevicetodevice)
344 check_memcpy_gpu("elpa_pxgemm: buf_self_dev <- buf_send_dev", successgpu)
345 endif
346 else ! useCCL
347 if (mpi_rank_target/=myid) then
348#ifdef WITH_MPI
349 call mpi_send(buf_send, int(nblk_mult_rows_max*nblk_mult_cols_max, kind=mpi_kind), &
350 mpi_math_datatype_precision, int(mpi_rank_target, kind=mpi_kind), 0, &
351 int(mpi_comm_all, kind=mpi_kind), mpierr)
352#endif
353 else
354 buf_self = buf_send
355 endif
356 endif ! useCCL
357 np_t_fine_1 = mod(np_t_fine_1+np_dirs_t, np_dirs_t_fine)
358 if (np_t_fine_1 == np_t_fine_1_start) exit
359 enddo ! np_t_fine_1 periodic loop
360 endif ! (mod(np_fine,np_cols) == my_pcol)
361
362 ! Recv
363 if (mod(np_t_fine,np_dirs_t) == my_pdir_t) then
364 my_pdir_source = mod(np_fine, np_dirs)
365
366 do np_fine_1 = my_pdir, np_dirs_fine-1, np_dirs
367 np_t_fine_1 = np_fine_1
368 my_pdir_t_source = mod(np_t_fine_1, np_dirs_t)
369
370 if (row_transposed) then
371 my_prow_source = my_pdir_source
372 my_pcol_source = my_pdir_t_source
373 else
374 my_prow_source = my_pdir_t_source
375 my_pcol_source = my_pdir_source
376 endif
377
378 if (matrix_order==column_major_order) then
379 mpi_rank_source = my_prow_source + np_rows*my_pcol_source
380 else
381 mpi_rank_source = my_pcol_source + np_cols*my_prow_source
382 endif
383
384 nblk_mult_dirs_1 = find_nblk_mult_dirs(l_dirs, nblk, np_dirs, np_fine_1, lcm)
385
386 m_blocks_loc_fine_1 = (nblk_mult_dirs_1+nblk-1)/nblk
387 mt_blocks_loc_fine = (nblk_mult_dirs_t+nblk-1)/nblk
388
389 if (row_transposed) then
390 j_block_loc_fine_max = mt_blocks_loc_fine - 1
391 i_block_loc_fine_max = m_blocks_loc_fine_1 - 1
392 np = np_fine_1
393 np_t = np_t_fine
394 else
395 j_block_loc_fine_max = m_blocks_loc_fine_1 - 1
396 i_block_loc_fine_max = mt_blocks_loc_fine - 1
397 np = np_fine
398 np_t = np_t_fine_1
399 endif
400
401 if (useccl) then
402 if (mpi_rank_source/=myid) then
403 successgpu = ccl_recv(buf_recv_dev, int(k_datatype*nblk_mult_rows_max*nblk_mult_cols_max,kind=c_size_t), &
404 ccldatatype, mpi_rank_source, ccl_comm_all, my_stream)
405
406 if (.not. successgpu) then
407 print *,"Error in ccl_recv"
408 stop 1
409 endif
410
411 successgpu = gpu_stream_synchronize(my_stream)
412 check_stream_synchronize_gpu("elpa_pxgemm: ccl_recv", successgpu)
413 else
414 ! buf_recv_dev = buf_self_dev
415 successgpu = gpu_memcpy(buf_recv_dev, buf_self_dev, nblk_mult_rows_max*nblk_mult_cols_max*size_of_datatype, &
416 gpumemcpydevicetodevice)
417 check_memcpy_gpu("elpa_pxgemm: buf_recv_dev <- buf_self_dev", successgpu)
418 endif
419 else ! useCCL
420 if (mpi_rank_source/=myid) then
421#ifdef WITH_MPI
422 call mpi_recv(buf_recv, int(nblk_mult_rows_max*nblk_mult_cols_max, kind=mpi_kind), &
423 mpi_math_datatype_precision, int(mpi_rank_source, kind=mpi_kind), 0, &
424 int(mpi_comm_all, kind=mpi_kind), mpi_status_ignore, mpierr)
425#endif
426 else
427 buf_recv = buf_self
428 endif
429 endif ! useCCL
430
431 if (useccl) then
432 lld_buf = nblk_mult_rows_max
433 call gpu_ccl_copy_buf_recv(precision_char, at_dev, buf_recv_dev, l_rows, l_cols, &
434 lld_buf, nblk, i_block_loc_fine_max, j_block_loc_fine_max, np, np_t, &
435 np_rows_fine, np_cols_fine, np_rows, np_cols, sm_count, debug, my_stream)
436 else ! useCCL
437 do i_block_loc_fine = 0, i_block_loc_fine_max
438 i_block_loc = (np + i_block_loc_fine*np_rows_fine)/np_rows
439 nblk_cut_row = min(nblk, l_rows-i_block_loc*nblk)
440
441 do j_block_loc_fine = 0, j_block_loc_fine_max
442 j_block_loc = (np_t + j_block_loc_fine*np_cols_fine)/np_cols
443 nblk_cut_col = min(nblk, l_cols-j_block_loc*nblk)
444
445 at(1+ i_block_loc *nblk: nblk_cut_row + i_block_loc *nblk, &
446 1+ j_block_loc *nblk: nblk_cut_col + j_block_loc *nblk) = &
447#if defined (REALCASE)
448 transpose(buf_recv(1+ j_block_loc_fine*nblk: nblk_cut_col + j_block_loc_fine*nblk, &
449 1+ i_block_loc_fine*nblk: nblk_cut_row + i_block_loc_fine*nblk))
450#else
451 conjg(transpose(buf_recv(1+ j_block_loc_fine*nblk: nblk_cut_col + j_block_loc_fine*nblk, &
452 1+ i_block_loc_fine*nblk: nblk_cut_row + i_block_loc_fine*nblk)) )
453#endif
454 enddo ! j_block_loc_fine
455 enddo ! i_block_loc_fine
456
457 endif ! useCCL
458
459 enddo ! np_fine_1
460 endif ! (mod(np_t_fine,np_dirs_t) == my_pdir)
461
462 if (debug==1) successgpu = gpu_devicesynchronize()
463 call obj%timer%stop("elpa_transpose_row")
464
465#ifdef WITH_NVTX
466 call nvtxrangepop() ! transpose row/col
467#endif
468
469end subroutine
470
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_elpa_pxgemm_helpers.F90:46
integer function find_nblk_mult_dirs(l_dirs, nblk, np_dirs, np_dir_fine, lcm)
Definition mod_elpa_pxgemm_helpers.F90:56
Definition elpa_abstract_impl.F90:73