Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.05.001
Loading...
Searching...
No Matches
cannon_back_template.h
Go to the documentation of this file.
1// This file is part of ELPA.
2//
3// The ELPA library was originally created by the ELPA consortium,
4// consisting of the following organizations:
5//
6// - Max Planck Computing and Data Facility (MPCDF), formerly known as
7// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9// Informatik,
10// - Technische Universität München, Lehrstuhl für Informatik mit
11// Schwerpunkt Wissenschaftliches Rechnen ,
12// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15// and
16// - IBM Deutschland GmbH
17//
18// This particular source code file has been developed within the ELPA-AEO //
19// project, which has been a joint effort of
20//
21// - Max Planck Computing and Data Facility (MPCDF), formerly known as
22// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
23// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
24// Informatik,
25// - Technische Universität München, Lehrstuhl für Informatik mit
26// Schwerpunkt Wissenschaftliches Rechnen ,
27// - Technische Universität München, Lehrstuhl für Theoretische Chemie,
28// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
29
30// More information can be found here:
31// http://elpa.mpcdf.mpg.de/ and
32// http://elpa-aeo.mpcdf.mpg.de
33//
34// ELPA is free software: you can redistribute it and/or modify
35// it under the terms of the version 3 of the license of the
36// GNU Lesser General Public License as published by the Free
37// Software Foundation.
38//
39// ELPA is distributed in the hope that it will be useful,
40// but WITHOUT ANY WARRANTY; without even the implied warranty of
41// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
42// GNU Lesser General Public License for more details.
43//
44// You should have received a copy of the GNU Lesser General Public License
45// along with ELPA. If not, see <http://www.gnu.org/licenses/>
46//
47// ELPA reflects a substantial effort on the part of the original
48// ELPA consortium, and we ask you to respect the spirit of the
49// license that we chose: i.e., please contribute any changes you
50// may have back to the original ELPA library distribution, and keep
51// any derivatives of ELPA under the same license that we chose for
52// the original distribution, the GNU Lesser General Public License.
53//
54// Author: Valeriy Manin (Bergische Universität Wuppertal)
55// integreated into the ELPA library Pavel Kus, Andeas Marek (MPCDF)
56// ported to GPU by Peter Karpov (MPCDF)
57
58// it seems, that we need those two levels of indirection to correctly expand macros
59#define cannons_triang_rectangular_impl_expand2(SUFFIX) cannons_triang_rectangular_##SUFFIX
60#define cannons_triang_rectangular_impl_expand1(SUFFIX) cannons_triang_rectangular_impl_expand2(SUFFIX)
61#define cannons_triang_rectangular_impl cannons_triang_rectangular_impl_expand1(ELPA_IMPL_SUFFIX)
62
63#define cannons_triang_rectangular_c_impl_expand2(SUFFIX) cannons_triang_rectangular_c_##SUFFIX
64#define cannons_triang_rectangular_c_impl_expand1(SUFFIX) cannons_triang_rectangular_c_impl_expand2(SUFFIX)
65#define cannons_triang_rectangular_c_impl cannons_triang_rectangular_c_impl_expand1(ELPA_IMPL_SUFFIX)
66
67void cannons_triang_rectangular_impl(math_type* U, math_type* B, C_INT_TYPE np_rows, C_INT_TYPE np_cols,
68 C_INT_TYPE my_prow, C_INT_TYPE my_pcol, C_INT_TYPE_PTR U_desc, C_INT_TYPE_PTR B_desc,
69 math_type *Res, MPI_Comm row_comm, MPI_Comm col_comm,
70 int wantDebug, int useGPU, intptr_t *gpublasHandle)
71{
72 // Cannons algorithm, Non-blocking version
73 // Input:
74 // - U is upper triangular matrix
75 // - B is rectangular matrix
76 // Output:
77 // - Res is a full rectangular matrix Res = U*B
78 // row_comm: communicator along rows
79 // col_comm: communicator along columns
80 // This function will be used for a backtransformation
81
82 C_INT_TYPE na, nb, nblk, width, na_rows, na_cols, nb_cols, Size_receive_U_now,
83 cols_in_buffer_U_my_initial, cols_in_buffer_U, rows_in_buffer_U, rows_in_buffer_U_now, cols_in_buffer_U_now, rows_in_buffer_U_my_initial;
84
85 C_INT_MPI_TYPE Size_receive_U_nowMPI, Size_receive_UMPI, Size_receive_BMPI;
86 C_INT_TYPE i, j, Size_send_U, Size_receive_U, Size_send_B, Size_receive_B, intNumber,
87 Buf_rows, Buf_cols_U, Buf_cols_B, curr_rows, num_of_iters, cols_in_buffer, rows_in_block,
88 curr_col_loc, cols_in_block, num_of_blocks_in_U_buffer, col_of_origin_U, b_rows_mult, b_cols_mult;
89
90 math_type *Buf_to_send_U, *Buf_to_receive_U, *Buf_to_send_B, *Buf_to_receive_B, *Buf_U, *PosBuff;
91
92 C_INT_TYPE where_to_send_U, from_where_to_receive_U, where_to_send_B, from_where_to_receive_B,
93 last_proc_col_B, last_proc_row_B, n, Size_U_stored, proc_col_min;
94
95 math_type *U_local_start, *Buf_pos, *B_local_start, *double_ptr, *CopyTo, *CopyFrom;
96
97 C_INT_TYPE ratio;
98
99 MPI_Status status;
100
101 C_INT_TYPE one = 1;
102 C_INT_TYPE zero = 0;
103 math_type dOne = 1.0;
104 math_type dZero = 0.0;
105
106 na = U_desc[2];
107 nblk = U_desc[4];
108 nb = B_desc[3];
109
110 na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
111 na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
112 nb_cols = numroc_(&nb, &nblk, &my_pcol, &zero, &np_cols);
113
114 MPI_Request request_U_Recv;
115 MPI_Request request_U_Send;
116 MPI_Request request_B_Recv;
117 MPI_Request request_B_Send;
118
120 last_proc_col_B = ((nb-1)/nblk) % np_cols;
121 last_proc_row_B = ((na-1)/nblk) % np_rows;
122
124
125 if(nb%nblk == 0)
126 if(my_pcol <= last_proc_col_B)
127 Buf_cols_B = nb_cols;
128 else
129 Buf_cols_B = nb_cols + nblk;
130 else
131 if(my_pcol < last_proc_col_B)
132 Buf_cols_B = nb_cols;
133 else if(my_pcol > last_proc_col_B)
134 Buf_cols_B = nb_cols + nblk;
135 else // if my_pcol == last_proc_col_B
136 Buf_cols_B = nb_cols + nblk - nb_cols%nblk;
137
138 if(na%nblk == 0)
139 if(my_prow <= last_proc_row_B)
140 Buf_rows = na_rows;
141 else
142 Buf_rows = na_rows + nblk;
143 else
144 if(my_prow < last_proc_row_B)
145 Buf_rows = na_rows;
146 else if(my_prow > last_proc_row_B)
147 Buf_rows = na_rows + nblk;
148 else // if my_prow == last_proc_row_B
149 Buf_rows = na_rows + nblk - na_rows%nblk;
150
151 ratio = np_cols/np_rows;
152
153 intNumber = ceil((math_type)na/(math_type)(np_cols*nblk)); // max. possible number of the local block columns of U
154 Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2; // number of local elements from the upper triangular part that every proc. has (max. possible value among all the procs.)
155
156 Buf_to_send_U = malloc(ratio*Size_U_stored*sizeof(math_type));
157 Buf_to_receive_U = malloc(ratio*Size_U_stored*sizeof(math_type));
158 Buf_to_send_B = malloc(Buf_cols_B*Buf_rows*sizeof(math_type));
159 Buf_to_receive_B = malloc(Buf_cols_B*Buf_rows*sizeof(math_type));
160 if(ratio != 1)
161 Buf_U = malloc(Size_U_stored*sizeof(math_type)); // in this case we will receive data into initial buffer and after place block-rows to the needed positions of buffer for calculation
162
163 for(i = 0; i < na_rows*nb_cols; i++)
164 Res[i] = 0;
165
166 math_type *Buf_to_send_receive_U_dev;
167 math_type *Buf_to_send_receive_B_dev;
168 math_type *Res_dev;
169
170 math_type *U_local_start_dev, *B_local_start_dev;
171
172 if (useGPU){
173 gpuErrCheck( gpuMalloc((intptr_t *)&Buf_to_send_receive_U_dev, ratio*Size_U_stored*sizeof(math_type)) );
174 gpuErrCheck( gpuMalloc((intptr_t *)&Buf_to_send_receive_B_dev, Buf_cols_B*Buf_rows*sizeof(math_type)) );
175 gpuErrCheck( gpuMalloc((intptr_t *)&Res_dev, na_rows*nb_cols*sizeof(math_type)) );
176 gpuErrCheck( gpuMemset((intptr_t *)Res_dev, 0, na_rows*nb_cols*sizeof(math_type)) );
177 }
178
180
181 NVTX_RANGE_PUSH("initial reordering of U");
182
183 // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to <ratio> with the "leap" equal to np_rows; the same holds for receive
184 if((ratio != 1)||(my_prow != 0)) // if grid is rectangular or my_prow is not 0
185 Buf_pos = Buf_to_send_U; // I will copy to the send buffer
186 else
187 Buf_pos = Buf_to_receive_U; // if grid is square and my_prow is 0, then I will copy to the received buffer
188
189 // form array to send by block-columns; we need only upper triangular part
190 // find the first local block belonging to the upper part of matrix U
191 if(my_pcol >= my_prow) // if I am in the upper part of proc. grid
192 curr_col_loc = 0; // my first local block-column has block from the upper part of matrix
193 else
194 curr_col_loc = 1; //ceil((math_type)(((math_type)my_prow - (math_type)my_pcol)/(math_type)np_cols)) always will give 1 since np_cols > np_rows
195
196 num_of_iters = ceil((math_type)na_cols/(math_type)nblk); // number my of block-columns
197 num_of_iters = num_of_iters - curr_col_loc; // I will exclude the first <curr_col_loc> block-columns since they do not have blocks from the upper part of matrix U
198 curr_col_loc = curr_col_loc*nblk; // local index of the found block-column
199
200 if(my_pcol >= my_prow)
201 rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
202 else
203 rows_in_block = ratio*nblk;
204 cols_in_buffer_U_my_initial = 0;
205 Size_send_U = 0;
206 for(i = 0; i < num_of_iters; i++) // loop over my block-columns, which have blocks in the upper part of U
207 {
208 if(rows_in_block > na_rows)
209 rows_in_block = na_rows;
210
211 if ((na_cols - curr_col_loc) < nblk)
212 cols_in_block = na_cols - curr_col_loc; // how many columns do I have in the current block-column
213 else
214 cols_in_block = nblk;
215
216 if((rows_in_block > 0)&&(cols_in_block > 0))
217 {
218 double_ptr = &U[curr_col_loc*na_rows]; // pointer to start of the current block-column to be copied to buffer
219 C_LACPY("A", &rows_in_block, &cols_in_block, double_ptr, &na_rows, Buf_pos, &rows_in_block); // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column
220 Buf_pos = Buf_pos + rows_in_block*cols_in_block; // go to the position where the next block-column will be copied
221 Size_send_U = Size_send_U + rows_in_block*cols_in_block;
222 cols_in_buffer_U_my_initial = cols_in_buffer_U_my_initial + cols_in_block;
223 }
224 curr_col_loc = curr_col_loc + nblk; // go to the next local block-column of my local array U
225 rows_in_block = rows_in_block + ratio*nblk;
226 }
227 rows_in_buffer_U_my_initial = rows_in_block - ratio*nblk; // remove redundant addition from the previous loop
228 *Buf_pos = (math_type)cols_in_buffer_U_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors
229 Buf_pos = Buf_pos + 1;
230 *Buf_pos = (math_type)rows_in_buffer_U_my_initial; // write number of the rows at the end of the buffer; we will need this for furhter multiplications on the other processors
231 Size_send_U = Size_send_U + 2;
232
233 // now we have the local buffer to send
234 // find the lowest processor column among those who will send me
235 proc_col_min = np_cols;
236 for(i = 0; i < ratio; i++)
237 {
238 from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols;
239 if(from_where_to_receive_U < proc_col_min)
240 proc_col_min = from_where_to_receive_U;
241 }
242
243 // do communications and form local buffers for calculations
244 Size_receive_U = 0; // size of the accumulated buffer
245 cols_in_buffer_U = 0; // number of columns in the accumulated buffer
246 rows_in_buffer_U = 0; // number of rows in the accumulated buffer
247 for(i = 0; i < ratio; i++)
248 {
249 where_to_send_U = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
250 from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols;
251
252 // send and receive in the row_comm
253 if(ratio != 1) // if grid is not square
254 {
255 if(where_to_send_U != my_pcol) // if I need to send and receive on this step
256 {
257 MPI_Sendrecv(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, Buf_U, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &status);
258 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U_nowMPI);
259 Size_receive_U_now = (C_INT_TYPE) Size_receive_U_nowMPI;
260 Size_receive_U = Size_receive_U + Size_receive_U_now - 2; // we need only number of elements, so exclude information about cols_in_buffer_U and rows_in_buffer_
261
262 cols_in_buffer_U_now = Buf_U[Size_receive_U_now - 2];
263 cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now;
264 rows_in_buffer_U_now = Buf_U[Size_receive_U_now - 1];
265
266 if(rows_in_buffer_U < rows_in_buffer_U_now)
267 rows_in_buffer_U = rows_in_buffer_U_now;
268
269 intNumber = from_where_to_receive_U/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks
270 if(proc_col_min >= my_prow) // if among procs who will send me there is one with the full sets of block-rows in the upper part
271 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2]; // here I will copy to; formula based on arithm. progression
272 else // if among procs who will send me there is one from the lower part of grid
273 if(from_where_to_receive_U < my_prow) // if I have just received from this processor from the lower part
274 CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2]; // copy the first block of this processor after the first blocks from the others procs. that will send me later (the first block-column of this proc. is in the lower part of matrix)
275 else
276 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2];
277 CopyFrom = Buf_U;
278 }
279 else // if I need to send to myself on this step, then I will copy from Buf_to_send_U to Buf_to_receive_U
280 {
281 cols_in_buffer_U_now = cols_in_buffer_U_my_initial;
282 cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now;
283
284 rows_in_buffer_U_now = rows_in_buffer_U_my_initial;
285 if(rows_in_buffer_U < rows_in_buffer_U_now)
286 rows_in_buffer_U = rows_in_buffer_U_now;
287
288 intNumber = my_pcol/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks
289 if(proc_col_min >= my_prow) // if among procs who will send me there is one with the full sets of block-rows in the upper part
290 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2]; // here I will copy to; formula based on arithm. progression
291 else // if among procs who will send me there is one from the lower part of grid
292 if(my_pcol < my_prow) // if I have just received from this processor from the lower part (in this case it is me)
293 CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2]; // copy the first block of this processor after the first blocks from the others procs. that will send me later (the first block-column of this proc. is in the lower part of matrix)
294 else
295 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2];
296 CopyFrom = Buf_to_send_U;
297 Size_receive_U = Size_receive_U + Size_send_U - 2;
298 }
299
300 // copy by block-columns
301 intNumber = ceil((math_type)cols_in_buffer_U_now/(math_type)nblk); // how many block-columns I have received on this iteration
302 if(from_where_to_receive_U >= my_prow)
303 rows_in_block = ceil(((math_type)(from_where_to_receive_U + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk; // number of rows in the first block-column of U buffer
304 else
305 rows_in_block = ratio*nblk;
306 for(j = 0; j < intNumber; j++)
307 {
308 if((j+1)*nblk < cols_in_buffer_U_now)
309 cols_in_block = nblk;
310 else
311 cols_in_block = cols_in_buffer_U_now - j*nblk;
312
313 C_LACPY("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
314
315 CopyFrom = CopyFrom + rows_in_block*cols_in_block;
316 CopyTo = CopyTo + ratio*rows_in_block*nblk + nblk*nblk*ratio*(ratio-1)/2; // I need to leave place for ratio block-columns of the other procs. of the lengths rows_in_block, (rows_in_block+nblk), (rows_in_block+2*nblk) and so on
317 rows_in_block = rows_in_block + ratio*nblk; // number of rows in the next block-columns
318 if(rows_in_block > rows_in_buffer_U_now)
319 rows_in_block = rows_in_buffer_U_now;
320 }
321 }
322 else // if grid is square
323 {
324 if(my_prow > 0)
325 {
326 MPI_Sendrecv(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, Buf_to_receive_U, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &status);
327 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
328 Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
329
330 cols_in_buffer_U = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-2];
331 rows_in_buffer_U = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
332 }
333 else // if my_prow == 0, then I have already everything in my Buf_to_receive_U buffer
334 {
335 Size_receive_U = Size_send_U;
336 rows_in_buffer_U = rows_in_buffer_U_my_initial;
337 cols_in_buffer_U = cols_in_buffer_U_my_initial;
338 }
339 }
340 }
341 if(ratio != 1)
342 {
343 Buf_to_receive_U[Size_receive_U] = cols_in_buffer_U;
344 Buf_to_receive_U[Size_receive_U + 1] = rows_in_buffer_U;
345 Size_receive_U = Size_receive_U + 2;
346 }
347
348 NVTX_RANGE_POP(); // initial reordering of U
349
351
352 NVTX_RANGE_PUSH("initial reordering of B");
353
354 if(my_pcol > 0)
355 {
356 where_to_send_B = (my_prow - my_pcol + np_cols)%np_rows; // shift = my_pcol
357 from_where_to_receive_B = (my_pcol + my_prow)%np_rows;
358
359 // send and receive in the row_comm
360 if(where_to_send_B != my_prow) // for the rectangular proc grids it may be possible that I need to "send to myself"; if it is not the case, then I send
361 {
362 // form array to send
363 C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_send_B, &na_rows);
364 MPI_Sendrecv(Buf_to_send_B, (C_INT_MPI_TYPE) (nb_cols*na_rows), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_B, 0, Buf_to_receive_B, (C_INT_MPI_TYPE) (nb_cols*Buf_rows), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_B, 0, col_comm, &status);
365 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_BMPI); // find out how many elements I have received
366 if (nb_cols!=0) Size_receive_B = (C_INT_TYPE) Size_receive_BMPI / nb_cols; // how many rows I have received
367 else Size_receive_B=0; // can happen only if nb_cols=Size_receive_BMPI=0
368 }
369 else
370 {
371 C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // else I copy data like I have "received" it
372 Size_receive_B = na_rows;
373 }
374 }
375 else
376 {
377 C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // if I am in the 0 proc row, I need not to send; so copy data like I have "received" it
378 Size_receive_B = na_rows;
379 }
380
381 NVTX_RANGE_POP(); // initial reordering of B
382
384
385 where_to_send_U = (my_pcol - 1 + np_cols)%np_cols;
386 from_where_to_receive_U = (my_pcol + 1)%np_cols;
387 where_to_send_B = (my_prow - 1 + np_rows)%np_rows;
388 from_where_to_receive_B = (my_prow + 1)%np_rows;
389
390 NVTX_RANGE_PUSH("loop i<np_rows");
391 for(i = 1; i < np_rows; i++)
392 {
393 // at this moment I need to send to neighbour what I have in the "received" arrays; that is why change pointers of the "received" and "send" arrays
394 double_ptr = Buf_to_send_U;
395 Buf_to_send_U = Buf_to_receive_U;
396 Buf_to_receive_U = double_ptr;
397
398 double_ptr = Buf_to_send_B;
399 Buf_to_send_B = Buf_to_receive_B;
400 Buf_to_receive_B = double_ptr;
401
402 Size_send_U = Size_receive_U;
403 Size_send_B = Size_receive_B;
404
406
407 MPI_Isend(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, row_comm, &request_U_Send);
408 MPI_Irecv(Buf_to_receive_U, (C_INT_MPI_TYPE) (ratio*Size_U_stored), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &request_U_Recv);
409
411
412 MPI_Isend(Buf_to_send_B, (C_INT_MPI_TYPE) (Size_send_B*nb_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_B, 0, col_comm, &request_B_Send);
413 MPI_Irecv(Buf_to_receive_B, (C_INT_MPI_TYPE) (Buf_rows*nb_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_B, 0, col_comm, &request_B_Recv);
414
416
417 cols_in_buffer_U = (C_INT_TYPE)Buf_to_send_U[Size_receive_U-2];
418 rows_in_buffer_U = (C_INT_TYPE)Buf_to_send_U[Size_receive_U-1];
419 //find minimal proc. column among those procs. who contributed in the current U buffer
420 proc_col_min = np_cols;
421 for(j = 0; j < ratio; j++)
422 {
423 col_of_origin_U = (my_pcol + my_prow + i - 1 + j*np_rows)%np_cols;
424 if(col_of_origin_U < proc_col_min)
425 proc_col_min = col_of_origin_U;
426 }
427 col_of_origin_U = proc_col_min;
428
429 num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
430
431 if (useGPU){
432 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_U_dev, (intptr_t *)Buf_to_send_U, ratio*Size_U_stored*sizeof(math_type), gpuMemcpyHostToDevice) );
433 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_B_dev, (intptr_t *)Buf_to_send_B, Buf_cols_B*Buf_rows*sizeof(math_type), gpuMemcpyHostToDevice) );
434
435 U_local_start_dev = Buf_to_send_receive_U_dev;
436 if (col_of_origin_U >= my_prow) B_local_start_dev = Buf_to_send_receive_B_dev;
437 else B_local_start_dev = Buf_to_send_receive_B_dev + nblk;
438 }
439 else {
440 U_local_start = Buf_to_send_U;
441 if (col_of_origin_U >= my_prow) B_local_start = Buf_to_send_B;
442 else B_local_start = Buf_to_send_B + nblk;
443 }
444
445 NVTX_RANGE_PUSH("loop j<num_of_blocks_in_U_buffer");
446 for(j = 0; j < num_of_blocks_in_U_buffer; j++)
447 {
448 curr_rows = (j+1)*nblk;
449 if (curr_rows > rows_in_buffer_U)
450 curr_rows = rows_in_buffer_U;
451
452 if((j+1)*nblk <= cols_in_buffer_U)
453 b_rows_mult = nblk;
454 else
455 b_rows_mult = cols_in_buffer_U - j*nblk;
456
457 if (Size_receive_B != 0){
458 NVTX_RANGE_PUSH("GEMM");
459 // Res = U_local_start*B_local_start
460 // Res = Buf_to_send_U*Buf_to_send_B
461 if (useGPU){
462 gpublasXgemm(gpublasHandle, 'N', 'N',
463 curr_rows, nb_cols, b_rows_mult, dOne,
464 U_local_start_dev, curr_rows,
465 B_local_start_dev, Size_receive_B, dOne,
466 Res_dev, na_rows);
467 if (wantDebug) gpuDeviceSynchronize();
468 }
469 else {
470 C_GEMM("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &dOne,
471 U_local_start, &curr_rows,
472 B_local_start, &Size_receive_B, &dOne,
473 Res, &na_rows);
474 }
476 }
477
478 if (useGPU) {
479 U_local_start_dev += nblk*curr_rows;
480 B_local_start_dev += nblk;
481 }
482 else {
483 U_local_start += nblk*curr_rows;
484 B_local_start += nblk;
485 }
486 }
487 NVTX_RANGE_POP(); // loop j<num_of_blocks_in_U_buffer
488
489 MPI_Wait(&request_U_Send, &status);
490 MPI_Wait(&request_U_Recv, &status);
491 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received
492 Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
493
494 MPI_Wait(&request_B_Send, &status);
495 MPI_Wait(&request_B_Recv, &status);
496 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_BMPI); // find out how many elements I have received
497 if (nb_cols!=0) Size_receive_B = (C_INT_TYPE) Size_receive_BMPI / nb_cols; // how many rows I have received
498 else Size_receive_B=0; // can happen only if nb_cols=Size_receive_BMPI=0
499
500 }
501 NVTX_RANGE_POP(); // loop i<np_rows
502
503 // last iteration
504 cols_in_buffer_U = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-2];
505 rows_in_buffer_U = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
506 //find minimal proc. column among those procs. who contributed in the current U buffer
507 proc_col_min = np_cols;
508 for(j = 0; j < ratio; j++)
509 {
510 col_of_origin_U = (my_pcol + my_prow + np_rows - 1 + j*np_rows)%np_cols;
511 if(col_of_origin_U < proc_col_min)
512 proc_col_min = col_of_origin_U;
513 }
514 col_of_origin_U = proc_col_min;
515
516 num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
517
518 if (useGPU){
519 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_U_dev, (intptr_t *)Buf_to_receive_U, ratio*Size_U_stored*sizeof(math_type), gpuMemcpyHostToDevice) );
520 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_B_dev, (intptr_t *)Buf_to_receive_B, Buf_cols_B*Buf_rows*sizeof(math_type), gpuMemcpyHostToDevice) );
521
522 U_local_start_dev = Buf_to_send_receive_U_dev;
523 if (col_of_origin_U >= my_prow) B_local_start_dev = Buf_to_send_receive_B_dev;
524 else B_local_start_dev = Buf_to_send_receive_B_dev + nblk;
525 }
526 else {
527 U_local_start = Buf_to_receive_U;
528 if (col_of_origin_U >= my_prow) B_local_start = Buf_to_receive_B;
529 else B_local_start = Buf_to_receive_B + nblk;
530 }
531
532 NVTX_RANGE_PUSH("loop j<num_of_blocks_in_U_buffer");
533 for(j = 0; j < num_of_blocks_in_U_buffer; j++)
534 {
535 curr_rows = (j+1)*nblk;
536 if (curr_rows > rows_in_buffer_U)
537 curr_rows = rows_in_buffer_U;
538
539 if((j+1)*nblk <= cols_in_buffer_U)
540 b_rows_mult = nblk;
541 else
542 b_rows_mult = cols_in_buffer_U - j*nblk;
543
544 if (Size_receive_B != 0) {
545 NVTX_RANGE_PUSH("GEMM_last");
546 // Res = U_local_start*B_local_start
547 // Res = Buf_to_receive_U*Buf_to_receive_B
548 if (useGPU){
549 gpublasXgemm(gpublasHandle, 'N', 'N',
550 curr_rows, nb_cols, b_rows_mult, dOne,
551 U_local_start_dev, curr_rows,
552 B_local_start_dev, Size_receive_B, dOne,
553 Res_dev, na_rows);
554 if (wantDebug) gpuDeviceSynchronize();
555 }
556 else {
557 C_GEMM("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &dOne,
558 U_local_start, &curr_rows,
559 B_local_start, &Size_receive_B, &dOne,
560 Res, &na_rows);
561 }
563 }
564
565 if (useGPU) {
566 U_local_start_dev += nblk*curr_rows;
567 B_local_start_dev += nblk;
568 }
569 else {
570 U_local_start += nblk*curr_rows;
571 B_local_start += nblk;
572 }
573
574 }
575 NVTX_RANGE_POP(); // loop j<num_of_blocks_in_U_buffer
576
577 if (useGPU) {
578 gpuErrCheck( gpuMemcpy((intptr_t *)Res, (intptr_t *)Res_dev, na_rows*nb_cols*sizeof(math_type), gpuMemcpyDeviceToHost) );
579 gpuErrCheck( gpuFree((intptr_t *)Buf_to_send_receive_U_dev) );
580 gpuErrCheck( gpuFree((intptr_t *)Buf_to_send_receive_B_dev) );
581 gpuErrCheck( gpuFree((intptr_t *)Res_dev) );
582 }
583
584 free(Buf_to_send_U);
585 free(Buf_to_receive_U);
586 free(Buf_to_send_B);
587 free(Buf_to_receive_B);
588 if(ratio != 1)
589 free(Buf_U);
590}
591
592
593void cannons_triang_rectangular_c_impl(math_type* U, math_type* B, int local_rowsCast, int local_colsCast,
594 C_INT_TYPE_PTR U_desc, C_INT_TYPE_PTR B_desc, math_type *Res,
595 C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm,
596 int wantDebug, int useGPU, intptr_t *gpublasHandle)
597{
598 C_INT_TYPE local_rows, local_cols;
599
600 local_rows = (C_INT_TYPE) local_rowsCast;
601 local_cols = (C_INT_TYPE) local_colsCast;
602
603 MPI_Comm c_row_comm = MPI_Comm_f2c(row_comm);
604 MPI_Comm c_col_comm = MPI_Comm_f2c(col_comm);
605
606 C_INT_TYPE my_prow, my_pcol, np_rows, np_cols;
607 C_INT_MPI_TYPE my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI;
608
609 MPI_Comm_rank(c_row_comm, &my_prowMPI);
610 MPI_Comm_size(c_row_comm, &np_rowsMPI);
611 MPI_Comm_rank(c_col_comm, &my_pcolMPI);
612 MPI_Comm_size(c_col_comm, &np_colsMPI);
613
614 my_prow = (C_INT_TYPE) my_prowMPI;
615 my_pcol = (C_INT_TYPE) my_pcolMPI;
616 np_rows = (C_INT_TYPE) np_rowsMPI;
617 np_cols = (C_INT_TYPE) np_colsMPI;
618
619 // BEWARE
620 // in the cannons algorithm, column and row communicators are exchanged
621 // What we usually call row_comm in elpa, is thus passed to col_comm parameter of the function and vice versa
622 // (order is swapped in the following call)
623 // It is a bit unfortunate, maybe it should be changed in the Cannon algorithm to comply with ELPA standard notation?
624 cannons_triang_rectangular_impl(U, B, np_rows, np_cols, my_prow, my_pcol, U_desc, B_desc, Res, c_col_comm, c_row_comm,
625 wantDebug, useGPU, gpublasHandle);
626}
627
int gpuMemcpyDeviceToHost
Definition cannon.c:104
#define C_INT_TYPE
Definition cannon.c:71
#define NVTX_RANGE_POP()
Definition cannon.c:95
#define NVTX_RANGE_PUSH(msg)
Definition cannon.c:94
#define gpuErrCheck(ans)
Definition cannon.c:107
#define C_INT_MPI_TYPE
Definition cannon.c:81
#define gpublasXgemm
Definition cannon.c:161
#define C_INT_TYPE_PTR
Definition cannon.c:70
int gpuMemcpyHostToDevice
Definition cannon.c:103
#define cannons_triang_rectangular_impl
Definition cannon_back_template.h:61
#define cannons_triang_rectangular_c_impl
Definition cannon_back_template.h:65
int gpuMemcpy(intptr_t *dest, intptr_t *src, size_t count, int dir)
Definition gpu_vendor_agnostic_layer.c:144
int gpuDeviceSynchronize()
Definition gpu_vendor_agnostic_layer.c:160
int gpuFree(intptr_t *a)
Definition gpu_vendor_agnostic_layer.c:128
int gpuMalloc(intptr_t *a, size_t width_height)
Definition gpu_vendor_agnostic_layer.c:112
int gpuMemset(intptr_t *a, int value, size_t count)
Definition gpu_vendor_agnostic_layer.c:176