Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.03.001
Loading...
Searching...
No Matches
cannon_forw_template.c
Go to the documentation of this file.
1// This file is part of ELPA.
2//
3// The ELPA library was originally created by the ELPA consortium,
4// consisting of the following organizations:
5//
6// - Max Planck Computing and Data Facility (MPCDF), formerly known as
7// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9// Informatik,
10// - Technische Universität München, Lehrstuhl für Informatik mit
11// Schwerpunkt Wissenschaftliches Rechnen ,
12// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15// and
16// - IBM Deutschland GmbH
17//
18// This particular source code file has been developed within the ELPA-AEO //
19// project, which has been a joint effort of
20//
21// - Max Planck Computing and Data Facility (MPCDF), formerly known as
22// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
23// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
24// Informatik,
25// - Technische Universität München, Lehrstuhl für Informatik mit
26// Schwerpunkt Wissenschaftliches Rechnen ,
27// - Technische Universität München, Lehrstuhl für Theoretische Chemie,
28// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
29
30// More information can be found here:
31// http://elpa.mpcdf.mpg.de/ and
32// http://elpa-aeo.mpcdf.mpg.de
33//
34// ELPA is free software: you can redistribute it and/or modify
35// it under the terms of the version 3 of the license of the
36// GNU Lesser General Public License as published by the Free
37// Software Foundation.
38//
39// ELPA is distributed in the hope that it will be useful,
40// but WITHOUT ANY WARRANTY; without even the implied warranty of
41// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
42// GNU Lesser General Public License for more details.
43//
44// You should have received a copy of the GNU Lesser General Public License
45// along with ELPA. If not, see <http://www.gnu.org/licenses/>
46//
47// ELPA reflects a substantial effort on the part of the original
48// ELPA consortium, and we ask you to respect the spirit of the
49// license that we chose: i.e., please contribute any changes you
50// may have back to the original ELPA library distribution, and keep
51// any derivatives of ELPA under the same license that we chose for
52// the original distribution, the GNU Lesser General Public License.
53//
54// Author: Valeriy Manin (Bergische Universität Wuppertal)
55// integrated into the ELPA library Pavel Kus, Andeas Marek (MPCDF)
56
57#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
58#define C_INT_TYPE_PTR long int*
59#define C_INT_TYPE long int
60#define BLAS_KIND c_int64_t
61#else
62#define C_INT_TYPE_PTR int*
63#define C_INT_TYPE int
64#define BLAS_KIND c_int
65#endif
66#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
67#define C_INT_MPI_TYPE_PTR long int*
68#define C_INT_MPI_TYPE long int
69#define MPI_KIND c_int64_t
70#else
71#define C_INT_MPI_TYPE_PTR int*
72#define C_INT_MPI_TYPE int
73#define MPI_KIND c_int
74#endif
75
76// it seems, that we need those two levels of indirection to correctly expand macros
77#define cannons_reduction_impl_expand2(SUFFIX) cannons_reduction_##SUFFIX
78#define cannons_reduction_impl_expand1(SUFFIX) cannons_reduction_impl_expand2(SUFFIX)
79#define cannons_reduction_impl cannons_reduction_impl_expand1(ELPA_IMPL_SUFFIX)
80
81#define cannons_reduction_c_impl_expand2(SUFFIX) cannons_reduction_c_##SUFFIX
82#define cannons_reduction_c_impl_expand1(SUFFIX) cannons_reduction_c_impl_expand2(SUFFIX)
83#define cannons_reduction_c_impl cannons_reduction_c_impl_expand1(ELPA_IMPL_SUFFIX)
84
85#include "../general/precision_typedefs.h"
86
87#include "../helpers/lapack_interfaces.h"
88#include "../helpers/scalapack_interfaces.h"
89
90void cannons_reduction_impl(math_type* A, math_type* U, C_INT_TYPE np_rows, C_INT_TYPE np_cols, C_INT_TYPE my_prow, C_INT_TYPE my_pcol,
91 C_INT_TYPE_PTR a_desc, math_type *Res, C_INT_MPI_TYPE ToStore, MPI_Comm row_comm, MPI_Comm col_comm)
92{
93 // Input matrices:
94 // - A: full matrix
95 // - U: upper triangular matrix U^(-1)
96 // Output matrix:
97 // - Res = U^(-H)*A*U^(-1), where U^(-H) := (U^(-1))^H
98 // row_comm: communicator along rows
99 // col_comm: communicator along columns
100
101 C_INT_TYPE na, nblk, i, j, Size_send_A, Size_receive_A, Size_send_U, Size_receive_U, Buf_rows, Buf_cols, pcol_where_to_send_A, pcol_from_where_to_receive_A, where_to_send_U, from_where_to_receive_U, last_proc_row, last_proc_col, cols_in_buffer_A, rows_in_buffer_A, intNumber;
102 C_INT_TYPE ratio, num_of_iters, cols_in_buffer, rows_in_block, rows_in_buffer, curr_col_loc, cols_in_block, curr_col_glob, curr_row_loc, Size_receive_A_now, Nb, owner, cols_in_buffer_A_now;
103 C_INT_MPI_TYPE Size_receive_A_nowMPI, Size_receive_AMPI, Size_receive_UMPI;
104
105 math_type *Buf_to_send_A, *Buf_to_receive_A, *Buf_to_send_U, *Buf_to_receive_U, *data_ptr, *Buf_A, *Buf_pos, *U_local_start, *Res_ptr, *M, *M_T, *A_local_start, *U_local_start_curr, *U_stored, *CopyTo, *CopyFrom, *U_to_calc;
106
107 C_INT_TYPE row_of_origin_U, rows_in_block_U, num_of_blocks_in_U_buffer, k, startPos, cols_in_buffer_U, rows_in_buffer_U, col_of_origin_A, curr_row_loc_res, curr_row_loc_A, curr_col_glob_res;
108 C_INT_TYPE curr_col_loc_res, curr_col_loc_buf, proc_row_curr, curr_col_loc_U, A_local_index, LDA_A, LDA_A_new, index_row_A_for_LDA, ii, rows_in_block_U_curr, width, row_origin_U, rows_in_block_A, cols_in_buffer_A_my_initial, rows_in_buffer_A_my_initial, proc_col_min;
109 C_INT_TYPE *SizesU;
110 C_INT_TYPE Size_U_skewed, Size_U_stored, Curr_pos_in_U_stored, rows_in_buffer_A_now;
111 math_type dOne = 1.0;
112 math_type dZero = 0.0;
113 C_INT_TYPE one = 1;
114 C_INT_TYPE zero = 0;
115 C_INT_TYPE na_rows, na_cols;
116
117 MPI_Status status;
118 MPI_Request request_A_Recv;
119 MPI_Request request_A_Send;
120 MPI_Request request_U_Recv;
121 MPI_Request request_U_Send;
122
123 na = a_desc[2];
124 nblk = a_desc[4];
125 na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
126 na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
127
128 //if(ToStore > (np_rows -1))
129 // if((my_prow == 0)&&(my_pcol == 0))
130 // printf("Buffering level is larger than (np_rows-1) !!!\n");
131 //if((my_prow == 0)&&(my_pcol == 0))
132 // printf("Buffering level = %d\n", ToStore);
133
135 if (np_cols%np_rows != 0)
136 {
137 //if((my_prow == 0)&& (my_pcol ==0))
138 // printf("!!!!! np_cols must be a multiple of np_rows!!!!! I do nothing! \n");
139 return;
140 }
141
142 if (np_cols < np_rows != 0)
143 {
144 //if((my_prow == 0)&& (my_pcol ==0))
145 // printf("np_cols < np_rows \n");
146 return;
147 }
148
149 ratio = np_cols/np_rows;
150 last_proc_row = ((na-1)/nblk) % np_rows; // processor row having the last block-row of matrix
151 last_proc_col = ((na-1)/nblk) % np_cols; // processor column having the last block-column of matrix
152
154 if (na%nblk == 0) {
155 if (my_pcol <= last_proc_col) {
156 Buf_cols = na_cols;
157 }
158 else {
159 Buf_cols = na_cols + nblk;
160 }
161 }
162 else {
163 if (my_pcol < last_proc_col) {
164 Buf_cols = na_cols;
165 }
166 else if (my_pcol > last_proc_col) {
167 Buf_cols = na_cols + nblk;
168 }
169 else { // if my_pcol == last_proc_col
170 Buf_cols = na_cols + nblk - na_cols%nblk;
171 }
172 }
173
174 if (na%nblk == 0) {
175 if (my_prow <= last_proc_row) {
176 Buf_rows = na_rows + 1; // Soheil: added + 1 to be able to accommodate for MPI message sizes in the case of pure blocked configuration e.g. na=100; with 9xMPI ranks and nblk=30 or with 4xMPI ranks and nblk=45
177 }
178 else {
179 Buf_rows = na_rows + nblk;
180 }
181 }
182 else {
183 if (my_prow < last_proc_row) {
184 Buf_rows = na_rows;
185 }
186 else if (my_prow > last_proc_row) {
187 Buf_rows = na_rows + nblk;
188 }
189 else { // if my_prow == last_proc_row
190 Buf_rows = na_rows + nblk - na_rows%nblk;
191 }
192 }
193
194 intNumber = ceil((math_type)na/(math_type)(np_cols*nblk)); // max. possible number of the local block columns of U
195 Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2; // number of local elements from the upper triangular part that every proc. has (max. possible value among all the procs.)
196
197 U_stored = malloc((Size_U_stored*(ToStore+1))*sizeof(math_type));
198 SizesU = malloc(ToStore*sizeof(C_INT_TYPE)); // here will be stored the sizes of the buffers of U that I have stored
199 Buf_to_send_A = malloc(ratio*Buf_cols*Buf_rows*sizeof(math_type));
200 Buf_to_receive_A = malloc(ratio*Buf_cols*Buf_rows*sizeof(math_type));
201 Buf_to_send_U = malloc(Size_U_stored*sizeof(math_type));
202 Buf_to_receive_U = malloc(Size_U_stored*sizeof(math_type));
203 if(ratio != 1)
204 Buf_A = malloc(Buf_cols*Buf_rows*sizeof(math_type)); // in this case we will receive data into initial buffer and after place block-columns to the needed positions of buffer for calculation
205 M = malloc(na_rows*na_cols*sizeof(math_type));
206 M_T = malloc(na_rows*na_cols*sizeof(math_type));
207 for(i = 0; i < na_rows*na_cols; i++)
208 M[i] = 0;
209
211
212 // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to <ratio> with the "leap" equal to np_rows; the same holds for receive
213 if(ratio != 1) {
214#ifdef WITH_NVTX
215 nvtxRangePushA("LACPY");
216#endif
217 C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows); // copy A to Buf_to_send_A
218#ifdef WITH_NVTX
219 nvtxRangePop();
220#endif
221 }
222 Size_receive_A = 0;
223
224 // receive from different processors and place in my buffer for calculation;
225 for(i = 0; i < ratio; i++)
226 {
227 pcol_where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
228 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
229
230 // send and receive in the row_comm
231 if(ratio != 1) // if grid is not square
232 {
233 if(pcol_where_to_send_A != my_pcol)
234 {
235 MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) (na_cols*na_rows) , MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_where_to_send_A, (C_INT_MPI_TYPE) zero,
236 Buf_A , (C_INT_MPI_TYPE) (na_rows*Buf_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero,
237 row_comm, &status);
238 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
239 Size_receive_A_now = (C_INT_TYPE) Size_receive_A_nowMPI;
240 Size_receive_A_now = Size_receive_A_now/na_rows; // how many columns of A I have received
241 }
242 else {
243 Size_receive_A_now = na_cols;
244 }
245
246 Size_receive_A = Size_receive_A + Size_receive_A_now; // here accumulate number of columns of A that I will receive
247
248 // now I need to copy the received block to my buffer for A
249 intNumber = pcol_from_where_to_receive_A/np_rows; // how many blocks I will receive, such that I will need to put them before the just received block
250
251 CopyTo = &Buf_to_receive_A[intNumber*na_rows*nblk]; // here I will start copying the received buffer
252 if (pcol_where_to_send_A != my_pcol) {
253 CopyFrom = Buf_A;
254 }
255 else {
256 CopyFrom = A;
257 }
258
259 intNumber = ceil((math_type)Size_receive_A_now/(math_type)nblk); // how many block-columns I have received
260 for(j = 0; j < intNumber; j++)
261 {
262 width = nblk; // width of the current block column
263 if(nblk*(j+1) > Size_receive_A_now)
264 width = Size_receive_A_now - nblk*j;
265 C_LACPY("A", &na_rows, &width, CopyFrom, &na_rows, CopyTo, &na_rows);
266 CopyTo = CopyTo + na_rows*nblk*ratio;
267 CopyFrom = CopyFrom + na_rows*nblk;
268 }
269 }
270
271 else { // if grid is square then simply receive from one processor to a calculation buffer
272 if(my_prow > 0)
273 {
274 C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows); // copy A to Buf_to_send_A
275 MPI_Sendrecv(Buf_to_send_A , (C_INT_MPI_TYPE) (na_cols*na_rows) , MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_where_to_send_A , (C_INT_MPI_TYPE) zero,
276 Buf_to_receive_A, (C_INT_MPI_TYPE) (na_rows*Buf_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero,
277 row_comm, &status);
278 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
279 Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
280 Size_receive_A = Size_receive_A/na_rows; // how many columns of A I have received
281 }
282 else
283 {
284 C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_receive_A, &na_rows); // copy A to the received buffer if I do not need to send
285 Size_receive_A = na_cols;
286 }
287 }
288 }
289
291
292 // form array to send by block-columns
293 num_of_iters = ceil((math_type)na_cols/(math_type)nblk); // number my of block-columns
294
295 where_to_send_U = (my_prow - my_pcol + np_cols)%np_rows; // shift = my_pcol; we assume that np_cols%np_rows = 0
296 from_where_to_receive_U = (my_pcol + my_prow)%np_rows;
297
298 if (where_to_send_U == my_prow) { // if I will not need to send my local part of U, then copy my local data to the "received" buffer
299 Buf_pos = Buf_to_receive_U;
300 }
301 else {
302 Buf_pos = Buf_to_send_U; // else form the array to send
303 }
304 // find the first local block belonging to the upper part of matrix U
305 if (my_pcol >= my_prow) { // if I am in the upper part of proc. grid
306 curr_col_loc = 0; // my first local block-column has block from the upper part of matrix
307 }
308 else {
309 curr_col_loc = 1; //ceil((math_type)(((math_type)my_prow - (math_type)my_pcol)/(math_type)np_cols)) always will give 1 since np_cols > np_rows
310 }
311 num_of_iters = num_of_iters - curr_col_loc; // I will exclude the first <curr_col_loc> block-columns since they do not have blocks from the upper part of matrix U
312 curr_col_loc = curr_col_loc*nblk; // local index of the found block-column
313
314 if (my_pcol >= my_prow ) {
315 rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
316 }
317 else {
318 rows_in_block = ratio*nblk;
319 }
320 Size_send_U = 0;
321 for(i = 0; i < num_of_iters; i++) // loop over my block-columns, which have blocks in the upper part of U
322 {
323 if (rows_in_block > na_rows) {
324 rows_in_block = na_rows;
325 }
326 if ((na_cols - curr_col_loc) < nblk) {
327 cols_in_block = na_cols - curr_col_loc; // how many columns do I have in the current block-column
328 }
329 else {
330 cols_in_block = nblk;
331 }
332 if ((rows_in_block > 0)&&(cols_in_block > 0))
333 {
334 data_ptr = &U[curr_col_loc*na_rows]; // pointer to start of the current block-column to be copied to buffer
335 C_LACPY("A", &rows_in_block, &cols_in_block, data_ptr, &na_rows, Buf_pos, &rows_in_block); // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column
336 Buf_pos = Buf_pos + rows_in_block*cols_in_block; // go to the position where the next block-column will be copied
337 Size_send_U = Size_send_U + rows_in_block*cols_in_block;
338 }
339 curr_col_loc = curr_col_loc + nblk; // go to the next local block-column of my local array U
340 rows_in_block = rows_in_block + ratio*nblk;
341 }
342 rows_in_buffer = rows_in_block - ratio*nblk; // remove redundant addition from the previous loop
343 *Buf_pos = (math_type)rows_in_buffer; // write number of the rows at the end of the buffer; we will need this for further multiplications on the other processors
344 Size_send_U = Size_send_U + 1;
345
346 //send and receive
347 if (where_to_send_U != my_prow)
348 {
349 // send and receive in the col_comm
350 MPI_Sendrecv(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, (C_INT_MPI_TYPE) zero, Buf_to_receive_U, (C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, (C_INT_MPI_TYPE) zero, col_comm, &status);
351 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received
352 Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
353 }
354 else {// if I do not need to send
355 Size_receive_U = Size_send_U; // how many elements I "have received"; the needed data I have already copied to the "receive" buffer
356 }
357 for(i = 0; i < Size_receive_U; i++)
358 U_stored[i] = Buf_to_receive_U[i];
359 Size_U_skewed = Size_receive_U;
360 Curr_pos_in_U_stored = Size_U_skewed;
361
363
364 pcol_where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
365 pcol_from_where_to_receive_A = (my_pcol + 1)%np_cols;
366 where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
367 from_where_to_receive_U = (my_prow + 1)%np_rows;
368
369 for(j = 1; j < np_rows; j++)
370 {
371 // at this moment I need to send to neighbour what I have in the "received" arrays; that is why exchange pointers of the "received" and "send" arrays
372 data_ptr = Buf_to_send_A;
373 Buf_to_send_A = Buf_to_receive_A;
374 Buf_to_receive_A = data_ptr;
375
376 data_ptr = Buf_to_send_U;
377 Buf_to_send_U = Buf_to_receive_U;
378 Buf_to_receive_U = data_ptr;
379
381 Size_send_A = Size_receive_A; // number of block-columns of A and block-rows of U to send (that I have received on the previous step)
382 MPI_Isend(Buf_to_send_A, (C_INT_MPI_TYPE) (Size_send_A*na_rows), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_where_to_send_A, (C_INT_MPI_TYPE) zero, row_comm, &request_A_Send);
383 MPI_Irecv(Buf_to_receive_A, (C_INT_MPI_TYPE) (Buf_cols*na_rows*ratio), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero, row_comm, &request_A_Recv);
384
386 Size_send_U = Size_receive_U;
387 MPI_Isend(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Send);
388 MPI_Irecv(Buf_to_receive_U, (C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Recv);
389
391 rows_in_buffer = (int)Buf_to_send_U[Size_receive_U-1];
392 row_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
393
394 if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U)) // if I and sender are from the upper part of grid
395 {
396 cols_in_buffer = na_cols; // then we have the same number of columns in the upper triangular part
397 curr_col_loc_res = 0; // all my block-columns have parts in the upper triangular part
398 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
399 }
400 if((my_pcol < my_prow)&&(my_pcol < row_origin_U)) // if I and sender are from the lower part of grid
401 {
402 cols_in_buffer = na_cols - nblk; // then we have the same number of columns in the upper triangular part, but the first block-column was not included
403 curr_col_loc_res = nblk; // I start update from the second block-column since the first on is in the lower triangular part
404 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
405 }
406 if((my_pcol >= my_prow)&&(my_pcol < row_origin_U)) // if I am from the upper part of grid and sender is from the lower part
407 {
408 cols_in_buffer = na_cols - nblk; // then I have received one block-column less than I have
409 curr_col_loc_res = nblk; // all my block-columns have parts in the upper triangular part, but the first block-column of the received buffers corresponds to my second one
410 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
411 }
412 if((my_pcol < my_prow)&&(my_pcol >= row_origin_U)) // if I am from the lower part of grid and sender is from the upper part
413 {
414 cols_in_buffer = na_cols; // then I have received the full set of block-columns
415 curr_col_loc_res = nblk; // I start update from the second block-column since the first on is in the lower triangular part
416 curr_col_loc_buf = nblk; // I skip the first block-column of the buffer, since my first block-column is in the lower part
417 }
418
419 num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk);
420
421 startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
422 U_local_start = &Buf_to_send_U[startPos];
423 Res_ptr = &M[curr_col_loc_res*na_rows];
424
425 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
426 {
427 curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
428 proc_row_curr = (curr_col_glob/nblk)%np_rows;
429 rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk; // in A; not to go down beyond the upper triangular part
430 if (my_prow <= proc_row_curr) {
431 rows_in_block_A = rows_in_block_A + nblk;
432 }
433 if (rows_in_block_A > na_rows) {
434 rows_in_block_A = na_rows;
435 }
436 if ((curr_col_loc_buf + nblk) <= cols_in_buffer) {
437 cols_in_block = nblk; // number columns in block of U which will take part in this calculation
438 }
439 else {
440 cols_in_block = cols_in_buffer - curr_col_loc_buf;
441 }
442
443 rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk; // corresponds to columns in A;
444 if (proc_row_curr >= row_origin_U) {
445 rows_in_block_U = rows_in_block_U + nblk;
446 }
447 if (rows_in_block_U > rows_in_buffer) {
448 rows_in_block_U = rows_in_buffer;
449 }
450
451 if ((rows_in_block_A > 0)&&(cols_in_block > 0)) {
452#ifdef WITH_NVTX
453 nvtxRangePushA("GEMM_1");
454#endif
455 if (j == 1) {
456 C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne, Buf_to_send_A, &na_rows, U_local_start, &rows_in_block_U, &dZero, Res_ptr, &na_rows);
457 }
458 else {
459 C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne, Buf_to_send_A, &na_rows, U_local_start, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
460 }
461#ifdef WITH_NVTX
462 nvtxRangePop();
463#endif
464 }
465 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
466 curr_col_loc_res = curr_col_loc_res + nblk;
467 Res_ptr = &M[curr_col_loc_res*na_rows];
468 curr_col_loc_buf = curr_col_loc_buf + nblk;
469 }
470
471 MPI_Wait(&request_A_Send, &status);
472 MPI_Wait(&request_A_Recv, &status);
473
474 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI); // find out how many elements I have received
475 Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
476 Size_receive_A = Size_receive_A / na_rows;
477
478
479 MPI_Wait(&request_U_Send, &status);
480 MPI_Wait(&request_U_Recv, &status);
481 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received
482 Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
484 if(j <= ToStore)
485 {
486 for(k = 0; k < Size_receive_U; k++)
487 U_stored[Curr_pos_in_U_stored + k] = Buf_to_receive_U[k];
488 Curr_pos_in_U_stored = Curr_pos_in_U_stored + Size_receive_U;
489 SizesU[j-1] = Size_receive_U;
490 }
491 }
492
494 rows_in_buffer = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
495 row_origin_U = (my_pcol + my_prow + np_cols + np_rows -1)%np_rows;
496
497 if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U)) // if I and sender are from the upper part of grid
498 {
499 cols_in_buffer = na_cols; // then we have the same number of columns in the upper triangular part
500 curr_col_loc_res = 0; // all my block-columns have parts in the upper triangular part
501 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
502 }
503 if((my_pcol < my_prow)&&(my_pcol < row_origin_U)) // if I and sender are from the lower part of grid
504 {
505 cols_in_buffer = na_cols - nblk; // then we have the same number of columns in the upper triangular part, but the first block-column was not included
506 curr_col_loc_res = nblk; // I start update from the second block-column since the first on is in the lower triangular part
507 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
508 }
509 if((my_pcol >= my_prow)&&(my_pcol < row_origin_U)) // if I am from the upper part of grid and sender is from the lower part
510 {
511 cols_in_buffer = na_cols - nblk; // then I have received one block-column less than I have
512 curr_col_loc_res = nblk; // all my block-columns have parts in the upper triangular part, but the first block-column of the received buffers corresponds to my second one
513 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
514 }
515 if((my_pcol < my_prow)&&(my_pcol >= row_origin_U)) // if I am from the lower part of grid and sender is from the upper part
516 {
517 cols_in_buffer = na_cols; // then I have received the full set of block-columns
518 curr_col_loc_res = nblk; // I start update from the second block-column since the first on is in the lower triangular part
519 curr_col_loc_buf = nblk; // I skip the first block-column of the buffer, since my first block-column is in the lower part
520 }
521
522 num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk);
523
524 startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
525 U_local_start = &Buf_to_receive_U[startPos];
526 Res_ptr = &M[curr_col_loc_res*na_rows];
527
528 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
529 {
530 curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
531 proc_row_curr = (curr_col_glob/nblk)%np_rows;
532 rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk; // in A; not to go down beyond the upper triangular part
533 if (my_prow <= proc_row_curr) {
534 rows_in_block_A = rows_in_block_A + nblk;
535 }
536 if (rows_in_block_A > na_rows) {
537 rows_in_block_A = na_rows;
538 }
539 if ((curr_col_loc_buf + nblk) <= cols_in_buffer) {
540 cols_in_block = nblk; // number columns in block of U which will take part in this calculation
541 }
542 else {
543 cols_in_block = cols_in_buffer - curr_col_loc_buf;
544 }
545 rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk; // corresponds to columns in A;
546 if (proc_row_curr >= row_origin_U) {
547 rows_in_block_U = rows_in_block_U + nblk;
548 }
549 if (rows_in_block_U > rows_in_buffer) {
550 rows_in_block_U = rows_in_buffer;
551 }
552 if ((rows_in_block_A > 0)&&(cols_in_block > 0)) {
553#ifdef WITH_NVTX
554 nvtxRangePushA("GEMM_2");
555#endif
556 if (j == 1) {
557 C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne, Buf_to_receive_A, &na_rows, U_local_start, &rows_in_block_U, &dZero, Res_ptr, &na_rows);
558 }
559 else {
560 C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne, Buf_to_receive_A, &na_rows, U_local_start, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
561 }
562#ifdef WITH_NVTX
563 nvtxRangePop();
564#endif
565 }
566 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
567 curr_col_loc_res = curr_col_loc_res + nblk;
568 Res_ptr = &M[curr_col_loc_res*na_rows];
569 curr_col_loc_buf = curr_col_loc_buf + nblk;
570 }
571
573#ifdef WITH_NVTX
574 nvtxRangePushA("PTRAN");
575#endif
576 C_PTRAN(&na, &na, &dOne, M, &one, &one, a_desc, &dZero, M_T, &one, &one, a_desc); // now M_T has lower part of U(-H)*A
577#ifdef WITH_NVTX
578 nvtxRangePop();
579#endif
580
582
584
585 // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to <ratio> with the "leap" equal to np_rows; the same holds for receive
586 if ((ratio != 1)||(my_prow != 0)) { // if grid is rectangular or my_prow is not 0
587 Buf_pos = Buf_to_send_A; // I will copy to the send buffer
588 }
589 else {
590 Buf_pos = Buf_to_receive_A; // if grid is square and my_prow is 0, then I will copy to the received buffer
591 }
592 // form array to send by block-columns; we need only lower triangular part
593 num_of_iters = ceil((math_type)na_cols/(math_type)nblk); // number my of block-columns
594
595 cols_in_buffer_A_my_initial = 0;
596 Size_send_A = 0;
597
598 if (my_pcol <= my_prow) // if I am from the lower part of grid
599 {
600 curr_row_loc = 0; // I will copy all my block-rows
601 rows_in_buffer_A_my_initial = na_rows;
602 }
603 else
604 {
605 curr_row_loc = ceil((math_type)(((math_type)my_pcol - (math_type)my_prow)/(math_type)np_rows))*nblk; // I will skip some of my block-rows
606 rows_in_buffer_A_my_initial = na_rows - curr_row_loc;
607 }
608
609 for(i = 0; i < num_of_iters; i++) // loop over my block-columns
610 {
611 curr_col_loc = i*nblk; // local index of start of the current block-column
612 rows_in_block = na_rows - curr_row_loc; // how many rows do I have in the lower part of the current block-column
613
614 if ((na_cols - curr_col_loc) < nblk) {
615 cols_in_block = na_cols - curr_col_loc; // how many columns do I have in the block-column
616 }
617 else {
618 cols_in_block = nblk;
619 }
620 if ((rows_in_block > 0)&&(cols_in_block > 0))
621 {
622 A_local_start = &M_T[curr_col_loc*na_rows + curr_row_loc];
623 C_LACPY("A", &rows_in_block, &cols_in_block, A_local_start, &na_rows, Buf_pos, &rows_in_block); // copy lower part of block-column in the buffer with LDA = length of the lower part of block-column
624 Buf_pos = Buf_pos + rows_in_block*cols_in_block;
625 Size_send_A = Size_send_A + rows_in_block*cols_in_block;
626 cols_in_buffer_A_my_initial = cols_in_buffer_A_my_initial + cols_in_block;
627 }
628 curr_row_loc = curr_row_loc + ratio*nblk;
629 }
630 *Buf_pos = (math_type)cols_in_buffer_A_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors
631 Size_send_A = Size_send_A + 1;
632
633 // now we have the local buffer to send
634 // find the lowest processor column among those who will send me
635 proc_col_min = np_cols;
636 for(i = 0; i < ratio; i++)
637 {
638 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
639 if(pcol_from_where_to_receive_A < proc_col_min)
640 proc_col_min = pcol_from_where_to_receive_A;
641 }
642 // do communications and form local buffers for calculations
643 Size_receive_A = 0; // size of the accumulated buffer
644 cols_in_buffer_A = 0; // number of columns in the accumulated buffer
645 rows_in_buffer_A = 0; // number of rows in the accumulated buffer
646 for(i = 0; i < ratio; i++)
647 {
648 pcol_where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
649 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
650
651 // send and receive in the row_comm
652 if(ratio != 1) // if grid is not square
653 {
654 if(pcol_where_to_send_A != my_pcol) // if I need to send and receive on this step
655 {
656 MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_where_to_send_A, (C_INT_MPI_TYPE) zero, Buf_A, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero, row_comm, &status);
657 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
658 Size_receive_A_now = (C_INT_TYPE) Size_receive_A_nowMPI;
659
660 Size_receive_A = Size_receive_A + Size_receive_A_now - 1; // we need only number of elements, so exclude information about cols_in_buffer_A
661
662 cols_in_buffer_A_now = Buf_A[Size_receive_A_now-1];
663 cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now;
664
665 // determine number of rows in the received buffer
666 if(pcol_from_where_to_receive_A <= my_prow) // if source is from the lower part of grid
667 {
668 rows_in_buffer_A_now = na_rows;
669 }
670 else
671 {
672 rows_in_buffer_A_now = na_rows - ceil((math_type)(((math_type)pcol_from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk; // some of the block-rows have been skipped
673 }
674 if(rows_in_buffer_A < rows_in_buffer_A_now)
675 rows_in_buffer_A = rows_in_buffer_A_now;
676
677 intNumber = pcol_from_where_to_receive_A/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks
678 if (proc_col_min <= my_prow) { // if among procs who will send me there is one with the full sets of block-rows in the lower part
679 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)]; // here I will copy to; formula based on arithm. progression
680 }
681 else {
682 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)]; // otherwise, the first block-column will be shorter by one block
683 }
684 CopyFrom = Buf_A;
685 }
686 else // if I need to send to myself on this step, then I will copy from Buf_to_send_L to Buf_to_receive_A
687 {
688 cols_in_buffer_A_now = cols_in_buffer_A_my_initial;
689 cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now;
690
691 rows_in_buffer_A_now = rows_in_buffer_A_my_initial;
692 if(rows_in_buffer_A < rows_in_buffer_A_now)
693 rows_in_buffer_A = rows_in_buffer_A_now;
694
695 intNumber = my_pcol/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks
696 if (proc_col_min <= my_prow) { // if among procs who will send me there is one with the full sets of block-rows in the lower part
697 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)]; // here I will copy to; formula based on arithm. progression
698 }
699 else {
700 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)]; // otherwise, the first block-column will be shorter by one block
701 }
702 CopyFrom = Buf_to_send_A;
703
704 Size_receive_A = Size_receive_A + Size_send_A - 1;
705 }
706
707 // copy by block-columns
708 intNumber = ceil((math_type)cols_in_buffer_A_now/(math_type)nblk); // how many block-columns I have received on this iteration
709 rows_in_block = rows_in_buffer_A_now;
710 for(j = 0; j < intNumber; j++)
711 {
712 if ((j+1)*nblk < cols_in_buffer_A_now) {
713 cols_in_block = nblk;
714 }
715 else {
716 cols_in_block = cols_in_buffer_A_now - j*nblk;
717 }
718 C_LACPY("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
719
720 CopyFrom = CopyFrom + rows_in_block*cols_in_block;
721 CopyTo = CopyTo + nblk*(ratio*rows_in_block - nblk*(ratio-1)*ratio/2); // I need to leave place for ratio block-columns of the other procs. of the lengths rows_in_block, (rows_in_block-nblk), (rows_in_block-2*nblk) and so on
722 rows_in_block = rows_in_block - ratio*nblk; // number of rows in the next block-columns
723 }
724 }
725 else // if grid is square
726 {
727 if(my_prow > 0)
728 {
729 MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_where_to_send_A, (C_INT_MPI_TYPE) zero, Buf_to_receive_A, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero, row_comm, &status);
730 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
731 Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
732
733 cols_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
734 if(pcol_from_where_to_receive_A <= my_prow) // if source is from the lower part of grid
735 {
736 rows_in_buffer_A = na_rows;
737 }
738 else
739 {
740 rows_in_buffer_A = na_rows - ceil((math_type)(((math_type)pcol_from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk; // some of the block-rows have been skipped
741 }
742 }
743 else // if my_prow == 0, then I have already everything in my Buf_to_receive_A buffer
744 {
745 Size_receive_A = Size_send_A;
746 rows_in_buffer_A = rows_in_buffer_A_my_initial;
747 cols_in_buffer_A = cols_in_buffer_A_my_initial;
748 }
749 }
750 }
751 if(ratio != 1)
752 {
753 Buf_to_receive_A[Size_receive_A] = cols_in_buffer_A;
754 Buf_to_receive_A[Size_receive_A + 1] = rows_in_buffer_A;
755 Size_receive_A = Size_receive_A + 2;
756 }
757 else
758 {
759 Buf_to_receive_A[Size_receive_A] = rows_in_buffer_A;
760 Size_receive_A = Size_receive_A + 1;
761 }
762
764
765 Size_receive_U = Size_U_skewed;
766 U_to_calc = U_stored;
767
769
770 pcol_where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
771 pcol_from_where_to_receive_A = (my_pcol + 1)%np_cols;
772 where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
773 from_where_to_receive_U = (my_prow + 1)%np_rows;
774 Curr_pos_in_U_stored = Size_U_skewed;
775
776 for(j = 1; j < np_rows; j++)
777 {
778 // at this moment I need to send to neighbour what I have in the "received" arrays; that is why exchange pointers of the "received" and "send" arrays
779 data_ptr = Buf_to_send_A;
780 Buf_to_send_A = Buf_to_receive_A;
781 Buf_to_receive_A = data_ptr;
782
783 if (j > ToStore)
784 {
785 data_ptr = Buf_to_send_U;
786 Buf_to_send_U = Buf_to_receive_U;
787 Buf_to_receive_U = data_ptr;
788 }
789
791 Size_send_A = Size_receive_A;
792 MPI_Isend(Buf_to_send_A, (C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_where_to_send_A, (C_INT_MPI_TYPE) zero, row_comm, &request_A_Send);
793 MPI_Irecv(Buf_to_receive_A, (C_INT_MPI_TYPE) (ratio*Size_U_stored), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero, row_comm, &request_A_Recv);
794
796 Size_send_U = Size_receive_U;
797 if (j > ToStore)
798 {
799 if(j > ToStore + 1)
800 {
801 MPI_Isend(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Send);
802 U_to_calc = Buf_to_send_U;
803 }
804 else {
805 MPI_Isend(U_to_calc, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Send);
806 }
807 MPI_Irecv(Buf_to_receive_U, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Recv);
808 }
809
811 rows_in_buffer_U = (C_INT_TYPE)U_to_calc[Size_receive_U-1];
812 row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
813 if (my_pcol >= row_of_origin_U) {
814 cols_in_buffer_U = na_cols;
815 }
816 else {
817 cols_in_buffer_U = na_cols - nblk;
818 }
819 cols_in_buffer_A = (C_INT_TYPE)Buf_to_send_A[Size_receive_A-2];
820 rows_in_buffer_A = (C_INT_TYPE)Buf_to_send_A[Size_receive_A-1];
821 // find the minimal pcol among those who have sent A for this iteration
822 col_of_origin_A = np_cols;
823 for(i = 0; i < ratio; i++)
824 {
825 intNumber = (my_pcol + my_prow + i*np_rows + np_cols + j - 1)%np_cols;
826 if(intNumber < col_of_origin_A)
827 col_of_origin_A = intNumber;
828 }
829
831 // find block-column of the result to start update with
832 if (my_pcol >= row_of_origin_U) { // if origin of U is from the upper part
833 curr_col_loc_res = 0; // then I update all columns of Result
834 }
835 else {
836 curr_col_loc_res = nblk; // the first block column of U corresponds to my second one and I do not need to update the first block-column
837 }
838 num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
839 if (my_pcol >= row_of_origin_U) { // if origin of U is from the upper part
840 rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk; // blocks in the first block-column of U buffer
841 }
842 else {
843 rows_in_block_U = ratio*nblk;
844 }
845 U_local_start = U_to_calc;
846
847 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
848 {
849 // find block-row of the result to start update with; we need to update only lower triangular part of result
850 curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk; // global index of the first column to be updated
851 // now we need to find the smallest my local row index, such that the corresponding global index is larger of equal to <curr_col_glob_res>
852 Nb = curr_col_glob_res/nblk; // how many global block-rows are before the needed one
853 owner = Nb%np_rows; // proc. row index of the owner of row with the global index equal to <curr_col_glob_res> (it is not necessarily me)
854 curr_row_loc_res = (Nb/np_rows)*nblk;
855 if(my_prow < owner)
856 curr_row_loc_res = curr_row_loc_res + nblk;
857
858 curr_row_loc_A = curr_row_loc_res; // it is impossible, that both col_of_origin_L and row_of_origin_U are from upper part
859 if(col_of_origin_A > my_prow)
860 curr_row_loc_A = curr_row_loc_A - nblk;
861
862 rows_in_block = rows_in_buffer_A - curr_row_loc_A; // rows in current block of A
863
864 curr_col_loc_U = i*nblk; // local index in the buffer U of the current column
865
866 if ((curr_col_loc_U + nblk) <= cols_in_buffer_U) {
867 cols_in_block = nblk; // number columns in block of U which will take part in this calculation
868 }
869 else {
870 cols_in_block = cols_in_buffer_U - curr_col_loc_U;
871 }
872 if (rows_in_block_U > rows_in_buffer_U) {
873 rows_in_block_U = rows_in_buffer_U; // rows in current column of U; also a leading dimension for U
874 }
875 A_local_index = curr_row_loc_A;
876 A_local_start = &Buf_to_send_A[A_local_index];
877 Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
878
879 LDA_A = rows_in_buffer_A;
880 LDA_A_new = LDA_A;
881 if ((rows_in_block > 0)&&(cols_in_block > 0))
882 {
883 U_local_start_curr = U_local_start;
884
885 // loop over block-columns of the "active" part of L buffer
886 for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
887 {
888 if ((ii+1)*nblk <= cols_in_buffer_A) {
889 rows_in_block_U_curr = nblk;
890 }
891 else {
892 rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;
893 }
894#ifdef WITH_NVTX
895 nvtxRangePushA("GEMM_3");
896#endif
897 if ((j == 1)&&(ii == 0)) {
898 C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dZero, Res_ptr, &na_rows);
899 }
900 else {
901 C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
902 }
903#ifdef WITH_NVTX
904 nvtxRangePop();
905#endif
906 LDA_A_new = LDA_A_new - nblk;
907
908 U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
909 A_local_index = A_local_index - LDA_A + LDA_A*nblk + LDA_A_new;
910 A_local_start = &Buf_to_send_A[A_local_index];
911 LDA_A = LDA_A_new;
912 }
913 }
914
915 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
916 curr_col_loc_res = curr_col_loc_res + nblk;
917 rows_in_block_U = rows_in_block_U + ratio*nblk;
918 }
919
920 MPI_Wait(&request_A_Send, &status);
921 MPI_Wait(&request_A_Recv, &status);
922 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI); // find out how many elements I have received
923 Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
924
925 if (j <= ToStore)
926 {
927 U_to_calc = &U_stored[Curr_pos_in_U_stored];
928 Curr_pos_in_U_stored = Curr_pos_in_U_stored + SizesU[j-1];
929 Size_receive_U = SizesU[j-1];
930 }
931 else
932 {
933 MPI_Wait(&request_U_Send, &status);
934 MPI_Wait(&request_U_Recv, &status);
935 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received
936 Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
937 }
938 }
939
941 if(ToStore < np_rows - 1)
942 U_to_calc = Buf_to_receive_U;
943 rows_in_buffer_U = (C_INT_TYPE)U_to_calc[Size_receive_U-1];
944 row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
945 if (my_pcol >= row_of_origin_U) {
946 cols_in_buffer_U = na_cols;
947 }
948 else {
949 cols_in_buffer_U = na_cols - nblk;
950 }
951 cols_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-2];
952 rows_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
953 // find the minimal pcol among those who have sent A for this iteration
954 col_of_origin_A = np_cols;
955 for(i = 0; i < ratio; i++)
956 {
957 intNumber = (my_pcol + my_prow + i*np_rows + np_cols + np_rows - 1)%np_cols;
958 if(intNumber < col_of_origin_A)
959 col_of_origin_A = intNumber;
960 }
961
962 // find block-column of the result to start update with
963 if (my_pcol >= row_of_origin_U) { // if origin of U is from the upper part
964 curr_col_loc_res = 0; // then I update all columns of Result
965 }
966 else {
967 curr_col_loc_res = nblk; // the first block column of U corresponds to my second one and I do not need to update the first block-column
968 }
969 num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
970 if (my_pcol >= row_of_origin_U) { // if origin of U is from the upper part
971 rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk; // blocks in the first block-column of U buffer
972 }
973 else {
974 rows_in_block_U = ratio*nblk;
975 }
976 U_local_start = U_to_calc;
977
978 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
979 {
980 // find block-row of the result to start update with; we need to update only lower triangular part of result
981 curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk; // global index of the first column to be updated
982 // now we need to find the smallest my local row index, such that the corresponding global index is larger of equal to <curr_col_glob_res>
983 Nb = curr_col_glob_res/nblk; // how many global block-rows are before the needed one
984 owner = Nb%np_rows; // proc. row index of the owner of row with the global index equal to <curr_col_glob_res> (it is not necessarily me)
985 curr_row_loc_res = (Nb/np_rows)*nblk;
986 if(my_prow < owner)
987 curr_row_loc_res = curr_row_loc_res + nblk;
988
989 curr_row_loc_A = curr_row_loc_res; // it is impossible, that both col_of_origin_L and row_of_origin_U are from upper part
990 if(col_of_origin_A > my_prow)
991 curr_row_loc_A = curr_row_loc_A - nblk;
992
993 rows_in_block = rows_in_buffer_A - curr_row_loc_A; //rows in current block of
994
995 curr_col_loc_U = i*nblk; // local index in the buffer U of the current column
996
997 if ((curr_col_loc_U + nblk) <= cols_in_buffer_U) {
998 cols_in_block = nblk; // number columns in block of U which will take part in this calculation
999 }
1000 else {
1001 cols_in_block = cols_in_buffer_U - curr_col_loc_U;
1002 }
1003 if (rows_in_block_U > rows_in_buffer_U) {
1004 rows_in_block_U = rows_in_buffer_U;
1005 }
1006
1007 A_local_index = curr_row_loc_A;
1008 A_local_start = &Buf_to_receive_A[A_local_index];
1009 Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
1010 LDA_A = rows_in_buffer_A;
1011 LDA_A_new = LDA_A;
1012 if ((rows_in_block > 0) &&(cols_in_block > 0))
1013 {
1014 U_local_start_curr = U_local_start;
1015
1016 // loop over block-columns of the "active" part of L buffer
1017 for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
1018 {
1019 if ((ii+1)*nblk <= cols_in_buffer_A) {
1020 rows_in_block_U_curr = nblk;
1021 }
1022 else {
1023 rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;
1024 }
1025#ifdef WITH_NVTX
1026 nvtxRangePushA("GEMM_4");
1027#endif
1028 if ((j == 1)&&(ii == 0)) {
1029 C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dZero, Res_ptr, &na_rows);
1030 }
1031 else {
1032 C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
1033 }
1034#ifdef WITH_NVTX
1035 nvtxRangePop();
1036#endif
1037 LDA_A_new = LDA_A_new - nblk;
1038
1039 U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
1040 A_local_index = A_local_index - (LDA_A - rows_in_block) + LDA_A*nblk + LDA_A_new - rows_in_block;
1041 A_local_start = &Buf_to_receive_A[A_local_index];
1042 LDA_A = LDA_A_new;
1043 }
1044 }
1045
1046 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
1047 curr_col_loc_res = curr_col_loc_res + nblk;
1048 rows_in_block_U = rows_in_block_U + ratio*nblk;
1049 }
1050
1051#ifdef WITH_NVTX
1052 nvtxRangePushA("PTRAN");
1053#endif
1054 C_PTRAN(&na, &na, &dOne, Res, &one, &one, a_desc, &dZero, M, &one, &one, a_desc);
1055#ifdef WITH_NVTX
1056 nvtxRangePop();
1057#endif
1058
1059#ifdef WITH_NVTX
1060 nvtxRangePushA("PLACPY");
1061#endif
1062 C_PLACPY("U", &na, &na, M, &one, &one, a_desc, Res, &one, &one, a_desc);
1063#ifdef WITH_NVTX
1064 nvtxRangePop();
1065#endif
1066
1067 free(Buf_to_send_A);
1068 free(Buf_to_receive_A);
1069 free(Buf_to_send_U);
1070 free(Buf_to_receive_U);
1071 free(M);
1072 free(M_T);
1073 if(ratio != 1)
1074 free(Buf_A);
1075 free(U_stored);
1076 free(SizesU);
1077}
1078
1079void cannons_reduction_c_impl(math_type* A, math_type* U, int local_rowsCast, int local_colsCast,
1080 C_INT_TYPE_PTR a_desc, math_type *Res, C_INT_MPI_TYPE ToStore, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm)
1081{
1082 C_INT_TYPE local_rows, local_cols;
1083 local_rows = (C_INT_TYPE) local_rowsCast;
1084 local_cols = (C_INT_TYPE) local_colsCast;
1085
1086 MPI_Comm c_row_comm = MPI_Comm_f2c(row_comm);
1087 MPI_Comm c_col_comm = MPI_Comm_f2c(col_comm);
1088
1089
1090 C_INT_TYPE my_prow, my_pcol, np_rows, np_cols;
1091 C_INT_MPI_TYPE my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI;
1092
1093 MPI_Comm_rank(c_row_comm, &my_prowMPI);
1094 MPI_Comm_size(c_row_comm, &np_rowsMPI);
1095 MPI_Comm_rank(c_col_comm, &my_pcolMPI);
1096 MPI_Comm_size(c_col_comm, &np_colsMPI);
1097
1098 my_prow = (C_INT_TYPE) my_prowMPI;
1099 my_pcol = (C_INT_TYPE) my_pcolMPI;
1100 np_rows = (C_INT_TYPE) np_rowsMPI;
1101 np_cols = (C_INT_TYPE) np_colsMPI;
1102
1103 // BEWARE
1104 // in the cannons algorithm, column and row communicators are exchanged
1105 // What we usually call row_comm in elpa, is thus passed to col_comm parameter of the function and vice versa
1106 // (order is swapped in the following call)
1107 // It is a bit unfortunate, maybe it should be changed in the Cannon algorithm to comply with ELPA standard notation?
1108
1109 // ELPA convention : col_comm means that the data is sent in the direction, where my_pcol changes (my_prow=const). Size of col_comm is np_cols
1110 // cannon convention: col_comm means that the data is sent in the direction, wheree my_pcol=const (my_prow changes). Size of col_comm is np_rows
1111 // Example of 2D process grid:
1112 // A1 A2 A3 A4
1113 // B1 B2 B3 B4
1114 // C1 C2 C3 C4
1115 // In ELPA, {B1, B2, B3, B4} belong to the same col_comm, in cannon they belong to the same row_comm
1116 cannons_reduction_impl(A, U, np_rows, np_cols, my_prow, my_pcol, a_desc, Res, ToStore, c_col_comm, c_row_comm);
1117}
1118
#define C_INT_TYPE
Definition cannon_forw_template.c:63
#define cannons_reduction_c_impl
Definition cannon_forw_template.c:83
#define cannons_reduction_impl
Definition cannon_forw_template.c:79
#define C_INT_MPI_TYPE
Definition cannon_forw_template.c:72
#define C_INT_TYPE_PTR
Definition cannon_forw_template.c:62