Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2024.05.001
Loading...
Searching...
No Matches
cannon_forw_template.h
Go to the documentation of this file.
1// This file is part of ELPA.
2//
3// The ELPA library was originally created by the ELPA consortium,
4// consisting of the following organizations:
5//
6// - Max Planck Computing and Data Facility (MPCDF), formerly known as
7// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9// Informatik,
10// - Technische Universität München, Lehrstuhl für Informatik mit
11// Schwerpunkt Wissenschaftliches Rechnen ,
12// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15// and
16// - IBM Deutschland GmbH
17//
18// This particular source code file has been developed within the ELPA-AEO //
19// project, which has been a joint effort of
20//
21// - Max Planck Computing and Data Facility (MPCDF), formerly known as
22// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
23// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
24// Informatik,
25// - Technische Universität München, Lehrstuhl für Informatik mit
26// Schwerpunkt Wissenschaftliches Rechnen ,
27// - Technische Universität München, Lehrstuhl für Theoretische Chemie,
28// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
29
30// More information can be found here:
31// http://elpa.mpcdf.mpg.de/ and
32// http://elpa-aeo.mpcdf.mpg.de
33//
34// ELPA is free software: you can redistribute it and/or modify
35// it under the terms of the version 3 of the license of the
36// GNU Lesser General Public License as published by the Free
37// Software Foundation.
38//
39// ELPA is distributed in the hope that it will be useful,
40// but WITHOUT ANY WARRANTY; without even the implied warranty of
41// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
42// GNU Lesser General Public License for more details.
43//
44// You should have received a copy of the GNU Lesser General Public License
45// along with ELPA. If not, see <http://www.gnu.org/licenses/>
46//
47// ELPA reflects a substantial effort on the part of the original
48// ELPA consortium, and we ask you to respect the spirit of the
49// license that we chose: i.e., please contribute any changes you
50// may have back to the original ELPA library distribution, and keep
51// any derivatives of ELPA under the same license that we chose for
52// the original distribution, the GNU Lesser General Public License.
53//
54// Author: Valeriy Manin (Bergische Universität Wuppertal)
55// integrated into the ELPA library Pavel Kus, Andeas Marek (MPCDF)
56// ported to GPU by Peter Karpov (MPCDF)
57
58#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
59#define C_INT_TYPE_PTR long int*
60#define C_INT_TYPE long int
61#define BLAS_KIND c_int64_t
62#else
63#define C_INT_TYPE_PTR int*
64#define C_INT_TYPE int
65#define BLAS_KIND c_int
66#endif
67#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
68#define C_INT_MPI_TYPE_PTR long int*
69#define C_INT_MPI_TYPE long int
70#define MPI_KIND c_int64_t
71#else
72#define C_INT_MPI_TYPE_PTR int*
73#define C_INT_MPI_TYPE int
74#define MPI_KIND c_int
75#endif
76
77// it seems, that we need those two levels of indirection to correctly expand macros
78#define cannons_reduction_impl_expand2(SUFFIX) cannons_reduction_##SUFFIX
79#define cannons_reduction_impl_expand1(SUFFIX) cannons_reduction_impl_expand2(SUFFIX)
80#define cannons_reduction_impl cannons_reduction_impl_expand1(ELPA_IMPL_SUFFIX)
81
82#define cannons_reduction_c_impl_expand2(SUFFIX) cannons_reduction_c_##SUFFIX
83#define cannons_reduction_c_impl_expand1(SUFFIX) cannons_reduction_c_impl_expand2(SUFFIX)
84#define cannons_reduction_c_impl cannons_reduction_c_impl_expand1(ELPA_IMPL_SUFFIX)
85
86#include "../general/precision_typedefs.h"
87
88#include "../helpers/lapack_interfaces.h"
89#include "../helpers/scalapack_interfaces.h"
90
91void cannons_reduction_impl(math_type* A, math_type* U,
92 C_INT_TYPE np_rows, C_INT_TYPE np_cols, C_INT_TYPE my_prow, C_INT_TYPE my_pcol,
93 C_INT_TYPE_PTR a_desc, math_type *Res, C_INT_MPI_TYPE ToStore,
94 MPI_Comm row_comm, MPI_Comm col_comm,
95 int wantDebug, int useGPU, intptr_t *gpublasHandle)
96{
97 // Input matrices:
98 // - A: full matrix
99 // - U: upper triangular matrix U^(-1)
100 // Output matrix:
101 // - Res = U^(-H)*A*U^(-1), where U^(-H) := (U^(-1))^H
102 // ToStore = cannon_buffer_size. Increasing the buffer size might make it faster, but costs memory. By default cannon_buffer_size=0
103 // GPU port supports only ToStore=0
104 // row_comm: communicator along rows
105 // col_comm: communicator along columns
106
107 C_INT_TYPE na, nblk, i, j, Size_send_A, Size_receive_A, Size_send_U, Size_receive_U, Buf_rows, Buf_cols,
108 pcol_where_to_send_A, pcol_from_where_to_receive_A, where_to_send_U, from_where_to_receive_U,
109 last_proc_row, last_proc_col, cols_in_buffer_A, rows_in_buffer_A, intNumber;
110 C_INT_TYPE ratio, num_of_iters, cols_in_buffer, rows_in_block, rows_in_buffer, curr_col_loc, cols_in_block,
111 curr_col_glob, curr_row_loc, Size_receive_A_now, Nb, owner, cols_in_buffer_A_now;
112 C_INT_MPI_TYPE Size_receive_A_nowMPI, Size_receive_AMPI, Size_receive_UMPI;
113
114 math_type *Buf_to_send_A, *Buf_to_receive_A, *Buf_to_send_U, *Buf_to_receive_U, *data_ptr, *Buf_A,
115 *Buf_pos, *U_local_start, *Res_ptr, *M, *M_T, *A_local_start, *U_local_start_curr, *U_stored,
116 *CopyTo, *CopyFrom, *U_to_calc;
117
118 C_INT_TYPE row_of_origin_U, rows_in_block_U, num_of_blocks_in_U_buffer, k, startPos, cols_in_buffer_U,
119 rows_in_buffer_U, col_of_origin_A, curr_row_loc_res, curr_row_loc_A, curr_col_glob_res;
120 C_INT_TYPE curr_col_loc_res, curr_col_loc_buf, proc_row_curr, curr_col_loc_U, A_local_index,
121 LDA_A, LDA_A_new, index_row_A_for_LDA, ii, rows_in_block_U_curr, width, row_origin_U,
122 rows_in_block_A, cols_in_buffer_A_my_initial, rows_in_buffer_A_my_initial, proc_col_min;
123 C_INT_TYPE *SizesU;
124 C_INT_TYPE Size_U_skewed, Size_U_stored, Curr_pos_in_U_stored, rows_in_buffer_A_now;
125 math_type dOne = 1.0;
126 math_type dZero = 0.0;
127 C_INT_TYPE one = 1;
128 C_INT_TYPE zero = 0;
129 C_INT_TYPE na_rows, na_cols;
130
131 MPI_Status status;
132 MPI_Request request_A_Recv;
133 MPI_Request request_A_Send;
134 MPI_Request request_U_Recv;
135 MPI_Request request_U_Send;
136
137 na = a_desc[2];
138 nblk = a_desc[4];
139 na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
140 na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
141
142 //if(ToStore > (np_rows -1))
143 // if((my_prow == 0)&&(my_pcol == 0))
144 // printf("Buffering level is larger than (np_rows-1) !!!\n");
145 //if((my_prow == 0)&&(my_pcol == 0))
146 // printf("Buffering level = %d\n", ToStore);
147
149 if (np_cols%np_rows != 0)
150 {
151 //if((my_prow == 0)&& (my_pcol ==0))
152 // printf("!!!!! np_cols must be a multiple of np_rows!!!!! I do nothing! \n");
153 return;
154 }
155
156 if (np_cols < np_rows != 0)
157 {
158 //if((my_prow == 0)&& (my_pcol ==0))
159 // printf("np_cols < np_rows \n");
160 return;
161 }
162
163 ratio = np_cols/np_rows;
164 last_proc_row = ((na-1)/nblk) % np_rows; // processor row having the last block-row of matrix
165 last_proc_col = ((na-1)/nblk) % np_cols; // processor column having the last block-column of matrix
166
168 if (na%nblk == 0) {
169 if (my_pcol <= last_proc_col) {
170 Buf_cols = na_cols;
171 }
172 else {
173 Buf_cols = na_cols + nblk;
174 }
175 }
176 else {
177 if (my_pcol < last_proc_col) {
178 Buf_cols = na_cols;
179 }
180 else if (my_pcol > last_proc_col) {
181 Buf_cols = na_cols + nblk;
182 }
183 else { // if my_pcol == last_proc_col
184 Buf_cols = na_cols + nblk - na_cols%nblk;
185 }
186 }
187
188 if (na%nblk == 0) {
189 if (my_prow <= last_proc_row) {
190 Buf_rows = na_rows + 1; // Soheil: added + 1 to be able to accommodate for MPI message sizes in the case of pure blocked configuration e.g. na=100; with 9xMPI ranks and nblk=30 or with 4xMPI ranks and nblk=45
191 }
192 else {
193 Buf_rows = na_rows + nblk;
194 }
195 }
196 else {
197 if (my_prow < last_proc_row) {
198 Buf_rows = na_rows;
199 }
200 else if (my_prow > last_proc_row) {
201 Buf_rows = na_rows + nblk;
202 }
203 else { // if my_prow == last_proc_row
204 Buf_rows = na_rows + nblk - na_rows%nblk;
205 }
206 }
207
208 intNumber = ceil((math_type)na/(math_type)(np_cols*nblk)); // max. possible number of the local block columns of U
209 Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2; // number of local elements from the upper triangular part that every proc. has (max. possible value among all the procs.)
210
211 U_stored = malloc((Size_U_stored*(ToStore+1))*sizeof(math_type));
212 SizesU = malloc(ToStore*sizeof(C_INT_TYPE)); // here will be stored the sizes of the buffers of U that I have stored
213 Buf_to_send_A = malloc(ratio*Buf_cols*Buf_rows*sizeof(math_type));
214 Buf_to_receive_A = malloc(ratio*Buf_cols*Buf_rows*sizeof(math_type));
215 Buf_to_send_U = malloc(Size_U_stored*sizeof(math_type));
216 Buf_to_receive_U = malloc(Size_U_stored*sizeof(math_type));
217 if(ratio != 1)
218 Buf_A = malloc(Buf_cols*Buf_rows*sizeof(math_type)); // in this case we will receive data into initial buffer and after place block-columns to the needed positions of buffer for calculation
219 M = calloc(na_rows*na_cols, sizeof(math_type));
220 M_T = malloc(na_rows*na_cols*sizeof(math_type));
221
222 math_type *Buf_to_send_receive_A_dev;
223 math_type *Buf_to_send_receive_U_dev;
224 math_type *M_dev;
225
226 math_type *U_local_start_dev, *Res_ptr_dev; // pointers for first PxGEMM
227 math_type *A_local_start_dev, *U_local_start_curr_dev; // pointers for second PxGEMM
228
229 if (useGPU){
231
232 gpuErrCheck( gpuMalloc((intptr_t *)&Buf_to_send_receive_A_dev , ratio*Buf_cols*Buf_rows*sizeof(math_type)) );
233 gpuErrCheck( gpuMalloc((intptr_t *)&Buf_to_send_receive_U_dev , Size_U_stored*sizeof(math_type)) );
234 gpuErrCheck( gpuMalloc((intptr_t *)&M_dev, na_rows*na_cols*sizeof(math_type)) );
235 gpuErrCheck( gpuMemset((intptr_t *)M_dev, 0, na_rows*na_cols*sizeof(math_type)) );
236 }
237
239
240 NVTX_RANGE_PUSH("initial reordering of A");
241
242 // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to <ratio> with the "leap" equal to np_rows; the same holds for receive
243 if(ratio != 1) {
244#ifdef WITH_NVTX
245 nvtxRangePushA("LACPY");
246#endif
247 C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows); // copy A to Buf_to_send_A
248#ifdef WITH_NVTX
249 nvtxRangePop();
250#endif
251 }
252 Size_receive_A = 0;
253
254 // receive from different processors and place in my buffer for calculation;
255 for(i = 0; i < ratio; i++)
256 {
257 pcol_where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
258 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
259
260 // send and receive in the row_comm
261 if(ratio != 1) // if grid is not square
262 {
263 if(pcol_where_to_send_A != my_pcol)
264 {
265 MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) (na_cols*na_rows) , MPI_MATH_DATATYPE_PRECISION_C,
266 (C_INT_MPI_TYPE) pcol_where_to_send_A , (C_INT_MPI_TYPE) zero,
267 Buf_A , (C_INT_MPI_TYPE) (na_rows*Buf_cols), MPI_MATH_DATATYPE_PRECISION_C,
268 (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero,
269 row_comm, &status);
270 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
271 Size_receive_A_now = (C_INT_TYPE) Size_receive_A_nowMPI;
272 Size_receive_A_now = Size_receive_A_now/na_rows; // how many columns of A I have received
273 }
274 else {
275 Size_receive_A_now = na_cols;
276 }
277
278 Size_receive_A = Size_receive_A + Size_receive_A_now; // here accumulate number of columns of A that I will receive
279
280 // now I need to copy the received block to my buffer for A
281 intNumber = pcol_from_where_to_receive_A/np_rows; // how many blocks I will receive, such that I will need to put them before the just received block
282
283 CopyTo = &Buf_to_receive_A[intNumber*na_rows*nblk]; // here I will start copying the received buffer
284 if (pcol_where_to_send_A != my_pcol) {
285 CopyFrom = Buf_A;
286 }
287 else {
288 CopyFrom = A;
289 }
290
291 intNumber = ceil((math_type)Size_receive_A_now/(math_type)nblk); // how many block-columns I have received
292 for(j = 0; j < intNumber; j++)
293 {
294 width = nblk; // width of the current block column
295 if(nblk*(j+1) > Size_receive_A_now)
296 width = Size_receive_A_now - nblk*j;
297 C_LACPY("A", &na_rows, &width, CopyFrom, &na_rows, CopyTo, &na_rows);
298 CopyTo = CopyTo + na_rows*nblk*ratio;
299 CopyFrom = CopyFrom + na_rows*nblk;
300 }
301 }
302
303 else { // if grid is square then simply receive from one processor to a calculation buffer
304 if(my_prow > 0)
305 {
306 C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows); // copy A to Buf_to_send_A
307 MPI_Sendrecv(Buf_to_send_A , (C_INT_MPI_TYPE) (na_cols*na_rows) , MPI_MATH_DATATYPE_PRECISION_C,
308 (C_INT_MPI_TYPE) pcol_where_to_send_A , (C_INT_MPI_TYPE) zero,
309 Buf_to_receive_A, (C_INT_MPI_TYPE) (na_rows*Buf_cols), MPI_MATH_DATATYPE_PRECISION_C,
310 (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero,
311 row_comm, &status);
312 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
313 Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
314 Size_receive_A = Size_receive_A/na_rows; // how many columns of A I have received
315 }
316 else
317 {
318 C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_receive_A, &na_rows); // copy A to the received buffer if I do not need to send
319 Size_receive_A = na_cols;
320 }
321 }
322 }
323
324 NVTX_RANGE_POP(); // initial reordering of A
325
327
328 NVTX_RANGE_PUSH("initial reordering of U");
329
330 // form array to send by block-columns
331 num_of_iters = ceil((math_type)na_cols/(math_type)nblk); // number my of block-columns
332
333 where_to_send_U = (my_prow - my_pcol + np_cols)%np_rows; // shift = my_pcol; we assume that np_cols%np_rows = 0
334 from_where_to_receive_U = (my_pcol + my_prow)%np_rows;
335
336 if (where_to_send_U == my_prow) { // if I will not need to send my local part of U, then copy my local data to the "received" buffer
337 Buf_pos = Buf_to_receive_U;
338 }
339 else {
340 Buf_pos = Buf_to_send_U; // else form the array to send
341 }
342 // find the first local block belonging to the upper part of matrix U
343 if (my_pcol >= my_prow) { // if I am in the upper part of proc. grid
344 curr_col_loc = 0; // my first local block-column has block from the upper part of matrix
345 }
346 else {
347 curr_col_loc = 1; //ceil((math_type)(((math_type)my_prow - (math_type)my_pcol)/(math_type)np_cols)) always will give 1 since np_cols > np_rows
348 }
349 num_of_iters = num_of_iters - curr_col_loc; // I will exclude the first <curr_col_loc> block-columns since they do not have blocks from the upper part of matrix U
350 curr_col_loc = curr_col_loc*nblk; // local index of the found block-column
351
352 if (my_pcol >= my_prow ) {
353 rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
354 }
355 else {
356 rows_in_block = ratio*nblk;
357 }
358 Size_send_U = 0;
359 for(i = 0; i < num_of_iters; i++) // loop over my block-columns, which have blocks in the upper part of U
360 {
361 if (rows_in_block > na_rows) {
362 rows_in_block = na_rows;
363 }
364 if ((na_cols - curr_col_loc) < nblk) {
365 cols_in_block = na_cols - curr_col_loc; // how many columns do I have in the current block-column
366 }
367 else {
368 cols_in_block = nblk;
369 }
370 if ((rows_in_block > 0)&&(cols_in_block > 0))
371 {
372 data_ptr = &U[curr_col_loc*na_rows]; // pointer to start of the current block-column to be copied to buffer
373 C_LACPY("A", &rows_in_block, &cols_in_block, data_ptr, &na_rows, Buf_pos, &rows_in_block); // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column
374 Buf_pos = Buf_pos + rows_in_block*cols_in_block; // go to the position where the next block-column will be copied
375 Size_send_U = Size_send_U + rows_in_block*cols_in_block;
376 }
377 curr_col_loc = curr_col_loc + nblk; // go to the next local block-column of my local array U
378 rows_in_block = rows_in_block + ratio*nblk;
379 }
380 rows_in_buffer = rows_in_block - ratio*nblk; // remove redundant addition from the previous loop
381 *Buf_pos = (math_type)rows_in_buffer; // write number of the rows at the end of the buffer; we will need this for further multiplications on the other processors
382 Size_send_U = Size_send_U + 1;
383
384 //send and receive
385 if (where_to_send_U != my_prow)
386 {
387 // send and receive in the col_comm
388 MPI_Sendrecv(Buf_to_send_U , (C_INT_MPI_TYPE) Size_send_U , MPI_MATH_DATATYPE_PRECISION_C,
389 (C_INT_MPI_TYPE) where_to_send_U , (C_INT_MPI_TYPE) zero,
390 Buf_to_receive_U, (C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C,
391 (C_INT_MPI_TYPE) from_where_to_receive_U, (C_INT_MPI_TYPE) zero,
392 col_comm, &status);
393 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received
394 Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
395 }
396 else {// if I do not need to send
397 Size_receive_U = Size_send_U; // how many elements I "have received"; the needed data I have already copied to the "receive" buffer
398 }
399 for(i = 0; i < Size_receive_U; i++)
400 U_stored[i] = Buf_to_receive_U[i];
401 Size_U_skewed = Size_receive_U;
402 Curr_pos_in_U_stored = Size_U_skewed;
403
404 NVTX_RANGE_POP(); // initial reordering of U
405
407
408 pcol_where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
409 pcol_from_where_to_receive_A = (my_pcol + 1)%np_cols;
410 where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
411 from_where_to_receive_U = (my_prow + 1)%np_rows;
412
413 NVTX_RANGE_PUSH("loop j<np_rows");
414 for(j = 1; j < np_rows; j++)
415 {
416 // at this moment I need to send to neighbour what I have in the "received" arrays; that is why exchange pointers of the "received" and "send" arrays
417 data_ptr = Buf_to_send_A;
418 Buf_to_send_A = Buf_to_receive_A;
419 Buf_to_receive_A = data_ptr;
420
421 data_ptr = Buf_to_send_U;
422 Buf_to_send_U = Buf_to_receive_U;
423 Buf_to_receive_U = data_ptr;
424
426 Size_send_A = Size_receive_A; // number of block-columns of A and block-rows of U to send (that I have received on the previous step)
427 MPI_Isend(Buf_to_send_A, (C_INT_MPI_TYPE) (Size_send_A*na_rows), MPI_MATH_DATATYPE_PRECISION_C,
428 (C_INT_MPI_TYPE) pcol_where_to_send_A, (C_INT_MPI_TYPE) zero, row_comm, &request_A_Send);
429 MPI_Irecv(Buf_to_receive_A, (C_INT_MPI_TYPE) (Buf_cols*na_rows*ratio), MPI_MATH_DATATYPE_PRECISION_C,
430 (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero, row_comm, &request_A_Recv);
431
433 Size_send_U = Size_receive_U;
434 MPI_Isend(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C,
435 (C_INT_MPI_TYPE) where_to_send_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Send);
436 MPI_Irecv(Buf_to_receive_U, (C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C,
437 (C_INT_MPI_TYPE) from_where_to_receive_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Recv);
438
440 rows_in_buffer = (int)Buf_to_send_U[Size_receive_U-1]; // copied above: *Buf_pos = (math_type)rows_in_buffer;
441 row_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
442
443 if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U)) // if I and sender are from the upper part of grid
444 {
445 cols_in_buffer = na_cols; // then we have the same number of columns in the upper triangular part
446 curr_col_loc_res = 0; // all my block-columns have parts in the upper triangular part
447 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
448 }
449 if((my_pcol < my_prow)&&(my_pcol < row_origin_U)) // if I and sender are from the lower part of grid
450 {
451 cols_in_buffer = na_cols - nblk; // then we have the same number of columns in the upper triangular part, but the first block-column was not included
452 curr_col_loc_res = nblk; // I start update from the second block-column since the first on is in the lower triangular part
453 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
454 }
455 if((my_pcol >= my_prow)&&(my_pcol < row_origin_U)) // if I am from the upper part of grid and sender is from the lower part
456 {
457 cols_in_buffer = na_cols - nblk; // then I have received one block-column less than I have
458 curr_col_loc_res = nblk; // all my block-columns have parts in the upper triangular part, but the first block-column of the received buffers corresponds to my second one
459 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
460 }
461 if((my_pcol < my_prow)&&(my_pcol >= row_origin_U)) // if I am from the lower part of grid and sender is from the upper part
462 {
463 cols_in_buffer = na_cols; // then I have received the full set of block-columns
464 curr_col_loc_res = nblk; // I start update from the second block-column since the first on is in the lower triangular part
465 curr_col_loc_buf = nblk; // I skip the first block-column of the buffer, since my first block-column is in the lower part
466 }
467
468 num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk);
469
470 startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
471
472 if (useGPU){
473 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_A_dev, (intptr_t *)Buf_to_send_A, ratio*Buf_cols*Buf_rows*sizeof(math_type), gpuMemcpyHostToDevice) );
474 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_U_dev, (intptr_t *)Buf_to_send_U, Size_U_stored*sizeof(math_type), gpuMemcpyHostToDevice) );
475 }
476
477 if (useGPU){
478 U_local_start_dev = Buf_to_send_receive_U_dev + startPos;
479 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows;
480 }
481 else {
482 U_local_start = &Buf_to_send_U[startPos];
483 Res_ptr = &M[curr_col_loc_res*na_rows];
484 }
485
486 NVTX_RANGE_PUSH("loop i<num_of_blocks_in_U_buffer");
487 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
488 {
489 curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
490 proc_row_curr = (curr_col_glob/nblk)%np_rows;
491 rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk; // in A; not to go down beyond the upper triangular part
492 if (my_prow <= proc_row_curr) {
493 rows_in_block_A = rows_in_block_A + nblk;
494 }
495 if (rows_in_block_A > na_rows) {
496 rows_in_block_A = na_rows;
497 }
498 if ((curr_col_loc_buf + nblk) <= cols_in_buffer) {
499 cols_in_block = nblk; // number columns in block of U which will take part in this calculation
500 }
501 else {
502 cols_in_block = cols_in_buffer - curr_col_loc_buf;
503 }
504
505 rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk; // corresponds to columns in A;
506 if (proc_row_curr >= row_origin_U) {
507 rows_in_block_U = rows_in_block_U + nblk;
508 }
509 if (rows_in_block_U > rows_in_buffer) {
510 rows_in_block_U = rows_in_buffer;
511 }
512
513 if ((rows_in_block_A > 0)&&(cols_in_block > 0)) {
514
515 NVTX_RANGE_PUSH("GEMM_1");
516 // Res_ptr = Buf_to_send_A*U_local_start + Res_ptr
517 // M = Buf_to_send_A*Buf_to_send_U + M
518 if (useGPU){
519 gpublasXgemm(gpublasHandle, 'N', 'N',
520 rows_in_block_A, cols_in_block, rows_in_block_U, dOne,
521 Buf_to_send_receive_A_dev, na_rows,
522 U_local_start_dev, rows_in_block_U, dOne,
523 Res_ptr_dev, na_rows);
524 if (wantDebug) gpuDeviceSynchronize();
525 }
526 else {
527 C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne,
528 Buf_to_send_A, &na_rows, U_local_start, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
529 }
530 NVTX_RANGE_POP(); // GEMM_1
531 }
532
533 curr_col_loc_res = curr_col_loc_res + nblk;
534 curr_col_loc_buf = curr_col_loc_buf + nblk;
535
536 if (useGPU) {
537 U_local_start_dev = U_local_start_dev + rows_in_block_U*cols_in_block;
538 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows;
539 }
540 else {
541 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
542 Res_ptr = &M[curr_col_loc_res*na_rows];
543 }
544
545 }
546 NVTX_RANGE_POP(); // loop i<num_of_blocks_in_U_buffer
547
548 MPI_Wait(&request_A_Send, &status);
549 MPI_Wait(&request_A_Recv, &status);
550
551 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI); // find out how many elements I have received
552 Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
553 Size_receive_A = Size_receive_A / na_rows;
554
555
556 MPI_Wait(&request_U_Send, &status);
557 MPI_Wait(&request_U_Recv, &status);
558 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received
559 Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
561 if(j <= ToStore)
562 {
563 for(k = 0; k < Size_receive_U; k++)
564 U_stored[Curr_pos_in_U_stored + k] = Buf_to_receive_U[k];
565 Curr_pos_in_U_stored = Curr_pos_in_U_stored + Size_receive_U;
566 SizesU[j-1] = Size_receive_U;
567 }
568 }
569 NVTX_RANGE_POP(); // loop j<np_rows
570
571
573 rows_in_buffer = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
574 row_origin_U = (my_pcol + my_prow + np_cols + np_rows -1)%np_rows;
575
576 if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U)) // if I and sender are from the upper part of grid
577 {
578 cols_in_buffer = na_cols; // then we have the same number of columns in the upper triangular part
579 curr_col_loc_res = 0; // all my block-columns have parts in the upper triangular part
580 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
581 }
582 if((my_pcol < my_prow)&&(my_pcol < row_origin_U)) // if I and sender are from the lower part of grid
583 {
584 cols_in_buffer = na_cols - nblk; // then we have the same number of columns in the upper triangular part, but the first block-column was not included
585 curr_col_loc_res = nblk; // I start update from the second block-column since the first on is in the lower triangular part
586 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
587 }
588 if((my_pcol >= my_prow)&&(my_pcol < row_origin_U)) // if I am from the upper part of grid and sender is from the lower part
589 {
590 cols_in_buffer = na_cols - nblk; // then I have received one block-column less than I have
591 curr_col_loc_res = nblk; // all my block-columns have parts in the upper triangular part, but the first block-column of the received buffers corresponds to my second one
592 curr_col_loc_buf = 0; // I use all the block-columns of the received buffer
593 }
594 if((my_pcol < my_prow)&&(my_pcol >= row_origin_U)) // if I am from the lower part of grid and sender is from the upper part
595 {
596 cols_in_buffer = na_cols; // then I have received the full set of block-columns
597 curr_col_loc_res = nblk; // I start update from the second block-column since the first on is in the lower triangular part
598 curr_col_loc_buf = nblk; // I skip the first block-column of the buffer, since my first block-column is in the lower part
599 }
600
601 num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk);
602
603 startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
604
605 if (useGPU){
606 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_A_dev, (intptr_t *)Buf_to_receive_A, ratio*Buf_cols*Buf_rows*sizeof(math_type), gpuMemcpyHostToDevice) );
607 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_U_dev, (intptr_t *)Buf_to_receive_U, Size_U_stored*sizeof(math_type), gpuMemcpyHostToDevice) );
608 }
609
610 if (useGPU){
611 U_local_start_dev = Buf_to_send_receive_U_dev + startPos;
612 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows;
613 }
614 else {
615 U_local_start = &Buf_to_receive_U[startPos];
616 Res_ptr = &M[curr_col_loc_res*na_rows];
617 }
618
619#ifdef WITH_NVTX
620 nvtxRangePushA("loop-last i<num_of_blocks_in_U_buffer");
621#endif
622 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
623 {
624 curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
625 proc_row_curr = (curr_col_glob/nblk)%np_rows;
626 rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk; // in A; not to go down beyond the upper triangular part
627 if (my_prow <= proc_row_curr) {
628 rows_in_block_A = rows_in_block_A + nblk;
629 }
630 if (rows_in_block_A > na_rows) {
631 rows_in_block_A = na_rows;
632 }
633 if ((curr_col_loc_buf + nblk) <= cols_in_buffer) {
634 cols_in_block = nblk; // number columns in block of U which will take part in this calculation
635 }
636 else {
637 cols_in_block = cols_in_buffer - curr_col_loc_buf;
638 }
639 rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk; // corresponds to columns in A;
640 if (proc_row_curr >= row_origin_U) {
641 rows_in_block_U = rows_in_block_U + nblk;
642 }
643 if (rows_in_block_U > rows_in_buffer) {
644 rows_in_block_U = rows_in_buffer;
645 }
646 if ((rows_in_block_A > 0)&&(cols_in_block > 0)) {
647#ifdef WITH_NVTX
648 nvtxRangePushA("GEMM_1_last");
649#endif
650 // Res_ptr = Buf_to_receive_A*U_local_start + Res_ptr
651 // M = Buf_to_receive_A*Buf_to_recieve_U + M
652 if (useGPU){
653 gpublasXgemm(gpublasHandle, 'N', 'N',
654 rows_in_block_A, cols_in_block, rows_in_block_U, dOne,
655 Buf_to_send_receive_A_dev, na_rows,
656 U_local_start_dev, rows_in_block_U, dOne,
657 Res_ptr_dev, na_rows);
658 if (wantDebug) gpuDeviceSynchronize();
659 }
660 else {
661 C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne,
662 Buf_to_receive_A, &na_rows, U_local_start, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
663 }
664#ifdef WITH_NVTX
665 nvtxRangePop();
666#endif
667 }
668
669 curr_col_loc_res = curr_col_loc_res + nblk;
670 curr_col_loc_buf = curr_col_loc_buf + nblk;
671
672 if (useGPU) {
673 U_local_start_dev = U_local_start_dev + rows_in_block_U*cols_in_block;
674 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows;
675 }
676 else {
677 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
678 Res_ptr = &M[curr_col_loc_res*na_rows];
679 }
680
681 }
682#ifdef WITH_NVTX
683 nvtxRangePop(); // loop-last i<num_of_blocks_in_U_buffer
684#endif
685
686 if (useGPU) gpuErrCheck( gpuMemcpy((intptr_t *)M, (intptr_t *)M_dev, na_rows*na_cols*sizeof(math_type), gpuMemcpyDeviceToHost) );
687
689#ifdef WITH_NVTX
690 nvtxRangePushA("PTRAN");
691#endif
692 C_PTRAN(&na, &na, &dOne, M, &one, &one, a_desc, &dZero, M_T, &one, &one, a_desc); // M_T <- M, now M_T has lower part of U^(-H)*A
693#ifdef WITH_NVTX
694 nvtxRangePop();
695#endif
696
698
700#ifdef WITH_NVTX
701 nvtxRangePushA("initial reordering of A");
702#endif
703
704 // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to <ratio> with the "leap" equal to np_rows; the same holds for receive
705 if ((ratio != 1)||(my_prow != 0)) { // if grid is rectangular or my_prow is not 0
706 Buf_pos = Buf_to_send_A; // I will copy to the send buffer
707 }
708 else {
709 Buf_pos = Buf_to_receive_A; // if grid is square and my_prow is 0, then I will copy to the received buffer
710 }
711 // form array to send by block-columns; we need only lower triangular part
712 num_of_iters = ceil((math_type)na_cols/(math_type)nblk); // number my of block-columns
713
714 cols_in_buffer_A_my_initial = 0;
715 Size_send_A = 0;
716
717 if (my_pcol <= my_prow) // if I am from the lower part of grid
718 {
719 curr_row_loc = 0; // I will copy all my block-rows
720 rows_in_buffer_A_my_initial = na_rows;
721 }
722 else
723 {
724 curr_row_loc = ceil((math_type)(((math_type)my_pcol - (math_type)my_prow)/(math_type)np_rows))*nblk; // I will skip some of my block-rows
725 rows_in_buffer_A_my_initial = na_rows - curr_row_loc;
726 }
727
728 for(i = 0; i < num_of_iters; i++) // loop over my block-columns
729 {
730 curr_col_loc = i*nblk; // local index of start of the current block-column
731 rows_in_block = na_rows - curr_row_loc; // how many rows do I have in the lower part of the current block-column
732
733 if ((na_cols - curr_col_loc) < nblk) {
734 cols_in_block = na_cols - curr_col_loc; // how many columns do I have in the block-column
735 }
736 else {
737 cols_in_block = nblk;
738 }
739 if ((rows_in_block > 0)&&(cols_in_block > 0))
740 {
741 A_local_start = &M_T[curr_col_loc*na_rows + curr_row_loc];
742 C_LACPY("A", &rows_in_block, &cols_in_block, A_local_start, &na_rows, Buf_pos, &rows_in_block); // copy lower part of block-column in the buffer with LDA = length of the lower part of block-column
743 Buf_pos = Buf_pos + rows_in_block*cols_in_block;
744 Size_send_A = Size_send_A + rows_in_block*cols_in_block;
745 cols_in_buffer_A_my_initial = cols_in_buffer_A_my_initial + cols_in_block;
746 }
747 curr_row_loc = curr_row_loc + ratio*nblk;
748 }
749 *Buf_pos = (math_type)cols_in_buffer_A_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors
750 Size_send_A = Size_send_A + 1;
751
752 // now we have the local buffer to send
753 // find the lowest processor column among those who will send me
754 proc_col_min = np_cols;
755 for(i = 0; i < ratio; i++)
756 {
757 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
758 if(pcol_from_where_to_receive_A < proc_col_min)
759 proc_col_min = pcol_from_where_to_receive_A;
760 }
761 // do communications and form local buffers for calculations
762 Size_receive_A = 0; // size of the accumulated buffer
763 cols_in_buffer_A = 0; // number of columns in the accumulated buffer
764 rows_in_buffer_A = 0; // number of rows in the accumulated buffer
765 for(i = 0; i < ratio; i++)
766 {
767 pcol_where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
768 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
769
770 // send and receive in the row_comm
771 if(ratio != 1) // if grid is not square
772 {
773 if(pcol_where_to_send_A != my_pcol) // if I need to send and receive on this step
774 {
775 MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) Size_send_A , MPI_MATH_DATATYPE_PRECISION_C,
776 (C_INT_MPI_TYPE) pcol_where_to_send_A , (C_INT_MPI_TYPE) zero,
777 Buf_A , (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C,
778 (C_INT_MPI_TYPE) pcol_from_where_to_receive_A , (C_INT_MPI_TYPE) zero,
779 row_comm, &status);
780
781 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
782 Size_receive_A_now = (C_INT_TYPE) Size_receive_A_nowMPI;
783
784 Size_receive_A = Size_receive_A + Size_receive_A_now - 1; // we need only number of elements, so exclude information about cols_in_buffer_A
785
786 cols_in_buffer_A_now = Buf_A[Size_receive_A_now-1];
787 cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now;
788
789 // determine number of rows in the received buffer
790 if(pcol_from_where_to_receive_A <= my_prow) // if source is from the lower part of grid
791 {
792 rows_in_buffer_A_now = na_rows;
793 }
794 else
795 {
796 rows_in_buffer_A_now = na_rows - ceil((math_type)(((math_type)pcol_from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk; // some of the block-rows have been skipped
797 }
798 if(rows_in_buffer_A < rows_in_buffer_A_now)
799 rows_in_buffer_A = rows_in_buffer_A_now;
800
801 intNumber = pcol_from_where_to_receive_A/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks
802 if (proc_col_min <= my_prow) { // if among procs who will send me there is one with the full sets of block-rows in the lower part
803 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)]; // here I will copy to; formula based on arithm. progression
804 }
805 else {
806 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)]; // otherwise, the first block-column will be shorter by one block
807 }
808 CopyFrom = Buf_A;
809 }
810 else // if I need to send to myself on this step, then I will copy from Buf_to_send_L to Buf_to_receive_A
811 {
812 cols_in_buffer_A_now = cols_in_buffer_A_my_initial;
813 cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now;
814
815 rows_in_buffer_A_now = rows_in_buffer_A_my_initial;
816 if(rows_in_buffer_A < rows_in_buffer_A_now)
817 rows_in_buffer_A = rows_in_buffer_A_now;
818
819 intNumber = my_pcol/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks
820 if (proc_col_min <= my_prow) { // if among procs who will send me there is one with the full sets of block-rows in the lower part
821 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)]; // here I will copy to; formula based on arithm. progression
822 }
823 else {
824 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)]; // otherwise, the first block-column will be shorter by one block
825 }
826 CopyFrom = Buf_to_send_A;
827
828 Size_receive_A = Size_receive_A + Size_send_A - 1;
829 }
830
831 // copy by block-columns
832 intNumber = ceil((math_type)cols_in_buffer_A_now/(math_type)nblk); // how many block-columns I have received on this iteration
833 rows_in_block = rows_in_buffer_A_now;
834 for(j = 0; j < intNumber; j++)
835 {
836 if ((j+1)*nblk < cols_in_buffer_A_now) {
837 cols_in_block = nblk;
838 }
839 else {
840 cols_in_block = cols_in_buffer_A_now - j*nblk;
841 }
842 C_LACPY("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
843
844 CopyFrom = CopyFrom + rows_in_block*cols_in_block;
845 CopyTo = CopyTo + nblk*(ratio*rows_in_block - nblk*(ratio-1)*ratio/2); // I need to leave place for ratio block-columns of the other procs. of the lengths rows_in_block, (rows_in_block-nblk), (rows_in_block-2*nblk) and so on
846 rows_in_block = rows_in_block - ratio*nblk; // number of rows in the next block-columns
847 }
848 }
849 else // if grid is square
850 {
851 if(my_prow > 0)
852 {
853 MPI_Sendrecv(Buf_to_send_A , (C_INT_MPI_TYPE) Size_send_A , MPI_MATH_DATATYPE_PRECISION_C,
854 (C_INT_MPI_TYPE) pcol_where_to_send_A , (C_INT_MPI_TYPE) zero,
855 Buf_to_receive_A, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C,
856 (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero,
857 row_comm, &status);
858
859 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
860
861 Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
862
863 cols_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
864 if(pcol_from_where_to_receive_A <= my_prow) // if source is from the lower part of grid
865 {
866 rows_in_buffer_A = na_rows;
867 }
868 else
869 {
870 rows_in_buffer_A = na_rows - ceil((math_type)(((math_type)pcol_from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk; // some of the block-rows have been skipped
871 }
872 }
873 else // if my_prow == 0, then I have already everything in my Buf_to_receive_A buffer
874 {
875 Size_receive_A = Size_send_A;
876 rows_in_buffer_A = rows_in_buffer_A_my_initial;
877 cols_in_buffer_A = cols_in_buffer_A_my_initial;
878 }
879 }
880 }
881 if(ratio != 1)
882 {
883 Buf_to_receive_A[Size_receive_A] = cols_in_buffer_A;
884 Buf_to_receive_A[Size_receive_A + 1] = rows_in_buffer_A;
885 Size_receive_A = Size_receive_A + 2;
886 }
887 else
888 {
889 Buf_to_receive_A[Size_receive_A] = rows_in_buffer_A;
890 Size_receive_A = Size_receive_A + 1;
891 }
892
893#ifdef WITH_NVTX
894 nvtxRangePop(); // initial reordering of A
895#endif
896
898
899 Size_receive_U = Size_U_skewed;
900 U_to_calc = U_stored;
901
903
904 pcol_where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
905 pcol_from_where_to_receive_A = (my_pcol + 1)%np_cols;
906 where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
907 from_where_to_receive_U = (my_prow + 1)%np_rows;
908 Curr_pos_in_U_stored = Size_U_skewed;
909
910 if (useGPU) {
911 gpuErrCheck( gpuMemset((intptr_t *)M_dev, 0, na_rows*na_cols*sizeof(math_type)) ); // Reset the buffer
912 }
913
914#ifdef WITH_NVTX
915 nvtxRangePushA("loop j<np_rows");
916#endif
917 for(j = 1; j < np_rows; j++)
918 {
919 // at this moment I need to send to neighbour what I have in the "received" arrays; that is why exchange pointers of the "received" and "send" arrays
920 data_ptr = Buf_to_send_A;
921 Buf_to_send_A = Buf_to_receive_A;
922 Buf_to_receive_A = data_ptr;
923
924 if (j > ToStore)
925 {
926 data_ptr = Buf_to_send_U;
927 Buf_to_send_U = Buf_to_receive_U;
928 Buf_to_receive_U = data_ptr;
929 }
930
932 Size_send_A = Size_receive_A;
933 MPI_Isend(Buf_to_send_A, (C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_where_to_send_A, (C_INT_MPI_TYPE) zero, row_comm, &request_A_Send);
934 MPI_Irecv(Buf_to_receive_A, (C_INT_MPI_TYPE) (ratio*Size_U_stored), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (C_INT_MPI_TYPE) zero, row_comm, &request_A_Recv);
935
937 Size_send_U = Size_receive_U;
938 if (j > ToStore)
939 {
940 if(j > ToStore + 1)
941 {
942 MPI_Isend(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Send);
943 U_to_calc = Buf_to_send_U;
944 }
945 else {
946 MPI_Isend(U_to_calc, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Send);
947 }
948 MPI_Irecv(Buf_to_receive_U, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, (C_INT_MPI_TYPE) zero, col_comm, &request_U_Recv);
949 }
950
952 rows_in_buffer_U = (C_INT_TYPE)U_to_calc[Size_receive_U-1];
953 row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
954 if (my_pcol >= row_of_origin_U) {
955 cols_in_buffer_U = na_cols;
956 }
957 else {
958 cols_in_buffer_U = na_cols - nblk;
959 }
960 cols_in_buffer_A = (C_INT_TYPE)Buf_to_send_A[Size_receive_A-2];
961 rows_in_buffer_A = (C_INT_TYPE)Buf_to_send_A[Size_receive_A-1];
962 // find the minimal pcol among those who have sent A for this iteration
963 col_of_origin_A = np_cols;
964 for(i = 0; i < ratio; i++)
965 {
966 intNumber = (my_pcol + my_prow + i*np_rows + np_cols + j - 1)%np_cols;
967 if(intNumber < col_of_origin_A)
968 col_of_origin_A = intNumber;
969 }
970
972 // find block-column of the result to start update with
973 if (my_pcol >= row_of_origin_U) { // if origin of U is from the upper part
974 curr_col_loc_res = 0; // then I update all columns of Result
975 }
976 else {
977 curr_col_loc_res = nblk; // the first block column of U corresponds to my second one and I do not need to update the first block-column
978 }
979 num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
980 if (my_pcol >= row_of_origin_U) { // if origin of U is from the upper part
981 rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk; // blocks in the first block-column of U buffer
982 }
983 else {
984 rows_in_block_U = ratio*nblk;
985 }
986
987 if (useGPU){
988 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_A_dev, (intptr_t *)Buf_to_send_A, ratio*Buf_cols*Buf_rows*sizeof(math_type), gpuMemcpyHostToDevice) );
989 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_U_dev, (intptr_t *)U_to_calc, Size_U_stored*sizeof(math_type), gpuMemcpyHostToDevice) );
990 }
991
992 if (useGPU){
993 U_local_start_dev = Buf_to_send_receive_U_dev;
994 }
995 else {
996 U_local_start = U_to_calc;
997 }
998
999 NVTX_RANGE_PUSH("loop i<num_of_blocks_in_U_buffer");
1000 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
1001 {
1002 // find block-row of the result to start update with; we need to update only lower triangular part of result
1003 curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk; // global index of the first column to be updated
1004 // now we need to find the smallest my local row index, such that the corresponding global index is larger of equal to <curr_col_glob_res>
1005 Nb = curr_col_glob_res/nblk; // how many global block-rows are before the needed one
1006 owner = Nb%np_rows; // proc. row index of the owner of row with the global index equal to <curr_col_glob_res> (it is not necessarily me)
1007 curr_row_loc_res = (Nb/np_rows)*nblk;
1008 if(my_prow < owner)
1009 curr_row_loc_res = curr_row_loc_res + nblk;
1010
1011 curr_row_loc_A = curr_row_loc_res; // it is impossible, that both col_of_origin_L and row_of_origin_U are from upper part
1012 if(col_of_origin_A > my_prow)
1013 curr_row_loc_A = curr_row_loc_A - nblk;
1014
1015 rows_in_block = rows_in_buffer_A - curr_row_loc_A; // rows in current block of A
1016
1017 curr_col_loc_U = i*nblk; // local index in the buffer U of the current column
1018
1019 if ((curr_col_loc_U + nblk) <= cols_in_buffer_U) {
1020 cols_in_block = nblk; // number columns in block of U which will take part in this calculation
1021 }
1022 else {
1023 cols_in_block = cols_in_buffer_U - curr_col_loc_U;
1024 }
1025 if (rows_in_block_U > rows_in_buffer_U) {
1026 rows_in_block_U = rows_in_buffer_U; // rows in current column of U; also a leading dimension for U
1027 }
1028 A_local_index = curr_row_loc_A;
1029
1030 if (useGPU){
1031 A_local_start_dev = Buf_to_send_receive_A_dev + A_local_index;
1032 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows + curr_row_loc_res; // we reuse M_dev buffer instead of introducing Res_dev
1033 }
1034 else {
1035 A_local_start = &Buf_to_send_A[A_local_index];
1036 Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
1037 }
1038
1039 LDA_A = rows_in_buffer_A;
1040 LDA_A_new = LDA_A;
1041 if ((rows_in_block > 0)&&(cols_in_block > 0))
1042 {
1043 if (useGPU){
1044 U_local_start_curr_dev = U_local_start_dev;
1045 }
1046 else {
1047 U_local_start_curr = U_local_start;
1048 }
1049
1050 // loop over block-columns of the "active" part of L buffer
1051 for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
1052 {
1053 if ((ii+1)*nblk <= cols_in_buffer_A) {
1054 rows_in_block_U_curr = nblk;
1055 }
1056 else {
1057 rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;
1058 }
1059
1060 NVTX_RANGE_PUSH("GEMM_2");
1061 // Res_ptr = A_local_start*U_local_start_curr + Res_ptr
1062 // Res = Buf_to_send_A*Buf_to_send_U + Res
1063 if (useGPU){
1064 gpublasXgemm(gpublasHandle, 'N', 'N',
1065 rows_in_block, cols_in_block, rows_in_block_U_curr, dOne,
1066 A_local_start_dev, LDA_A,
1067 U_local_start_curr_dev, rows_in_block_U, dOne,
1068 Res_ptr_dev, na_rows);
1069 if (wantDebug) gpuDeviceSynchronize();
1070 }
1071 else {
1072 C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne,
1073 A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
1074 }
1076
1077 LDA_A_new = LDA_A_new - nblk;
1078 A_local_index = A_local_index - LDA_A + LDA_A*nblk + LDA_A_new;
1079
1080 if (useGPU){
1081 A_local_start_dev = Buf_to_send_receive_A_dev + A_local_index;
1082 U_local_start_curr_dev = U_local_start_curr_dev + rows_in_block_U_curr;
1083 }
1084 else {
1085 A_local_start = &Buf_to_send_A[A_local_index];
1086 U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
1087 }
1088
1089 LDA_A = LDA_A_new;
1090 }
1091 }
1092
1093 if (useGPU){
1094 U_local_start_dev = U_local_start_dev + rows_in_block_U*cols_in_block;
1095 }
1096 else {
1097 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
1098 }
1099
1100 curr_col_loc_res = curr_col_loc_res + nblk;
1101 rows_in_block_U = rows_in_block_U + ratio*nblk;
1102 }
1103 NVTX_RANGE_POP(); // loop i<num_of_blocks_in_U_buffer
1104
1105 MPI_Wait(&request_A_Send, &status);
1106 MPI_Wait(&request_A_Recv, &status);
1107 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI); // find out how many elements I have received
1108 Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
1109
1110 if (j <= ToStore)
1111 {
1112 U_to_calc = &U_stored[Curr_pos_in_U_stored];
1113 Curr_pos_in_U_stored = Curr_pos_in_U_stored + SizesU[j-1];
1114 Size_receive_U = SizesU[j-1];
1115 }
1116 else
1117 {
1118 MPI_Wait(&request_U_Send, &status);
1119 MPI_Wait(&request_U_Recv, &status);
1120 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received
1121 Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
1122 }
1123 }
1124#ifdef WITH_NVTX
1125 nvtxRangePop(); // loop j<np_rows"
1126#endif
1127
1128
1130 if(ToStore < np_rows - 1)
1131 U_to_calc = Buf_to_receive_U;
1132 rows_in_buffer_U = (C_INT_TYPE)U_to_calc[Size_receive_U-1];
1133 row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
1134 if (my_pcol >= row_of_origin_U) {
1135 cols_in_buffer_U = na_cols;
1136 }
1137 else {
1138 cols_in_buffer_U = na_cols - nblk;
1139 }
1140 cols_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-2];
1141 rows_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
1142 // find the minimal pcol among those who have sent A for this iteration
1143 col_of_origin_A = np_cols;
1144 for(i = 0; i < ratio; i++)
1145 {
1146 intNumber = (my_pcol + my_prow + i*np_rows + np_cols + np_rows - 1)%np_cols;
1147 if(intNumber < col_of_origin_A)
1148 col_of_origin_A = intNumber;
1149 }
1150
1151 // find block-column of the result to start update with
1152 if (my_pcol >= row_of_origin_U) { // if origin of U is from the upper part
1153 curr_col_loc_res = 0; // then I update all columns of Result
1154 }
1155 else {
1156 curr_col_loc_res = nblk; // the first block column of U corresponds to my second one and I do not need to update the first block-column
1157 }
1158 num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
1159 if (my_pcol >= row_of_origin_U) { // if origin of U is from the upper part
1160 rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk; // blocks in the first block-column of U buffer
1161 }
1162 else {
1163 rows_in_block_U = ratio*nblk;
1164 }
1165
1166 if (useGPU){
1167 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_A_dev, (intptr_t *)Buf_to_receive_A, ratio*Buf_cols*Buf_rows*sizeof(math_type), gpuMemcpyHostToDevice) );
1168 gpuErrCheck( gpuMemcpy((intptr_t *)Buf_to_send_receive_U_dev, (intptr_t *)U_to_calc, Size_U_stored*sizeof(math_type), gpuMemcpyHostToDevice) );
1169 }
1170
1171 if (useGPU){
1172 U_local_start_dev = Buf_to_send_receive_U_dev;
1173 }
1174 else {
1175 U_local_start = U_to_calc;
1176 }
1177
1178 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
1179 {
1180 // find block-row of the result to start update with; we need to update only lower triangular part of result
1181 curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk; // global index of the first column to be updated
1182 // now we need to find the smallest my local row index, such that the corresponding global index is larger of equal to <curr_col_glob_res>
1183 Nb = curr_col_glob_res/nblk; // how many global block-rows are before the needed one
1184 owner = Nb%np_rows; // proc. row index of the owner of row with the global index equal to <curr_col_glob_res> (it is not necessarily me)
1185 curr_row_loc_res = (Nb/np_rows)*nblk;
1186 if(my_prow < owner)
1187 curr_row_loc_res = curr_row_loc_res + nblk;
1188
1189 curr_row_loc_A = curr_row_loc_res; // it is impossible, that both col_of_origin_L and row_of_origin_U are from upper part
1190 if(col_of_origin_A > my_prow)
1191 curr_row_loc_A = curr_row_loc_A - nblk;
1192
1193 rows_in_block = rows_in_buffer_A - curr_row_loc_A; //rows in current block of
1194
1195 curr_col_loc_U = i*nblk; // local index in the buffer U of the current column
1196
1197 if ((curr_col_loc_U + nblk) <= cols_in_buffer_U) {
1198 cols_in_block = nblk; // number columns in block of U which will take part in this calculation
1199 }
1200 else {
1201 cols_in_block = cols_in_buffer_U - curr_col_loc_U;
1202 }
1203 if (rows_in_block_U > rows_in_buffer_U) {
1204 rows_in_block_U = rows_in_buffer_U;
1205 }
1206
1207 A_local_index = curr_row_loc_A;
1208
1209 if (useGPU){
1210 A_local_start_dev = Buf_to_send_receive_A_dev + A_local_index;
1211 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows + curr_row_loc_res; // we reuse M_dev buffer instead of introducing Res_dev
1212 }
1213 else {
1214 A_local_start = &Buf_to_receive_A[A_local_index];
1215 Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
1216 }
1217
1218 LDA_A = rows_in_buffer_A;
1219 LDA_A_new = LDA_A;
1220
1221 if ((rows_in_block > 0) &&(cols_in_block > 0))
1222 {
1223 if (useGPU){
1224 U_local_start_curr_dev = U_local_start_dev;
1225 }
1226 else {
1227 U_local_start_curr = U_local_start;
1228 }
1229
1230 // loop over block-columns of the "active" part of L buffer
1231 for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
1232 {
1233 if ((ii+1)*nblk <= cols_in_buffer_A) {
1234 rows_in_block_U_curr = nblk;
1235 }
1236 else {
1237 rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;
1238 }
1239
1240 NVTX_RANGE_PUSH("GEMM_2_last");
1241 // Res_ptr = A_local_start*U_local_start_curr + Res_ptr
1242 // Res = Buf_to_send_A*Buf_to_send_U + Res
1243 if (useGPU){
1244 gpublasXgemm(gpublasHandle, 'N', 'N',
1245 rows_in_block, cols_in_block, rows_in_block_U_curr, dOne,
1246 A_local_start_dev, LDA_A,
1247 U_local_start_curr_dev, rows_in_block_U, dOne,
1248 Res_ptr_dev, na_rows);
1249 if (wantDebug) gpuDeviceSynchronize();
1250 }
1251 else {
1252 C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne,
1253 A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
1254 }
1256
1257 LDA_A_new = LDA_A_new - nblk;
1258 A_local_index = A_local_index - (LDA_A - rows_in_block) + LDA_A*nblk + LDA_A_new - rows_in_block;
1259
1260 if (useGPU){
1261 A_local_start_dev = Buf_to_send_receive_A_dev + A_local_index;
1262 U_local_start_curr_dev = U_local_start_curr_dev + rows_in_block_U_curr;
1263 }
1264 else {
1265 A_local_start = &Buf_to_receive_A[A_local_index];
1266 U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
1267 }
1268
1269 LDA_A = LDA_A_new;
1270 }
1271 }
1272
1273 if (useGPU){
1274 U_local_start_dev = U_local_start_dev + rows_in_block_U*cols_in_block;
1275 }
1276 else {
1277 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
1278 }
1279
1280 curr_col_loc_res = curr_col_loc_res + nblk;
1281 rows_in_block_U = rows_in_block_U + ratio*nblk;
1282 }
1283
1284 if (useGPU) gpuErrCheck( gpuMemcpy((intptr_t *)Res, (intptr_t *)M_dev, na_rows*na_cols*sizeof(math_type), gpuMemcpyDeviceToHost) );
1285
1286#ifdef WITH_NVTX
1287 nvtxRangePushA("PTRAN");
1288#endif
1289 C_PTRAN(&na, &na, &dOne, Res, &one, &one, a_desc, &dZero, M, &one, &one, a_desc); // M <- Res^T (or M <- Res^H)
1290#ifdef WITH_NVTX
1291 nvtxRangePop();
1292#endif
1293
1294#ifdef WITH_NVTX
1295 nvtxRangePushA("PLACPY");
1296#endif
1297 C_PLACPY("U", &na, &na, M, &one, &one, a_desc, Res, &one, &one, a_desc); // Res <- M
1298#ifdef WITH_NVTX
1299 nvtxRangePop();
1300#endif
1301
1302 if (useGPU){
1303 gpuErrCheck( gpuFree((intptr_t *)Buf_to_send_receive_A_dev) );
1304 gpuErrCheck( gpuFree((intptr_t *)Buf_to_send_receive_U_dev) );
1305 gpuErrCheck( gpuFree((intptr_t *)M_dev) );
1306 }
1307
1308 free(Buf_to_send_A);
1309 free(Buf_to_receive_A);
1310 free(Buf_to_send_U);
1311 free(Buf_to_receive_U);
1312 free(M);
1313 free(M_T);
1314 if(ratio != 1)
1315 free(Buf_A);
1316 free(U_stored);
1317 free(SizesU);
1318
1319}
1320
1321void cannons_reduction_c_impl(math_type* A, math_type* U, int local_rowsCast, int local_colsCast,
1322 C_INT_TYPE_PTR a_desc, math_type *Res, C_INT_MPI_TYPE ToStore,
1323 C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm,
1324 int wantDebug, int useGPU, intptr_t *gpublasHandle)
1325{
1326 C_INT_TYPE local_rows, local_cols;
1327 local_rows = (C_INT_TYPE) local_rowsCast;
1328 local_cols = (C_INT_TYPE) local_colsCast;
1329
1330 MPI_Comm c_row_comm = MPI_Comm_f2c(row_comm);
1331 MPI_Comm c_col_comm = MPI_Comm_f2c(col_comm);
1332
1333
1334 C_INT_TYPE my_prow, my_pcol, np_rows, np_cols;
1335 C_INT_MPI_TYPE my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI;
1336
1337 MPI_Comm_rank(c_row_comm, &my_prowMPI);
1338 MPI_Comm_size(c_row_comm, &np_rowsMPI);
1339 MPI_Comm_rank(c_col_comm, &my_pcolMPI);
1340 MPI_Comm_size(c_col_comm, &np_colsMPI);
1341
1342 my_prow = (C_INT_TYPE) my_prowMPI;
1343 my_pcol = (C_INT_TYPE) my_pcolMPI;
1344 np_rows = (C_INT_TYPE) np_rowsMPI;
1345 np_cols = (C_INT_TYPE) np_colsMPI;
1346
1347 // BEWARE
1348 // in the cannons algorithm, column and row communicators are exchanged
1349 // What we usually call row_comm in elpa, is thus passed to col_comm parameter of the function and vice versa
1350 // (order is swapped in the following call)
1351 // It is a bit unfortunate, maybe it should be changed in the Cannon algorithm to comply with ELPA standard notation?
1352
1353 // ELPA convention : col_comm means that the data is sent in the direction, where my_pcol changes (my_prow=const). Size of col_comm is np_cols
1354 // cannon convention: col_comm means that the data is sent in the direction, wheree my_pcol=const (my_prow changes). Size of col_comm is np_rows
1355 // Example of 2D process grid:
1356 // A1 A2 A3 A4
1357 // B1 B2 B3 B4
1358 // C1 C2 C3 C4
1359 // In ELPA, {B1, B2, B3, B4} belong to the same col_comm, in cannon they belong to the same row_comm
1360 cannons_reduction_impl(A, U, np_rows, np_cols, my_prow, my_pcol,
1361 a_desc, Res, ToStore, c_col_comm, c_row_comm, wantDebug, useGPU, gpublasHandle);
1362}
1363
int gpuMemcpyDeviceToHost
Definition cannon.c:104
#define NVTX_RANGE_POP()
Definition cannon.c:95
#define NVTX_RANGE_PUSH(msg)
Definition cannon.c:94
#define gpuErrCheck(ans)
Definition cannon.c:107
#define gpublasXgemm
Definition cannon.c:161
int gpuMemcpyHostToDevice
Definition cannon.c:103
#define C_INT_TYPE
Definition cannon_forw_template.h:64
#define cannons_reduction_c_impl
Definition cannon_forw_template.h:84
#define cannons_reduction_impl
Definition cannon_forw_template.h:80
#define C_INT_MPI_TYPE
Definition cannon_forw_template.h:73
#define C_INT_TYPE_PTR
Definition cannon_forw_template.h:63
int gpuMemcpy(intptr_t *dest, intptr_t *src, size_t count, int dir)
Definition gpu_vendor_agnostic_layer.c:144
int gpuDeviceSynchronize()
Definition gpu_vendor_agnostic_layer.c:160
int gpuFree(intptr_t *a)
Definition gpu_vendor_agnostic_layer.c:128
int gpuMalloc(intptr_t *a, size_t width_height)
Definition gpu_vendor_agnostic_layer.c:112
int gpuMemset(intptr_t *a, int value, size_t count)
Definition gpu_vendor_agnostic_layer.c:176
void set_gpu_parameters(int *gpuMemcpyHostToDevice, int *gpuMemcpyDeviceToHost)
Definition gpu_vendor_agnostic_layer.c:62