69 math_type *Res, MPI_Comm row_comm, MPI_Comm col_comm,
70 int wantDebug,
int useGPU, intptr_t *gpublasHandle)
82 C_INT_TYPE na, nb, nblk, width, na_rows, na_cols, nb_cols, Size_receive_U_now,
83 cols_in_buffer_U_my_initial, cols_in_buffer_U, rows_in_buffer_U, rows_in_buffer_U_now, cols_in_buffer_U_now, rows_in_buffer_U_my_initial;
85 C_INT_MPI_TYPE Size_receive_U_nowMPI, Size_receive_UMPI, Size_receive_BMPI;
86 C_INT_TYPE i, j, Size_send_U, Size_receive_U, Size_send_B, Size_receive_B, intNumber,
87 Buf_rows, Buf_cols_U, Buf_cols_B, curr_rows, num_of_iters, cols_in_buffer, rows_in_block,
88 curr_col_loc, cols_in_block, num_of_blocks_in_U_buffer, col_of_origin_U, b_rows_mult, b_cols_mult;
90 math_type *Buf_to_send_U, *Buf_to_receive_U, *Buf_to_send_B, *Buf_to_receive_B, *Buf_U, *PosBuff;
92 C_INT_TYPE where_to_send_U, from_where_to_receive_U, where_to_send_B, from_where_to_receive_B,
93 last_proc_col_B, last_proc_row_B, n, Size_U_stored, proc_col_min;
95 math_type *U_local_start, *Buf_pos, *B_local_start, *double_ptr, *CopyTo, *CopyFrom;
103 math_type dOne = 1.0;
104 math_type dZero = 0.0;
110 na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
111 na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
112 nb_cols = numroc_(&nb, &nblk, &my_pcol, &zero, &np_cols);
114 MPI_Request request_U_Recv;
115 MPI_Request request_U_Send;
116 MPI_Request request_B_Recv;
117 MPI_Request request_B_Send;
120 last_proc_col_B = ((nb-1)/nblk) % np_cols;
121 last_proc_row_B = ((na-1)/nblk) % np_rows;
126 if(my_pcol <= last_proc_col_B)
127 Buf_cols_B = nb_cols;
129 Buf_cols_B = nb_cols + nblk;
131 if(my_pcol < last_proc_col_B)
132 Buf_cols_B = nb_cols;
133 else if(my_pcol > last_proc_col_B)
134 Buf_cols_B = nb_cols + nblk;
136 Buf_cols_B = nb_cols + nblk - nb_cols%nblk;
139 if(my_prow <= last_proc_row_B)
142 Buf_rows = na_rows + nblk;
144 if(my_prow < last_proc_row_B)
146 else if(my_prow > last_proc_row_B)
147 Buf_rows = na_rows + nblk;
149 Buf_rows = na_rows + nblk - na_rows%nblk;
151 ratio = np_cols/np_rows;
153 intNumber = ceil((math_type)na/(math_type)(np_cols*nblk));
154 Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2;
156 Buf_to_send_U = malloc(ratio*Size_U_stored*
sizeof(math_type));
157 Buf_to_receive_U = malloc(ratio*Size_U_stored*
sizeof(math_type));
158 Buf_to_send_B = malloc(Buf_cols_B*Buf_rows*
sizeof(math_type));
159 Buf_to_receive_B = malloc(Buf_cols_B*Buf_rows*
sizeof(math_type));
161 Buf_U = malloc(Size_U_stored*
sizeof(math_type));
163 for(i = 0; i < na_rows*nb_cols; i++)
166 math_type *Buf_to_send_receive_U_dev;
167 math_type *Buf_to_send_receive_B_dev;
170 math_type *U_local_start_dev, *B_local_start_dev;
173 gpuErrCheck(
gpuMalloc((intptr_t *)&Buf_to_send_receive_U_dev, ratio*Size_U_stored*
sizeof(math_type)) );
174 gpuErrCheck(
gpuMalloc((intptr_t *)&Buf_to_send_receive_B_dev, Buf_cols_B*Buf_rows*
sizeof(math_type)) );
184 if((ratio != 1)||(my_prow != 0))
185 Buf_pos = Buf_to_send_U;
187 Buf_pos = Buf_to_receive_U;
191 if(my_pcol >= my_prow)
196 num_of_iters = ceil((math_type)na_cols/(math_type)nblk);
197 num_of_iters = num_of_iters - curr_col_loc;
198 curr_col_loc = curr_col_loc*nblk;
200 if(my_pcol >= my_prow)
201 rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
203 rows_in_block = ratio*nblk;
204 cols_in_buffer_U_my_initial = 0;
206 for(i = 0; i < num_of_iters; i++)
208 if(rows_in_block > na_rows)
209 rows_in_block = na_rows;
211 if ((na_cols - curr_col_loc) < nblk)
212 cols_in_block = na_cols - curr_col_loc;
214 cols_in_block = nblk;
216 if((rows_in_block > 0)&&(cols_in_block > 0))
218 double_ptr = &U[curr_col_loc*na_rows];
219 C_LACPY(
"A", &rows_in_block, &cols_in_block, double_ptr, &na_rows, Buf_pos, &rows_in_block);
220 Buf_pos = Buf_pos + rows_in_block*cols_in_block;
221 Size_send_U = Size_send_U + rows_in_block*cols_in_block;
222 cols_in_buffer_U_my_initial = cols_in_buffer_U_my_initial + cols_in_block;
224 curr_col_loc = curr_col_loc + nblk;
225 rows_in_block = rows_in_block + ratio*nblk;
227 rows_in_buffer_U_my_initial = rows_in_block - ratio*nblk;
228 *Buf_pos = (math_type)cols_in_buffer_U_my_initial;
229 Buf_pos = Buf_pos + 1;
230 *Buf_pos = (math_type)rows_in_buffer_U_my_initial;
231 Size_send_U = Size_send_U + 2;
235 proc_col_min = np_cols;
236 for(i = 0; i < ratio; i++)
238 from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols;
239 if(from_where_to_receive_U < proc_col_min)
240 proc_col_min = from_where_to_receive_U;
245 cols_in_buffer_U = 0;
246 rows_in_buffer_U = 0;
247 for(i = 0; i < ratio; i++)
249 where_to_send_U = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
250 from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols;
255 if(where_to_send_U != my_pcol)
257 MPI_Sendrecv(Buf_to_send_U, (
C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_U, 0, Buf_U, (
C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &status);
258 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U_nowMPI);
259 Size_receive_U_now = (
C_INT_TYPE) Size_receive_U_nowMPI;
260 Size_receive_U = Size_receive_U + Size_receive_U_now - 2;
262 cols_in_buffer_U_now = Buf_U[Size_receive_U_now - 2];
263 cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now;
264 rows_in_buffer_U_now = Buf_U[Size_receive_U_now - 1];
266 if(rows_in_buffer_U < rows_in_buffer_U_now)
267 rows_in_buffer_U = rows_in_buffer_U_now;
269 intNumber = from_where_to_receive_U/np_rows;
270 if(proc_col_min >= my_prow)
271 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2];
273 if(from_where_to_receive_U < my_prow)
274 CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2];
276 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2];
281 cols_in_buffer_U_now = cols_in_buffer_U_my_initial;
282 cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now;
284 rows_in_buffer_U_now = rows_in_buffer_U_my_initial;
285 if(rows_in_buffer_U < rows_in_buffer_U_now)
286 rows_in_buffer_U = rows_in_buffer_U_now;
288 intNumber = my_pcol/np_rows;
289 if(proc_col_min >= my_prow)
290 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2];
292 if(my_pcol < my_prow)
293 CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2];
295 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2];
296 CopyFrom = Buf_to_send_U;
297 Size_receive_U = Size_receive_U + Size_send_U - 2;
301 intNumber = ceil((math_type)cols_in_buffer_U_now/(math_type)nblk);
302 if(from_where_to_receive_U >= my_prow)
303 rows_in_block = ceil(((math_type)(from_where_to_receive_U + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
305 rows_in_block = ratio*nblk;
306 for(j = 0; j < intNumber; j++)
308 if((j+1)*nblk < cols_in_buffer_U_now)
309 cols_in_block = nblk;
311 cols_in_block = cols_in_buffer_U_now - j*nblk;
313 C_LACPY(
"A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
315 CopyFrom = CopyFrom + rows_in_block*cols_in_block;
316 CopyTo = CopyTo + ratio*rows_in_block*nblk + nblk*nblk*ratio*(ratio-1)/2;
317 rows_in_block = rows_in_block + ratio*nblk;
318 if(rows_in_block > rows_in_buffer_U_now)
319 rows_in_block = rows_in_buffer_U_now;
326 MPI_Sendrecv(Buf_to_send_U, (
C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_U, 0, Buf_to_receive_U, (
C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &status);
327 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
328 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
330 cols_in_buffer_U = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-2];
331 rows_in_buffer_U = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
335 Size_receive_U = Size_send_U;
336 rows_in_buffer_U = rows_in_buffer_U_my_initial;
337 cols_in_buffer_U = cols_in_buffer_U_my_initial;
343 Buf_to_receive_U[Size_receive_U] = cols_in_buffer_U;
344 Buf_to_receive_U[Size_receive_U + 1] = rows_in_buffer_U;
345 Size_receive_U = Size_receive_U + 2;
356 where_to_send_B = (my_prow - my_pcol + np_cols)%np_rows;
357 from_where_to_receive_B = (my_pcol + my_prow)%np_rows;
360 if(where_to_send_B != my_prow)
363 C_LACPY(
"A", &na_rows, &nb_cols, B, &na_rows, Buf_to_send_B, &na_rows);
364 MPI_Sendrecv(Buf_to_send_B, (
C_INT_MPI_TYPE) (nb_cols*na_rows), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_B, 0, Buf_to_receive_B, (
C_INT_MPI_TYPE) (nb_cols*Buf_rows), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_B, 0, col_comm, &status);
365 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_BMPI);
366 if (nb_cols!=0) Size_receive_B = (
C_INT_TYPE) Size_receive_BMPI / nb_cols;
367 else Size_receive_B=0;
371 C_LACPY(
"A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows);
372 Size_receive_B = na_rows;
377 C_LACPY(
"A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows);
378 Size_receive_B = na_rows;
385 where_to_send_U = (my_pcol - 1 + np_cols)%np_cols;
386 from_where_to_receive_U = (my_pcol + 1)%np_cols;
387 where_to_send_B = (my_prow - 1 + np_rows)%np_rows;
388 from_where_to_receive_B = (my_prow + 1)%np_rows;
391 for(i = 1; i < np_rows; i++)
394 double_ptr = Buf_to_send_U;
395 Buf_to_send_U = Buf_to_receive_U;
396 Buf_to_receive_U = double_ptr;
398 double_ptr = Buf_to_send_B;
399 Buf_to_send_B = Buf_to_receive_B;
400 Buf_to_receive_B = double_ptr;
402 Size_send_U = Size_receive_U;
403 Size_send_B = Size_receive_B;
407 MPI_Isend(Buf_to_send_U, (
C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_U, 0, row_comm, &request_U_Send);
408 MPI_Irecv(Buf_to_receive_U, (
C_INT_MPI_TYPE) (ratio*Size_U_stored), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &request_U_Recv);
412 MPI_Isend(Buf_to_send_B, (
C_INT_MPI_TYPE) (Size_send_B*nb_cols), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_B, 0, col_comm, &request_B_Send);
413 MPI_Irecv(Buf_to_receive_B, (
C_INT_MPI_TYPE) (Buf_rows*nb_cols), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_B, 0, col_comm, &request_B_Recv);
417 cols_in_buffer_U = (
C_INT_TYPE)Buf_to_send_U[Size_receive_U-2];
418 rows_in_buffer_U = (
C_INT_TYPE)Buf_to_send_U[Size_receive_U-1];
420 proc_col_min = np_cols;
421 for(j = 0; j < ratio; j++)
423 col_of_origin_U = (my_pcol + my_prow + i - 1 + j*np_rows)%np_cols;
424 if(col_of_origin_U < proc_col_min)
425 proc_col_min = col_of_origin_U;
427 col_of_origin_U = proc_col_min;
429 num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
435 U_local_start_dev = Buf_to_send_receive_U_dev;
436 if (col_of_origin_U >= my_prow) B_local_start_dev = Buf_to_send_receive_B_dev;
437 else B_local_start_dev = Buf_to_send_receive_B_dev + nblk;
440 U_local_start = Buf_to_send_U;
441 if (col_of_origin_U >= my_prow) B_local_start = Buf_to_send_B;
442 else B_local_start = Buf_to_send_B + nblk;
446 for(j = 0; j < num_of_blocks_in_U_buffer; j++)
448 curr_rows = (j+1)*nblk;
449 if (curr_rows > rows_in_buffer_U)
450 curr_rows = rows_in_buffer_U;
452 if((j+1)*nblk <= cols_in_buffer_U)
455 b_rows_mult = cols_in_buffer_U - j*nblk;
457 if (Size_receive_B != 0){
463 curr_rows, nb_cols, b_rows_mult, dOne,
464 U_local_start_dev, curr_rows,
465 B_local_start_dev, Size_receive_B, dOne,
470 C_GEMM(
"N",
"N", &curr_rows, &nb_cols, &b_rows_mult, &dOne,
471 U_local_start, &curr_rows,
472 B_local_start, &Size_receive_B, &dOne,
479 U_local_start_dev += nblk*curr_rows;
480 B_local_start_dev += nblk;
483 U_local_start += nblk*curr_rows;
484 B_local_start += nblk;
489 MPI_Wait(&request_U_Send, &status);
490 MPI_Wait(&request_U_Recv, &status);
491 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
492 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
494 MPI_Wait(&request_B_Send, &status);
495 MPI_Wait(&request_B_Recv, &status);
496 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_BMPI);
497 if (nb_cols!=0) Size_receive_B = (
C_INT_TYPE) Size_receive_BMPI / nb_cols;
498 else Size_receive_B=0;
504 cols_in_buffer_U = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-2];
505 rows_in_buffer_U = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
507 proc_col_min = np_cols;
508 for(j = 0; j < ratio; j++)
510 col_of_origin_U = (my_pcol + my_prow + np_rows - 1 + j*np_rows)%np_cols;
511 if(col_of_origin_U < proc_col_min)
512 proc_col_min = col_of_origin_U;
514 col_of_origin_U = proc_col_min;
516 num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
522 U_local_start_dev = Buf_to_send_receive_U_dev;
523 if (col_of_origin_U >= my_prow) B_local_start_dev = Buf_to_send_receive_B_dev;
524 else B_local_start_dev = Buf_to_send_receive_B_dev + nblk;
527 U_local_start = Buf_to_receive_U;
528 if (col_of_origin_U >= my_prow) B_local_start = Buf_to_receive_B;
529 else B_local_start = Buf_to_receive_B + nblk;
533 for(j = 0; j < num_of_blocks_in_U_buffer; j++)
535 curr_rows = (j+1)*nblk;
536 if (curr_rows > rows_in_buffer_U)
537 curr_rows = rows_in_buffer_U;
539 if((j+1)*nblk <= cols_in_buffer_U)
542 b_rows_mult = cols_in_buffer_U - j*nblk;
544 if (Size_receive_B != 0) {
550 curr_rows, nb_cols, b_rows_mult, dOne,
551 U_local_start_dev, curr_rows,
552 B_local_start_dev, Size_receive_B, dOne,
557 C_GEMM(
"N",
"N", &curr_rows, &nb_cols, &b_rows_mult, &dOne,
558 U_local_start, &curr_rows,
559 B_local_start, &Size_receive_B, &dOne,
566 U_local_start_dev += nblk*curr_rows;
567 B_local_start_dev += nblk;
570 U_local_start += nblk*curr_rows;
571 B_local_start += nblk;
585 free(Buf_to_receive_U);
587 free(Buf_to_receive_B);