101 C_INT_TYPE na, nblk, i, j, Size_send_A, Size_receive_A, Size_send_U, Size_receive_U, Buf_rows, Buf_cols, pcol_where_to_send_A, pcol_from_where_to_receive_A, where_to_send_U, from_where_to_receive_U, last_proc_row, last_proc_col, cols_in_buffer_A, rows_in_buffer_A, intNumber;
102 C_INT_TYPE ratio, num_of_iters, cols_in_buffer, rows_in_block, rows_in_buffer, curr_col_loc, cols_in_block, curr_col_glob, curr_row_loc, Size_receive_A_now, Nb, owner, cols_in_buffer_A_now;
103 C_INT_MPI_TYPE Size_receive_A_nowMPI, Size_receive_AMPI, Size_receive_UMPI;
105 math_type *Buf_to_send_A, *Buf_to_receive_A, *Buf_to_send_U, *Buf_to_receive_U, *data_ptr, *Buf_A, *Buf_pos, *U_local_start, *Res_ptr, *M, *M_T, *A_local_start, *U_local_start_curr, *U_stored, *CopyTo, *CopyFrom, *U_to_calc;
107 C_INT_TYPE row_of_origin_U, rows_in_block_U, num_of_blocks_in_U_buffer, k, startPos, cols_in_buffer_U, rows_in_buffer_U, col_of_origin_A, curr_row_loc_res, curr_row_loc_A, curr_col_glob_res;
108 C_INT_TYPE curr_col_loc_res, curr_col_loc_buf, proc_row_curr, curr_col_loc_U, A_local_index, LDA_A, LDA_A_new, index_row_A_for_LDA, ii, rows_in_block_U_curr, width, row_origin_U, rows_in_block_A, cols_in_buffer_A_my_initial, rows_in_buffer_A_my_initial, proc_col_min;
110 C_INT_TYPE Size_U_skewed, Size_U_stored, Curr_pos_in_U_stored, rows_in_buffer_A_now;
111 math_type dOne = 1.0;
112 math_type dZero = 0.0;
118 MPI_Request request_A_Recv;
119 MPI_Request request_A_Send;
120 MPI_Request request_U_Recv;
121 MPI_Request request_U_Send;
125 na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
126 na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
135 if (np_cols%np_rows != 0)
142 if (np_cols < np_rows != 0)
149 ratio = np_cols/np_rows;
150 last_proc_row = ((na-1)/nblk) % np_rows;
151 last_proc_col = ((na-1)/nblk) % np_cols;
155 if (my_pcol <= last_proc_col) {
159 Buf_cols = na_cols + nblk;
163 if (my_pcol < last_proc_col) {
166 else if (my_pcol > last_proc_col) {
167 Buf_cols = na_cols + nblk;
170 Buf_cols = na_cols + nblk - na_cols%nblk;
175 if (my_prow <= last_proc_row) {
176 Buf_rows = na_rows + 1;
179 Buf_rows = na_rows + nblk;
183 if (my_prow < last_proc_row) {
186 else if (my_prow > last_proc_row) {
187 Buf_rows = na_rows + nblk;
190 Buf_rows = na_rows + nblk - na_rows%nblk;
194 intNumber = ceil((math_type)na/(math_type)(np_cols*nblk));
195 Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2;
197 U_stored = malloc((Size_U_stored*(ToStore+1))*
sizeof(math_type));
199 Buf_to_send_A = malloc(ratio*Buf_cols*Buf_rows*
sizeof(math_type));
200 Buf_to_receive_A = malloc(ratio*Buf_cols*Buf_rows*
sizeof(math_type));
201 Buf_to_send_U = malloc(Size_U_stored*
sizeof(math_type));
202 Buf_to_receive_U = malloc(Size_U_stored*
sizeof(math_type));
204 Buf_A = malloc(Buf_cols*Buf_rows*
sizeof(math_type));
205 M = malloc(na_rows*na_cols*
sizeof(math_type));
206 M_T = malloc(na_rows*na_cols*
sizeof(math_type));
207 for(i = 0; i < na_rows*na_cols; i++)
215 nvtxRangePushA(
"LACPY");
217 C_LACPY(
"A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows);
225 for(i = 0; i < ratio; i++)
227 pcol_where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
228 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
233 if(pcol_where_to_send_A != my_pcol)
238 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
239 Size_receive_A_now = (
C_INT_TYPE) Size_receive_A_nowMPI;
240 Size_receive_A_now = Size_receive_A_now/na_rows;
243 Size_receive_A_now = na_cols;
246 Size_receive_A = Size_receive_A + Size_receive_A_now;
249 intNumber = pcol_from_where_to_receive_A/np_rows;
251 CopyTo = &Buf_to_receive_A[intNumber*na_rows*nblk];
252 if (pcol_where_to_send_A != my_pcol) {
259 intNumber = ceil((math_type)Size_receive_A_now/(math_type)nblk);
260 for(j = 0; j < intNumber; j++)
263 if(nblk*(j+1) > Size_receive_A_now)
264 width = Size_receive_A_now - nblk*j;
265 C_LACPY(
"A", &na_rows, &width, CopyFrom, &na_rows, CopyTo, &na_rows);
266 CopyTo = CopyTo + na_rows*nblk*ratio;
267 CopyFrom = CopyFrom + na_rows*nblk;
274 C_LACPY(
"A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows);
278 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
279 Size_receive_A = (
C_INT_TYPE) Size_receive_AMPI;
280 Size_receive_A = Size_receive_A/na_rows;
284 C_LACPY(
"A", &na_rows, &na_cols, A, &na_rows, Buf_to_receive_A, &na_rows);
285 Size_receive_A = na_cols;
293 num_of_iters = ceil((math_type)na_cols/(math_type)nblk);
295 where_to_send_U = (my_prow - my_pcol + np_cols)%np_rows;
296 from_where_to_receive_U = (my_pcol + my_prow)%np_rows;
298 if (where_to_send_U == my_prow) {
299 Buf_pos = Buf_to_receive_U;
302 Buf_pos = Buf_to_send_U;
305 if (my_pcol >= my_prow) {
311 num_of_iters = num_of_iters - curr_col_loc;
312 curr_col_loc = curr_col_loc*nblk;
314 if (my_pcol >= my_prow ) {
315 rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
318 rows_in_block = ratio*nblk;
321 for(i = 0; i < num_of_iters; i++)
323 if (rows_in_block > na_rows) {
324 rows_in_block = na_rows;
326 if ((na_cols - curr_col_loc) < nblk) {
327 cols_in_block = na_cols - curr_col_loc;
330 cols_in_block = nblk;
332 if ((rows_in_block > 0)&&(cols_in_block > 0))
334 data_ptr = &U[curr_col_loc*na_rows];
335 C_LACPY(
"A", &rows_in_block, &cols_in_block, data_ptr, &na_rows, Buf_pos, &rows_in_block);
336 Buf_pos = Buf_pos + rows_in_block*cols_in_block;
337 Size_send_U = Size_send_U + rows_in_block*cols_in_block;
339 curr_col_loc = curr_col_loc + nblk;
340 rows_in_block = rows_in_block + ratio*nblk;
342 rows_in_buffer = rows_in_block - ratio*nblk;
343 *Buf_pos = (math_type)rows_in_buffer;
344 Size_send_U = Size_send_U + 1;
347 if (where_to_send_U != my_prow)
350 MPI_Sendrecv(Buf_to_send_U, (
C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_U, (
C_INT_MPI_TYPE) zero, Buf_to_receive_U, (
C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_U, (
C_INT_MPI_TYPE) zero, col_comm, &status);
351 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
352 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
355 Size_receive_U = Size_send_U;
357 for(i = 0; i < Size_receive_U; i++)
358 U_stored[i] = Buf_to_receive_U[i];
359 Size_U_skewed = Size_receive_U;
360 Curr_pos_in_U_stored = Size_U_skewed;
364 pcol_where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
365 pcol_from_where_to_receive_A = (my_pcol + 1)%np_cols;
366 where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
367 from_where_to_receive_U = (my_prow + 1)%np_rows;
369 for(j = 1; j < np_rows; j++)
372 data_ptr = Buf_to_send_A;
373 Buf_to_send_A = Buf_to_receive_A;
374 Buf_to_receive_A = data_ptr;
376 data_ptr = Buf_to_send_U;
377 Buf_to_send_U = Buf_to_receive_U;
378 Buf_to_receive_U = data_ptr;
381 Size_send_A = Size_receive_A;
386 Size_send_U = Size_receive_U;
391 rows_in_buffer = (int)Buf_to_send_U[Size_receive_U-1];
392 row_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
394 if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U))
396 cols_in_buffer = na_cols;
397 curr_col_loc_res = 0;
398 curr_col_loc_buf = 0;
400 if((my_pcol < my_prow)&&(my_pcol < row_origin_U))
402 cols_in_buffer = na_cols - nblk;
403 curr_col_loc_res = nblk;
404 curr_col_loc_buf = 0;
406 if((my_pcol >= my_prow)&&(my_pcol < row_origin_U))
408 cols_in_buffer = na_cols - nblk;
409 curr_col_loc_res = nblk;
410 curr_col_loc_buf = 0;
412 if((my_pcol < my_prow)&&(my_pcol >= row_origin_U))
414 cols_in_buffer = na_cols;
415 curr_col_loc_res = nblk;
416 curr_col_loc_buf = nblk;
419 num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk);
421 startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
422 U_local_start = &Buf_to_send_U[startPos];
423 Res_ptr = &M[curr_col_loc_res*na_rows];
425 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
427 curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
428 proc_row_curr = (curr_col_glob/nblk)%np_rows;
429 rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk;
430 if (my_prow <= proc_row_curr) {
431 rows_in_block_A = rows_in_block_A + nblk;
433 if (rows_in_block_A > na_rows) {
434 rows_in_block_A = na_rows;
436 if ((curr_col_loc_buf + nblk) <= cols_in_buffer) {
437 cols_in_block = nblk;
440 cols_in_block = cols_in_buffer - curr_col_loc_buf;
443 rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk;
444 if (proc_row_curr >= row_origin_U) {
445 rows_in_block_U = rows_in_block_U + nblk;
447 if (rows_in_block_U > rows_in_buffer) {
448 rows_in_block_U = rows_in_buffer;
451 if ((rows_in_block_A > 0)&&(cols_in_block > 0)) {
453 nvtxRangePushA(
"GEMM_1");
456 C_GEMM(
"N",
"N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne, Buf_to_send_A, &na_rows, U_local_start, &rows_in_block_U, &dZero, Res_ptr, &na_rows);
459 C_GEMM(
"N",
"N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne, Buf_to_send_A, &na_rows, U_local_start, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
465 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
466 curr_col_loc_res = curr_col_loc_res + nblk;
467 Res_ptr = &M[curr_col_loc_res*na_rows];
468 curr_col_loc_buf = curr_col_loc_buf + nblk;
471 MPI_Wait(&request_A_Send, &status);
472 MPI_Wait(&request_A_Recv, &status);
474 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
475 Size_receive_A = (
C_INT_TYPE) Size_receive_AMPI;
476 Size_receive_A = Size_receive_A / na_rows;
479 MPI_Wait(&request_U_Send, &status);
480 MPI_Wait(&request_U_Recv, &status);
481 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
482 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
486 for(k = 0; k < Size_receive_U; k++)
487 U_stored[Curr_pos_in_U_stored + k] = Buf_to_receive_U[k];
488 Curr_pos_in_U_stored = Curr_pos_in_U_stored + Size_receive_U;
489 SizesU[j-1] = Size_receive_U;
494 rows_in_buffer = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
495 row_origin_U = (my_pcol + my_prow + np_cols + np_rows -1)%np_rows;
497 if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U))
499 cols_in_buffer = na_cols;
500 curr_col_loc_res = 0;
501 curr_col_loc_buf = 0;
503 if((my_pcol < my_prow)&&(my_pcol < row_origin_U))
505 cols_in_buffer = na_cols - nblk;
506 curr_col_loc_res = nblk;
507 curr_col_loc_buf = 0;
509 if((my_pcol >= my_prow)&&(my_pcol < row_origin_U))
511 cols_in_buffer = na_cols - nblk;
512 curr_col_loc_res = nblk;
513 curr_col_loc_buf = 0;
515 if((my_pcol < my_prow)&&(my_pcol >= row_origin_U))
517 cols_in_buffer = na_cols;
518 curr_col_loc_res = nblk;
519 curr_col_loc_buf = nblk;
522 num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk);
524 startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
525 U_local_start = &Buf_to_receive_U[startPos];
526 Res_ptr = &M[curr_col_loc_res*na_rows];
528 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
530 curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
531 proc_row_curr = (curr_col_glob/nblk)%np_rows;
532 rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk;
533 if (my_prow <= proc_row_curr) {
534 rows_in_block_A = rows_in_block_A + nblk;
536 if (rows_in_block_A > na_rows) {
537 rows_in_block_A = na_rows;
539 if ((curr_col_loc_buf + nblk) <= cols_in_buffer) {
540 cols_in_block = nblk;
543 cols_in_block = cols_in_buffer - curr_col_loc_buf;
545 rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk;
546 if (proc_row_curr >= row_origin_U) {
547 rows_in_block_U = rows_in_block_U + nblk;
549 if (rows_in_block_U > rows_in_buffer) {
550 rows_in_block_U = rows_in_buffer;
552 if ((rows_in_block_A > 0)&&(cols_in_block > 0)) {
554 nvtxRangePushA(
"GEMM_2");
557 C_GEMM(
"N",
"N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne, Buf_to_receive_A, &na_rows, U_local_start, &rows_in_block_U, &dZero, Res_ptr, &na_rows);
560 C_GEMM(
"N",
"N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne, Buf_to_receive_A, &na_rows, U_local_start, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
566 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
567 curr_col_loc_res = curr_col_loc_res + nblk;
568 Res_ptr = &M[curr_col_loc_res*na_rows];
569 curr_col_loc_buf = curr_col_loc_buf + nblk;
574 nvtxRangePushA(
"PTRAN");
576 C_PTRAN(&na, &na, &dOne, M, &one, &one, a_desc, &dZero, M_T, &one, &one, a_desc);
586 if ((ratio != 1)||(my_prow != 0)) {
587 Buf_pos = Buf_to_send_A;
590 Buf_pos = Buf_to_receive_A;
593 num_of_iters = ceil((math_type)na_cols/(math_type)nblk);
595 cols_in_buffer_A_my_initial = 0;
598 if (my_pcol <= my_prow)
601 rows_in_buffer_A_my_initial = na_rows;
605 curr_row_loc = ceil((math_type)(((math_type)my_pcol - (math_type)my_prow)/(math_type)np_rows))*nblk;
606 rows_in_buffer_A_my_initial = na_rows - curr_row_loc;
609 for(i = 0; i < num_of_iters; i++)
611 curr_col_loc = i*nblk;
612 rows_in_block = na_rows - curr_row_loc;
614 if ((na_cols - curr_col_loc) < nblk) {
615 cols_in_block = na_cols - curr_col_loc;
618 cols_in_block = nblk;
620 if ((rows_in_block > 0)&&(cols_in_block > 0))
622 A_local_start = &M_T[curr_col_loc*na_rows + curr_row_loc];
623 C_LACPY(
"A", &rows_in_block, &cols_in_block, A_local_start, &na_rows, Buf_pos, &rows_in_block);
624 Buf_pos = Buf_pos + rows_in_block*cols_in_block;
625 Size_send_A = Size_send_A + rows_in_block*cols_in_block;
626 cols_in_buffer_A_my_initial = cols_in_buffer_A_my_initial + cols_in_block;
628 curr_row_loc = curr_row_loc + ratio*nblk;
630 *Buf_pos = (math_type)cols_in_buffer_A_my_initial;
631 Size_send_A = Size_send_A + 1;
635 proc_col_min = np_cols;
636 for(i = 0; i < ratio; i++)
638 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
639 if(pcol_from_where_to_receive_A < proc_col_min)
640 proc_col_min = pcol_from_where_to_receive_A;
644 cols_in_buffer_A = 0;
645 rows_in_buffer_A = 0;
646 for(i = 0; i < ratio; i++)
648 pcol_where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
649 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
654 if(pcol_where_to_send_A != my_pcol)
656 MPI_Sendrecv(Buf_to_send_A, (
C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) pcol_where_to_send_A, (
C_INT_MPI_TYPE) zero, Buf_A, (
C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (
C_INT_MPI_TYPE) zero, row_comm, &status);
657 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
658 Size_receive_A_now = (
C_INT_TYPE) Size_receive_A_nowMPI;
660 Size_receive_A = Size_receive_A + Size_receive_A_now - 1;
662 cols_in_buffer_A_now = Buf_A[Size_receive_A_now-1];
663 cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now;
666 if(pcol_from_where_to_receive_A <= my_prow)
668 rows_in_buffer_A_now = na_rows;
672 rows_in_buffer_A_now = na_rows - ceil((math_type)(((math_type)pcol_from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk;
674 if(rows_in_buffer_A < rows_in_buffer_A_now)
675 rows_in_buffer_A = rows_in_buffer_A_now;
677 intNumber = pcol_from_where_to_receive_A/np_rows;
678 if (proc_col_min <= my_prow) {
679 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)];
682 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)];
688 cols_in_buffer_A_now = cols_in_buffer_A_my_initial;
689 cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now;
691 rows_in_buffer_A_now = rows_in_buffer_A_my_initial;
692 if(rows_in_buffer_A < rows_in_buffer_A_now)
693 rows_in_buffer_A = rows_in_buffer_A_now;
695 intNumber = my_pcol/np_rows;
696 if (proc_col_min <= my_prow) {
697 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)];
700 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)];
702 CopyFrom = Buf_to_send_A;
704 Size_receive_A = Size_receive_A + Size_send_A - 1;
708 intNumber = ceil((math_type)cols_in_buffer_A_now/(math_type)nblk);
709 rows_in_block = rows_in_buffer_A_now;
710 for(j = 0; j < intNumber; j++)
712 if ((j+1)*nblk < cols_in_buffer_A_now) {
713 cols_in_block = nblk;
716 cols_in_block = cols_in_buffer_A_now - j*nblk;
718 C_LACPY(
"A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
720 CopyFrom = CopyFrom + rows_in_block*cols_in_block;
721 CopyTo = CopyTo + nblk*(ratio*rows_in_block - nblk*(ratio-1)*ratio/2);
722 rows_in_block = rows_in_block - ratio*nblk;
729 MPI_Sendrecv(Buf_to_send_A, (
C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) pcol_where_to_send_A, (
C_INT_MPI_TYPE) zero, Buf_to_receive_A, (
C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) pcol_from_where_to_receive_A, (
C_INT_MPI_TYPE) zero, row_comm, &status);
730 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
731 Size_receive_A = (
C_INT_TYPE) Size_receive_AMPI;
733 cols_in_buffer_A = (
C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
734 if(pcol_from_where_to_receive_A <= my_prow)
736 rows_in_buffer_A = na_rows;
740 rows_in_buffer_A = na_rows - ceil((math_type)(((math_type)pcol_from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk;
745 Size_receive_A = Size_send_A;
746 rows_in_buffer_A = rows_in_buffer_A_my_initial;
747 cols_in_buffer_A = cols_in_buffer_A_my_initial;
753 Buf_to_receive_A[Size_receive_A] = cols_in_buffer_A;
754 Buf_to_receive_A[Size_receive_A + 1] = rows_in_buffer_A;
755 Size_receive_A = Size_receive_A + 2;
759 Buf_to_receive_A[Size_receive_A] = rows_in_buffer_A;
760 Size_receive_A = Size_receive_A + 1;
765 Size_receive_U = Size_U_skewed;
766 U_to_calc = U_stored;
770 pcol_where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
771 pcol_from_where_to_receive_A = (my_pcol + 1)%np_cols;
772 where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
773 from_where_to_receive_U = (my_prow + 1)%np_rows;
774 Curr_pos_in_U_stored = Size_U_skewed;
776 for(j = 1; j < np_rows; j++)
779 data_ptr = Buf_to_send_A;
780 Buf_to_send_A = Buf_to_receive_A;
781 Buf_to_receive_A = data_ptr;
785 data_ptr = Buf_to_send_U;
786 Buf_to_send_U = Buf_to_receive_U;
787 Buf_to_receive_U = data_ptr;
791 Size_send_A = Size_receive_A;
796 Size_send_U = Size_receive_U;
802 U_to_calc = Buf_to_send_U;
811 rows_in_buffer_U = (
C_INT_TYPE)U_to_calc[Size_receive_U-1];
812 row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
813 if (my_pcol >= row_of_origin_U) {
814 cols_in_buffer_U = na_cols;
817 cols_in_buffer_U = na_cols - nblk;
819 cols_in_buffer_A = (
C_INT_TYPE)Buf_to_send_A[Size_receive_A-2];
820 rows_in_buffer_A = (
C_INT_TYPE)Buf_to_send_A[Size_receive_A-1];
822 col_of_origin_A = np_cols;
823 for(i = 0; i < ratio; i++)
825 intNumber = (my_pcol + my_prow + i*np_rows + np_cols + j - 1)%np_cols;
826 if(intNumber < col_of_origin_A)
827 col_of_origin_A = intNumber;
832 if (my_pcol >= row_of_origin_U) {
833 curr_col_loc_res = 0;
836 curr_col_loc_res = nblk;
838 num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
839 if (my_pcol >= row_of_origin_U) {
840 rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk;
843 rows_in_block_U = ratio*nblk;
845 U_local_start = U_to_calc;
847 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
850 curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk;
852 Nb = curr_col_glob_res/nblk;
854 curr_row_loc_res = (Nb/np_rows)*nblk;
856 curr_row_loc_res = curr_row_loc_res + nblk;
858 curr_row_loc_A = curr_row_loc_res;
859 if(col_of_origin_A > my_prow)
860 curr_row_loc_A = curr_row_loc_A - nblk;
862 rows_in_block = rows_in_buffer_A - curr_row_loc_A;
864 curr_col_loc_U = i*nblk;
866 if ((curr_col_loc_U + nblk) <= cols_in_buffer_U) {
867 cols_in_block = nblk;
870 cols_in_block = cols_in_buffer_U - curr_col_loc_U;
872 if (rows_in_block_U > rows_in_buffer_U) {
873 rows_in_block_U = rows_in_buffer_U;
875 A_local_index = curr_row_loc_A;
876 A_local_start = &Buf_to_send_A[A_local_index];
877 Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
879 LDA_A = rows_in_buffer_A;
881 if ((rows_in_block > 0)&&(cols_in_block > 0))
883 U_local_start_curr = U_local_start;
886 for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
888 if ((ii+1)*nblk <= cols_in_buffer_A) {
889 rows_in_block_U_curr = nblk;
892 rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;
895 nvtxRangePushA(
"GEMM_3");
897 if ((j == 1)&&(ii == 0)) {
898 C_GEMM(
"N",
"N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dZero, Res_ptr, &na_rows);
901 C_GEMM(
"N",
"N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
906 LDA_A_new = LDA_A_new - nblk;
908 U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
909 A_local_index = A_local_index - LDA_A + LDA_A*nblk + LDA_A_new;
910 A_local_start = &Buf_to_send_A[A_local_index];
915 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
916 curr_col_loc_res = curr_col_loc_res + nblk;
917 rows_in_block_U = rows_in_block_U + ratio*nblk;
920 MPI_Wait(&request_A_Send, &status);
921 MPI_Wait(&request_A_Recv, &status);
922 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
923 Size_receive_A = (
C_INT_TYPE) Size_receive_AMPI;
927 U_to_calc = &U_stored[Curr_pos_in_U_stored];
928 Curr_pos_in_U_stored = Curr_pos_in_U_stored + SizesU[j-1];
929 Size_receive_U = SizesU[j-1];
933 MPI_Wait(&request_U_Send, &status);
934 MPI_Wait(&request_U_Recv, &status);
935 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
936 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
941 if(ToStore < np_rows - 1)
942 U_to_calc = Buf_to_receive_U;
943 rows_in_buffer_U = (
C_INT_TYPE)U_to_calc[Size_receive_U-1];
944 row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
945 if (my_pcol >= row_of_origin_U) {
946 cols_in_buffer_U = na_cols;
949 cols_in_buffer_U = na_cols - nblk;
951 cols_in_buffer_A = (
C_INT_TYPE)Buf_to_receive_A[Size_receive_A-2];
952 rows_in_buffer_A = (
C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
954 col_of_origin_A = np_cols;
955 for(i = 0; i < ratio; i++)
957 intNumber = (my_pcol + my_prow + i*np_rows + np_cols + np_rows - 1)%np_cols;
958 if(intNumber < col_of_origin_A)
959 col_of_origin_A = intNumber;
963 if (my_pcol >= row_of_origin_U) {
964 curr_col_loc_res = 0;
967 curr_col_loc_res = nblk;
969 num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
970 if (my_pcol >= row_of_origin_U) {
971 rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk;
974 rows_in_block_U = ratio*nblk;
976 U_local_start = U_to_calc;
978 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
981 curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk;
983 Nb = curr_col_glob_res/nblk;
985 curr_row_loc_res = (Nb/np_rows)*nblk;
987 curr_row_loc_res = curr_row_loc_res + nblk;
989 curr_row_loc_A = curr_row_loc_res;
990 if(col_of_origin_A > my_prow)
991 curr_row_loc_A = curr_row_loc_A - nblk;
993 rows_in_block = rows_in_buffer_A - curr_row_loc_A;
995 curr_col_loc_U = i*nblk;
997 if ((curr_col_loc_U + nblk) <= cols_in_buffer_U) {
998 cols_in_block = nblk;
1001 cols_in_block = cols_in_buffer_U - curr_col_loc_U;
1003 if (rows_in_block_U > rows_in_buffer_U) {
1004 rows_in_block_U = rows_in_buffer_U;
1007 A_local_index = curr_row_loc_A;
1008 A_local_start = &Buf_to_receive_A[A_local_index];
1009 Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
1010 LDA_A = rows_in_buffer_A;
1012 if ((rows_in_block > 0) &&(cols_in_block > 0))
1014 U_local_start_curr = U_local_start;
1017 for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
1019 if ((ii+1)*nblk <= cols_in_buffer_A) {
1020 rows_in_block_U_curr = nblk;
1023 rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;
1026 nvtxRangePushA(
"GEMM_4");
1028 if ((j == 1)&&(ii == 0)) {
1029 C_GEMM(
"N",
"N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dZero, Res_ptr, &na_rows);
1032 C_GEMM(
"N",
"N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
1037 LDA_A_new = LDA_A_new - nblk;
1039 U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
1040 A_local_index = A_local_index - (LDA_A - rows_in_block) + LDA_A*nblk + LDA_A_new - rows_in_block;
1041 A_local_start = &Buf_to_receive_A[A_local_index];
1046 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
1047 curr_col_loc_res = curr_col_loc_res + nblk;
1048 rows_in_block_U = rows_in_block_U + ratio*nblk;
1052 nvtxRangePushA(
"PTRAN");
1054 C_PTRAN(&na, &na, &dOne, Res, &one, &one, a_desc, &dZero, M, &one, &one, a_desc);
1060 nvtxRangePushA(
"PLACPY");
1062 C_PLACPY(
"U", &na, &na, M, &one, &one, a_desc, Res, &one, &one, a_desc);
1067 free(Buf_to_send_A);
1068 free(Buf_to_receive_A);
1069 free(Buf_to_send_U);
1070 free(Buf_to_receive_U);