94 MPI_Comm row_comm, MPI_Comm col_comm,
95 int wantDebug,
int useGPU, intptr_t *gpublasHandle)
107 C_INT_TYPE na, nblk, i, j, Size_send_A, Size_receive_A, Size_send_U, Size_receive_U, Buf_rows, Buf_cols,
108 pcol_where_to_send_A, pcol_from_where_to_receive_A, where_to_send_U, from_where_to_receive_U,
109 last_proc_row, last_proc_col, cols_in_buffer_A, rows_in_buffer_A, intNumber;
110 C_INT_TYPE ratio, num_of_iters, cols_in_buffer, rows_in_block, rows_in_buffer, curr_col_loc, cols_in_block,
111 curr_col_glob, curr_row_loc, Size_receive_A_now, Nb, owner, cols_in_buffer_A_now;
112 C_INT_MPI_TYPE Size_receive_A_nowMPI, Size_receive_AMPI, Size_receive_UMPI;
114 math_type *Buf_to_send_A, *Buf_to_receive_A, *Buf_to_send_U, *Buf_to_receive_U, *data_ptr, *Buf_A,
115 *Buf_pos, *U_local_start, *Res_ptr, *M, *M_T, *A_local_start, *U_local_start_curr, *U_stored,
116 *CopyTo, *CopyFrom, *U_to_calc;
118 C_INT_TYPE row_of_origin_U, rows_in_block_U, num_of_blocks_in_U_buffer, k, startPos, cols_in_buffer_U,
119 rows_in_buffer_U, col_of_origin_A, curr_row_loc_res, curr_row_loc_A, curr_col_glob_res;
120 C_INT_TYPE curr_col_loc_res, curr_col_loc_buf, proc_row_curr, curr_col_loc_U, A_local_index,
121 LDA_A, LDA_A_new, index_row_A_for_LDA, ii, rows_in_block_U_curr, width, row_origin_U,
122 rows_in_block_A, cols_in_buffer_A_my_initial, rows_in_buffer_A_my_initial, proc_col_min;
124 C_INT_TYPE Size_U_skewed, Size_U_stored, Curr_pos_in_U_stored, rows_in_buffer_A_now;
125 math_type dOne = 1.0;
126 math_type dZero = 0.0;
132 MPI_Request request_A_Recv;
133 MPI_Request request_A_Send;
134 MPI_Request request_U_Recv;
135 MPI_Request request_U_Send;
139 na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
140 na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
149 if (np_cols%np_rows != 0)
156 if (np_cols < np_rows != 0)
163 ratio = np_cols/np_rows;
164 last_proc_row = ((na-1)/nblk) % np_rows;
165 last_proc_col = ((na-1)/nblk) % np_cols;
169 if (my_pcol <= last_proc_col) {
173 Buf_cols = na_cols + nblk;
177 if (my_pcol < last_proc_col) {
180 else if (my_pcol > last_proc_col) {
181 Buf_cols = na_cols + nblk;
184 Buf_cols = na_cols + nblk - na_cols%nblk;
189 if (my_prow <= last_proc_row) {
190 Buf_rows = na_rows + 1;
193 Buf_rows = na_rows + nblk;
197 if (my_prow < last_proc_row) {
200 else if (my_prow > last_proc_row) {
201 Buf_rows = na_rows + nblk;
204 Buf_rows = na_rows + nblk - na_rows%nblk;
208 intNumber = ceil((math_type)na/(math_type)(np_cols*nblk));
209 Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2;
211 U_stored = malloc((Size_U_stored*(ToStore+1))*
sizeof(math_type));
213 Buf_to_send_A = malloc(ratio*Buf_cols*Buf_rows*
sizeof(math_type));
214 Buf_to_receive_A = malloc(ratio*Buf_cols*Buf_rows*
sizeof(math_type));
215 Buf_to_send_U = malloc(Size_U_stored*
sizeof(math_type));
216 Buf_to_receive_U = malloc(Size_U_stored*
sizeof(math_type));
218 Buf_A = malloc(Buf_cols*Buf_rows*
sizeof(math_type));
219 M = calloc(na_rows*na_cols,
sizeof(math_type));
220 M_T = malloc(na_rows*na_cols*
sizeof(math_type));
222 math_type *Buf_to_send_receive_A_dev;
223 math_type *Buf_to_send_receive_U_dev;
226 math_type *U_local_start_dev, *Res_ptr_dev;
227 math_type *A_local_start_dev, *U_local_start_curr_dev;
232 gpuErrCheck(
gpuMalloc((intptr_t *)&Buf_to_send_receive_A_dev , ratio*Buf_cols*Buf_rows*
sizeof(math_type)) );
233 gpuErrCheck(
gpuMalloc((intptr_t *)&Buf_to_send_receive_U_dev , Size_U_stored*
sizeof(math_type)) );
245 nvtxRangePushA(
"LACPY");
247 C_LACPY(
"A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows);
255 for(i = 0; i < ratio; i++)
257 pcol_where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
258 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
263 if(pcol_where_to_send_A != my_pcol)
265 MPI_Sendrecv(Buf_to_send_A, (
C_INT_MPI_TYPE) (na_cols*na_rows) , MPI_MATH_DATATYPE_PRECISION_C,
267 Buf_A , (
C_INT_MPI_TYPE) (na_rows*Buf_cols), MPI_MATH_DATATYPE_PRECISION_C,
270 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
271 Size_receive_A_now = (
C_INT_TYPE) Size_receive_A_nowMPI;
272 Size_receive_A_now = Size_receive_A_now/na_rows;
275 Size_receive_A_now = na_cols;
278 Size_receive_A = Size_receive_A + Size_receive_A_now;
281 intNumber = pcol_from_where_to_receive_A/np_rows;
283 CopyTo = &Buf_to_receive_A[intNumber*na_rows*nblk];
284 if (pcol_where_to_send_A != my_pcol) {
291 intNumber = ceil((math_type)Size_receive_A_now/(math_type)nblk);
292 for(j = 0; j < intNumber; j++)
295 if(nblk*(j+1) > Size_receive_A_now)
296 width = Size_receive_A_now - nblk*j;
297 C_LACPY(
"A", &na_rows, &width, CopyFrom, &na_rows, CopyTo, &na_rows);
298 CopyTo = CopyTo + na_rows*nblk*ratio;
299 CopyFrom = CopyFrom + na_rows*nblk;
306 C_LACPY(
"A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows);
307 MPI_Sendrecv(Buf_to_send_A , (
C_INT_MPI_TYPE) (na_cols*na_rows) , MPI_MATH_DATATYPE_PRECISION_C,
309 Buf_to_receive_A, (
C_INT_MPI_TYPE) (na_rows*Buf_cols), MPI_MATH_DATATYPE_PRECISION_C,
312 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
313 Size_receive_A = (
C_INT_TYPE) Size_receive_AMPI;
314 Size_receive_A = Size_receive_A/na_rows;
318 C_LACPY(
"A", &na_rows, &na_cols, A, &na_rows, Buf_to_receive_A, &na_rows);
319 Size_receive_A = na_cols;
331 num_of_iters = ceil((math_type)na_cols/(math_type)nblk);
333 where_to_send_U = (my_prow - my_pcol + np_cols)%np_rows;
334 from_where_to_receive_U = (my_pcol + my_prow)%np_rows;
336 if (where_to_send_U == my_prow) {
337 Buf_pos = Buf_to_receive_U;
340 Buf_pos = Buf_to_send_U;
343 if (my_pcol >= my_prow) {
349 num_of_iters = num_of_iters - curr_col_loc;
350 curr_col_loc = curr_col_loc*nblk;
352 if (my_pcol >= my_prow ) {
353 rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
356 rows_in_block = ratio*nblk;
359 for(i = 0; i < num_of_iters; i++)
361 if (rows_in_block > na_rows) {
362 rows_in_block = na_rows;
364 if ((na_cols - curr_col_loc) < nblk) {
365 cols_in_block = na_cols - curr_col_loc;
368 cols_in_block = nblk;
370 if ((rows_in_block > 0)&&(cols_in_block > 0))
372 data_ptr = &U[curr_col_loc*na_rows];
373 C_LACPY(
"A", &rows_in_block, &cols_in_block, data_ptr, &na_rows, Buf_pos, &rows_in_block);
374 Buf_pos = Buf_pos + rows_in_block*cols_in_block;
375 Size_send_U = Size_send_U + rows_in_block*cols_in_block;
377 curr_col_loc = curr_col_loc + nblk;
378 rows_in_block = rows_in_block + ratio*nblk;
380 rows_in_buffer = rows_in_block - ratio*nblk;
381 *Buf_pos = (math_type)rows_in_buffer;
382 Size_send_U = Size_send_U + 1;
385 if (where_to_send_U != my_prow)
388 MPI_Sendrecv(Buf_to_send_U , (
C_INT_MPI_TYPE) Size_send_U , MPI_MATH_DATATYPE_PRECISION_C,
390 Buf_to_receive_U, (
C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C,
393 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
394 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
397 Size_receive_U = Size_send_U;
399 for(i = 0; i < Size_receive_U; i++)
400 U_stored[i] = Buf_to_receive_U[i];
401 Size_U_skewed = Size_receive_U;
402 Curr_pos_in_U_stored = Size_U_skewed;
408 pcol_where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
409 pcol_from_where_to_receive_A = (my_pcol + 1)%np_cols;
410 where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
411 from_where_to_receive_U = (my_prow + 1)%np_rows;
414 for(j = 1; j < np_rows; j++)
417 data_ptr = Buf_to_send_A;
418 Buf_to_send_A = Buf_to_receive_A;
419 Buf_to_receive_A = data_ptr;
421 data_ptr = Buf_to_send_U;
422 Buf_to_send_U = Buf_to_receive_U;
423 Buf_to_receive_U = data_ptr;
426 Size_send_A = Size_receive_A;
427 MPI_Isend(Buf_to_send_A, (
C_INT_MPI_TYPE) (Size_send_A*na_rows), MPI_MATH_DATATYPE_PRECISION_C,
429 MPI_Irecv(Buf_to_receive_A, (
C_INT_MPI_TYPE) (Buf_cols*na_rows*ratio), MPI_MATH_DATATYPE_PRECISION_C,
433 Size_send_U = Size_receive_U;
434 MPI_Isend(Buf_to_send_U, (
C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C,
436 MPI_Irecv(Buf_to_receive_U, (
C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C,
440 rows_in_buffer = (int)Buf_to_send_U[Size_receive_U-1];
441 row_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
443 if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U))
445 cols_in_buffer = na_cols;
446 curr_col_loc_res = 0;
447 curr_col_loc_buf = 0;
449 if((my_pcol < my_prow)&&(my_pcol < row_origin_U))
451 cols_in_buffer = na_cols - nblk;
452 curr_col_loc_res = nblk;
453 curr_col_loc_buf = 0;
455 if((my_pcol >= my_prow)&&(my_pcol < row_origin_U))
457 cols_in_buffer = na_cols - nblk;
458 curr_col_loc_res = nblk;
459 curr_col_loc_buf = 0;
461 if((my_pcol < my_prow)&&(my_pcol >= row_origin_U))
463 cols_in_buffer = na_cols;
464 curr_col_loc_res = nblk;
465 curr_col_loc_buf = nblk;
468 num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk);
470 startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
478 U_local_start_dev = Buf_to_send_receive_U_dev + startPos;
479 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows;
482 U_local_start = &Buf_to_send_U[startPos];
483 Res_ptr = &M[curr_col_loc_res*na_rows];
487 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
489 curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
490 proc_row_curr = (curr_col_glob/nblk)%np_rows;
491 rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk;
492 if (my_prow <= proc_row_curr) {
493 rows_in_block_A = rows_in_block_A + nblk;
495 if (rows_in_block_A > na_rows) {
496 rows_in_block_A = na_rows;
498 if ((curr_col_loc_buf + nblk) <= cols_in_buffer) {
499 cols_in_block = nblk;
502 cols_in_block = cols_in_buffer - curr_col_loc_buf;
505 rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk;
506 if (proc_row_curr >= row_origin_U) {
507 rows_in_block_U = rows_in_block_U + nblk;
509 if (rows_in_block_U > rows_in_buffer) {
510 rows_in_block_U = rows_in_buffer;
513 if ((rows_in_block_A > 0)&&(cols_in_block > 0)) {
520 rows_in_block_A, cols_in_block, rows_in_block_U, dOne,
521 Buf_to_send_receive_A_dev, na_rows,
522 U_local_start_dev, rows_in_block_U, dOne,
523 Res_ptr_dev, na_rows);
527 C_GEMM(
"N",
"N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne,
528 Buf_to_send_A, &na_rows, U_local_start, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
533 curr_col_loc_res = curr_col_loc_res + nblk;
534 curr_col_loc_buf = curr_col_loc_buf + nblk;
537 U_local_start_dev = U_local_start_dev + rows_in_block_U*cols_in_block;
538 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows;
541 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
542 Res_ptr = &M[curr_col_loc_res*na_rows];
548 MPI_Wait(&request_A_Send, &status);
549 MPI_Wait(&request_A_Recv, &status);
551 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
552 Size_receive_A = (
C_INT_TYPE) Size_receive_AMPI;
553 Size_receive_A = Size_receive_A / na_rows;
556 MPI_Wait(&request_U_Send, &status);
557 MPI_Wait(&request_U_Recv, &status);
558 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
559 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
563 for(k = 0; k < Size_receive_U; k++)
564 U_stored[Curr_pos_in_U_stored + k] = Buf_to_receive_U[k];
565 Curr_pos_in_U_stored = Curr_pos_in_U_stored + Size_receive_U;
566 SizesU[j-1] = Size_receive_U;
573 rows_in_buffer = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
574 row_origin_U = (my_pcol + my_prow + np_cols + np_rows -1)%np_rows;
576 if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U))
578 cols_in_buffer = na_cols;
579 curr_col_loc_res = 0;
580 curr_col_loc_buf = 0;
582 if((my_pcol < my_prow)&&(my_pcol < row_origin_U))
584 cols_in_buffer = na_cols - nblk;
585 curr_col_loc_res = nblk;
586 curr_col_loc_buf = 0;
588 if((my_pcol >= my_prow)&&(my_pcol < row_origin_U))
590 cols_in_buffer = na_cols - nblk;
591 curr_col_loc_res = nblk;
592 curr_col_loc_buf = 0;
594 if((my_pcol < my_prow)&&(my_pcol >= row_origin_U))
596 cols_in_buffer = na_cols;
597 curr_col_loc_res = nblk;
598 curr_col_loc_buf = nblk;
601 num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk);
603 startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
611 U_local_start_dev = Buf_to_send_receive_U_dev + startPos;
612 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows;
615 U_local_start = &Buf_to_receive_U[startPos];
616 Res_ptr = &M[curr_col_loc_res*na_rows];
620 nvtxRangePushA(
"loop-last i<num_of_blocks_in_U_buffer");
622 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
624 curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
625 proc_row_curr = (curr_col_glob/nblk)%np_rows;
626 rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk;
627 if (my_prow <= proc_row_curr) {
628 rows_in_block_A = rows_in_block_A + nblk;
630 if (rows_in_block_A > na_rows) {
631 rows_in_block_A = na_rows;
633 if ((curr_col_loc_buf + nblk) <= cols_in_buffer) {
634 cols_in_block = nblk;
637 cols_in_block = cols_in_buffer - curr_col_loc_buf;
639 rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk;
640 if (proc_row_curr >= row_origin_U) {
641 rows_in_block_U = rows_in_block_U + nblk;
643 if (rows_in_block_U > rows_in_buffer) {
644 rows_in_block_U = rows_in_buffer;
646 if ((rows_in_block_A > 0)&&(cols_in_block > 0)) {
648 nvtxRangePushA(
"GEMM_1_last");
654 rows_in_block_A, cols_in_block, rows_in_block_U, dOne,
655 Buf_to_send_receive_A_dev, na_rows,
656 U_local_start_dev, rows_in_block_U, dOne,
657 Res_ptr_dev, na_rows);
661 C_GEMM(
"N",
"N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &dOne,
662 Buf_to_receive_A, &na_rows, U_local_start, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
669 curr_col_loc_res = curr_col_loc_res + nblk;
670 curr_col_loc_buf = curr_col_loc_buf + nblk;
673 U_local_start_dev = U_local_start_dev + rows_in_block_U*cols_in_block;
674 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows;
677 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
678 Res_ptr = &M[curr_col_loc_res*na_rows];
690 nvtxRangePushA(
"PTRAN");
692 C_PTRAN(&na, &na, &dOne, M, &one, &one, a_desc, &dZero, M_T, &one, &one, a_desc);
701 nvtxRangePushA(
"initial reordering of A");
705 if ((ratio != 1)||(my_prow != 0)) {
706 Buf_pos = Buf_to_send_A;
709 Buf_pos = Buf_to_receive_A;
712 num_of_iters = ceil((math_type)na_cols/(math_type)nblk);
714 cols_in_buffer_A_my_initial = 0;
717 if (my_pcol <= my_prow)
720 rows_in_buffer_A_my_initial = na_rows;
724 curr_row_loc = ceil((math_type)(((math_type)my_pcol - (math_type)my_prow)/(math_type)np_rows))*nblk;
725 rows_in_buffer_A_my_initial = na_rows - curr_row_loc;
728 for(i = 0; i < num_of_iters; i++)
730 curr_col_loc = i*nblk;
731 rows_in_block = na_rows - curr_row_loc;
733 if ((na_cols - curr_col_loc) < nblk) {
734 cols_in_block = na_cols - curr_col_loc;
737 cols_in_block = nblk;
739 if ((rows_in_block > 0)&&(cols_in_block > 0))
741 A_local_start = &M_T[curr_col_loc*na_rows + curr_row_loc];
742 C_LACPY(
"A", &rows_in_block, &cols_in_block, A_local_start, &na_rows, Buf_pos, &rows_in_block);
743 Buf_pos = Buf_pos + rows_in_block*cols_in_block;
744 Size_send_A = Size_send_A + rows_in_block*cols_in_block;
745 cols_in_buffer_A_my_initial = cols_in_buffer_A_my_initial + cols_in_block;
747 curr_row_loc = curr_row_loc + ratio*nblk;
749 *Buf_pos = (math_type)cols_in_buffer_A_my_initial;
750 Size_send_A = Size_send_A + 1;
754 proc_col_min = np_cols;
755 for(i = 0; i < ratio; i++)
757 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
758 if(pcol_from_where_to_receive_A < proc_col_min)
759 proc_col_min = pcol_from_where_to_receive_A;
763 cols_in_buffer_A = 0;
764 rows_in_buffer_A = 0;
765 for(i = 0; i < ratio; i++)
767 pcol_where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
768 pcol_from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
773 if(pcol_where_to_send_A != my_pcol)
775 MPI_Sendrecv(Buf_to_send_A, (
C_INT_MPI_TYPE) Size_send_A , MPI_MATH_DATATYPE_PRECISION_C,
777 Buf_A , (
C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C,
781 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
782 Size_receive_A_now = (
C_INT_TYPE) Size_receive_A_nowMPI;
784 Size_receive_A = Size_receive_A + Size_receive_A_now - 1;
786 cols_in_buffer_A_now = Buf_A[Size_receive_A_now-1];
787 cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now;
790 if(pcol_from_where_to_receive_A <= my_prow)
792 rows_in_buffer_A_now = na_rows;
796 rows_in_buffer_A_now = na_rows - ceil((math_type)(((math_type)pcol_from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk;
798 if(rows_in_buffer_A < rows_in_buffer_A_now)
799 rows_in_buffer_A = rows_in_buffer_A_now;
801 intNumber = pcol_from_where_to_receive_A/np_rows;
802 if (proc_col_min <= my_prow) {
803 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)];
806 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)];
812 cols_in_buffer_A_now = cols_in_buffer_A_my_initial;
813 cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now;
815 rows_in_buffer_A_now = rows_in_buffer_A_my_initial;
816 if(rows_in_buffer_A < rows_in_buffer_A_now)
817 rows_in_buffer_A = rows_in_buffer_A_now;
819 intNumber = my_pcol/np_rows;
820 if (proc_col_min <= my_prow) {
821 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)];
824 CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)];
826 CopyFrom = Buf_to_send_A;
828 Size_receive_A = Size_receive_A + Size_send_A - 1;
832 intNumber = ceil((math_type)cols_in_buffer_A_now/(math_type)nblk);
833 rows_in_block = rows_in_buffer_A_now;
834 for(j = 0; j < intNumber; j++)
836 if ((j+1)*nblk < cols_in_buffer_A_now) {
837 cols_in_block = nblk;
840 cols_in_block = cols_in_buffer_A_now - j*nblk;
842 C_LACPY(
"A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
844 CopyFrom = CopyFrom + rows_in_block*cols_in_block;
845 CopyTo = CopyTo + nblk*(ratio*rows_in_block - nblk*(ratio-1)*ratio/2);
846 rows_in_block = rows_in_block - ratio*nblk;
853 MPI_Sendrecv(Buf_to_send_A , (
C_INT_MPI_TYPE) Size_send_A , MPI_MATH_DATATYPE_PRECISION_C,
855 Buf_to_receive_A, (
C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C,
859 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
861 Size_receive_A = (
C_INT_TYPE) Size_receive_AMPI;
863 cols_in_buffer_A = (
C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
864 if(pcol_from_where_to_receive_A <= my_prow)
866 rows_in_buffer_A = na_rows;
870 rows_in_buffer_A = na_rows - ceil((math_type)(((math_type)pcol_from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk;
875 Size_receive_A = Size_send_A;
876 rows_in_buffer_A = rows_in_buffer_A_my_initial;
877 cols_in_buffer_A = cols_in_buffer_A_my_initial;
883 Buf_to_receive_A[Size_receive_A] = cols_in_buffer_A;
884 Buf_to_receive_A[Size_receive_A + 1] = rows_in_buffer_A;
885 Size_receive_A = Size_receive_A + 2;
889 Buf_to_receive_A[Size_receive_A] = rows_in_buffer_A;
890 Size_receive_A = Size_receive_A + 1;
899 Size_receive_U = Size_U_skewed;
900 U_to_calc = U_stored;
904 pcol_where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
905 pcol_from_where_to_receive_A = (my_pcol + 1)%np_cols;
906 where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
907 from_where_to_receive_U = (my_prow + 1)%np_rows;
908 Curr_pos_in_U_stored = Size_U_skewed;
915 nvtxRangePushA(
"loop j<np_rows");
917 for(j = 1; j < np_rows; j++)
920 data_ptr = Buf_to_send_A;
921 Buf_to_send_A = Buf_to_receive_A;
922 Buf_to_receive_A = data_ptr;
926 data_ptr = Buf_to_send_U;
927 Buf_to_send_U = Buf_to_receive_U;
928 Buf_to_receive_U = data_ptr;
932 Size_send_A = Size_receive_A;
937 Size_send_U = Size_receive_U;
943 U_to_calc = Buf_to_send_U;
952 rows_in_buffer_U = (
C_INT_TYPE)U_to_calc[Size_receive_U-1];
953 row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
954 if (my_pcol >= row_of_origin_U) {
955 cols_in_buffer_U = na_cols;
958 cols_in_buffer_U = na_cols - nblk;
960 cols_in_buffer_A = (
C_INT_TYPE)Buf_to_send_A[Size_receive_A-2];
961 rows_in_buffer_A = (
C_INT_TYPE)Buf_to_send_A[Size_receive_A-1];
963 col_of_origin_A = np_cols;
964 for(i = 0; i < ratio; i++)
966 intNumber = (my_pcol + my_prow + i*np_rows + np_cols + j - 1)%np_cols;
967 if(intNumber < col_of_origin_A)
968 col_of_origin_A = intNumber;
973 if (my_pcol >= row_of_origin_U) {
974 curr_col_loc_res = 0;
977 curr_col_loc_res = nblk;
979 num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
980 if (my_pcol >= row_of_origin_U) {
981 rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk;
984 rows_in_block_U = ratio*nblk;
993 U_local_start_dev = Buf_to_send_receive_U_dev;
996 U_local_start = U_to_calc;
1000 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
1003 curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk;
1005 Nb = curr_col_glob_res/nblk;
1007 curr_row_loc_res = (Nb/np_rows)*nblk;
1009 curr_row_loc_res = curr_row_loc_res + nblk;
1011 curr_row_loc_A = curr_row_loc_res;
1012 if(col_of_origin_A > my_prow)
1013 curr_row_loc_A = curr_row_loc_A - nblk;
1015 rows_in_block = rows_in_buffer_A - curr_row_loc_A;
1017 curr_col_loc_U = i*nblk;
1019 if ((curr_col_loc_U + nblk) <= cols_in_buffer_U) {
1020 cols_in_block = nblk;
1023 cols_in_block = cols_in_buffer_U - curr_col_loc_U;
1025 if (rows_in_block_U > rows_in_buffer_U) {
1026 rows_in_block_U = rows_in_buffer_U;
1028 A_local_index = curr_row_loc_A;
1031 A_local_start_dev = Buf_to_send_receive_A_dev + A_local_index;
1032 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows + curr_row_loc_res;
1035 A_local_start = &Buf_to_send_A[A_local_index];
1036 Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
1039 LDA_A = rows_in_buffer_A;
1041 if ((rows_in_block > 0)&&(cols_in_block > 0))
1044 U_local_start_curr_dev = U_local_start_dev;
1047 U_local_start_curr = U_local_start;
1051 for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
1053 if ((ii+1)*nblk <= cols_in_buffer_A) {
1054 rows_in_block_U_curr = nblk;
1057 rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;
1065 rows_in_block, cols_in_block, rows_in_block_U_curr, dOne,
1066 A_local_start_dev, LDA_A,
1067 U_local_start_curr_dev, rows_in_block_U, dOne,
1068 Res_ptr_dev, na_rows);
1072 C_GEMM(
"N",
"N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne,
1073 A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
1077 LDA_A_new = LDA_A_new - nblk;
1078 A_local_index = A_local_index - LDA_A + LDA_A*nblk + LDA_A_new;
1081 A_local_start_dev = Buf_to_send_receive_A_dev + A_local_index;
1082 U_local_start_curr_dev = U_local_start_curr_dev + rows_in_block_U_curr;
1085 A_local_start = &Buf_to_send_A[A_local_index];
1086 U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
1094 U_local_start_dev = U_local_start_dev + rows_in_block_U*cols_in_block;
1097 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
1100 curr_col_loc_res = curr_col_loc_res + nblk;
1101 rows_in_block_U = rows_in_block_U + ratio*nblk;
1105 MPI_Wait(&request_A_Send, &status);
1106 MPI_Wait(&request_A_Recv, &status);
1107 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
1108 Size_receive_A = (
C_INT_TYPE) Size_receive_AMPI;
1112 U_to_calc = &U_stored[Curr_pos_in_U_stored];
1113 Curr_pos_in_U_stored = Curr_pos_in_U_stored + SizesU[j-1];
1114 Size_receive_U = SizesU[j-1];
1118 MPI_Wait(&request_U_Send, &status);
1119 MPI_Wait(&request_U_Recv, &status);
1120 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
1121 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
1130 if(ToStore < np_rows - 1)
1131 U_to_calc = Buf_to_receive_U;
1132 rows_in_buffer_U = (
C_INT_TYPE)U_to_calc[Size_receive_U-1];
1133 row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
1134 if (my_pcol >= row_of_origin_U) {
1135 cols_in_buffer_U = na_cols;
1138 cols_in_buffer_U = na_cols - nblk;
1140 cols_in_buffer_A = (
C_INT_TYPE)Buf_to_receive_A[Size_receive_A-2];
1141 rows_in_buffer_A = (
C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
1143 col_of_origin_A = np_cols;
1144 for(i = 0; i < ratio; i++)
1146 intNumber = (my_pcol + my_prow + i*np_rows + np_cols + np_rows - 1)%np_cols;
1147 if(intNumber < col_of_origin_A)
1148 col_of_origin_A = intNumber;
1152 if (my_pcol >= row_of_origin_U) {
1153 curr_col_loc_res = 0;
1156 curr_col_loc_res = nblk;
1158 num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
1159 if (my_pcol >= row_of_origin_U) {
1160 rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk;
1163 rows_in_block_U = ratio*nblk;
1172 U_local_start_dev = Buf_to_send_receive_U_dev;
1175 U_local_start = U_to_calc;
1178 for (i = 0; i < num_of_blocks_in_U_buffer; i++)
1181 curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk;
1183 Nb = curr_col_glob_res/nblk;
1185 curr_row_loc_res = (Nb/np_rows)*nblk;
1187 curr_row_loc_res = curr_row_loc_res + nblk;
1189 curr_row_loc_A = curr_row_loc_res;
1190 if(col_of_origin_A > my_prow)
1191 curr_row_loc_A = curr_row_loc_A - nblk;
1193 rows_in_block = rows_in_buffer_A - curr_row_loc_A;
1195 curr_col_loc_U = i*nblk;
1197 if ((curr_col_loc_U + nblk) <= cols_in_buffer_U) {
1198 cols_in_block = nblk;
1201 cols_in_block = cols_in_buffer_U - curr_col_loc_U;
1203 if (rows_in_block_U > rows_in_buffer_U) {
1204 rows_in_block_U = rows_in_buffer_U;
1207 A_local_index = curr_row_loc_A;
1210 A_local_start_dev = Buf_to_send_receive_A_dev + A_local_index;
1211 Res_ptr_dev = M_dev + curr_col_loc_res*na_rows + curr_row_loc_res;
1214 A_local_start = &Buf_to_receive_A[A_local_index];
1215 Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
1218 LDA_A = rows_in_buffer_A;
1221 if ((rows_in_block > 0) &&(cols_in_block > 0))
1224 U_local_start_curr_dev = U_local_start_dev;
1227 U_local_start_curr = U_local_start;
1231 for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
1233 if ((ii+1)*nblk <= cols_in_buffer_A) {
1234 rows_in_block_U_curr = nblk;
1237 rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;
1245 rows_in_block, cols_in_block, rows_in_block_U_curr, dOne,
1246 A_local_start_dev, LDA_A,
1247 U_local_start_curr_dev, rows_in_block_U, dOne,
1248 Res_ptr_dev, na_rows);
1252 C_GEMM(
"N",
"N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &dOne,
1253 A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dOne, Res_ptr, &na_rows);
1257 LDA_A_new = LDA_A_new - nblk;
1258 A_local_index = A_local_index - (LDA_A - rows_in_block) + LDA_A*nblk + LDA_A_new - rows_in_block;
1261 A_local_start_dev = Buf_to_send_receive_A_dev + A_local_index;
1262 U_local_start_curr_dev = U_local_start_curr_dev + rows_in_block_U_curr;
1265 A_local_start = &Buf_to_receive_A[A_local_index];
1266 U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
1274 U_local_start_dev = U_local_start_dev + rows_in_block_U*cols_in_block;
1277 U_local_start = U_local_start + rows_in_block_U*cols_in_block;
1280 curr_col_loc_res = curr_col_loc_res + nblk;
1281 rows_in_block_U = rows_in_block_U + ratio*nblk;
1287 nvtxRangePushA(
"PTRAN");
1289 C_PTRAN(&na, &na, &dOne, Res, &one, &one, a_desc, &dZero, M, &one, &one, a_desc);
1295 nvtxRangePushA(
"PLACPY");
1297 C_PLACPY(
"U", &na, &na, M, &one, &one, a_desc, Res, &one, &one, a_desc);
1308 free(Buf_to_send_A);
1309 free(Buf_to_receive_A);
1310 free(Buf_to_send_U);
1311 free(Buf_to_receive_U);