85void cannons_triang_rectangular_impl(math_type* U, math_type* B,
C_INT_TYPE np_rows,
C_INT_TYPE np_cols,
C_INT_TYPE my_prow,
C_INT_TYPE my_pcol,
C_INT_TYPE_PTR U_desc,
C_INT_TYPE_PTR b_desc, math_type *Res, MPI_Comm row_comm, MPI_Comm col_comm)
97 C_INT_TYPE na, nb, nblk, width, na_rows, na_cols, nb_cols, cols_in_buffer_U_my_initial, cols_in_buffer_U, rows_in_buffer_U, Size_receive_U_now, rows_in_buffer_U_now, cols_in_buffer_U_now, rows_in_buffer_U_my_initial;
99 C_INT_MPI_TYPE Size_receive_U_nowMPI, Size_receive_UMPI, Size_receive_BMPI;
100 C_INT_TYPE i, j, Size_send_U, Size_receive_U, Size_send_B, Size_receive_B, intNumber, Buf_rows, Buf_cols_U, Buf_cols_B, curr_rows, num_of_iters, cols_in_buffer, rows_in_block, curr_col_loc, cols_in_block, num_of_blocks_in_U_buffer, col_of_origin_U, b_rows_mult, b_cols_mult;
102 math_type *Buf_to_send_U, *Buf_to_receive_U, *Buf_to_send_B, *Buf_to_receive_B, *Buf_U, *PosBuff;
104 C_INT_TYPE where_to_send_U, from_where_to_receive_U, where_to_send_B, from_where_to_receive_B, last_proc_col_B, last_proc_row_B, n, Size_U_stored, proc_col_min;
106 math_type *U_local_start, *Buf_pos, *B_local_start, *double_ptr, *CopyTo, *CopyFrom;
114 math_type done = 1.0;
115 math_type dzero = 0.0;
121 na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
122 na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
123 nb_cols = numroc_(&nb, &nblk, &my_pcol, &zero, &np_cols);
125 MPI_Request request_U_Recv;
126 MPI_Request request_U_Send;
127 MPI_Request request_B_Recv;
128 MPI_Request request_B_Send;
131 last_proc_col_B = ((nb-1)/nblk) % np_cols;
132 last_proc_row_B = ((na-1)/nblk) % np_rows;
137 if(my_pcol <= last_proc_col_B)
138 Buf_cols_B = nb_cols;
140 Buf_cols_B = nb_cols + nblk;
142 if(my_pcol < last_proc_col_B)
143 Buf_cols_B = nb_cols;
144 else if(my_pcol > last_proc_col_B)
145 Buf_cols_B = nb_cols + nblk;
147 Buf_cols_B = nb_cols + nblk - nb_cols%nblk;
150 if(my_prow <= last_proc_row_B)
153 Buf_rows = na_rows + nblk;
155 if(my_prow < last_proc_row_B)
157 else if(my_prow > last_proc_row_B)
158 Buf_rows = na_rows + nblk;
160 Buf_rows = na_rows + nblk - na_rows%nblk;
162 ratio = np_cols/np_rows;
164 intNumber = ceil((math_type)na/(math_type)(np_cols*nblk));
165 Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2;
167 Buf_to_send_U = malloc(ratio*Size_U_stored*
sizeof(math_type));
168 Buf_to_receive_U = malloc(ratio*Size_U_stored*
sizeof(math_type));
169 Buf_to_send_B = malloc(Buf_cols_B*Buf_rows*
sizeof(math_type));
170 Buf_to_receive_B = malloc(Buf_cols_B*Buf_rows*
sizeof(math_type));
172 Buf_U = malloc(Size_U_stored*
sizeof(math_type));
174 for(i = 0; i < na_rows*nb_cols; i++)
180 if((ratio != 1)||(my_prow != 0))
181 Buf_pos = Buf_to_send_U;
183 Buf_pos = Buf_to_receive_U;
187 if(my_pcol >= my_prow)
192 num_of_iters = ceil((math_type)na_cols/(math_type)nblk);
193 num_of_iters = num_of_iters - curr_col_loc;
194 curr_col_loc = curr_col_loc*nblk;
196 if(my_pcol >= my_prow )
197 rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
199 rows_in_block = ratio*nblk;
200 cols_in_buffer_U_my_initial = 0;
202 for(i = 0; i < num_of_iters; i++)
204 if(rows_in_block > na_rows)
205 rows_in_block = na_rows;
207 if ((na_cols - curr_col_loc) < nblk)
208 cols_in_block = na_cols - curr_col_loc;
210 cols_in_block = nblk;
212 if((rows_in_block > 0)&&(cols_in_block > 0))
214 double_ptr = &U[curr_col_loc*na_rows];
215 C_LACPY(
"A", &rows_in_block, &cols_in_block, double_ptr, &na_rows, Buf_pos, &rows_in_block);
216 Buf_pos = Buf_pos + rows_in_block*cols_in_block;
217 Size_send_U = Size_send_U + rows_in_block*cols_in_block;
218 cols_in_buffer_U_my_initial = cols_in_buffer_U_my_initial + cols_in_block;
220 curr_col_loc = curr_col_loc + nblk;
221 rows_in_block = rows_in_block + ratio*nblk;
223 rows_in_buffer_U_my_initial = rows_in_block - ratio*nblk;
224 *Buf_pos = (math_type)cols_in_buffer_U_my_initial;
225 Buf_pos = Buf_pos + 1;
226 *Buf_pos = (math_type)rows_in_buffer_U_my_initial;
227 Size_send_U = Size_send_U + 2;
231 proc_col_min = np_cols;
232 for(i = 0; i < ratio; i++)
234 from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols;
235 if(from_where_to_receive_U < proc_col_min)
236 proc_col_min = from_where_to_receive_U;
241 cols_in_buffer_U = 0;
242 rows_in_buffer_U = 0;
243 for(i = 0; i < ratio; i++)
245 where_to_send_U = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
246 from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols;
251 if(where_to_send_U != my_pcol)
253 MPI_Sendrecv(Buf_to_send_U, (
C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_U, 0, Buf_U, (
C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &status);
254 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U_nowMPI);
255 Size_receive_U_now = (
C_INT_TYPE) Size_receive_U_nowMPI;
256 Size_receive_U = Size_receive_U + Size_receive_U_now - 2;
258 cols_in_buffer_U_now = Buf_U[Size_receive_U_now - 2];
259 cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now;
260 rows_in_buffer_U_now = Buf_U[Size_receive_U_now - 1];
262 if(rows_in_buffer_U < rows_in_buffer_U_now)
263 rows_in_buffer_U = rows_in_buffer_U_now;
265 intNumber = from_where_to_receive_U/np_rows;
266 if(proc_col_min >= my_prow)
267 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2];
269 if(from_where_to_receive_U < my_prow)
270 CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2];
272 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2];
277 cols_in_buffer_U_now = cols_in_buffer_U_my_initial;
278 cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now;
280 rows_in_buffer_U_now = rows_in_buffer_U_my_initial;
281 if(rows_in_buffer_U < rows_in_buffer_U_now)
282 rows_in_buffer_U = rows_in_buffer_U_now;
284 intNumber = my_pcol/np_rows;
285 if(proc_col_min >= my_prow)
286 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2];
288 if(my_pcol < my_prow)
289 CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2];
291 CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2];
292 CopyFrom = Buf_to_send_U;
293 Size_receive_U = Size_receive_U + Size_send_U - 2;
297 intNumber = ceil((math_type)cols_in_buffer_U_now/(math_type)nblk);
298 if(from_where_to_receive_U >= my_prow)
299 rows_in_block = ceil(((math_type)(from_where_to_receive_U + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
301 rows_in_block = ratio*nblk;
302 for(j = 0; j < intNumber; j++)
304 if((j+1)*nblk < cols_in_buffer_U_now)
305 cols_in_block = nblk;
307 cols_in_block = cols_in_buffer_U_now - j*nblk;
309 C_LACPY(
"A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
311 CopyFrom = CopyFrom + rows_in_block*cols_in_block;
312 CopyTo = CopyTo + ratio*rows_in_block*nblk + nblk*nblk*ratio*(ratio-1)/2;
313 rows_in_block = rows_in_block + ratio*nblk;
314 if(rows_in_block > rows_in_buffer_U_now)
315 rows_in_block = rows_in_buffer_U_now;
322 MPI_Sendrecv(Buf_to_send_U, (
C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_U, 0, Buf_to_receive_U, (
C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &status);
323 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
324 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
326 cols_in_buffer_U = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-2];
327 rows_in_buffer_U = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
331 Size_receive_U = Size_send_U;
332 rows_in_buffer_U = rows_in_buffer_U_my_initial;
333 cols_in_buffer_U = cols_in_buffer_U_my_initial;
339 Buf_to_receive_U[Size_receive_U] = cols_in_buffer_U;
340 Buf_to_receive_U[Size_receive_U + 1] = rows_in_buffer_U;
341 Size_receive_U = Size_receive_U + 2;
348 where_to_send_B = (my_prow - my_pcol + np_cols)%np_rows;
349 from_where_to_receive_B = (my_pcol + my_prow)%np_rows;
352 if(where_to_send_B != my_prow)
355 C_LACPY(
"A", &na_rows, &nb_cols, B, &na_rows, Buf_to_send_B, &na_rows);
356 MPI_Sendrecv(Buf_to_send_B, (
C_INT_MPI_TYPE) (nb_cols*na_rows), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_B, 0, Buf_to_receive_B, (
C_INT_MPI_TYPE) (nb_cols*Buf_rows), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_B, 0, col_comm, &status);
357 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_BMPI);
358 if (nb_cols!=0) Size_receive_B = (
C_INT_TYPE) Size_receive_BMPI / nb_cols;
359 else Size_receive_B=0;
363 C_LACPY(
"A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows);
364 Size_receive_B = na_rows;
369 C_LACPY(
"A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows);
370 Size_receive_B = na_rows;
374 where_to_send_U = (my_pcol - 1 + np_cols)%np_cols;
375 from_where_to_receive_U = (my_pcol + 1)%np_cols;
376 where_to_send_B = (my_prow - 1 + np_rows)%np_rows;
377 from_where_to_receive_B = (my_prow + 1)%np_rows;
379 for(i = 1; i < np_rows; i++)
382 double_ptr = Buf_to_send_U;
383 Buf_to_send_U = Buf_to_receive_U;
384 Buf_to_receive_U = double_ptr;
386 double_ptr = Buf_to_send_B;
387 Buf_to_send_B = Buf_to_receive_B;
388 Buf_to_receive_B = double_ptr;
390 Size_send_U = Size_receive_U;
391 Size_send_B = Size_receive_B;
394 MPI_Isend(Buf_to_send_U, (
C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_U, 0, row_comm, &request_U_Send);
395 MPI_Irecv(Buf_to_receive_U, (
C_INT_MPI_TYPE) (ratio*Size_U_stored), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &request_U_Recv);
397 MPI_Isend(Buf_to_send_B, (
C_INT_MPI_TYPE) (Size_send_B*nb_cols), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) where_to_send_B, 0, col_comm, &request_B_Send);
398 MPI_Irecv(Buf_to_receive_B, (
C_INT_MPI_TYPE) (Buf_rows*nb_cols), MPI_MATH_DATATYPE_PRECISION_C, (
C_INT_MPI_TYPE) from_where_to_receive_B, 0, col_comm, &request_B_Recv);
400 cols_in_buffer_U = (
C_INT_TYPE)Buf_to_send_U[Size_receive_U-2];
401 rows_in_buffer_U = (
C_INT_TYPE)Buf_to_send_U[Size_receive_U-1];
403 proc_col_min = np_cols;
404 for(j = 0; j < ratio; j++)
406 col_of_origin_U = (my_pcol + my_prow + i - 1 + j*np_rows)%np_cols;
407 if(col_of_origin_U < proc_col_min)
408 proc_col_min = col_of_origin_U;
410 col_of_origin_U = proc_col_min;
412 num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
414 if (col_of_origin_U >= my_prow)
415 B_local_start = Buf_to_send_B;
417 B_local_start = Buf_to_send_B + nblk;
419 U_local_start = Buf_to_send_U;
421 for(j = 0; j < num_of_blocks_in_U_buffer; j++)
423 curr_rows = (j+1)*nblk;
424 if (curr_rows > rows_in_buffer_U)
425 curr_rows = rows_in_buffer_U;
427 if((j+1)*nblk <= cols_in_buffer_U)
430 b_rows_mult = cols_in_buffer_U - j*nblk;
432 if(Size_receive_B!=0) C_GEMM(
"N",
"N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows);
434 U_local_start = U_local_start + nblk*curr_rows;
435 B_local_start = B_local_start + nblk;
438 MPI_Wait(&request_U_Send, &status);
439 MPI_Wait(&request_U_Recv, &status);
440 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
441 Size_receive_U = (
C_INT_TYPE) Size_receive_UMPI;
443 MPI_Wait(&request_B_Send, &status);
444 MPI_Wait(&request_B_Recv, &status);
445 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_BMPI);
446 if (nb_cols!=0) Size_receive_B = (
C_INT_TYPE) Size_receive_BMPI / nb_cols;
447 else Size_receive_B=0;
452 cols_in_buffer_U = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-2];
453 rows_in_buffer_U = (
C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
455 proc_col_min = np_cols;
456 for(j = 0; j < ratio; j++)
458 col_of_origin_U = (my_pcol + my_prow + np_rows - 1 + j*np_rows)%np_cols;
459 if(col_of_origin_U < proc_col_min)
460 proc_col_min = col_of_origin_U;
462 col_of_origin_U = proc_col_min;
464 num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
466 if (col_of_origin_U >= my_prow)
467 B_local_start = Buf_to_receive_B;
469 B_local_start = Buf_to_receive_B + nblk;
471 U_local_start = Buf_to_receive_U;
473 for(j = 0; j < num_of_blocks_in_U_buffer; j++)
475 curr_rows = (j+1)*nblk;
476 if (curr_rows > rows_in_buffer_U)
477 curr_rows = rows_in_buffer_U;
479 if((j+1)*nblk <= cols_in_buffer_U)
482 b_rows_mult = cols_in_buffer_U - j*nblk;
484 if(Size_receive_B!=0) C_GEMM(
"N",
"N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows);
486 U_local_start = U_local_start + nblk*curr_rows;
487 B_local_start = B_local_start + nblk;
491 free(Buf_to_receive_U);
493 free(Buf_to_receive_B);