Eigenvalue SoLvers for Petaflop-Applications (ELPA) 2025.06.001.rc1
Loading...
Searching...
No Matches
merge_recursive_template.F90
Go to the documentation of this file.
1#ifdef SOLVE_TRIDI_GPU_BUILD
2recursive subroutine merge_recursive_gpu_&
3 &precision &
4 (obj, np_off, nprocs, ldq, matrixcols, nblk, &
5 l_col, p_col, l_col_bc, p_col_bc, limits, &
6 np_cols, na, q_dev, d, e, &
7 mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &
8 usegpu, wantdebug, success, max_threads)
9#else
10recursive subroutine merge_recursive_cpu_&
11 &precision &
12 (obj, np_off, nprocs, ldq, matrixcols, nblk, &
13 l_col, p_col, l_col_bc, p_col_bc, limits, &
14 np_cols, na, q, d, e, &
15 mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &
16 usegpu, wantdebug, success, max_threads)
17#endif
18
19 use precision
21!#ifdef WITH_OPENMP_TRADITIONAL
22! use elpa_omp
23!#endif
24 use elpa_mpi
26 use elpa_utilities
27#if defined(WITH_NVIDIA_GPU_VERSION) && defined(WITH_NVTX)
28 use cuda_functions ! for NVTX labels
29#elif defined(WITH_AMD_GPU_VERSION) && defined(WITH_ROCTX)
30 use hip_functions ! for ROCTX labels
31#endif
32 implicit none
33
34 ! noff is always a multiple of nblk_ev
35 ! nlen-noff is always > nblk_ev
36
37
38 class(elpa_abstract_impl_t), intent(inout) :: obj
39 integer(kind=ik), intent(in) :: max_threads
40 integer(kind=ik), intent(in) :: mpi_comm_all, mpi_comm_rows, mpi_comm_cols
41 integer(kind=ik), intent(in) :: ldq, matrixcols, nblk, na, np_cols
42 integer(kind=ik), intent(in) :: l_col_bc(na), p_col_bc(na), l_col(na), p_col(na), &
43 limits(0:np_cols)
44
45 integer(kind=c_intptr_t) :: q_dev
46#ifdef USE_ASSUMED_SIZE
47#ifdef SOLVE_TRIDI_GPU_BUILD
48 real(kind=real_datatype) :: q(ldq,matrixcols)
49#else
50 real(kind=real_datatype) :: q(ldq,*)
51#endif
52#else
53 real(kind=real_datatype) :: q(ldq,matrixcols)
54#endif
55
56#ifdef WITH_MPI
57 integer(kind=MPI_KIND) :: mpierr, my_pcolmpi
58#endif
59 integer(kind=ik) :: my_pcol
60 real(kind=real_datatype), intent(inout) :: d(na), e(na)
61 integer(kind=ik) :: np_off, nprocs
62 integer(kind=ik) :: np1, np2, noff, nlen, nmid, n
63 logical, intent(in) :: usegpu, wantdebug
64 logical, intent(out) :: success
65
66 success = .true.
67
68#ifdef WITH_MPI
69 call obj%timer%start("mpi_communication")
70 call mpi_comm_rank(int(mpi_comm_cols,kind=mpi_kind) ,my_pcolmpi, mpierr)
71
72 my_pcol = int(my_pcolmpi,kind=c_int)
73 call obj%timer%stop("mpi_communication")
74#endif
75
76 if (nprocs<=1) then
77 ! Safety check only
78 if (wantdebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
79 success = .false.
80 return
81 endif
82
83 ! Split problem into 2 subproblems of size np1 / np2
84
85 np1 = nprocs/2
86 np2 = nprocs-np1
87
88 if (np1 > 1) then
89 if (usegpu) then
90 call merge_recursive_gpu_&
91 &precision &
92 (obj, np_off, np1, ldq, matrixcols, nblk, &
93 l_col, p_col, l_col_bc, p_col_bc, limits, &
94 np_cols, na, q_dev, d, e, &
95 mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &
96 usegpu, wantdebug, success, max_threads)
97 else
98 call merge_recursive_cpu_&
99 &precision &
100 (obj, np_off, np1, ldq, matrixcols, nblk, &
101 l_col, p_col, l_col_bc, p_col_bc, limits, &
102 np_cols, na, q, d, e, &
103 mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &
104 usegpu, wantdebug, success, max_threads)
105 endif ! useGPU
106 endif ! (np1 > 1)
107
108 if (.not.(success)) then
109 write(error_unit,*) "Error in merge_recursive. Aborting..."
110 return
111 endif
112
113 if (np2 > 1) then
114 if (usegpu) then
115 call merge_recursive_gpu_&
116 &precision &
117 (obj, np_off+np1, np2, ldq, matrixcols, nblk, &
118 l_col, p_col, l_col_bc, p_col_bc, limits, &
119 np_cols, na, q_dev, d, e, &
120 mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &
121 usegpu, wantdebug, success, max_threads)
122 else
123 call merge_recursive_cpu_&
124 &precision &
125 (obj, np_off+np1, np2, ldq, matrixcols, nblk, &
126 l_col, p_col, l_col_bc, p_col_bc, limits, &
127 np_cols, na, q, d, e, &
128 mpi_comm_all, mpi_comm_rows, mpi_comm_cols, &
129 usegpu, wantdebug, success, max_threads)
130 endif ! useGPU
131 endif ! (np2 > 1)
132
133 if (.not.(success)) then
134 write(error_unit,*) "Error in merge_recursice. Aborting..."
135 return
136 endif
137
138 noff = limits(np_off)
139 nmid = limits(np_off+np1) - noff
140 nlen = limits(np_off+nprocs) - noff
141
142#ifdef WITH_MPI
143 call obj%timer%start("mpi_communication")
144 if (my_pcol==np_off) then
145 do n=np_off+np1,np_off+nprocs-1
146 call mpi_send(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), 12_mpi_kind, &
147 int(mpi_comm_cols,kind=mpi_kind), mpierr)
148 enddo
149 endif
150 call obj%timer%stop("mpi_communication")
151#else /* WITH_MPI */
152#endif /* WITH_MPI */
153
154 if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
155#ifdef WITH_MPI
156 call obj%timer%start("mpi_communication")
157 call mpi_recv(d(noff+1), int(nmid,kind=mpi_kind), mpi_real_precision, int(np_off,kind=mpi_kind), 12_mpi_kind, &
158 int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
159 call obj%timer%stop("mpi_communication")
160#else /* WITH_MPI */
161! d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
162#endif /* WITH_MPI */
163 endif
164
165 if (my_pcol==np_off+np1) then
166 do n=np_off,np_off+np1-1
167#ifdef WITH_MPI
168 call obj%timer%start("mpi_communication")
169 call mpi_send(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(n,kind=mpi_kind), &
170 15_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpierr)
171 call obj%timer%stop("mpi_communication")
172#else /* WITH_MPI */
173#endif /* WITH_MPI */
174
175 enddo
176 endif
177
178 if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
179#ifdef WITH_MPI
180 call obj%timer%start("mpi_communication")
181 call mpi_recv(d(noff+nmid+1), int(nlen-nmid,kind=mpi_kind), mpi_real_precision, int(np_off+np1,kind=mpi_kind), &
182 15_mpi_kind, int(mpi_comm_cols,kind=mpi_kind), mpi_status_ignore, mpierr)
183 call obj%timer%stop("mpi_communication")
184#else /* WITH_MPI */
185! d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
186#endif /* WITH_MPI */
187 endif
188 if (nprocs == np_cols) then
189
190 ! Last merge, result distribution must be block cyclic, noff==0,
191 ! p_col_bc is set so that only nev eigenvalues are calculated
192 if (usegpu) then
193 nvtx_range_push("merge_systems_gpu")
194 call merge_systems_gpu_&
195 &precision &
196 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q_dev, ldq, noff, &
197 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
198 l_col, p_col, &
199 l_col_bc, p_col_bc, np_off, nprocs, usegpu, wantdebug, success, max_threads)
200 nvtx_range_pop("merge_systems_gpu")
201 else
202 call merge_systems_cpu_&
203 &precision &
204 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
205 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
206 l_col, p_col, &
207 l_col_bc, p_col_bc, np_off, nprocs, usegpu, wantdebug, success, max_threads)
208 endif
209 if (.not.(success)) then
210 write(error_unit,*) "Error in merge_systems: Aborting..."
211 return
212 endif
213
214 else
215 ! Not last merge, leave dense column distribution
216 if (usegpu) then
217 nvtx_range_push("merge_systems_gpu")
218 call merge_systems_gpu_&
219 &precision &
220 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q_dev, ldq, noff, &
221 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
222 l_col(noff+1), p_col(noff+1), &
223 l_col(noff+1), p_col(noff+1), np_off, nprocs, usegpu, wantdebug, success, &
224 max_threads)
225 nvtx_range_pop("merge_systems_gpu")
226 else
227 call merge_systems_cpu_&
228 &precision &
229 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
230 nblk, matrixcols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
231 l_col(noff+1), p_col(noff+1), &
232 l_col(noff+1), p_col(noff+1), np_off, nprocs, usegpu, wantdebug, success, &
233 max_threads)
234 endif
235 if (.not.(success)) then
236 write(error_unit,*) "Error in merge_systems: Aborting..."
237 return
238 endif
239 endif
240
241end
242
Fortran module to provide an abstract definition of the implementation. Do not use directly....
Definition elpa_abstract_impl.F90:50
Definition mod_merge_systems.F90:3
Definition elpa_abstract_impl.F90:73