Neko 1.99.3
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
cuda_intf.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use utils, only : neko_error
36 use, intrinsic :: iso_c_binding
37 !$ use omp_lib
38 implicit none
39
40#ifdef HAVE_CUDA
41
43 enum, bind(c)
44 enumerator :: cudasuccess = 0
45 enumerator :: cudaerrorinvalidvalue = 1
46 enumerator :: cudaerrormemoryallocation = 2
48 end enum
49
51 enum, bind(c)
52 enumerator :: cudamemcpyhosttohost = 0
53 enumerator :: cudamemcpyhosttodevice = 1
54 enumerator :: cudamemcpydevicetohost = 2
55 enumerator :: cudamemcpydevicetodevice = 3
56 enumerator :: cudamemcpydefault = 4
57 end enum
58
59 interface
60 integer(c_int) function cudamalloc(ptr_d, s) &
61 bind(c, name = 'cudaMalloc')
62 use, intrinsic :: iso_c_binding
63 implicit none
64 type(c_ptr) :: ptr_d
65 integer(c_size_t), value :: s
66 end function cudamalloc
67
68 integer(c_int) function cudafree(ptr_d) &
69 bind(c, name = 'cudaFree')
70 use, intrinsic :: iso_c_binding
71 implicit none
72 type(c_ptr), value :: ptr_d
73 end function cudafree
74
75 integer(c_int) function cudamemcpy(ptr_dst, ptr_src, s, dir) &
76 bind(c, name = 'cudaMemcpy')
77 use, intrinsic :: iso_c_binding
78 implicit none
79 type(c_ptr), value :: ptr_dst, ptr_src
80 integer(c_size_t), value :: s
81 integer(c_int), value :: dir
82 end function cudamemcpy
83
84 integer(c_int) function cudamemcpyasync(ptr_dst, ptr_src, s, dir, stream) &
85 bind(c, name = 'cudaMemcpyAsync')
86 use, intrinsic :: iso_c_binding
87 implicit none
88 type(c_ptr), value :: ptr_dst, ptr_src, stream
89 integer(c_size_t), value :: s
90 integer(c_int), value :: dir
91 end function cudamemcpyasync
92
93 integer(c_int) function cudamemsetasync(ptr, v, s, stream) &
94 bind(c, name = 'cudaMemsetAsync')
95 use, intrinsic :: iso_c_binding
96 implicit none
97 type(c_ptr), value :: ptr, stream
98 integer(c_int), value :: v
99 integer(c_size_t), value :: s
100 end function cudamemsetasync
101
102 integer(c_int) function cudadevicesynchronize() &
103 bind(c, name = 'cudaDeviceSynchronize')
104 use, intrinsic :: iso_c_binding
105 implicit none
106 end function cudadevicesynchronize
107
108 integer(c_int) function cudadevicereset() &
109 bind(c, name = 'cudaDeviceReset')
110 use, intrinsic :: iso_c_binding
111 implicit none
112 end function cudadevicereset
113
114 integer(c_int) function cudagetdeviceproperties(prop, device) &
115 bind(c, name = 'cudaGetDeviceProperties')
116 use, intrinsic :: iso_c_binding
117 implicit none
118 type(c_ptr), value :: prop
119 integer(c_int), value :: device
120 end function cudagetdeviceproperties
121
122 integer(c_int) function cudastreamcreate(stream) &
123 bind(c, name = 'cudaStreamCreate')
124 use, intrinsic :: iso_c_binding
125 implicit none
126 type(c_ptr) :: stream
127 end function cudastreamcreate
128
129 integer(c_int) function cudastreamcreatewithflags(stream, flags) &
130 bind(c, name = 'cudaStreamCreateWithFlags')
131 use, intrinsic :: iso_c_binding
132 implicit none
133 type(c_ptr) :: stream
134 integer(c_int), value :: flags
135 end function cudastreamcreatewithflags
136
137 integer(c_int) function cudastreamcreatewithpriority(stream, flags, prio) &
138 bind(c, name = 'cudaStreamCreateWithPriority')
139 use, intrinsic :: iso_c_binding
140 implicit none
141 type(c_ptr) :: stream
142 integer(c_int), value :: flags, prio
144
145 integer(c_int) function cudastreamdestroy(steam) &
146 bind(c, name = 'cudaStreamDestroy')
147 use, intrinsic :: iso_c_binding
148 implicit none
149 type(c_ptr), value :: steam
150 end function cudastreamdestroy
151
152 integer(c_int) function cudastreamsynchronize(stream) &
153 bind(c, name = 'cudaStreamSynchronize')
154 use, intrinsic :: iso_c_binding
155 implicit none
156 type(c_ptr), value :: stream
157 end function cudastreamsynchronize
158
159 integer(c_int) function cudastreamwaitevent(stream, event, flags) &
160 bind(c, name = 'cudaStreamWaitEvent')
161 use, intrinsic :: iso_c_binding
162 implicit none
163 type(c_ptr), value :: stream, event
164 integer(c_int), value :: flags
165 end function cudastreamwaitevent
166
167 integer(c_int) function cudadevicegetstreampriorityrange &
168 (low_prio, high_prio) &
169 bind(c, name = 'cudaDeviceGetStreamPriorityRange')
170 use, intrinsic :: iso_c_binding
171 implicit none
172 integer(c_int) :: low_prio, high_prio
174
175 integer(c_int) function cudaprofilerstart() &
176 bind(c, name = 'cudaProfilerStart')
177 use, intrinsic :: iso_c_binding
178 implicit none
179 end function cudaprofilerstart
180
181 integer(c_int) function cudaprofilerstop() &
182 bind(c, name = 'cudaProfilerStop')
183 use, intrinsic :: iso_c_binding
184 implicit none
185 end function cudaprofilerstop
186
187 integer(c_int) function cudaeventcreate(event) &
188 bind(c, name = 'cudaEventCreate')
189 use, intrinsic :: iso_c_binding
190 implicit none
191 type(c_ptr) :: event
192 end function cudaeventcreate
193
194 integer(c_int) function cudaeventdestroy(event) &
195 bind(c, name = 'cudaEventDestroy')
196 use, intrinsic :: iso_c_binding
197 implicit none
198 type(c_ptr), value :: event
199 end function cudaeventdestroy
200
201 integer(c_int) function cudaeventcreatewithflags(event, flags) &
202 bind(c, name = 'cudaEventCreateWithFlags')
203 use, intrinsic :: iso_c_binding
204 implicit none
205 type(c_ptr) :: event
206 integer(c_int), value :: flags
207 end function cudaeventcreatewithflags
208
209 integer(c_int) function cudaeventrecord(event, stream) &
210 bind(c, name = 'cudaEventRecord')
211 use, intrinsic :: iso_c_binding
212 implicit none
213 type(c_ptr), value :: event, stream
214 end function cudaeventrecord
215
216 integer(c_int) function cudaeventsynchronize(event) &
217 bind(c, name = 'cudaEventSynchronize')
218 use, intrinsic :: iso_c_binding
219 implicit none
220 type(c_ptr), value :: event
221 end function cudaeventsynchronize
222
223 integer(c_int) function cudagetdevicecount(device_count) &
224 bind(c, name = 'cudaGetDeviceCount')
225 use, intrinsic :: iso_c_binding
226 implicit none
227 integer(c_int) :: device_count
228 end function cudagetdevicecount
229
230 integer(c_int) function cudagetdevice(device) &
231 bind(c, name = 'cudaGetDevice')
232 use, intrinsic :: iso_c_binding
233 implicit none
234 integer(c_int) :: device
235 end function cudagetdevice
236
237 integer(c_int) function cudasetdevice(device) &
238 bind(c, name = 'cudaSetDevice')
239 use, intrinsic :: iso_c_binding
240 implicit none
241 integer(c_int), value :: device
242 end function cudasetdevice
243 end interface
244
245contains
246
247 subroutine cuda_init(glb_cmd_queue, aux_cmd_queue, &
248 STRM_HIGH_PRIO, STRM_LOW_PRIO)
249 type(c_ptr), intent(inout) :: glb_cmd_queue
250 type(c_ptr), intent(inout) :: aux_cmd_queue
251 integer, intent(inout) :: STRM_HIGH_PRIO
252 integer, intent(inout) :: STRM_LOW_PRIO
253 integer(c_int) :: device_id
254 integer :: nthrds = 1
255
256 !$omp parallel
257 !$omp master
258 !$ nthrds = omp_get_num_threads()
259 !$omp end master
260 !$omp end parallel
261
262 ! Ensure that all threads are assigned to the same device
263 if (nthrds .gt. 1) then
264 if (cudagetdevice(device_id) .ne. cudasuccess) then
265 call neko_error('Error retrieving device id')
266 end if
267
268 !$omp parallel
269 if (cudasetdevice(device_id) .ne. cudasuccess) then
270 call neko_error('Error setting device id')
271 end if
272 !$omp end parallel
273 end if
274
275 if (cudadevicegetstreampriorityrange(strm_low_prio, strm_high_prio) &
276 .ne. cudasuccess) then
277 call neko_error('Error retrieving stream priority range')
278 end if
279
280 if (cudastreamcreatewithpriority(glb_cmd_queue, 1, strm_high_prio) &
281 .ne. cudasuccess) then
282 call neko_error('Error creating main stream')
283 end if
284
285 if (cudastreamcreatewithpriority(aux_cmd_queue, 1, strm_low_prio) &
286 .ne. cudasuccess) then
287 call neko_error('Error creating aux stream')
288 end if
289 end subroutine cuda_init
290
291 subroutine cuda_finalize(glb_cmd_queue, aux_cmd_queue)
292 type(c_ptr), intent(inout) :: glb_cmd_queue
293 type(c_ptr), intent(inout) :: aux_cmd_queue
294 integer(c_int) :: ierr
295
296 if (cudastreamdestroy(glb_cmd_queue) .ne. cudasuccess) then
297 call neko_error('Error destroying main stream')
298 end if
299
300 if (cudastreamdestroy(aux_cmd_queue) .ne. cudasuccess) then
301 call neko_error('Error destroying aux stream')
302 end if
303
304 ! Best-effort context teardown to release runtime-owned allocations.
305 ierr = cudadevicesynchronize()
306 ierr = cudadevicereset()
307 end subroutine cuda_finalize
308
309 subroutine cuda_device_name(name)
310 character(len=*), intent(inout) :: name
311 character(kind=c_char, len=8192), target :: prop
312 integer :: end_pos
313
314 !
315 ! Yes this is an ugly hack!
316 ! Since we're only interested in the device name (first 256 bytes)
317 ! we pass down a large enough chunk of memory to the cuda runtime
318 ! and extract what we need later on
319 !
320 ! This will of course break if sizeof(cudaDeviceProp) > 8192
321 !
322
323 if (cudagetdeviceproperties(c_loc(prop), 0) .ne. cudasuccess) then
324 call neko_error('Failed to query device')
325 end if
326
327 end_pos = scan(prop(1:256), c_null_char)
328 if (end_pos .ge. 2) then
329 name(1:end_pos-1) = prop(1:end_pos-1)
330 end if
331 end subroutine cuda_device_name
332
334 integer function cuda_device_count()
335 integer(c_int) :: num_devices
336
337 if (cudagetdevicecount(num_devices) .ne. cudasuccess) then
338 call neko_error('Error retrieving device count')
339 end if
340
341 cuda_device_count = num_devices
342 end function cuda_device_count
343
344#endif
345
346end module cuda_intf
Fortran CUDA interface.
Definition cuda_intf.F90:34
subroutine cuda_device_name(name)
subroutine cuda_finalize(glb_cmd_queue, aux_cmd_queue)
@ cudamemcpydevicetohost
Definition cuda_intf.F90:54
@ cudamemcpydefault
Definition cuda_intf.F90:56
@ cudamemcpyhosttohost
Definition cuda_intf.F90:52
@ cudamemcpydevicetodevice
Definition cuda_intf.F90:55
@ cudamemcpyhosttodevice
Definition cuda_intf.F90:53
integer function cuda_device_count()
Return the number of avaialble CUDA devices.
@ cudaerrorinitializationerror
Definition cuda_intf.F90:47
@ cudaerrormemoryallocation
Definition cuda_intf.F90:46
@ cudaerrorinvalidvalue
Definition cuda_intf.F90:45
subroutine cuda_init(glb_cmd_queue, aux_cmd_queue, strm_high_prio, strm_low_prio)
Device abstraction, common interface for various accelerators.
Definition device.F90:34
Utilities.
Definition utils.f90:35