Neko 1.99.1
A portable framework for high-order spectral element flow simulations
Loading...
Searching...
No Matches
cuda_intf.F90
Go to the documentation of this file.
1! Copyright (c) 2021-2025, The Neko Authors
2! All rights reserved.
3!
4! Redistribution and use in source and binary forms, with or without
5! modification, are permitted provided that the following conditions
6! are met:
7!
8! * Redistributions of source code must retain the above copyright
9! notice, this list of conditions and the following disclaimer.
10!
11! * Redistributions in binary form must reproduce the above
12! copyright notice, this list of conditions and the following
13! disclaimer in the documentation and/or other materials provided
14! with the distribution.
15!
16! * Neither the name of the authors nor the names of its
17! contributors may be used to endorse or promote products derived
18! from this software without specific prior written permission.
19!
20! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31! POSSIBILITY OF SUCH DAMAGE.
32!
35 use utils, only : neko_error
36 use, intrinsic :: iso_c_binding
37 !$ use omp_lib
38 implicit none
39
40#ifdef HAVE_CUDA
41
43 enum, bind(c)
44 enumerator :: cudasuccess = 0
45 enumerator :: cudaerrorinvalidvalue = 1
46 enumerator :: cudaerrormemoryallocation = 2
48 end enum
49
51 enum, bind(c)
52 enumerator :: cudamemcpyhosttohost = 0
53 enumerator :: cudamemcpyhosttodevice = 1
54 enumerator :: cudamemcpydevicetohost = 2
55 enumerator :: cudamemcpydevicetodevice = 3
56 enumerator :: cudamemcpydefault = 4
57 end enum
58
59 interface
60 integer(c_int) function cudamalloc(ptr_d, s) &
61 bind(c, name = 'cudaMalloc')
62 use, intrinsic :: iso_c_binding
63 implicit none
64 type(c_ptr) :: ptr_d
65 integer(c_size_t), value :: s
66 end function cudamalloc
67 end interface
68
69 interface
70 integer(c_int) function cudafree(ptr_d) &
71 bind(c, name = 'cudaFree')
72 use, intrinsic :: iso_c_binding
73 implicit none
74 type(c_ptr), value :: ptr_d
75 end function cudafree
76 end interface
77
78 interface
79 integer(c_int) function cudamemcpy(ptr_dst, ptr_src, s, dir) &
80 bind(c, name = 'cudaMemcpy')
81 use, intrinsic :: iso_c_binding
82 implicit none
83 type(c_ptr), value :: ptr_dst, ptr_src
84 integer(c_size_t), value :: s
85 integer(c_int), value :: dir
86 end function cudamemcpy
87 end interface
88
89 interface
90 integer(c_int) function cudamemcpyasync(ptr_dst, ptr_src, s, dir, stream) &
91 bind(c, name = 'cudaMemcpyAsync')
92 use, intrinsic :: iso_c_binding
93 implicit none
94 type(c_ptr), value :: ptr_dst, ptr_src, stream
95 integer(c_size_t), value :: s
96 integer(c_int), value :: dir
97 end function cudamemcpyasync
98 end interface
99
100 interface
101 integer(c_int) function cudamemsetasync(ptr, v, s, stream) &
102 bind(c, name = 'cudaMemsetAsync')
103 use, intrinsic :: iso_c_binding
104 implicit none
105 type(c_ptr), value :: ptr, stream
106 integer(c_int), value :: v
107 integer(c_size_t), value :: s
108 end function cudamemsetasync
109 end interface
110
111 interface
112 integer(c_int) function cudadevicesynchronize() &
113 bind(c, name = 'cudaDeviceSynchronize')
114 use, intrinsic :: iso_c_binding
115 implicit none
116 end function cudadevicesynchronize
117 end interface
118
119 interface
120 integer(c_int) function cudagetdeviceproperties(prop, device) &
121 bind(c, name = 'cudaGetDeviceProperties')
122 use, intrinsic :: iso_c_binding
123 implicit none
124 type(c_ptr), value :: prop
125 integer(c_int), value :: device
126 end function cudagetdeviceproperties
127 end interface
128
129 interface
130 integer(c_int) function cudastreamcreate(stream) &
131 bind(c, name = 'cudaStreamCreate')
132 use, intrinsic :: iso_c_binding
133 implicit none
134 type(c_ptr) :: stream
135 end function cudastreamcreate
136 end interface
137
138 interface
139 integer(c_int) function cudastreamcreatewithflags(stream, flags) &
140 bind(c, name = 'cudaStreamCreateWithFlags')
141 use, intrinsic :: iso_c_binding
142 implicit none
143 type(c_ptr) :: stream
144 integer(c_int), value :: flags
145 end function cudastreamcreatewithflags
146 end interface
147
148 interface
149 integer(c_int) function cudastreamcreatewithpriority(stream, flags, prio) &
150 bind(c, name = 'cudaStreamCreateWithPriority')
151 use, intrinsic :: iso_c_binding
152 implicit none
153 type(c_ptr) :: stream
154 integer(c_int), value :: flags, prio
156 end interface
157
158 interface
159 integer(c_int) function cudastreamdestroy(steam) &
160 bind(c, name = 'cudaStreamDestroy')
161 use, intrinsic :: iso_c_binding
162 implicit none
163 type(c_ptr), value :: steam
164 end function cudastreamdestroy
165 end interface
166
167 interface
168 integer(c_int) function cudastreamsynchronize(stream) &
169 bind(c, name = 'cudaStreamSynchronize')
170 use, intrinsic :: iso_c_binding
171 implicit none
172 type(c_ptr), value :: stream
173 end function cudastreamsynchronize
174 end interface
175
176 interface
177 integer(c_int) function cudastreamwaitevent(stream, event, flags) &
178 bind(c, name = 'cudaStreamWaitEvent')
179 use, intrinsic :: iso_c_binding
180 implicit none
181 type(c_ptr), value :: stream, event
182 integer(c_int), value :: flags
183 end function cudastreamwaitevent
184 end interface
185
186 interface
187 integer(c_int) function cudadevicegetstreampriorityrange &
188 (low_prio, high_prio) &
189 bind(c, name = 'cudaDeviceGetStreamPriorityRange')
190 use, intrinsic :: iso_c_binding
191 implicit none
192 integer(c_int) :: low_prio, high_prio
194 end interface
195
196 interface
197 integer(c_int) function cudaprofilerstart() &
198 bind(c, name = 'cudaProfilerStart')
199 use, intrinsic :: iso_c_binding
200 implicit none
201 end function cudaprofilerstart
202 end interface
203
204 interface
205 integer(c_int) function cudaprofilerstop() &
206 bind(c, name = 'cudaProfilerStop')
207 use, intrinsic :: iso_c_binding
208 implicit none
209 end function cudaprofilerstop
210 end interface
211
212 interface
213 integer(c_int) function cudaeventcreate(event) &
214 bind(c, name = 'cudaEventCreate')
215 use, intrinsic :: iso_c_binding
216 implicit none
217 type(c_ptr) :: event
218 end function cudaeventcreate
219 end interface
220
221 interface
222 integer(c_int) function cudaeventdestroy(event) &
223 bind(c, name = 'cudaEventDestroy')
224 use, intrinsic :: iso_c_binding
225 implicit none
226 type(c_ptr), value :: event
227 end function cudaeventdestroy
228 end interface
229
230 interface
231 integer(c_int) function cudaeventcreatewithflags(event, flags) &
232 bind(c, name = 'cudaEventCreateWithFlags')
233 use, intrinsic :: iso_c_binding
234 implicit none
235 type(c_ptr) :: event
236 integer(c_int), value :: flags
237 end function cudaeventcreatewithflags
238 end interface
239
240 interface
241 integer(c_int) function cudaeventrecord(event, stream) &
242 bind(c, name = 'cudaEventRecord')
243 use, intrinsic :: iso_c_binding
244 implicit none
245 type(c_ptr), value :: event, stream
246 end function cudaeventrecord
247 end interface
248
249 interface
250 integer(c_int) function cudaeventsynchronize(event) &
251 bind(c, name = 'cudaEventSynchronize')
252 use, intrinsic :: iso_c_binding
253 implicit none
254 type(c_ptr), value :: event
255 end function cudaeventsynchronize
256 end interface
257
258 interface
259 integer(c_int) function cudagetdevicecount(device_count) &
260 bind(c, name = 'cudaGetDeviceCount')
261 use, intrinsic :: iso_c_binding
262 implicit none
263 integer(c_int) :: device_count
264 end function cudagetdevicecount
265 end interface
266
267 interface
268 integer(c_int) function cudagetdevice(device) &
269 bind(c, name = 'cudaGetDevice')
270 use, intrinsic :: iso_c_binding
271 implicit none
272 integer(c_int) :: device
273 end function cudagetdevice
274 end interface
275
276 interface
277 integer(c_int) function cudasetdevice(device) &
278 bind(c, name = 'cudaSetDevice')
279 use, intrinsic :: iso_c_binding
280 implicit none
281 integer(c_int), value :: device
282 end function cudasetdevice
283 end interface
284
285contains
286
287 subroutine cuda_init(glb_cmd_queue, aux_cmd_queue, &
288 STRM_HIGH_PRIO, STRM_LOW_PRIO)
289 type(c_ptr), intent(inout) :: glb_cmd_queue
290 type(c_ptr), intent(inout) :: aux_cmd_queue
291 integer, intent(inout) :: STRM_HIGH_PRIO
292 integer, intent(inout) :: STRM_LOW_PRIO
293 integer(c_int) :: device_id
294 integer :: nthrds = 1
295
296 !$omp parallel
297 !$omp master
298 !$ nthrds = omp_get_num_threads()
299 !$omp end master
300 !$omp end parallel
301
302 ! Ensure that all threads are assigned to the same device
303 if (nthrds .gt. 1) then
304 if (cudagetdevice(device_id) .ne. cudasuccess) then
305 call neko_error('Error retrieving device id')
306 end if
307
308 !$omp parallel
309 if (cudasetdevice(device_id) .ne. cudasuccess) then
310 call neko_error('Error setting device id')
311 end if
312 !$omp end parallel
313 end if
314
315 if (cudadevicegetstreampriorityrange(strm_low_prio, strm_high_prio) &
316 .ne. cudasuccess) then
317 call neko_error('Error retrieving stream priority range')
318 end if
319
320 if (cudastreamcreatewithpriority(glb_cmd_queue, 1, strm_high_prio) &
321 .ne. cudasuccess) then
322 call neko_error('Error creating main stream')
323 end if
324
325 if (cudastreamcreatewithpriority(aux_cmd_queue, 1, strm_low_prio) &
326 .ne. cudasuccess) then
327 call neko_error('Error creating aux stream')
328 end if
329 end subroutine cuda_init
330
331 subroutine cuda_finalize(glb_cmd_queue, aux_cmd_queue)
332 type(c_ptr), intent(inout) :: glb_cmd_queue
333 type(c_ptr), intent(inout) :: aux_cmd_queue
334
335 if (cudastreamdestroy(glb_cmd_queue) .ne. cudasuccess) then
336 call neko_error('Error destroying main stream')
337 end if
338
339 if (cudastreamdestroy(aux_cmd_queue) .ne. cudasuccess) then
340 call neko_error('Error destroying aux stream')
341 end if
342 end subroutine cuda_finalize
343
344 subroutine cuda_device_name(name)
345 character(len=*), intent(inout) :: name
346 character(kind=c_char, len=8192), target :: prop
347 integer :: end_pos
348
349 !
350 ! Yes this is an ugly hack!
351 ! Since we're only interested in the device name (first 256 bytes)
352 ! we pass down a large enough chunk of memory to the cuda runtime
353 ! and extract what we need later on
354 !
355 ! This will of course break if sizeof(cudaDeviceProp) > 8192
356 !
357
358 if (cudagetdeviceproperties(c_loc(prop), 0) .ne. cudasuccess) then
359 call neko_error('Failed to query device')
360 end if
361
362 end_pos = scan(prop(1:256), c_null_char)
363 if (end_pos .ge. 2) then
364 name(1:end_pos-1) = prop(1:end_pos-1)
365 end if
366 end subroutine cuda_device_name
367
369 integer function cuda_device_count()
370 integer(c_int) :: num_devices
371
372 if (cudagetdevicecount(num_devices) .ne. cudasuccess) then
373 call neko_error('Error retrieving device count')
374 end if
375
376 cuda_device_count = num_devices
377 end function cuda_device_count
378
379#endif
380
381end module cuda_intf
Fortran CUDA interface.
Definition cuda_intf.F90:34
subroutine cuda_device_name(name)
subroutine cuda_finalize(glb_cmd_queue, aux_cmd_queue)
@ cudamemcpydevicetohost
Definition cuda_intf.F90:54
@ cudamemcpydefault
Definition cuda_intf.F90:56
@ cudamemcpyhosttohost
Definition cuda_intf.F90:52
@ cudamemcpydevicetodevice
Definition cuda_intf.F90:55
@ cudamemcpyhosttodevice
Definition cuda_intf.F90:53
integer function cuda_device_count()
Return the number of avaialble CUDA devices.
@ cudaerrorinitializationerror
Definition cuda_intf.F90:47
@ cudaerrormemoryallocation
Definition cuda_intf.F90:46
@ cudaerrorinvalidvalue
Definition cuda_intf.F90:45
subroutine cuda_init(glb_cmd_queue, aux_cmd_queue, strm_high_prio, strm_low_prio)
Device abstraction, common interface for various accelerators.
Definition device.F90:34
Utilities.
Definition utils.f90:35