Neko  0.9.99
A portable framework for high-order spectral element flow simulations
cuda_intf.F90
Go to the documentation of this file.
1 ! Copyright (c) 2021-2022, The Neko Authors
2 ! All rights reserved.
3 !
4 ! Redistribution and use in source and binary forms, with or without
5 ! modification, are permitted provided that the following conditions
6 ! are met:
7 !
8 ! * Redistributions of source code must retain the above copyright
9 ! notice, this list of conditions and the following disclaimer.
10 !
11 ! * Redistributions in binary form must reproduce the above
12 ! copyright notice, this list of conditions and the following
13 ! disclaimer in the documentation and/or other materials provided
14 ! with the distribution.
15 !
16 ! * Neither the name of the authors nor the names of its
17 ! contributors may be used to endorse or promote products derived
18 ! from this software without specific prior written permission.
19 !
20 ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 ! POSSIBILITY OF SUCH DAMAGE.
32 !
34 module cuda_intf
35  use utils, only : neko_error
36  use, intrinsic :: iso_c_binding
37  !$ use omp_lib
38  implicit none
39 
40 #ifdef HAVE_CUDA
41 
43  type(c_ptr), bind(c) :: glb_cmd_queue = c_null_ptr
44 
46  type(c_ptr), bind(c) :: aux_cmd_queue = c_null_ptr
47 
49  integer :: strm_high_prio
50 
52  integer :: strm_low_prio
53 
55  enum, bind(c)
56  enumerator :: cudasuccess = 0
57  enumerator :: cudaerrorinvalidvalue = 1
58  enumerator :: cudaerrormemoryallocation = 2
59  enumerator :: cudaerrorinitializationerror = 3
60  end enum
61 
63  enum, bind(c)
64  enumerator :: cudamemcpyhosttohost = 0
65  enumerator :: cudamemcpyhosttodevice = 1
66  enumerator :: cudamemcpydevicetohost = 2
67  enumerator :: cudamemcpydevicetodevice = 3
68  enumerator :: cudamemcpydefault = 4
69  end enum
70 
71  interface
72  integer(c_int) function cudamalloc(ptr_d, s) &
73  bind(c, name = 'cudaMalloc')
74  use, intrinsic :: iso_c_binding
75  implicit none
76  type(c_ptr) :: ptr_d
77  integer(c_size_t), value :: s
78  end function cudamalloc
79  end interface
80 
81  interface
82  integer(c_int) function cudafree(ptr_d) &
83  bind(c, name = 'cudaFree')
84  use, intrinsic :: iso_c_binding
85  implicit none
86  type(c_ptr), value :: ptr_d
87  end function cudafree
88  end interface
89 
90  interface
91  integer(c_int) function cudamemcpy(ptr_dst, ptr_src, s, dir) &
92  bind(c, name = 'cudaMemcpy')
93  use, intrinsic :: iso_c_binding
94  implicit none
95  type(c_ptr), value :: ptr_dst, ptr_src
96  integer(c_size_t), value :: s
97  integer(c_int), value :: dir
98  end function cudamemcpy
99  end interface
100 
101  interface
102  integer(c_int) function cudamemcpyasync(ptr_dst, ptr_src, s, dir, stream) &
103  bind(c, name = 'cudaMemcpyAsync')
104  use, intrinsic :: iso_c_binding
105  implicit none
106  type(c_ptr), value :: ptr_dst, ptr_src, stream
107  integer(c_size_t), value :: s
108  integer(c_int), value :: dir
109  end function cudamemcpyasync
110  end interface
111 
112  interface
113  integer(c_int) function cudadevicesynchronize() &
114  bind(c, name = 'cudaDeviceSynchronize')
115  use, intrinsic :: iso_c_binding
116  implicit none
117  end function cudadevicesynchronize
118  end interface
119 
120  interface
121  integer(c_int) function cudagetdeviceproperties(prop, device) &
122  bind(c, name = 'cudaGetDeviceProperties')
123  use, intrinsic :: iso_c_binding
124  implicit none
125  type(c_ptr), value :: prop
126  integer(c_int), value :: device
127  end function cudagetdeviceproperties
128  end interface
129 
130  interface
131  integer(c_int) function cudastreamcreate(stream) &
132  bind(c, name = 'cudaStreamCreate')
133  use, intrinsic :: iso_c_binding
134  implicit none
135  type(c_ptr) :: stream
136  end function cudastreamcreate
137  end interface
138 
139  interface
140  integer(c_int) function cudastreamcreatewithflags(stream, flags) &
141  bind(c, name = 'cudaStreamCreateWithFlags')
142  use, intrinsic :: iso_c_binding
143  implicit none
144  type(c_ptr) :: stream
145  integer(c_int), value :: flags
146  end function cudastreamcreatewithflags
147  end interface
148 
149  interface
150  integer(c_int) function cudastreamcreatewithpriority(stream, flags, prio) &
151  bind(c, name = 'cudaStreamCreateWithPriority')
152  use, intrinsic :: iso_c_binding
153  implicit none
154  type(c_ptr) :: stream
155  integer(c_int), value :: flags, prio
156  end function cudastreamcreatewithpriority
157  end interface
158 
159  interface
160  integer(c_int) function cudastreamdestroy(steam) &
161  bind(c, name = 'cudaStreamDestroy')
162  use, intrinsic :: iso_c_binding
163  implicit none
164  type(c_ptr), value :: steam
165  end function cudastreamdestroy
166  end interface
167 
168  interface
169  integer(c_int) function cudastreamsynchronize(stream) &
170  bind(c, name = 'cudaStreamSynchronize')
171  use, intrinsic :: iso_c_binding
172  implicit none
173  type(c_ptr), value :: stream
174  end function cudastreamsynchronize
175  end interface
176 
177  interface
178  integer(c_int) function cudastreamwaitevent(stream, event, flags) &
179  bind(c, name = 'cudaStreamWaitEvent')
180  use, intrinsic :: iso_c_binding
181  implicit none
182  type(c_ptr), value :: stream, event
183  integer(c_int), value :: flags
184  end function cudastreamwaitevent
185  end interface
186 
187  interface
188  integer(c_int) function cudadevicegetstreampriorityrange &
189  (low_prio, high_prio) &
190  bind(c, name = 'cudaDeviceGetStreamPriorityRange')
191  use, intrinsic :: iso_c_binding
192  implicit none
193  integer(c_int) :: low_prio, high_prio
194  end function cudadevicegetstreampriorityrange
195  end interface
196 
197  interface
198  integer(c_int) function cudaprofilerstart() &
199  bind(c, name = 'cudaProfilerStart')
200  use, intrinsic :: iso_c_binding
201  implicit none
202  end function cudaprofilerstart
203  end interface
204 
205  interface
206  integer(c_int) function cudaprofilerstop() &
207  bind(c, name = 'cudaProfilerStop')
208  use, intrinsic :: iso_c_binding
209  implicit none
210  end function cudaprofilerstop
211  end interface
212 
213  interface
214  integer(c_int) function cudaeventcreate(event) &
215  bind(c, name = 'cudaEventCreate')
216  use, intrinsic :: iso_c_binding
217  implicit none
218  type(c_ptr) :: event
219  end function cudaeventcreate
220  end interface
221 
222  interface
223  integer(c_int) function cudaeventdestroy(event) &
224  bind(c, name = 'cudaEventDestroy')
225  use, intrinsic :: iso_c_binding
226  implicit none
227  type(c_ptr), value :: event
228  end function cudaeventdestroy
229  end interface
230 
231  interface
232  integer(c_int) function cudaeventcreatewithflags(event, flags) &
233  bind(c, name = 'cudaEventCreateWithFlags')
234  use, intrinsic :: iso_c_binding
235  implicit none
236  type(c_ptr) :: event
237  integer(c_int), value :: flags
238  end function cudaeventcreatewithflags
239  end interface
240 
241  interface
242  integer(c_int) function cudaeventrecord(event, stream) &
243  bind(c, name = 'cudaEventRecord')
244  use, intrinsic :: iso_c_binding
245  implicit none
246  type(c_ptr), value :: event, stream
247  end function cudaeventrecord
248  end interface
249 
250  interface
251  integer(c_int) function cudaeventsynchronize(event) &
252  bind(c, name = 'cudaEventSynchronize')
253  use, intrinsic :: iso_c_binding
254  implicit none
255  type(c_ptr), value :: event
256  end function cudaeventsynchronize
257  end interface
258 
259  interface
260  integer(c_int) function cudagetdevicecount(device_count) &
261  bind(c, name = 'cudaGetDeviceCount')
262  use, intrinsic :: iso_c_binding
263  implicit none
264  integer(c_int) :: device_count
265  end function cudagetdevicecount
266  end interface
267 
268  interface
269  integer(c_int) function cudagetdevice(device) &
270  bind(c, name = 'cudaGetDevice')
271  use, intrinsic :: iso_c_binding
272  implicit none
273  integer(c_int) :: device
274  end function cudagetdevice
275  end interface
276 
277  interface
278  integer(c_int) function cudasetdevice(device) &
279  bind(c, name = 'cudaSetDevice')
280  use, intrinsic :: iso_c_binding
281  implicit none
282  integer(c_int), value :: device
283  end function cudasetdevice
284  end interface
285 
286 contains
287 
288  subroutine cuda_init
289  integer(c_int) :: device_id
290  integer :: nthrds = 1
291 
292  !$omp parallel
293  !$omp master
294  !$ nthrds = omp_get_num_threads()
295  !$omp end master
296  !$omp end parallel
297 
298  ! Ensure that all threads are assigned to the same device
299  if (nthrds .gt. 1) then
300  if (cudagetdevice(device_id) .ne. cudasuccess) then
301  call neko_error('Error retrieving device id')
302  end if
303 
304  !$omp parallel
305  if (cudasetdevice(device_id) .ne. cudasuccess) then
306  call neko_error('Error setting device id')
307  end if
308  !$omp end parallel
309  end if
310 
311  if (cudadevicegetstreampriorityrange(strm_low_prio, strm_high_prio) &
312  .ne. cudasuccess) then
313  call neko_error('Error retrieving stream priority range')
314  end if
315 
316  if (cudastreamcreatewithpriority(glb_cmd_queue, 1, strm_high_prio) &
317  .ne. cudasuccess) then
318  call neko_error('Error creating main stream')
319  end if
320 
321  if (cudastreamcreatewithpriority(aux_cmd_queue, 1, strm_low_prio) &
322  .ne. cudasuccess) then
323  call neko_error('Error creating aux stream')
324  end if
325  end subroutine cuda_init
326 
327  subroutine cuda_finalize
328  if (cudastreamdestroy(glb_cmd_queue) .ne. cudasuccess) then
329  call neko_error('Error destroying main stream')
330  end if
331 
332  if (cudastreamdestroy(aux_cmd_queue) .ne. cudasuccess) then
333  call neko_error('Error destroying aux stream')
334  end if
335  end subroutine cuda_finalize
336 
337  subroutine cuda_device_name(name)
338  character(len=*), intent(inout) :: name
339  character(kind=c_char, len=8192), target :: prop
340  integer :: end_pos
341 
342  !
343  ! Yes this is an ugly hack!
344  ! Since we're only interested in the device name (first 256 bytes)
345  ! we pass down a large enough chunk of memory to the cuda runtime
346  ! and extract what we need later on
347  !
348  ! This will of course break if sizeof(cudaDeviceProp) > 8192
349  !
350 
351  if (cudagetdeviceproperties(c_loc(prop), 0) .ne. cudasuccess) then
352  call neko_error('Failed to query device')
353  end if
354 
355  end_pos = scan(prop(1:256), c_null_char)
356  if (end_pos .ge. 2) then
357  name(1:end_pos-1) = prop(1:end_pos-1)
358  end if
359  end subroutine cuda_device_name
360 
362  integer function cuda_device_count()
363  integer(c_int) :: num_devices
364 
365  if (cudagetdevicecount(num_devices) .ne. cudasuccess) then
366  call neko_error('Error retrieving device count')
367  end if
368 
369  cuda_device_count = num_devices
370  end function cuda_device_count
371 
372 #endif
373 
374 end module cuda_intf
Fortran CUDA interface.
Definition: cuda_intf.F90:34
subroutine cuda_device_name(name)
Definition: cuda_intf.F90:338
type(c_ptr), bind(C) aux_cmd_queue
Aux HIP command queue.
Definition: cuda_intf.F90:46
subroutine cuda_finalize
Definition: cuda_intf.F90:328
@ cudamemcpydevicetohost
Definition: cuda_intf.F90:66
@ cudamemcpydefault
Definition: cuda_intf.F90:68
@ cudamemcpyhosttohost
Definition: cuda_intf.F90:64
@ cudamemcpydevicetodevice
Definition: cuda_intf.F90:67
@ cudamemcpyhosttodevice
Definition: cuda_intf.F90:65
subroutine cuda_init
Definition: cuda_intf.F90:289
integer function cuda_device_count()
Return the number of avaialble CUDA devices.
Definition: cuda_intf.F90:363
@ cudaerrorinitializationerror
Definition: cuda_intf.F90:59
@ cudaerrormemoryallocation
Definition: cuda_intf.F90:58
@ cudaerrorinvalidvalue
Definition: cuda_intf.F90:57
type(c_ptr), bind(C) glb_cmd_queue
Global HIP command queue.
Definition: cuda_intf.F90:43
integer strm_low_prio
Low priority stream setting.
Definition: cuda_intf.F90:52
integer strm_high_prio
High priority stream setting.
Definition: cuda_intf.F90:49
Device abstraction, common interface for various accelerators.
Definition: device.F90:34
Utilities.
Definition: utils.f90:35