Neko  0.9.0
A portable framework for high-order spectral element flow simulations
math.hip
Go to the documentation of this file.
1 /*
2  Copyright (c) 2021-2023, The Neko Authors
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  * Redistributions in binary form must reproduce the above
13  copyright notice, this list of conditions and the following
14  disclaimer in the documentation and/or other materials provided
15  with the distribution.
16 
17  * Neither the name of the authors nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #include <hip/hip_runtime.h>
36 #include <device/device_config.h>
37 #include <device/hip/check.h>
38 #include "math_kernel.h"
39 
40 extern "C" {
41 
44 
48  void hip_copy(void *a, void *b, int *n) {
49  HIP_CHECK(hipMemcpyAsync(a, b, (*n) * sizeof(real),
50  hipMemcpyDeviceToDevice,
51  (hipStream_t) glb_cmd_queue));
52  }
53 
57  void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m) {
58 
59  const dim3 nthrds(1024, 1, 1);
60  const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
61 
62  hipLaunchKernelGGL(HIP_KERNEL_NAME(masked_copy_kernel<real>),
63  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
64  (real *) a, (real *) b, (int *) mask, *n, *m);
65 
66  HIP_CHECK(hipGetLastError());
67 
68  }
69 
70 
74  void hip_masked_red_copy(void *a, void *b, void *mask, int *n, int *m) {
75 
76  const dim3 nthrds(1024, 1, 1);
77  const dim3 nblcks(((*m)+1024 - 1)/ 1024, 1, 1);
78 
79  hipLaunchKernelGGL(HIP_KERNEL_NAME(masked_red_copy_kernel<real>),
80  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
81  (real *) a, (real *) b, (int *) mask, *n, *m);
82 
83  HIP_CHECK(hipGetLastError());
84 
85  }
86 
90  void hip_cfill_mask(void* a, real* c, int* size, void* mask, int* mask_size) {
91 
92  const dim3 nthrds(1024, 1, 1);
93  const dim3 nblcks(((*mask_size) + 1024 - 1) / 1024, 1, 1);
94 
95  hipLaunchKernelGGL(HIP_KERNEL_NAME(cfill_mask_kernel<real>),
96  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
97  (real*)a, *c, *size, (int*)mask, *mask_size);
98 
99  HIP_CHECK(hipGetLastError());
100  }
101 
105  void hip_rzero(void *a, int *n) {
106  HIP_CHECK(hipMemsetAsync(a, 0, (*n) * sizeof(real),
107  (hipStream_t) glb_cmd_queue));
108  }
109 
113  void hip_cmult(void *a, real *c, int *n) {
114 
115  const dim3 nthrds(1024, 1, 1);
116  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
117 
118  hipLaunchKernelGGL(HIP_KERNEL_NAME(cmult_kernel<real>),
119  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
120  (real *) a, *c, *n);
121  HIP_CHECK(hipGetLastError());
122 
123  }
124 
128  void hip_cmult2(void *a, void *b, real *c, int *n) {
129 
130  const dim3 nthrds(1024, 1, 1);
131  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
132 
133  hipLaunchKernelGGL(HIP_KERNEL_NAME(cmult2_kernel<real>),
134  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
135  (real *) a,(real *) b, *c, *n);
136  HIP_CHECK(hipGetLastError());
137 
138  }
142  void hip_cadd(void *a, real *c, int *n) {
143 
144  const dim3 nthrds(1024, 1, 1);
145  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
146 
147  hipLaunchKernelGGL(HIP_KERNEL_NAME(cadd_kernel<real>),
148  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
149  (real *) a, *c, *n);
150  HIP_CHECK(hipGetLastError());
151  }
152 
157  void hip_cadd2(void *a, void *b, real *c, int *n) {
158 
159  const dim3 nthrds(1024, 1, 1);
160  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
161 
162  hipLaunchKernelGGL(HIP_KERNEL_NAME(cadd2_kernel<real>),
163  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
164  (real *) a, (real *) b, *c, *n);
165  HIP_CHECK(hipGetLastError());
166  }
167 
171  void hip_cfill(void *a, real *c, int *n) {
172 
173  const dim3 nthrds(1024, 1, 1);
174  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
175  if (*n > 0) {
176  hipLaunchKernelGGL(HIP_KERNEL_NAME(cfill_kernel<real>),
177  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
178  (real *) a, *c, *n);
179  HIP_CHECK(hipGetLastError());
180  }
181  }
182 
187  void hip_add2(void *a, void *b, int *n) {
188 
189  const dim3 nthrds(1024, 1, 1);
190  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
191 
192  hipLaunchKernelGGL(HIP_KERNEL_NAME(add2_kernel<real>),
193  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
194  (real *) a, (real *) b, *n);
195  HIP_CHECK(hipGetLastError());
196  }
197 
202  void hip_add4(void *a, void *b, void *c, void *d, int *n) {
203 
204  const dim3 nthrds(1024, 1, 1);
205  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
206 
207  hipLaunchKernelGGL(HIP_KERNEL_NAME(add4_kernel<real>),
208  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
209  (real *) a, (real *) b, (real *) c, (real *) d, *n);
210  HIP_CHECK(hipGetLastError());
211  }
212 
218  void hip_add2s1(void *a, void *b, real *c1, int *n) {
219 
220  const dim3 nthrds(1024, 1, 1);
221  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
222 
223  hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s1_kernel<real>),
224  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
225  (real *) a, (real *) b,
226  *c1, *n);
227  HIP_CHECK(hipGetLastError());
228  }
229 
235  void hip_add2s2(void *a, void *b, real *c1, int *n) {
236 
237  const dim3 nthrds(1024, 1, 1);
238  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
239 
240  hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s2_kernel<real>),
241  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
242  (real *) a, (real *) b,
243  *c1, *n);
244  HIP_CHECK(hipGetLastError());
245  }
246 
252  void hip_addsqr2s2(void *a, void *b, real *c1, int *n) {
253 
254  const dim3 nthrds(1024, 1, 1);
255  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
256 
257  hipLaunchKernelGGL(HIP_KERNEL_NAME(addsqr2s2_kernel<real>),
258  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
259  (real *) a, (real *) b,
260  *c1, *n);
261  HIP_CHECK(hipGetLastError());
262  }
263 
269  void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
270 
271  const dim3 nthrds(1024, 1, 1);
272  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
273 
274  hipLaunchKernelGGL(HIP_KERNEL_NAME(add3s2_kernel<real>),
275  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
276  (real *) a, (real *) b, (real *) c,
277  *c1, *c2, *n);
278  HIP_CHECK(hipGetLastError());
279  }
280 
285  void hip_invcol1(void *a, int *n) {
286 
287  const dim3 nthrds(1024, 1, 1);
288  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
289 
290  hipLaunchKernelGGL(HIP_KERNEL_NAME(invcol1_kernel<real>),
291  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
292  (real *) a, *n);
293  HIP_CHECK(hipGetLastError());
294  }
295 
300  void hip_invcol2(void *a, void *b, int *n) {
301 
302  const dim3 nthrds(1024, 1, 1);
303  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
304 
305  hipLaunchKernelGGL(HIP_KERNEL_NAME(invcol2_kernel<real>),
306  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
307  (real *) a, (real *) b, *n);
308  HIP_CHECK(hipGetLastError());
309  }
310 
315  void hip_col2(void *a, void *b, int *n) {
316 
317  const dim3 nthrds(1024, 1, 1);
318  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
319 
320  hipLaunchKernelGGL(HIP_KERNEL_NAME(col2_kernel<real>),
321  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
322  (real *) a, (real *) b, *n);
323  HIP_CHECK(hipGetLastError());
324  }
325 
330  void hip_col3(void *a, void *b, void *c, int *n) {
331 
332  const dim3 nthrds(1024, 1, 1);
333  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
334 
335  hipLaunchKernelGGL(HIP_KERNEL_NAME(col3_kernel<real>),
336  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
337  (real *) a, (real *) b, (real *) c, *n);
338  HIP_CHECK(hipGetLastError());
339  }
340 
345  void hip_subcol3(void *a, void *b, void *c, int *n) {
346 
347  const dim3 nthrds(1024, 1, 1);
348  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
349 
350  hipLaunchKernelGGL(HIP_KERNEL_NAME(subcol3_kernel<real>),
351  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
352  (real *) a, (real *) b, (real *) c, *n);
353  HIP_CHECK(hipGetLastError());
354  }
355 
360  void hip_sub2(void *a, void *b, int *n) {
361 
362  const dim3 nthrds(1024, 1, 1);
363  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
364 
365  hipLaunchKernelGGL(HIP_KERNEL_NAME(sub2_kernel<real>),
366  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
367  (real *) a, (real *) b, *n);
368  HIP_CHECK(hipGetLastError());
369  }
370 
375  void hip_sub3(void *a, void *b, void *c, int *n) {
376 
377  const dim3 nthrds(1024, 1, 1);
378  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
379 
380  hipLaunchKernelGGL(HIP_KERNEL_NAME(sub3_kernel<real>),
381  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
382  (real *) a, (real *) b, (real *) c, *n);
383  HIP_CHECK(hipGetLastError());
384  }
385 
390  void hip_addcol3(void *a, void *b, void *c, int *n) {
391 
392  const dim3 nthrds(1024, 1, 1);
393  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
394 
395  hipLaunchKernelGGL(HIP_KERNEL_NAME(addcol3_kernel<real>),
396  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
397  (real *) a, (real *) b, (real *) c, *n);
398  HIP_CHECK(hipGetLastError());
399  }
400 
405  void hip_addcol4(void *a, void *b, void *c, void *d, int *n) {
406 
407  const dim3 nthrds(1024, 1, 1);
408  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
409 
410  hipLaunchKernelGGL(HIP_KERNEL_NAME(addcol4_kernel<real>),
411  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
412  (real *) a, (real *) b, (real *) c, (real *) d, *n);
413  HIP_CHECK(hipGetLastError());
414  }
415 
420  void hip_vdot3(void *dot, void *u1, void *u2, void *u3,
421  void *v1, void *v2, void *v3, int *n) {
422 
423  const dim3 nthrds(1024, 1, 1);
424  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
425 
426  hipLaunchKernelGGL(HIP_KERNEL_NAME(vdot3_kernel<real>),
427  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
428  (real *) dot, (real *) u1, (real *) u2, (real *) u3,
429  (real *) v1, (real *) v2, (real *) v3, *n);
430  HIP_CHECK(hipGetLastError());
431  }
432 
437  void hip_vcross(void *u1, void *u2, void *u3,
438  void *v1, void *v2, void *v3,
439  void *w1, void *w2, void *w3, int *n) {
440 
441  const dim3 nthrds(1024, 1, 1);
442  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
443 
444  hipLaunchKernelGGL(HIP_KERNEL_NAME(vcross_kernel<real>),
445  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
446  (real *) u1, (real *) u2, (real *) u3,
447  (real *) v1, (real *) v2, (real *) v3,
448  (real *) w1, (real *) w2, (real *) w3, *n);
449  HIP_CHECK(hipGetLastError());
450  }
451 
452 
453  /*
454  * Reduction buffer
455  */
456  int red_s = 0;
457  real * bufred = NULL;
458  real * bufred_d = NULL;
459 
464  real hip_vlsc3(void *u, void *v, void *w, int *n) {
465 
466  const dim3 nthrds(1024, 1, 1);
467  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
468  const int nb = ((*n) + 1024 - 1)/ 1024;
469  const hipStream_t stream = (hipStream_t) glb_cmd_queue;
470 
471  if ( nb > red_s){
472  red_s = nb;
473  if (bufred != NULL) {
474  HIP_CHECK(hipHostFree(bufred));
475  HIP_CHECK(hipFree(bufred_d));
476  }
477  HIP_CHECK(hipHostMalloc(&bufred,nb*sizeof(real),hipHostMallocDefault));
478  HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
479  }
480 
481  hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_kernel<real>),
482  nblcks, nthrds, 0, stream,
483  (real *) u, (real *) v,
484  (real *) w, bufred_d, *n);
485  HIP_CHECK(hipGetLastError());
486  hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
487  1, 1024, 0, stream, bufred_d, nb);
488  HIP_CHECK(hipGetLastError());
489 
490  HIP_CHECK(hipMemcpyAsync(bufred, bufred_d, sizeof(real),
491  hipMemcpyDeviceToHost, stream));
492  HIP_CHECK(hipStreamSynchronize(stream));
493 
494  return bufred[0];
495  }
496 
497 
502  real hip_glsc3(void *a, void *b, void *c, int *n) {
503 
504  const dim3 nthrds(1024, 1, 1);
505  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
506  const int nb = ((*n) + 1024 - 1)/ 1024;
507  const hipStream_t stream = (hipStream_t) glb_cmd_queue;
508 
509  if ( nb > red_s){
510  red_s = nb;
511  if (bufred != NULL) {
512  HIP_CHECK(hipHostFree(bufred));
513  HIP_CHECK(hipFree(bufred_d));
514  }
515  HIP_CHECK(hipHostMalloc(&bufred,nb*sizeof(real),hipHostMallocDefault));
516  HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
517  }
518 
519  hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_kernel<real>),
520  nblcks, nthrds, 0, stream,
521  (real *) a, (real *) b,
522  (real *) c, bufred_d, *n);
523  HIP_CHECK(hipGetLastError());
524  hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
525  1, 1024, 0, stream, bufred_d, nb);
526  HIP_CHECK(hipGetLastError());
527 
528 #ifdef HAVE_DEVICE_MPI
529  HIP_CHECK(hipStreamSynchronize(stream));
531 #else
532  HIP_CHECK(hipMemcpyAsync(bufred, bufred_d, sizeof(real),
533  hipMemcpyDeviceToHost, stream));
534  HIP_CHECK(hipStreamSynchronize(stream));
535 #endif
536  return bufred[0];
537  }
538 
543  void hip_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
544  int pow2 = 1;
545  while(pow2 < (*j)){
546  pow2 = 2*pow2;
547  }
548  const int nt = 1024/pow2;
549  const dim3 nthrds(pow2, nt, 1);
550  const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
551  const dim3 nthrds_red(1024,1,1);
552  const dim3 nblcks_red( (*j),1,1);
553  const int nb = ((*n) + nt - 1)/nt;
554  const hipStream_t stream = (hipStream_t) glb_cmd_queue;
555 
556  if((*j)*nb>red_s){
557  red_s = (*j)*nb;
558  if (bufred != NULL) {
559  HIP_CHECK(hipHostFree(bufred));
560  HIP_CHECK(hipFree(bufred_d));
561  }
562  HIP_CHECK(hipHostMalloc(&bufred,(*j)*nb*sizeof(real),hipHostMallocDefault));
563  HIP_CHECK(hipMalloc(&bufred_d, (*j)*nb*sizeof(real)));
564  }
565  hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_many_kernel<real>),
566  nblcks, nthrds, 0, stream,
567  (const real *) w, (const real **) v,
568  (const real *)mult, bufred_d, *j, *n);
569  HIP_CHECK(hipGetLastError());
570 
571  hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc3_reduce_kernel<real>),
572  nblcks_red, nthrds_red, 0, stream,
573  bufred_d, nb, *j);
574  HIP_CHECK(hipGetLastError());
575 
576 #ifdef HAVE_DEVICE_MPI
577  HIP_CHECK(hipStreamSynchronize(stream));
579 #else
580  HIP_CHECK(hipMemcpyAsync(h, bufred_d, (*j)* sizeof(real),
581  hipMemcpyDeviceToHost, stream));
582  HIP_CHECK(hipStreamSynchronize(stream));
583 #endif
584  }
585 
592  void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
593 
594  const dim3 nthrds(1024, 1, 1);
595  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
596 
597  hipLaunchKernelGGL(HIP_KERNEL_NAME(add2s2_many_kernel<real>),
598  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
599  (real *) x, (const real **) p, (real *) alpha, *j, *n);
600  HIP_CHECK(hipGetLastError());
601 
602  }
603 
608  void hip_add3(void *a, void *b, void *c, int *n) {
609 
610  const dim3 nthrds(1024, 1, 1);
611  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
612 
613  hipLaunchKernelGGL(HIP_KERNEL_NAME(add3_kernel<real>),
614  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
615  (real *) a, (real *) b, (real *) c, *n);
616  HIP_CHECK(hipGetLastError());
617  }
618 
623  real hip_glsc2(void *a, void *b, int *n) {
624 
625  const dim3 nthrds(1024, 1, 1);
626  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
627  const int nb = ((*n) + 1024 - 1)/ 1024;
628  const hipStream_t stream = (hipStream_t) glb_cmd_queue;
629 
630  if ( nb > red_s){
631  red_s = nb;
632  if (bufred != NULL) {
633  HIP_CHECK(hipHostFree(bufred));
634  HIP_CHECK(hipFree(bufred_d));
635  }
636  HIP_CHECK(hipHostMalloc(&bufred,nb*sizeof(real),hipHostMallocDefault));
637  HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
638  }
639 
640  hipLaunchKernelGGL(HIP_KERNEL_NAME(glsc2_kernel<real>),
641  nblcks, nthrds, 0, stream,
642  (real *) a, (real *) b, bufred_d, *n);
643  HIP_CHECK(hipGetLastError());
644  hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
645  1, 1024, 0, stream, bufred_d, nb);
646  HIP_CHECK(hipGetLastError());
647 
648 #ifdef HAVE_DEVICE_MPI
649  HIP_CHECK(hipStreamSynchronize(stream));
651 #else
652  HIP_CHECK(hipMemcpyAsync(bufred, bufred_d, sizeof(real),
653  hipMemcpyDeviceToHost, stream));
654  HIP_CHECK(hipStreamSynchronize(stream));
655 #endif
656  return bufred[0];
657  }
658 
663  real hip_glsum(void *a, int *n) {
664  const dim3 nthrds(1024, 1, 1);
665  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
666  const int nb = ((*n) + 1024 - 1)/ 1024;
667  const hipStream_t stream = (hipStream_t) glb_cmd_queue;
668 
669  if ( nb > red_s){
670  red_s = nb;
671  if (bufred != NULL) {
672  HIP_CHECK(hipHostFree(bufred));
673  HIP_CHECK(hipFree(bufred_d));
674  }
675  HIP_CHECK(hipHostMalloc(&bufred,nb*sizeof(real),hipHostMallocDefault));
676  HIP_CHECK(hipMalloc(&bufred_d, nb*sizeof(real)));
677  }
678  if( *n > 0) {
679  hipLaunchKernelGGL(HIP_KERNEL_NAME(glsum_kernel<real>),
680  nblcks, nthrds, 0, stream,
681  (real *) a, bufred_d, *n);
682  HIP_CHECK(hipGetLastError());
683  hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel<real>),
684  1, 1024, 0, stream, bufred_d, nb);
685  HIP_CHECK(hipGetLastError());
686  }
687 #ifdef HAVE_DEVICE_MPI
688  HIP_CHECK(hipStreamSynchronize(stream));
690 #else
691  HIP_CHECK(hipMemcpyAsync(bufred, bufred_d,sizeof(real),
692  hipMemcpyDeviceToHost, stream));
693  HIP_CHECK(hipStreamSynchronize(stream));
694 #endif
695  return bufred[0];
696  }
697 
701  void hip_absval(void *a, int *n) {
702 
703  const dim3 nthrds(1024, 1, 1);
704  const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
705 
706  hipLaunchKernelGGL(HIP_KERNEL_NAME(absval_kernel<real>),
707  nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
708  (real *) a, *n);
709  HIP_CHECK(hipGetLastError());
710 
711  }
712 }
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ u
__global__ void T *__restrict__ T *__restrict__ const T *__restrict__ const T *__restrict__ v
const int j
__global__ void const T *__restrict__ x
Definition: cdtp_kernel.h:106
__global__ void const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ const T *__restrict__ w3
Definition: cdtp_kernel.h:113
double real
Definition: device_config.h:12
void * glb_cmd_queue
#define DEVICE_MPI_SUM
Definition: device_mpi_op.h:9
void device_mpi_allreduce(void *buf_d, void *buf, int count, int nbytes, int op)
#define HIP_CHECK(err)
Definition: check.h:8
void hip_vdot3(void *dot, void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, int *n)
Definition: math.hip:420
void hip_vcross(void *u1, void *u2, void *u3, void *v1, void *v2, void *v3, void *w1, void *w2, void *w3, int *n)
Definition: math.hip:437
void hip_cmult2(void *a, void *b, real *c, int *n)
Definition: math.hip:128
real hip_glsc3(void *a, void *b, void *c, int *n)
Definition: math.hip:502
void hip_cfill_mask(void *a, real *c, int *size, void *mask, int *mask_size)
Definition: math.hip:90
void hip_invcol2(void *a, void *b, int *n)
Definition: math.hip:300
void hip_cadd2(void *a, void *b, real *c, int *n)
Definition: math.hip:157
void hip_masked_red_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.hip:74
void hip_invcol1(void *a, int *n)
Definition: math.hip:285
void hip_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n)
Definition: math.hip:269
void hip_subcol3(void *a, void *b, void *c, int *n)
Definition: math.hip:345
void hip_col3(void *a, void *b, void *c, int *n)
Definition: math.hip:330
real hip_glsc2(void *a, void *b, int *n)
Definition: math.hip:623
void hip_masked_copy(void *a, void *b, void *mask, int *n, int *m)
Definition: math.hip:57
void hip_copy(void *a, void *b, int *n)
Definition: math.hip:48
void hip_add2(void *a, void *b, int *n)
Definition: math.hip:187
real hip_vlsc3(void *u, void *v, void *w, int *n)
Definition: math.hip:464
void hip_add3(void *a, void *b, void *c, int *n)
Definition: math.hip:608
real * bufred
Definition: math.hip:457
void hip_addsqr2s2(void *a, void *b, real *c1, int *n)
Definition: math.hip:252
void hip_add2s2(void *a, void *b, real *c1, int *n)
Definition: math.hip:235
void hip_rzero(void *a, int *n)
Definition: math.hip:105
void hip_sub2(void *a, void *b, int *n)
Definition: math.hip:360
void hip_cadd(void *a, real *c, int *n)
Definition: math.hip:142
real hip_glsum(void *a, int *n)
Definition: math.hip:663
int red_s
Definition: math.hip:456
void hip_addcol3(void *a, void *b, void *c, int *n)
Definition: math.hip:390
void hip_cfill(void *a, real *c, int *n)
Definition: math.hip:171
void hip_absval(void *a, int *n)
Definition: math.hip:701
void hip_col2(void *a, void *b, int *n)
Definition: math.hip:315
void hip_glsc3_many(real *h, void *w, void *v, void *mult, int *j, int *n)
Definition: math.hip:543
void hip_sub3(void *a, void *b, void *c, int *n)
Definition: math.hip:375
void hip_add4(void *a, void *b, void *c, void *d, int *n)
Definition: math.hip:202
void hip_cmult(void *a, real *c, int *n)
Definition: math.hip:113
void hip_add2s2_many(void *x, void **p, void *alpha, int *j, int *n)
Definition: math.hip:592
void hip_addcol4(void *a, void *b, void *c, void *d, int *n)
Definition: math.hip:405
real * bufred_d
Definition: math.hip:458
void hip_add2s1(void *a, void *b, real *c1, int *n)
Definition: math.hip:218