cmdr2 commited on
Commit
4bec2e4
·
1 Parent(s): f959b90

cuda: unary ops as float + de-duplicate (ggml/1130)

Browse files
ggml/src/ggml-cuda/clamp.cu CHANGED
@@ -1,20 +1,24 @@
1
  #include "clamp.cuh"
2
 
 
 
 
 
3
  template <class T>
4
- static __global__ void op_clamp(const T * x, T * dst, const T min, const T max, const int k) {
5
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
6
 
7
  if (i >= k) {
8
  return;
9
  }
10
 
11
- dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
12
  }
13
 
14
  template <class T>
15
  static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int k, cudaStream_t stream) {
16
  const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
17
- op_clamp<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
18
  }
19
 
20
 
 
1
  #include "clamp.cuh"
2
 
3
+ static __device__ __forceinline__ float op_clamp(float x, float min, float max) {
4
+ return fminf(fmaxf(x, min), max);
5
+ }
6
+
7
  template <class T>
8
+ static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const T max, const int k) {
9
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
10
 
11
  if (i >= k) {
12
  return;
13
  }
14
 
15
+ dst[i] = (T)op_clamp((float)x[i], (float)min, (float)max);
16
  }
17
 
18
  template <class T>
19
  static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int k, cudaStream_t stream) {
20
  const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
21
+ op_clamp_kernel<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
22
  }
23
 
24
 
ggml/src/ggml-cuda/unary.cu CHANGED
@@ -1,447 +1,213 @@
1
  #include "unary.cuh"
2
 
3
- template <class T>
4
- static __global__ void op_abs(const T * x, T * dst, const int k) {
5
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
6
-
7
- if (i >= k) {
8
- return;
9
- }
10
-
11
- dst[i] = fabsf(x[i]);
12
  }
13
 
14
- template <class T>
15
- static __global__ void op_sgn(const T * x, T * dst, const int k) {
16
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
17
-
18
- if (i >= k) {
19
- return;
20
- }
21
-
22
- dst[i] = (T)(x[i] > (T)0.f ? 1.f : ((x[i] < (T)0.f ? -1.f : 0.f)));
23
  }
24
 
25
- template <class T>
26
- static __global__ void op_neg(const T * x, T * dst, const int k) {
27
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
28
-
29
- if (i >= k) {
30
- return;
31
- }
32
-
33
- dst[i] = -x[i];
34
  }
35
 
36
- template <class T>
37
- static __global__ void op_step(const T * x, T * dst, const int k) {
38
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
39
-
40
- if (i >= k) {
41
- return;
42
- }
43
-
44
- dst[i] = x[i] > (T)0.0f;
45
  }
46
 
47
- template <class T>
48
- static __global__ void op_gelu(const T * x, T * dst, const int k) {
49
- const T GELU_COEF_A = 0.044715f;
50
- const T SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
51
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
52
-
53
- if (i >= k) {
54
- return;
55
- }
56
 
57
- T xi = x[i];
58
- dst[i] = (T)0.5f*xi*((T)1.0f + (T)tanhf(SQRT_2_OVER_PI*xi*((T)1.0f + GELU_COEF_A*xi*xi)));
59
  }
60
 
61
- template <class T>
62
- static __global__ void op_gelu_quick(const T * x, T * dst, int k) {
63
- const T GELU_QUICK_COEF = -1.702f;
64
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
65
- if (i >= k) {
66
- return;
67
- }
68
- dst[i] = x[i] * ((T)1.0f / ((T)1.0f + (T)expf(GELU_QUICK_COEF * x[i])));
69
- }
70
 
71
- template <class T>
72
- static __global__ void op_silu(const T * x, T * dst, const int k) {
73
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
74
-
75
- if (i >= k) {
76
- return;
77
- }
78
- dst[i] = x[i] / ((T)1.0f + (T)expf(-x[i]));
79
  }
80
 
81
- template <class T>
82
- static __global__ void op_silu_back(
83
- const T * grad, const T * xf, T * dst, const int k) {
84
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
85
-
86
- if (i >= k) {
87
- return;
88
- }
89
-
90
- const T xfi = xf[i];
91
- const T s = (T)1.0f / ((T)1.0f + (T)expf(-xfi));
92
- dst[i] = grad[i] * s * ((T)1.0f + xfi * ((T)1.0f - s));
93
  }
94
 
95
- template <class T>
96
- static __global__ void op_tanh(const T * x, T * dst, int k) {
97
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
98
- if (i >= k) {
99
- return;
100
- }
101
- dst[i] = tanhf(x[i]);
102
  }
103
 
104
- template <class T>
105
- static __global__ void op_relu(const T * x, T * dst, const int k) {
106
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
107
-
108
- if (i >= k) {
109
- return;
110
- }
111
- dst[i] = fmaxf(x[i], 0);
112
  }
113
 
114
- template <class T>
115
- static __global__ void op_sigmoid(const T * x, T * dst, const int k) {
116
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
117
-
118
- if (i >= k) {
119
- return;
120
- }
121
- dst[i] = (T)1.0f / ((T)1.0f + (T)expf(-x[i]));
122
  }
123
 
124
- template <class T>
125
- static __global__ void op_hardsigmoid(const T * x, T * dst, const int k) {
126
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
127
-
128
- if (i >= k) {
129
- return;
130
- }
131
- dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + (T)3.0f) / (T)6.0f));
132
  }
133
 
134
- template <class T>
135
- static __global__ void op_hardswish(const T * x, T * dst, const int k) {
136
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
137
-
138
- if (i >= k) {
139
- return;
140
- }
141
- dst[i] = x[i] * (T)fminf(1.0f, fmaxf(0.0f, (x[i] + (T)3.0f) / (T)6.0f));
142
  }
143
 
144
- template <class T>
145
- static __global__ void op_exp(const T * x, T * dst, const int k) {
146
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
147
-
148
- if (i >= k) {
149
- return;
150
- }
151
- dst[i] = expf(x[i]);
152
  }
153
 
154
- template <class T>
155
- static __global__ void op_leaky_relu(const T * x, T * dst, const int k, const float negative_slope) {
156
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
157
- if (i >= k) {
158
- return;
159
- }
160
- dst[i] = (T)fmaxf(x[i], 0) + (T)fminf(x[i], 0.0f) * (T)negative_slope;
161
  }
162
 
163
- template <class T>
164
- static __global__ void op_sqr(const T * x, T * dst, const int k) {
165
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
166
-
167
- if (i >= k) {
168
- return;
169
- }
170
- dst[i] = x[i] * x[i];
171
  }
172
 
173
- template <class T>
174
- static __global__ void op_sqrt(const T * x, T * dst, const int k) {
175
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
176
-
177
- if (i >= k) {
178
- return;
179
- }
180
- dst[i] = sqrtf(x[i]);
181
  }
182
 
183
- template <class T>
184
- static __global__ void op_sin(const T * x, T * dst, const int k) {
185
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
186
-
187
- if (i >= k) {
188
- return;
189
- }
190
- dst[i] = sinf(x[i]);
191
  }
192
 
193
- template <class T>
194
- static __global__ void op_cos(const T * x, T * dst, const int k) {
195
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
196
-
197
- if (i >= k) {
198
- return;
199
- }
200
- dst[i] = cosf(x[i]);
201
  }
202
 
203
- template <class T>
204
- static __global__ void op_log(const T * x, T * dst, const int k) {
205
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
206
 
207
  if (i >= k) {
208
  return;
209
  }
210
- dst[i] = logf(x[i]);
211
- }
212
 
213
- template <class T>
214
- static void abs_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
215
- const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
216
- op_abs<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
217
  }
218
 
219
- template <class T>
220
- static void sgn_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
221
  const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
222
- op_sgn<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
223
  }
224
 
225
- template <class T>
226
- static void neg_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
227
- const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
228
- op_neg<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
229
- }
 
230
 
231
- template <class T>
232
- static void step_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
233
- const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE;
234
- op_step<<<num_blocks, CUDA_STEP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
235
- }
236
 
237
- template <class T>
238
- static void gelu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
239
- const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
240
- op_gelu<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
241
- }
242
 
243
- template <class T>
244
- static void gelu_quick_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
245
- const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
246
- op_gelu_quick<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 
247
  }
248
 
249
- template <class T>
250
- static void silu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
251
- const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
252
- op_silu<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
253
  }
254
 
255
- template <class T>
256
- static void silu_back_cuda(const T * grad, const T * x, T * dst, const int k, cudaStream_t stream) {
257
- const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
258
- op_silu_back<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
259
  }
260
 
261
- template <class T>
262
- static void tanh_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
263
- const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
264
- op_tanh<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
265
  }
266
 
267
- template <class T>
268
- static void relu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
269
- const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
270
- op_relu<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
271
  }
272
 
273
- template <class T>
274
- static void sigmoid_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
275
- const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
276
- op_sigmoid<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
277
  }
278
 
279
- template <class T>
280
- static void hardsigmoid_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
281
- const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
282
- op_hardsigmoid<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
283
  }
284
 
285
- template <class T>
286
- static void hardswish_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
287
- const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
288
- op_hardswish<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
289
  }
290
 
291
- template <class T>
292
- static void exp_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
293
- const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE;
294
- op_exp<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
295
  }
296
 
297
- template <class T>
298
- static void leaky_relu_cuda(const T * x, T * dst, const int k, const float negative_slope, cudaStream_t stream) {
299
- const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
300
- op_leaky_relu<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
301
  }
302
 
303
- template <class T>
304
- static void sqr_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
305
- const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
306
- op_sqr<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
307
  }
308
 
309
- template <class T>
310
- static void sqrt_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
311
- const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
312
- op_sqrt<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
313
  }
314
 
315
- template <class T>
316
- static void sin_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
317
- const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
318
- op_sin<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
319
  }
320
 
321
- template <class T>
322
- static void cos_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
323
- const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
324
- op_cos<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
325
  }
326
 
327
- template <class T>
328
- static void log_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
329
- const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
330
- op_log<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
331
  }
332
 
333
- void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
334
- const ggml_tensor * src0 = dst->src[0];
335
- const void * src0_d = src0->data;
336
- void * dst_d = dst->data;
337
- cudaStream_t stream = ctx.stream();
338
-
339
- GGML_ASSERT(ggml_is_contiguous(src0));
340
-
341
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
342
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
343
- GGML_ASSERT(src0->type == dst->type);
344
-
345
- if (src0->type == GGML_TYPE_F16) {
346
- abs_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
347
- } else {
348
- abs_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
349
- }
350
  }
351
 
352
- void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
353
- const ggml_tensor * src0 = dst->src[0];
354
- const void * src0_d = src0->data;
355
- void * dst_d = dst->data;
356
- cudaStream_t stream = ctx.stream();
357
-
358
- GGML_ASSERT(ggml_is_contiguous(src0));
359
-
360
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
361
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
362
- GGML_ASSERT(src0->type == dst->type);
363
-
364
- if (src0->type == GGML_TYPE_F16) {
365
- sgn_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
366
- } else {
367
- sgn_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
368
- }
369
  }
370
 
371
- void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
372
- const ggml_tensor * src0 = dst->src[0];
373
- const void * src0_d = src0->data;
374
- void * dst_d = dst->data;
375
- cudaStream_t stream = ctx.stream();
376
-
377
- GGML_ASSERT(ggml_is_contiguous(src0));
378
-
379
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
380
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
381
- GGML_ASSERT(src0->type == dst->type);
382
-
383
- if (src0->type == GGML_TYPE_F16) {
384
- neg_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
385
- } else {
386
- neg_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
387
- }
388
  }
389
 
390
- void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
391
- const ggml_tensor * src0 = dst->src[0];
392
- const void * src0_d = src0->data;
393
- void * dst_d = dst->data;
394
- cudaStream_t stream = ctx.stream();
395
-
396
- GGML_ASSERT(ggml_is_contiguous(src0));
397
-
398
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
399
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
400
- GGML_ASSERT(src0->type == dst->type);
401
-
402
- if (src0->type == GGML_TYPE_F16) {
403
- step_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
404
- } else {
405
- step_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
406
- }
407
  }
408
 
409
- void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
410
- const ggml_tensor * src0 = dst->src[0];
411
- const void * src0_d = src0->data;
412
- void * dst_d = dst->data;
413
- cudaStream_t stream = ctx.stream();
414
-
415
- GGML_ASSERT(ggml_is_contiguous(src0));
416
 
417
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
418
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
419
- GGML_ASSERT(src0->type == dst->type);
420
-
421
- if (src0->type == GGML_TYPE_F16) {
422
- gelu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
423
- } else {
424
- gelu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
425
- }
426
  }
427
 
428
- void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
429
- const ggml_tensor * src0 = dst->src[0];
430
- const void * src0_d = src0->data;
431
- void * dst_d = dst->data;
432
- cudaStream_t stream = ctx.stream();
433
 
434
- GGML_ASSERT(ggml_is_contiguous(src0));
 
 
435
 
436
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
437
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
438
- GGML_ASSERT(src0->type == dst->type);
439
 
440
- if (src0->type == GGML_TYPE_F16) {
441
- silu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
442
- } else {
443
- silu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
444
- }
445
  }
446
 
447
  void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -467,137 +233,27 @@ void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
467
  }
468
  }
469
 
470
- void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
471
- const ggml_tensor * src0 = dst->src[0];
472
- const void * src0_d = src0->data;
473
- void * dst_d = dst->data;
474
- cudaStream_t stream = ctx.stream();
475
-
476
- GGML_ASSERT(ggml_is_contiguous(src0));
477
-
478
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
479
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
480
- GGML_ASSERT(src0->type == dst->type);
481
 
482
- if (src0->type == GGML_TYPE_F16) {
483
- gelu_quick_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
484
- } else {
485
- gelu_quick_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
486
- }
487
  }
488
 
489
- void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
490
- const ggml_tensor * src0 = dst->src[0];
491
- const void * src0_d = src0->data;
492
- void * dst_d = dst->data;
493
- cudaStream_t stream = ctx.stream();
494
-
495
- GGML_ASSERT(ggml_is_contiguous(src0));
496
-
497
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
498
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
499
- GGML_ASSERT(src0->type == dst->type);
500
-
501
- if (src0->type == GGML_TYPE_F16) {
502
- tanh_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
503
- } else {
504
- tanh_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
505
- }
506
- }
507
-
508
- void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
509
- const ggml_tensor * src0 = dst->src[0];
510
- const void * src0_d = src0->data;
511
- void * dst_d = dst->data;
512
- cudaStream_t stream = ctx.stream();
513
-
514
- GGML_ASSERT(ggml_is_contiguous(src0));
515
-
516
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
517
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
518
- GGML_ASSERT(src0->type == dst->type);
519
-
520
- if (src0->type == GGML_TYPE_F16) {
521
- relu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
522
- } else {
523
- relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
524
- }
525
- }
526
-
527
- void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
528
- const ggml_tensor * src0 = dst->src[0];
529
- const void * src0_d = src0->data;
530
- void * dst_d = dst->data;
531
- cudaStream_t stream = ctx.stream();
532
-
533
- GGML_ASSERT(ggml_is_contiguous(src0));
534
-
535
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
536
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
537
- GGML_ASSERT(src0->type == dst->type);
538
 
539
- if (src0->type == GGML_TYPE_F16) {
540
- sigmoid_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
541
- } else {
542
- sigmoid_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
543
  }
544
- }
545
 
546
- void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
547
- const ggml_tensor * src0 = dst->src[0];
548
- const void * src0_d = src0->data;
549
- void * dst_d = dst->data;
550
- cudaStream_t stream = ctx.stream();
551
-
552
- GGML_ASSERT(ggml_is_contiguous(src0));
553
-
554
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
555
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
556
- GGML_ASSERT(src0->type == dst->type);
557
-
558
- if (src0->type == GGML_TYPE_F16) {
559
- hardsigmoid_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
560
- } else {
561
- hardsigmoid_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
562
- }
563
  }
564
 
565
- void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
566
- const ggml_tensor * src0 = dst->src[0];
567
- const void * src0_d = src0->data;
568
- void * dst_d = dst->data;
569
- cudaStream_t stream = ctx.stream();
570
-
571
- GGML_ASSERT(ggml_is_contiguous(src0));
572
-
573
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
574
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
575
- GGML_ASSERT(src0->type == dst->type);
576
-
577
- if (src0->type == GGML_TYPE_F16) {
578
- hardswish_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
579
- } else {
580
- hardswish_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
581
- }
582
- }
583
-
584
- void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
585
- const ggml_tensor * src0 = dst->src[0];
586
- const void * src0_d = src0->data;
587
- void * dst_d = dst->data;
588
- cudaStream_t stream = ctx.stream();
589
-
590
- GGML_ASSERT(ggml_is_contiguous(src0));
591
-
592
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
593
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
594
- GGML_ASSERT(src0->type == dst->type);
595
-
596
- if (src0->type == GGML_TYPE_F16) {
597
- exp_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
598
- } else {
599
- exp_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
600
- }
601
  }
602
 
603
  void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -621,98 +277,3 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
621
  leaky_relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), negative_slope, stream);
622
  }
623
  }
624
-
625
- void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
626
- const ggml_tensor * src0 = dst->src[0];
627
- const void * src0_d = src0->data;
628
- void * dst_d = dst->data;
629
- cudaStream_t stream = ctx.stream();
630
-
631
- GGML_ASSERT(ggml_is_contiguous(src0));
632
-
633
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
634
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
635
- GGML_ASSERT(src0->type == dst->type);
636
-
637
- if (src0->type == GGML_TYPE_F16) {
638
- sqr_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
639
- } else {
640
- sqr_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
641
- }
642
- }
643
-
644
- void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
645
- const ggml_tensor * src0 = dst->src[0];
646
- const void * src0_d = src0->data;
647
- void * dst_d = dst->data;
648
- cudaStream_t stream = ctx.stream();
649
-
650
- GGML_ASSERT(ggml_is_contiguous(src0));
651
-
652
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
653
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
654
- GGML_ASSERT(src0->type == dst->type);
655
-
656
- if (src0->type == GGML_TYPE_F16) {
657
- sqrt_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
658
- } else {
659
- sqrt_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
660
- }
661
- }
662
-
663
- void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
664
- const ggml_tensor * src0 = dst->src[0];
665
- const void * src0_d = src0->data;
666
- void * dst_d = dst->data;
667
- cudaStream_t stream = ctx.stream();
668
-
669
- GGML_ASSERT(ggml_is_contiguous(src0));
670
-
671
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
672
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
673
- GGML_ASSERT(src0->type == dst->type);
674
-
675
- if (src0->type == GGML_TYPE_F16) {
676
- sin_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
677
- } else {
678
- sin_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
679
- }
680
- }
681
-
682
- void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
683
- const ggml_tensor * src0 = dst->src[0];
684
- const void * src0_d = src0->data;
685
- void * dst_d = dst->data;
686
- cudaStream_t stream = ctx.stream();
687
-
688
- GGML_ASSERT(ggml_is_contiguous(src0));
689
-
690
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
691
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
692
- GGML_ASSERT(src0->type == dst->type);
693
-
694
- if (src0->type == GGML_TYPE_F16) {
695
- cos_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
696
- } else {
697
- cos_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
698
- }
699
- }
700
-
701
- void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
702
- const ggml_tensor * src0 = dst->src[0];
703
- const void * src0_d = src0->data;
704
- void * dst_d = dst->data;
705
- cudaStream_t stream = ctx.stream();
706
-
707
- GGML_ASSERT(ggml_is_contiguous(src0));
708
-
709
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
710
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
711
- GGML_ASSERT(src0->type == dst->type);
712
-
713
- if (src0->type == GGML_TYPE_F16) {
714
- log_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
715
- } else {
716
- log_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
717
- }
718
- }
 
1
  #include "unary.cuh"
2
 
3
+ static __device__ __forceinline__ float op_abs(float x) {
4
+ return fabsf(x);
 
 
 
 
 
 
 
5
  }
6
 
7
+ static __device__ __forceinline__ float op_sgn(float x) {
8
+ return (x > 0.f ? 1.f : ((x < 0.f ? -1.f : 0.f)));
 
 
 
 
 
 
 
9
  }
10
 
11
+ static __device__ __forceinline__ float op_neg(float x) {
12
+ return -x;
 
 
 
 
 
 
 
13
  }
14
 
15
+ static __device__ __forceinline__ float op_step(float x) {
16
+ return x > 0.0f;
 
 
 
 
 
 
 
17
  }
18
 
19
+ static __device__ __forceinline__ float op_gelu(float x) {
20
+ const float GELU_COEF_A = 0.044715f;
21
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
 
 
 
 
 
 
22
 
23
+ return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 
24
  }
25
 
26
+ static __device__ __forceinline__ float op_gelu_quick(float x) {
27
+ const float GELU_QUICK_COEF = -1.702f;
 
 
 
 
 
 
 
28
 
29
+ return x * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x)));
 
 
 
 
 
 
 
30
  }
31
 
32
+ static __device__ __forceinline__ float op_silu(float x) {
33
+ return x / (1.0f + expf(-x));
 
 
 
 
 
 
 
 
 
 
34
  }
35
 
36
+ static __device__ __forceinline__ float op_tanh(float x) {
37
+ return tanhf(x);
 
 
 
 
 
38
  }
39
 
40
+ static __device__ __forceinline__ float op_relu(float x) {
41
+ return fmaxf(x, 0);
 
 
 
 
 
 
42
  }
43
 
44
+ static __device__ __forceinline__ float op_sigmoid(float x) {
45
+ return 1.0f / (1.0f + expf(-x));
 
 
 
 
 
 
46
  }
47
 
48
+ static __device__ __forceinline__ float op_hardsigmoid(float x) {
49
+ return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
 
 
 
 
 
 
50
  }
51
 
52
+ static __device__ __forceinline__ float op_hardswish(float x) {
53
+ return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
 
 
 
 
 
 
54
  }
55
 
56
+ static __device__ __forceinline__ float op_exp(float x) {
57
+ return expf(x);
 
 
 
 
 
 
58
  }
59
 
60
+ static __device__ __forceinline__ float op_sqr(float x) {
61
+ return x * x;
 
 
 
 
 
62
  }
63
 
64
+ static __device__ __forceinline__ float op_sqrt(float x) {
65
+ return sqrtf(x);
 
 
 
 
 
 
66
  }
67
 
68
+ static __device__ __forceinline__ float op_sin(float x) {
69
+ return sinf(x);
 
 
 
 
 
 
70
  }
71
 
72
+ static __device__ __forceinline__ float op_cos(float x) {
73
+ return cosf(x);
 
 
 
 
 
 
74
  }
75
 
76
+ static __device__ __forceinline__ float op_log(float x) {
77
+ return logf(x);
 
 
 
 
 
 
78
  }
79
 
80
+ template <float (*op)(float), typename T>
81
+ static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
82
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
83
 
84
  if (i >= k) {
85
  return;
86
  }
 
 
87
 
88
+ dst[i] = (T)op((float)x[i]);
 
 
 
89
  }
90
 
91
+ template <float (*op)(float), typename T>
92
+ static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
93
  const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
94
+ unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
95
  }
96
 
97
+ template <float (*op)(float)>
98
+ void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
99
+ const ggml_tensor * src0 = dst->src[0];
100
+ const void * src0_d = src0->data;
101
+ void * dst_d = dst->data;
102
+ cudaStream_t stream = ctx.stream();
103
 
104
+ GGML_ASSERT(ggml_is_contiguous(src0));
 
 
 
 
105
 
106
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
107
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
108
+ GGML_ASSERT(src0->type == dst->type);
 
 
109
 
110
+ if (src0->type == GGML_TYPE_F16) {
111
+ unary_cuda<op>((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
112
+ } else {
113
+ unary_cuda<op>((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
114
+ }
115
  }
116
 
117
+ void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
118
+ ggml_cuda_op_unary<op_abs>(ctx, dst);
 
 
119
  }
120
 
121
+ void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
122
+ ggml_cuda_op_unary<op_sgn>(ctx, dst);
 
 
123
  }
124
 
125
+ void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
126
+ ggml_cuda_op_unary<op_neg>(ctx, dst);
 
 
127
  }
128
 
129
+ void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
130
+ ggml_cuda_op_unary<op_step>(ctx, dst);
 
 
131
  }
132
 
133
+ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
134
+ ggml_cuda_op_unary<op_gelu>(ctx, dst);
 
 
135
  }
136
 
137
+ void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
138
+ ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
 
 
139
  }
140
 
141
+ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
142
+ ggml_cuda_op_unary<op_silu>(ctx, dst);
 
 
143
  }
144
 
145
+ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
146
+ ggml_cuda_op_unary<op_tanh>(ctx, dst);
 
 
147
  }
148
 
149
+ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
150
+ ggml_cuda_op_unary<op_relu>(ctx, dst);
 
 
151
  }
152
 
153
+ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
154
+ ggml_cuda_op_unary<op_sigmoid>(ctx, dst);
 
 
155
  }
156
 
157
+ void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
158
+ ggml_cuda_op_unary<op_hardsigmoid>(ctx, dst);
 
 
159
  }
160
 
161
+ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
162
+ ggml_cuda_op_unary<op_hardswish>(ctx, dst);
 
 
163
  }
164
 
165
+ void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
166
+ ggml_cuda_op_unary<op_exp>(ctx, dst);
 
 
167
  }
168
 
169
+ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
170
+ ggml_cuda_op_unary<op_sqr>(ctx, dst);
 
 
171
  }
172
 
173
+ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
174
+ ggml_cuda_op_unary<op_sqrt>(ctx, dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  }
176
 
177
+ void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
178
+ ggml_cuda_op_unary<op_sin>(ctx, dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  }
180
 
181
+ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
182
+ ggml_cuda_op_unary<op_cos>(ctx, dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  }
184
 
185
+ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
186
+ ggml_cuda_op_unary<op_log>(ctx, dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  }
188
 
189
+ /* silu_back */
 
 
 
 
 
 
190
 
191
+ static __device__ __forceinline__ float op_silu_back(float grad, float x) {
192
+ const float s = 1.0f / (1.0f + expf(-x));
193
+ return grad * s * (1.0f + x * (1.0f - s));
 
 
 
 
 
 
194
  }
195
 
196
+ template <class T>
197
+ static __global__ void silu_back_kernel(const T * grad, const T * xf, T * dst, const int k) {
198
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
 
199
 
200
+ if (i >= k) {
201
+ return;
202
+ }
203
 
204
+ dst[i] = (T)op_silu_back((float)grad[i], (float)xf[i]);
205
+ }
 
206
 
207
+ template <class T>
208
+ static void silu_back_cuda(const T * grad, const T * x, T * dst, const int k, cudaStream_t stream) {
209
+ const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
210
+ silu_back_kernel<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
 
211
  }
212
 
213
  void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
233
  }
234
  }
235
 
236
+ /* leaky relu */
 
 
 
 
 
 
 
 
 
 
237
 
238
+ static __device__ __forceinline__ float op_leaky_relu(float x, const float negative_slope) {
239
+ return fmaxf(x, 0) + fminf(x, 0.0f) * negative_slope;
 
 
 
240
  }
241
 
242
+ template <class T>
243
+ static __global__ void leaky_relu_kernel(const T * x, T * dst, const int k, const float negative_slope) {
244
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
+ if (i >= k) {
247
+ return;
 
 
248
  }
 
249
 
250
+ dst[i] = (T)op_leaky_relu((float)x[i], negative_slope);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  }
252
 
253
+ template <class T>
254
+ static void leaky_relu_cuda(const T * x, T * dst, const int k, const float negative_slope, cudaStream_t stream) {
255
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
256
+ leaky_relu_kernel<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  }
258
 
259
  void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
277
  leaky_relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), negative_slope, stream);
278
  }
279
  }