Sigbjørn Skjæret commited on
Commit
2237878
·
1 Parent(s): d41a4ec

cuda : add softcap fusion (llama/14907)

Browse files
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -33,6 +33,7 @@
33
  #include "ggml-cuda/rope.cuh"
34
  #include "ggml-cuda/roll.cuh"
35
  #include "ggml-cuda/scale.cuh"
 
36
  #include "ggml-cuda/softmax.cuh"
37
  #include "ggml-cuda/ssm-conv.cuh"
38
  #include "ggml-cuda/ssm-scan.cuh"
@@ -2770,7 +2771,12 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
2770
  }
2771
  #endif
2772
 
2773
- static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
 
 
 
 
 
2774
  if (!ggml_can_fuse(cgraph, node_idx, ops)) {
2775
  return false;
2776
  }
@@ -2798,9 +2804,32 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
2798
  if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
2799
  return false;
2800
  }
 
 
2801
  }
2802
 
2803
- return true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2804
  }
2805
 
2806
  static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
@@ -2821,10 +2850,18 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
2821
  }
2822
 
2823
  static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
2824
- if (!disable_fusion && ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
2825
- ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
2826
- i++;
2827
- continue;
 
 
 
 
 
 
 
 
2828
  }
2829
  #ifndef NDEBUG
2830
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
 
33
  #include "ggml-cuda/rope.cuh"
34
  #include "ggml-cuda/roll.cuh"
35
  #include "ggml-cuda/scale.cuh"
36
+ #include "ggml-cuda/softcap.cuh"
37
  #include "ggml-cuda/softmax.cuh"
38
  #include "ggml-cuda/ssm-conv.cuh"
39
  #include "ggml-cuda/ssm-scan.cuh"
 
2771
  }
2772
  #endif
2773
 
2774
+ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
2775
+ #ifndef NDEBUG
2776
+ const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
2777
+ GGML_ASSERT(unary_ops.size() == num_unary);
2778
+ #endif
2779
+
2780
  if (!ggml_can_fuse(cgraph, node_idx, ops)) {
2781
  return false;
2782
  }
 
2804
  if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
2805
  return false;
2806
  }
2807
+
2808
+ return true;
2809
  }
2810
 
2811
+ if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
2812
+ && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
2813
+ const ggml_tensor *scale = cgraph->nodes[node_idx];
2814
+ const ggml_tensor *tanh = cgraph->nodes[node_idx+1];
2815
+ const ggml_tensor *scale2 = cgraph->nodes[node_idx+2];
2816
+
2817
+ GGML_ASSERT(scale->src[0]->type == GGML_TYPE_F32);
2818
+ GGML_ASSERT(scale->type == GGML_TYPE_F32);
2819
+
2820
+ if (ggml_get_unary_op(tanh) != GGML_UNARY_OP_TANH) {
2821
+ return false;
2822
+ }
2823
+
2824
+ // Check for bias
2825
+ if (ggml_get_op_params_f32(scale, 1) != 0.0f || ggml_get_op_params_f32(scale2, 1) != 0.0f) {
2826
+ return false;
2827
+ }
2828
+
2829
+ return true;
2830
+ }
2831
+
2832
+ return false;
2833
  }
2834
 
2835
  static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
 
2850
  }
2851
 
2852
  static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
2853
+ if (!disable_fusion) {
2854
+ if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
2855
+ ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
2856
+ i++;
2857
+ continue;
2858
+ }
2859
+
2860
+ if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
2861
+ i += 2;
2862
+ ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
2863
+ continue;
2864
+ }
2865
  }
2866
  #ifndef NDEBUG
2867
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
ggml/src/ggml-cuda/softcap.cu ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "softcap.cuh"
2
+
3
+ static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) {
4
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
5
+
6
+ if (i >= k) {
7
+ return;
8
+ }
9
+
10
+ dst[i] = tanhf(scale * x[i]) * softcap;
11
+ }
12
+
13
+ static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) {
14
+ const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE;
15
+ softcap_f32<<<num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream>>>(x, dst, scale, softcap, k);
16
+ }
17
+
18
+ // fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
19
+ void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src) {
20
+ const ggml_tensor * src0 = src->src[0];
21
+ const float * src0_d = (const float *)src0->data;
22
+ float * dst_d = (float *)dst->data;
23
+ cudaStream_t stream = ctx.stream();
24
+
25
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
26
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
27
+
28
+ float scale;
29
+ float softcap;
30
+ memcpy(&scale, (float *) src->op_params + 0, sizeof(float));
31
+ memcpy(&softcap, (float *) dst->op_params + 0, sizeof(float));
32
+
33
+ softcap_f32_cuda(src0_d, dst_d, scale, softcap, ggml_nelements(src0), stream);
34
+ }
ggml/src/ggml-cuda/softcap.cuh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_SOFTCAP_BLOCK_SIZE 256
4
+
5
+ void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src);