Spaces:
Running
Running
Sigbjørn Skjæret
commited on
Commit
·
2237878
1
Parent(s):
d41a4ec
cuda : add softcap fusion (llama/14907)
Browse files
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -33,6 +33,7 @@
|
|
| 33 |
#include "ggml-cuda/rope.cuh"
|
| 34 |
#include "ggml-cuda/roll.cuh"
|
| 35 |
#include "ggml-cuda/scale.cuh"
|
|
|
|
| 36 |
#include "ggml-cuda/softmax.cuh"
|
| 37 |
#include "ggml-cuda/ssm-conv.cuh"
|
| 38 |
#include "ggml-cuda/ssm-scan.cuh"
|
|
@@ -2770,7 +2771,12 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
|
| 2770 |
}
|
| 2771 |
#endif
|
| 2772 |
|
| 2773 |
-
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2774 |
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
| 2775 |
return false;
|
| 2776 |
}
|
|
@@ -2798,9 +2804,32 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|
| 2798 |
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
| 2799 |
return false;
|
| 2800 |
}
|
|
|
|
|
|
|
| 2801 |
}
|
| 2802 |
|
| 2803 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2804 |
}
|
| 2805 |
|
| 2806 |
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
|
@@ -2821,10 +2850,18 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
| 2821 |
}
|
| 2822 |
|
| 2823 |
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
| 2824 |
-
if (!disable_fusion
|
| 2825 |
-
|
| 2826 |
-
|
| 2827 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2828 |
}
|
| 2829 |
#ifndef NDEBUG
|
| 2830 |
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
|
|
|
| 33 |
#include "ggml-cuda/rope.cuh"
|
| 34 |
#include "ggml-cuda/roll.cuh"
|
| 35 |
#include "ggml-cuda/scale.cuh"
|
| 36 |
+
#include "ggml-cuda/softcap.cuh"
|
| 37 |
#include "ggml-cuda/softmax.cuh"
|
| 38 |
#include "ggml-cuda/ssm-conv.cuh"
|
| 39 |
#include "ggml-cuda/ssm-scan.cuh"
|
|
|
|
| 2771 |
}
|
| 2772 |
#endif
|
| 2773 |
|
| 2774 |
+
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
|
| 2775 |
+
#ifndef NDEBUG
|
| 2776 |
+
const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
|
| 2777 |
+
GGML_ASSERT(unary_ops.size() == num_unary);
|
| 2778 |
+
#endif
|
| 2779 |
+
|
| 2780 |
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
| 2781 |
return false;
|
| 2782 |
}
|
|
|
|
| 2804 |
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
| 2805 |
return false;
|
| 2806 |
}
|
| 2807 |
+
|
| 2808 |
+
return true;
|
| 2809 |
}
|
| 2810 |
|
| 2811 |
+
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
|
| 2812 |
+
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
|
| 2813 |
+
const ggml_tensor *scale = cgraph->nodes[node_idx];
|
| 2814 |
+
const ggml_tensor *tanh = cgraph->nodes[node_idx+1];
|
| 2815 |
+
const ggml_tensor *scale2 = cgraph->nodes[node_idx+2];
|
| 2816 |
+
|
| 2817 |
+
GGML_ASSERT(scale->src[0]->type == GGML_TYPE_F32);
|
| 2818 |
+
GGML_ASSERT(scale->type == GGML_TYPE_F32);
|
| 2819 |
+
|
| 2820 |
+
if (ggml_get_unary_op(tanh) != GGML_UNARY_OP_TANH) {
|
| 2821 |
+
return false;
|
| 2822 |
+
}
|
| 2823 |
+
|
| 2824 |
+
// Check for bias
|
| 2825 |
+
if (ggml_get_op_params_f32(scale, 1) != 0.0f || ggml_get_op_params_f32(scale2, 1) != 0.0f) {
|
| 2826 |
+
return false;
|
| 2827 |
+
}
|
| 2828 |
+
|
| 2829 |
+
return true;
|
| 2830 |
+
}
|
| 2831 |
+
|
| 2832 |
+
return false;
|
| 2833 |
}
|
| 2834 |
|
| 2835 |
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
|
|
|
| 2850 |
}
|
| 2851 |
|
| 2852 |
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
| 2853 |
+
if (!disable_fusion) {
|
| 2854 |
+
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
|
| 2855 |
+
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
|
| 2856 |
+
i++;
|
| 2857 |
+
continue;
|
| 2858 |
+
}
|
| 2859 |
+
|
| 2860 |
+
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
|
| 2861 |
+
i += 2;
|
| 2862 |
+
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
|
| 2863 |
+
continue;
|
| 2864 |
+
}
|
| 2865 |
}
|
| 2866 |
#ifndef NDEBUG
|
| 2867 |
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
ggml/src/ggml-cuda/softcap.cu
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "softcap.cuh"
|
| 2 |
+
|
| 3 |
+
static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) {
|
| 4 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 5 |
+
|
| 6 |
+
if (i >= k) {
|
| 7 |
+
return;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
dst[i] = tanhf(scale * x[i]) * softcap;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) {
|
| 14 |
+
const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE;
|
| 15 |
+
softcap_f32<<<num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream>>>(x, dst, scale, softcap, k);
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
// fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
|
| 19 |
+
void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src) {
|
| 20 |
+
const ggml_tensor * src0 = src->src[0];
|
| 21 |
+
const float * src0_d = (const float *)src0->data;
|
| 22 |
+
float * dst_d = (float *)dst->data;
|
| 23 |
+
cudaStream_t stream = ctx.stream();
|
| 24 |
+
|
| 25 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
| 26 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
| 27 |
+
|
| 28 |
+
float scale;
|
| 29 |
+
float softcap;
|
| 30 |
+
memcpy(&scale, (float *) src->op_params + 0, sizeof(float));
|
| 31 |
+
memcpy(&softcap, (float *) dst->op_params + 0, sizeof(float));
|
| 32 |
+
|
| 33 |
+
softcap_f32_cuda(src0_d, dst_d, scale, softcap, ggml_nelements(src0), stream);
|
| 34 |
+
}
|
ggml/src/ggml-cuda/softcap.cuh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.cuh"
|
| 2 |
+
|
| 3 |
+
#define CUDA_SOFTCAP_BLOCK_SIZE 256
|
| 4 |
+
|
| 5 |
+
void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src);
|