Spaces:
Sleeping
Sleeping
Commit
·
474cc59
1
Parent(s):
91726b8
ggml: Add POOL2D OP for GPU acceleration to the Vulkan backend in the MobileVLM model. (llama/9763)
Browse files* ggml: Add POOL2D OP for GPU ACC to the Vulkan.
- The MobileVLM model now supports inference acceleration through GPU by utilizing the Vulkan backend.
- A GGML_OP_POOL_2D shader has been added. (Pooling)
- The encoding performance of the CLIP model improved from 2.8s on the CPU to 0.7s on the GPU.
Signed-off-by: Changyeon Kim <[email protected]>
* [fix] Correct the incorrect order of the parameters.
fix casting to int.
Signed-off-by: Changyeon Kim <[email protected]>
---------
Signed-off-by: Changyeon Kim <[email protected]>
ggml/src/ggml-vulkan.cpp
CHANGED
|
@@ -213,6 +213,7 @@ struct vk_device_struct {
|
|
| 213 |
vk_pipeline pipeline_sum_rows_f32;
|
| 214 |
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
| 215 |
vk_pipeline pipeline_timestep_embedding_f32;
|
|
|
|
| 216 |
|
| 217 |
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
| 218 |
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
|
@@ -403,6 +404,17 @@ struct vk_op_timestep_embedding_push_constants {
|
|
| 403 |
uint32_t max_period;
|
| 404 |
};
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
// Allow pre-recording command buffers
|
| 407 |
struct vk_staging_memcpy {
|
| 408 |
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
|
@@ -1803,6 +1815,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1803 |
|
| 1804 |
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
| 1805 |
|
|
|
|
|
|
|
| 1806 |
for (auto &c : compiles) {
|
| 1807 |
c.wait();
|
| 1808 |
}
|
|
@@ -4234,6 +4248,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
| 4234 |
return ctx->device->pipeline_timestep_embedding_f32;
|
| 4235 |
}
|
| 4236 |
return nullptr;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4237 |
case GGML_OP_LEAKY_RELU:
|
| 4238 |
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 4239 |
return ctx->device->pipeline_leaky_relu_f32;
|
|
@@ -4464,6 +4483,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 4464 |
uint32_t half_ceil = (dim + 1) / 2;
|
| 4465 |
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
| 4466 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4467 |
case GGML_OP_ADD:
|
| 4468 |
case GGML_OP_DIV:
|
| 4469 |
case GGML_OP_MUL:
|
|
@@ -4914,6 +4941,34 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
|
| 4914 |
}, dryrun);
|
| 4915 |
}
|
| 4916 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4917 |
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 4918 |
const float * op_params = (const float *)dst->op_params;
|
| 4919 |
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
|
@@ -5792,6 +5847,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 5792 |
case GGML_OP_SUM_ROWS:
|
| 5793 |
case GGML_OP_IM2COL:
|
| 5794 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
|
|
| 5795 |
case GGML_OP_LEAKY_RELU:
|
| 5796 |
break;
|
| 5797 |
default:
|
|
@@ -5927,6 +5983,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 5927 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 5928 |
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
| 5929 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5930 |
break;
|
| 5931 |
case GGML_OP_LEAKY_RELU:
|
| 5932 |
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
|
|
@@ -6018,6 +6078,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
| 6018 |
case GGML_OP_SUM_ROWS:
|
| 6019 |
case GGML_OP_IM2COL:
|
| 6020 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
|
|
| 6021 |
case GGML_OP_LEAKY_RELU:
|
| 6022 |
case GGML_OP_REPEAT:
|
| 6023 |
buf = tensor->buffer;
|
|
@@ -6821,6 +6882,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 6821 |
case GGML_OP_SUM_ROWS:
|
| 6822 |
case GGML_OP_IM2COL:
|
| 6823 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
|
|
| 6824 |
case GGML_OP_LEAKY_RELU:
|
| 6825 |
return true;
|
| 6826 |
default:
|
|
@@ -7334,6 +7396,16 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|
| 7334 |
const int32_t dim = tensor->op_params[0];
|
| 7335 |
const int32_t max_period = tensor->op_params[1];
|
| 7336 |
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7337 |
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
| 7338 |
const float * op_params = (const float *)tensor->op_params;
|
| 7339 |
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
|
|
|
| 213 |
vk_pipeline pipeline_sum_rows_f32;
|
| 214 |
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
| 215 |
vk_pipeline pipeline_timestep_embedding_f32;
|
| 216 |
+
vk_pipeline pipeline_pool2d_f32;
|
| 217 |
|
| 218 |
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
| 219 |
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
|
|
|
| 404 |
uint32_t max_period;
|
| 405 |
};
|
| 406 |
|
| 407 |
+
struct vk_op_pool2d_push_constants {
|
| 408 |
+
uint32_t IW; uint32_t IH;
|
| 409 |
+
uint32_t OW; uint32_t OH;
|
| 410 |
+
uint32_t OC;
|
| 411 |
+
uint32_t pelements;
|
| 412 |
+
uint32_t op;
|
| 413 |
+
int32_t k0; int32_t k1;
|
| 414 |
+
int32_t s0; int32_t s1;
|
| 415 |
+
int32_t p0; int32_t p1;
|
| 416 |
+
};
|
| 417 |
+
|
| 418 |
// Allow pre-recording command buffers
|
| 419 |
struct vk_staging_memcpy {
|
| 420 |
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
|
|
|
| 1815 |
|
| 1816 |
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
| 1817 |
|
| 1818 |
+
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
| 1819 |
+
|
| 1820 |
for (auto &c : compiles) {
|
| 1821 |
c.wait();
|
| 1822 |
}
|
|
|
|
| 4248 |
return ctx->device->pipeline_timestep_embedding_f32;
|
| 4249 |
}
|
| 4250 |
return nullptr;
|
| 4251 |
+
case GGML_OP_POOL_2D:
|
| 4252 |
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 4253 |
+
return ctx->device->pipeline_pool2d_f32;
|
| 4254 |
+
}
|
| 4255 |
+
return nullptr;
|
| 4256 |
case GGML_OP_LEAKY_RELU:
|
| 4257 |
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 4258 |
return ctx->device->pipeline_leaky_relu_f32;
|
|
|
|
| 4483 |
uint32_t half_ceil = (dim + 1) / 2;
|
| 4484 |
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
| 4485 |
} break;
|
| 4486 |
+
case GGML_OP_POOL_2D:
|
| 4487 |
+
{
|
| 4488 |
+
const uint32_t N = dst->ne[3];
|
| 4489 |
+
const uint32_t OC = dst->ne[2];
|
| 4490 |
+
const uint32_t OH = dst->ne[1];
|
| 4491 |
+
const uint32_t OW = dst->ne[0];
|
| 4492 |
+
elements = { N * OC * OH * OW, 1, 1};
|
| 4493 |
+
} break;
|
| 4494 |
case GGML_OP_ADD:
|
| 4495 |
case GGML_OP_DIV:
|
| 4496 |
case GGML_OP_MUL:
|
|
|
|
| 4941 |
}, dryrun);
|
| 4942 |
}
|
| 4943 |
|
| 4944 |
+
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 4945 |
+
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
| 4946 |
+
const int32_t k1 = dst->op_params[1];
|
| 4947 |
+
const int32_t k0 = dst->op_params[2];
|
| 4948 |
+
const int32_t s1 = dst->op_params[3];
|
| 4949 |
+
const int32_t s0 = dst->op_params[4];
|
| 4950 |
+
const int32_t p1 = dst->op_params[5];
|
| 4951 |
+
const int32_t p0 = dst->op_params[6];
|
| 4952 |
+
|
| 4953 |
+
const uint32_t IH = src0->ne[1];
|
| 4954 |
+
const uint32_t IW = src0->ne[0];
|
| 4955 |
+
|
| 4956 |
+
const uint32_t N = dst->ne[3];
|
| 4957 |
+
|
| 4958 |
+
const uint32_t OC = dst->ne[2];
|
| 4959 |
+
const uint32_t OH = dst->ne[1];
|
| 4960 |
+
const uint32_t OW = dst->ne[0];
|
| 4961 |
+
|
| 4962 |
+
const uint32_t parallel_elements = N * OC * OH * OW;
|
| 4963 |
+
|
| 4964 |
+
ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
|
| 4965 |
+
IW, IH, OW, OH, OC,
|
| 4966 |
+
parallel_elements,
|
| 4967 |
+
op,
|
| 4968 |
+
k0, k1, s0, s1, p0, p1,
|
| 4969 |
+
}, dryrun);
|
| 4970 |
+
}
|
| 4971 |
+
|
| 4972 |
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 4973 |
const float * op_params = (const float *)dst->op_params;
|
| 4974 |
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
|
|
|
| 5847 |
case GGML_OP_SUM_ROWS:
|
| 5848 |
case GGML_OP_IM2COL:
|
| 5849 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 5850 |
+
case GGML_OP_POOL_2D:
|
| 5851 |
case GGML_OP_LEAKY_RELU:
|
| 5852 |
break;
|
| 5853 |
default:
|
|
|
|
| 5983 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 5984 |
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
| 5985 |
|
| 5986 |
+
break;
|
| 5987 |
+
case GGML_OP_POOL_2D:
|
| 5988 |
+
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
| 5989 |
+
|
| 5990 |
break;
|
| 5991 |
case GGML_OP_LEAKY_RELU:
|
| 5992 |
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
|
|
|
|
| 6078 |
case GGML_OP_SUM_ROWS:
|
| 6079 |
case GGML_OP_IM2COL:
|
| 6080 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 6081 |
+
case GGML_OP_POOL_2D:
|
| 6082 |
case GGML_OP_LEAKY_RELU:
|
| 6083 |
case GGML_OP_REPEAT:
|
| 6084 |
buf = tensor->buffer;
|
|
|
|
| 6882 |
case GGML_OP_SUM_ROWS:
|
| 6883 |
case GGML_OP_IM2COL:
|
| 6884 |
case GGML_OP_TIMESTEP_EMBEDDING:
|
| 6885 |
+
case GGML_OP_POOL_2D:
|
| 6886 |
case GGML_OP_LEAKY_RELU:
|
| 6887 |
return true;
|
| 6888 |
default:
|
|
|
|
| 7396 |
const int32_t dim = tensor->op_params[0];
|
| 7397 |
const int32_t max_period = tensor->op_params[1];
|
| 7398 |
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
| 7399 |
+
} else if (tensor->op == GGML_OP_POOL_2D) {
|
| 7400 |
+
enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
|
| 7401 |
+
const int32_t k0 = tensor->op_params[1];
|
| 7402 |
+
const int32_t k1 = tensor->op_params[2];
|
| 7403 |
+
const int32_t s0 = tensor->op_params[3];
|
| 7404 |
+
const int32_t s1 = tensor->op_params[4];
|
| 7405 |
+
const int32_t p0 = tensor->op_params[5];
|
| 7406 |
+
const int32_t p1 = tensor->op_params[6];
|
| 7407 |
+
|
| 7408 |
+
tensor_clone = ggml_pool_2d(ggml_ctx, src0_clone, op, k0, k1, s0, s1, p0, p1);
|
| 7409 |
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
| 7410 |
const float * op_params = (const float *)tensor->op_params;
|
| 7411 |
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
ggml/src/vulkan-shaders/pool2d.comp
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "types.comp"
|
| 4 |
+
|
| 5 |
+
#extension GL_EXT_shader_16bit_storage : require
|
| 6 |
+
|
| 7 |
+
layout(push_constant) uniform parameter {
|
| 8 |
+
uint IW; uint IH;
|
| 9 |
+
uint OW; uint OH;
|
| 10 |
+
uint OC;
|
| 11 |
+
uint pelements;
|
| 12 |
+
uint op;
|
| 13 |
+
int k0; int k1;
|
| 14 |
+
int s0; int s1;
|
| 15 |
+
int p0; int p1;
|
| 16 |
+
} p;
|
| 17 |
+
|
| 18 |
+
#define BLOCK_SIZE 512
|
| 19 |
+
#define FLT_MAX 3.402823466e+38F
|
| 20 |
+
#define OP_POOL_MAX 0u
|
| 21 |
+
#define OP_POOL_AVG 1u
|
| 22 |
+
|
| 23 |
+
layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
| 24 |
+
|
| 25 |
+
layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
|
| 26 |
+
layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 27 |
+
|
| 28 |
+
void main() {
|
| 29 |
+
const uint idx = gl_GlobalInvocationID.x;
|
| 30 |
+
if (idx >= p.pelements) {
|
| 31 |
+
return;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
const uint O_HW = p.OW * p.OH;
|
| 35 |
+
|
| 36 |
+
const uint nc = idx / O_HW;
|
| 37 |
+
const uint cur_oh = (idx % O_HW) / p.OW;
|
| 38 |
+
const uint cur_ow = (idx % O_HW) % p.OW;
|
| 39 |
+
|
| 40 |
+
const int start_h = int(cur_oh) * p.s0 - p.p0;
|
| 41 |
+
const uint bh = max(start_h, 0);
|
| 42 |
+
const uint eh = min(start_h + p.k0, p.IH);
|
| 43 |
+
|
| 44 |
+
const int start_w = int(cur_ow) * p.s1 - p.p1;
|
| 45 |
+
const uint bw = max(start_w, 0);
|
| 46 |
+
const uint ew = min(start_w + p.k1, p.IW);
|
| 47 |
+
|
| 48 |
+
const float scale = 1.0 / float(p.k0 * p.k1);
|
| 49 |
+
float res;
|
| 50 |
+
|
| 51 |
+
if (p.op == OP_POOL_AVG) {
|
| 52 |
+
res = 0.0;
|
| 53 |
+
} else if (p.op == OP_POOL_MAX) {
|
| 54 |
+
res = -FLT_MAX;
|
| 55 |
+
} else {
|
| 56 |
+
return;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
#pragma unroll
|
| 60 |
+
for (uint i = bh; i < eh; i++) {
|
| 61 |
+
#pragma unroll
|
| 62 |
+
for (uint j = bw; j < ew; j++) {
|
| 63 |
+
const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);
|
| 64 |
+
|
| 65 |
+
if (p.op == OP_POOL_AVG) {
|
| 66 |
+
res += cur * scale;
|
| 67 |
+
} else if (p.op == OP_POOL_MAX) {
|
| 68 |
+
res = max(res, cur);
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
|
| 74 |
+
}
|
ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
CHANGED
|
@@ -493,6 +493,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
|
| 493 |
tasks.push_back(std::async(std::launch::async, [=] {
|
| 494 |
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 495 |
}));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
}
|
| 497 |
|
| 498 |
void write_output_files() {
|
|
|
|
| 493 |
tasks.push_back(std::async(std::launch::async, [=] {
|
| 494 |
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 495 |
}));
|
| 496 |
+
|
| 497 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 498 |
+
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 499 |
+
}));
|
| 500 |
}
|
| 501 |
|
| 502 |
void write_output_files() {
|