Spaces:
Running
Running
sync : ggml vulkan (ggml/0)
Browse files- ggml/src/ggml-vulkan-shaders.hpp +0 -0
- ggml/src/vulkan-shaders/CMakeLists.txt +7 -0
- ggml/src/vulkan-shaders/add.comp +4 -2
- ggml/src/vulkan-shaders/clamp.comp +5 -3
- ggml/src/vulkan-shaders/concat.comp +35 -0
- ggml/src/vulkan-shaders/copy.comp +5 -3
- ggml/src/vulkan-shaders/cos.comp +15 -0
- ggml/src/vulkan-shaders/dequant_funcs.comp +8 -0
- ggml/src/vulkan-shaders/dequant_iq4_nl.comp +30 -0
- ggml/src/vulkan-shaders/dequant_q4_0.comp +4 -6
- ggml/src/vulkan-shaders/div.comp +4 -2
- ggml/src/vulkan-shaders/gelu.comp +1 -1
- ggml/src/vulkan-shaders/gelu_quick.comp +23 -0
- ggml/src/vulkan-shaders/generic_binary_head.comp +5 -1
- ggml/src/vulkan-shaders/generic_unary_head.comp +4 -0
- ggml/src/vulkan-shaders/group_norm.comp +66 -0
- ggml/src/vulkan-shaders/im2col.comp +57 -0
- ggml/src/vulkan-shaders/leaky_relu.comp +22 -0
- ggml/src/vulkan-shaders/mul.comp +4 -2
- ggml/src/vulkan-shaders/mul_mat_vec.comp +10 -3
- ggml/src/vulkan-shaders/mul_mm.comp +14 -1
- ggml/src/vulkan-shaders/norm.comp +1 -1
- ggml/src/vulkan-shaders/pad.comp +26 -0
- ggml/src/vulkan-shaders/relu.comp +1 -1
- ggml/src/vulkan-shaders/rms_norm.comp +1 -1
- ggml/src/vulkan-shaders/scale.comp +4 -2
- ggml/src/vulkan-shaders/silu.comp +1 -1
- ggml/src/vulkan-shaders/sin.comp +15 -0
- ggml/src/vulkan-shaders/soft_max.comp +1 -1
- ggml/src/vulkan-shaders/square.comp +5 -3
- ggml/src/vulkan-shaders/sum_rows.comp +1 -1
- ggml/src/vulkan-shaders/tanh.comp +21 -0
- ggml/src/vulkan-shaders/timestep_embedding.comp +41 -0
- ggml/src/vulkan-shaders/types.comp +23 -2
- ggml/src/vulkan-shaders/upscale.comp +36 -0
- ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +587 -0
- scripts/sync-ggml-am.sh +1 -0
- scripts/sync-ggml.last +1 -1
ggml/src/ggml-vulkan-shaders.hpp
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml/src/vulkan-shaders/CMakeLists.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
find_package (Threads REQUIRED)
|
| 2 |
+
|
| 3 |
+
set(TARGET vulkan-shaders-gen)
|
| 4 |
+
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
| 5 |
+
install(TARGETS ${TARGET} RUNTIME)
|
| 6 |
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
| 7 |
+
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
ggml/src/vulkan-shaders/add.comp
CHANGED
|
@@ -4,9 +4,11 @@
|
|
| 4 |
#include "generic_binary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
return;
|
| 9 |
}
|
| 10 |
|
| 11 |
-
data_d[p.d_offset + dst_idx(
|
| 12 |
}
|
|
|
|
| 4 |
#include "generic_binary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
return;
|
| 11 |
}
|
| 12 |
|
| 13 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
|
| 14 |
}
|
ggml/src/vulkan-shaders/clamp.comp
CHANGED
|
@@ -4,10 +4,12 @@
|
|
| 4 |
#include "generic_unary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
return;
|
| 9 |
}
|
| 10 |
|
| 11 |
-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(
|
| 12 |
-
data_d[p.d_offset + dst_idx(
|
| 13 |
}
|
|
|
|
| 4 |
#include "generic_unary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
return;
|
| 11 |
}
|
| 12 |
|
| 13 |
+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
| 14 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
| 15 |
}
|
ggml/src/vulkan-shaders/concat.comp
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "types.comp"
|
| 4 |
+
#include "generic_binary_head.comp"
|
| 5 |
+
|
| 6 |
+
void main() {
|
| 7 |
+
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 8 |
+
const int dim = p.param3;
|
| 9 |
+
|
| 10 |
+
if (idx >= p.ne) {
|
| 11 |
+
return;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
|
| 15 |
+
const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
|
| 16 |
+
const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
|
| 17 |
+
const uint i2_offset = i2*p.ne21*p.ne20;
|
| 18 |
+
const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
|
| 19 |
+
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
|
| 20 |
+
|
| 21 |
+
uint o[4] = {0, 0, 0, 0};
|
| 22 |
+
o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
|
| 23 |
+
|
| 24 |
+
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
| 25 |
+
const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
|
| 26 |
+
const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
|
| 27 |
+
|
| 28 |
+
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
| 29 |
+
|
| 30 |
+
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 31 |
+
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
| 32 |
+
#else
|
| 33 |
+
data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
|
| 34 |
+
#endif
|
| 35 |
+
}
|
ggml/src/vulkan-shaders/copy.comp
CHANGED
|
@@ -4,13 +4,15 @@
|
|
| 4 |
#include "generic_unary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
return;
|
| 9 |
}
|
| 10 |
|
| 11 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 12 |
-
data_d[p.d_offset + dst_idx(
|
| 13 |
#else
|
| 14 |
-
data_d[p.d_offset + dst_idx(
|
| 15 |
#endif
|
| 16 |
}
|
|
|
|
| 4 |
#include "generic_unary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
return;
|
| 11 |
}
|
| 12 |
|
| 13 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 14 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
|
| 15 |
#else
|
| 16 |
+
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
|
| 17 |
#endif
|
| 18 |
}
|
ggml/src/vulkan-shaders/cos.comp
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "types.comp"
|
| 4 |
+
#include "generic_unary_head.comp"
|
| 5 |
+
|
| 6 |
+
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
+
return;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
| 14 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
|
| 15 |
+
}
|
ggml/src/vulkan-shaders/dequant_funcs.comp
CHANGED
|
@@ -58,3 +58,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
|
| 58 |
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
|
| 59 |
}
|
| 60 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
|
| 59 |
}
|
| 60 |
#endif
|
| 61 |
+
|
| 62 |
+
#if defined(DATA_A_IQ4_NL)
|
| 63 |
+
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
| 64 |
+
const float d = float(data_a[a_offset + ib].d);
|
| 65 |
+
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
| 66 |
+
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
| 67 |
+
}
|
| 68 |
+
#endif
|
ggml/src/vulkan-shaders/dequant_iq4_nl.comp
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "dequant_head.comp"
|
| 4 |
+
|
| 5 |
+
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
| 6 |
+
|
| 7 |
+
layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
|
| 8 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
| 12 |
+
|
| 13 |
+
const uint tid = gl_LocalInvocationID.x % 64;
|
| 14 |
+
const uint il = tid/32;
|
| 15 |
+
const uint ir = tid%32;
|
| 16 |
+
const uint ib = 32*i + ir;
|
| 17 |
+
if (ib >= p.nel / 32) {
|
| 18 |
+
return;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
const uint q_idx = 8*il;
|
| 22 |
+
const uint b_idx = 1024*i + 32*ir + q_idx;
|
| 23 |
+
|
| 24 |
+
const float d = float(data_a[ib].d);
|
| 25 |
+
|
| 26 |
+
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
| 27 |
+
data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
|
| 28 |
+
data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
|
| 29 |
+
}
|
| 30 |
+
}
|
ggml/src/vulkan-shaders/dequant_q4_0.comp
CHANGED
|
@@ -18,15 +18,13 @@ void main() {
|
|
| 18 |
return;
|
| 19 |
}
|
| 20 |
|
| 21 |
-
const uint
|
|
|
|
| 22 |
|
| 23 |
const float d = float(data_a[ib].d);
|
| 24 |
-
const float dm = -8.0f * d;
|
| 25 |
-
|
| 26 |
-
const uint q_idx = 8*il;
|
| 27 |
|
| 28 |
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
| 29 |
-
data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF)
|
| 30 |
-
data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4)
|
| 31 |
}
|
| 32 |
}
|
|
|
|
| 18 |
return;
|
| 19 |
}
|
| 20 |
|
| 21 |
+
const uint q_idx = 8*il;
|
| 22 |
+
const uint b_idx = 1024*i + 32*ir + q_idx;
|
| 23 |
|
| 24 |
const float d = float(data_a[ib].d);
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
| 27 |
+
data_b[b_idx + l + 0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
|
| 28 |
+
data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >> 4) - 8.0f));
|
| 29 |
}
|
| 30 |
}
|
ggml/src/vulkan-shaders/div.comp
CHANGED
|
@@ -4,9 +4,11 @@
|
|
| 4 |
#include "generic_binary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
return;
|
| 9 |
}
|
| 10 |
|
| 11 |
-
data_d[p.d_offset + dst_idx(
|
| 12 |
}
|
|
|
|
| 4 |
#include "generic_binary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
return;
|
| 11 |
}
|
| 12 |
|
| 13 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
|
| 14 |
}
|
ggml/src/vulkan-shaders/gelu.comp
CHANGED
|
@@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|
| 13 |
void main() {
|
| 14 |
const float GELU_COEF_A = 0.044715f;
|
| 15 |
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
| 16 |
-
const uint i = gl_GlobalInvocationID.x;
|
| 17 |
|
| 18 |
if (i >= p.KX) {
|
| 19 |
return;
|
|
|
|
| 13 |
void main() {
|
| 14 |
const float GELU_COEF_A = 0.044715f;
|
| 15 |
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
| 16 |
+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 17 |
|
| 18 |
if (i >= p.KX) {
|
| 19 |
return;
|
ggml/src/vulkan-shaders/gelu_quick.comp
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "generic_head.comp"
|
| 4 |
+
#include "types.comp"
|
| 5 |
+
|
| 6 |
+
#extension GL_EXT_control_flow_attributes : enable
|
| 7 |
+
|
| 8 |
+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
| 9 |
+
|
| 10 |
+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
| 11 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 12 |
+
|
| 13 |
+
void main() {
|
| 14 |
+
const float GELU_QUICK_COEF = -1.702f;
|
| 15 |
+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 16 |
+
|
| 17 |
+
if (i >= p.KX) {
|
| 18 |
+
return;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
const float x = float(data_a[i]);
|
| 22 |
+
data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
|
| 23 |
+
}
|
ggml/src/vulkan-shaders/generic_binary_head.comp
CHANGED
|
@@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
|
|
| 7 |
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
| 8 |
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
| 9 |
uint d_offset;
|
| 10 |
-
float param1; float param2;
|
| 11 |
} p;
|
| 12 |
|
| 13 |
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
@@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
|
| 16 |
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
| 17 |
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
uint src0_idx(uint idx) {
|
| 20 |
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
| 21 |
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
|
|
|
| 7 |
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
| 8 |
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
| 9 |
uint d_offset;
|
| 10 |
+
float param1; float param2; int param3;
|
| 11 |
} p;
|
| 12 |
|
| 13 |
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
| 16 |
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
| 17 |
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
| 18 |
|
| 19 |
+
uint get_idx() {
|
| 20 |
+
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
uint src0_idx(uint idx) {
|
| 24 |
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
| 25 |
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
ggml/src/vulkan-shaders/generic_unary_head.comp
CHANGED
|
@@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
| 14 |
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
| 15 |
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
uint src0_idx(uint idx) {
|
| 18 |
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
| 19 |
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
|
|
|
| 14 |
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
| 15 |
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 16 |
|
| 17 |
+
uint get_idx() {
|
| 18 |
+
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
uint src0_idx(uint idx) {
|
| 22 |
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
| 23 |
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
ggml/src/vulkan-shaders/group_norm.comp
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "generic_head.comp"
|
| 4 |
+
#include "types.comp"
|
| 5 |
+
|
| 6 |
+
#extension GL_EXT_control_flow_attributes : enable
|
| 7 |
+
#define BLOCK_SIZE 512
|
| 8 |
+
|
| 9 |
+
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
| 10 |
+
|
| 11 |
+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
| 12 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 13 |
+
|
| 14 |
+
shared float tmp[BLOCK_SIZE];
|
| 15 |
+
|
| 16 |
+
void main() {
|
| 17 |
+
const uint group_size = p.KX;
|
| 18 |
+
const float eps = p.param1;
|
| 19 |
+
|
| 20 |
+
const uint tid = gl_LocalInvocationID.x;
|
| 21 |
+
const uint start = gl_WorkGroupID.x * group_size + tid;
|
| 22 |
+
const uint end = start + group_size;
|
| 23 |
+
|
| 24 |
+
tmp[tid] = 0.0f;
|
| 25 |
+
|
| 26 |
+
// Calculate mean
|
| 27 |
+
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
| 28 |
+
tmp[tid] += float(data_a[col]);
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
// tmp up partial tmps and write back result
|
| 32 |
+
barrier();
|
| 33 |
+
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
| 34 |
+
if (tid < s) {
|
| 35 |
+
tmp[tid] += tmp[tid + s];
|
| 36 |
+
}
|
| 37 |
+
barrier();
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
const float mean = tmp[0] / group_size;
|
| 41 |
+
barrier();
|
| 42 |
+
tmp[tid] = 0.0f;
|
| 43 |
+
|
| 44 |
+
// Calculate variance
|
| 45 |
+
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
| 46 |
+
const float xi = float(data_a[col]) - mean;
|
| 47 |
+
data_d[col] = D_TYPE(xi);
|
| 48 |
+
tmp[tid] += xi * xi;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
// sum up partial sums and write back result
|
| 52 |
+
barrier();
|
| 53 |
+
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
| 54 |
+
if (tid < s) {
|
| 55 |
+
tmp[tid] += tmp[tid + s];
|
| 56 |
+
}
|
| 57 |
+
barrier();
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
const float variance = tmp[0] / group_size;
|
| 61 |
+
const float scale = inversesqrt(variance + eps);
|
| 62 |
+
|
| 63 |
+
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
| 64 |
+
data_d[col] *= D_TYPE(scale);
|
| 65 |
+
}
|
| 66 |
+
}
|
ggml/src/vulkan-shaders/im2col.comp
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#extension GL_EXT_shader_16bit_storage : require
|
| 4 |
+
|
| 5 |
+
layout (push_constant) uniform parameter
|
| 6 |
+
{
|
| 7 |
+
uint batch_offset; uint offset_delta;
|
| 8 |
+
uint IC;
|
| 9 |
+
uint IW; uint IH;
|
| 10 |
+
uint OW; uint OH;
|
| 11 |
+
uint KW; uint KH;
|
| 12 |
+
uint pelements;
|
| 13 |
+
uint CHW;
|
| 14 |
+
int s0; int s1;
|
| 15 |
+
int p0; int p1;
|
| 16 |
+
int d0; int d1;
|
| 17 |
+
} p;
|
| 18 |
+
|
| 19 |
+
#include "types.comp"
|
| 20 |
+
|
| 21 |
+
#define BLOCK_SIZE 256
|
| 22 |
+
|
| 23 |
+
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
| 24 |
+
|
| 25 |
+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
| 26 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 27 |
+
|
| 28 |
+
void main() {
|
| 29 |
+
const uint i = gl_GlobalInvocationID.x;
|
| 30 |
+
if (i >= p.pelements) {
|
| 31 |
+
return;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
| 35 |
+
const uint kx = i / ksize;
|
| 36 |
+
const uint kd = kx * ksize;
|
| 37 |
+
const uint ky = (i - kd) / p.OW;
|
| 38 |
+
const uint ix = i % p.OW;
|
| 39 |
+
|
| 40 |
+
const uint oh = gl_GlobalInvocationID.y;
|
| 41 |
+
const uint batch = gl_GlobalInvocationID.z / p.IC;
|
| 42 |
+
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
| 43 |
+
|
| 44 |
+
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
|
| 45 |
+
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
| 46 |
+
|
| 47 |
+
const uint offset_dst =
|
| 48 |
+
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
|
| 49 |
+
(ic * (p.KW * p.KH) + ky * p.KW + kx);
|
| 50 |
+
|
| 51 |
+
if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
|
| 52 |
+
data_d[offset_dst] = D_TYPE(0.0f);
|
| 53 |
+
} else {
|
| 54 |
+
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
|
| 55 |
+
data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
|
| 56 |
+
}
|
| 57 |
+
}
|
ggml/src/vulkan-shaders/leaky_relu.comp
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "generic_head.comp"
|
| 4 |
+
#include "types.comp"
|
| 5 |
+
|
| 6 |
+
#extension GL_EXT_control_flow_attributes : enable
|
| 7 |
+
|
| 8 |
+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
| 9 |
+
|
| 10 |
+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
| 11 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 12 |
+
|
| 13 |
+
void main() {
|
| 14 |
+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 15 |
+
|
| 16 |
+
if (i >= p.KX) {
|
| 17 |
+
return;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
const float val = float(data_a[i]);
|
| 21 |
+
data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
|
| 22 |
+
}
|
ggml/src/vulkan-shaders/mul.comp
CHANGED
|
@@ -4,9 +4,11 @@
|
|
| 4 |
#include "generic_binary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
return;
|
| 9 |
}
|
| 10 |
|
| 11 |
-
data_d[p.d_offset + dst_idx(
|
| 12 |
}
|
|
|
|
| 4 |
#include "generic_binary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
return;
|
| 11 |
}
|
| 12 |
|
| 13 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
|
| 14 |
}
|
ggml/src/vulkan-shaders/mul_mat_vec.comp
CHANGED
|
@@ -16,6 +16,13 @@ void main() {
|
|
| 16 |
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
| 17 |
const uint tid = gl_LocalInvocationID.x;
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
uint a_offset, b_offset, d_offset;
|
| 20 |
get_offsets(a_offset, b_offset, d_offset);
|
| 21 |
|
|
@@ -23,8 +30,8 @@ void main() {
|
|
| 23 |
|
| 24 |
tmp[tid] = FLOAT_TYPE(0.0f);
|
| 25 |
|
| 26 |
-
[[unroll]] for (uint i = 0; i < p.ncols/
|
| 27 |
-
const uint col = i*
|
| 28 |
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
|
| 29 |
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
| 30 |
const uint iybs = col - col%QUANT_K; // y block start index
|
|
@@ -38,7 +45,7 @@ void main() {
|
|
| 38 |
|
| 39 |
// sum up partial sums and write back result
|
| 40 |
barrier();
|
| 41 |
-
[[unroll]] for (uint s =
|
| 42 |
if (tid < s) {
|
| 43 |
tmp[tid] += tmp[tid + s];
|
| 44 |
}
|
|
|
|
| 16 |
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
| 17 |
const uint tid = gl_LocalInvocationID.x;
|
| 18 |
|
| 19 |
+
// There are not enough cols to use all threads
|
| 20 |
+
if (tid >= p.ncols) {
|
| 21 |
+
return;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
const uint block_size = min(p.ncols, BLOCK_SIZE);
|
| 25 |
+
|
| 26 |
uint a_offset, b_offset, d_offset;
|
| 27 |
get_offsets(a_offset, b_offset, d_offset);
|
| 28 |
|
|
|
|
| 30 |
|
| 31 |
tmp[tid] = FLOAT_TYPE(0.0f);
|
| 32 |
|
| 33 |
+
[[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
|
| 34 |
+
const uint col = i*block_size + 2*tid;
|
| 35 |
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
|
| 36 |
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
| 37 |
const uint iybs = col - col%QUANT_K; // y block start index
|
|
|
|
| 45 |
|
| 46 |
// sum up partial sums and write back result
|
| 47 |
barrier();
|
| 48 |
+
[[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
|
| 49 |
if (tid < s) {
|
| 50 |
tmp[tid] += tmp[tid + s];
|
| 51 |
}
|
ggml/src/vulkan-shaders/mul_mm.comp
CHANGED
|
@@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
|
|
| 71 |
shared FLOAT_TYPE buf_b[BN * (BK+1)];
|
| 72 |
|
| 73 |
#ifdef MUL_MAT_ID
|
| 74 |
-
shared u16vec2 row_ids[
|
| 75 |
#endif
|
| 76 |
|
| 77 |
void main() {
|
|
@@ -380,6 +380,19 @@ void main() {
|
|
| 380 |
|
| 381 |
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
|
| 382 |
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
#endif
|
| 384 |
}
|
| 385 |
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
|
|
|
|
| 71 |
shared FLOAT_TYPE buf_b[BN * (BK+1)];
|
| 72 |
|
| 73 |
#ifdef MUL_MAT_ID
|
| 74 |
+
shared u16vec2 row_ids[3072];
|
| 75 |
#endif
|
| 76 |
|
| 77 |
void main() {
|
|
|
|
| 380 |
|
| 381 |
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
|
| 382 |
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
| 383 |
+
#elif defined(DATA_A_IQ4_NL)
|
| 384 |
+
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 385 |
+
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;
|
| 386 |
+
|
| 387 |
+
const uint ib = idx / 16;
|
| 388 |
+
const uint iqs = idx & 0xF;
|
| 389 |
+
|
| 390 |
+
const float d = float(data_a[ib].d);
|
| 391 |
+
const uint vui = uint(data_a[ib].qs[iqs]);
|
| 392 |
+
const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
| 393 |
+
|
| 394 |
+
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
| 395 |
+
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
| 396 |
#endif
|
| 397 |
}
|
| 398 |
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
|
ggml/src/vulkan-shaders/norm.comp
CHANGED
|
@@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|
| 14 |
shared vec2 sum[BLOCK_SIZE];
|
| 15 |
|
| 16 |
void main() {
|
| 17 |
-
const uint row = gl_WorkGroupID.x;
|
| 18 |
const uint tid = gl_LocalInvocationID.x;
|
| 19 |
|
| 20 |
sum[tid] = vec2(0.0f, 0.0f);
|
|
|
|
| 14 |
shared vec2 sum[BLOCK_SIZE];
|
| 15 |
|
| 16 |
void main() {
|
| 17 |
+
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
| 18 |
const uint tid = gl_LocalInvocationID.x;
|
| 19 |
|
| 20 |
sum[tid] = vec2(0.0f, 0.0f);
|
ggml/src/vulkan-shaders/pad.comp
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "types.comp"
|
| 4 |
+
#include "generic_unary_head.comp"
|
| 5 |
+
|
| 6 |
+
void main() {
|
| 7 |
+
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
+
return;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
|
| 14 |
+
const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
|
| 15 |
+
const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
|
| 16 |
+
const uint i2_offset = i2*p.ne11*p.ne10;
|
| 17 |
+
const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
|
| 18 |
+
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
|
| 19 |
+
|
| 20 |
+
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
| 21 |
+
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
|
| 22 |
+
|
| 23 |
+
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
| 24 |
+
|
| 25 |
+
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
|
| 26 |
+
}
|
ggml/src/vulkan-shaders/relu.comp
CHANGED
|
@@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
|
| 11 |
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 12 |
|
| 13 |
void main() {
|
| 14 |
-
const uint i = gl_GlobalInvocationID.x;
|
| 15 |
|
| 16 |
if (i >= p.KX) {
|
| 17 |
return;
|
|
|
|
| 11 |
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 12 |
|
| 13 |
void main() {
|
| 14 |
+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 15 |
|
| 16 |
if (i >= p.KX) {
|
| 17 |
return;
|
ggml/src/vulkan-shaders/rms_norm.comp
CHANGED
|
@@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|
| 14 |
shared FLOAT_TYPE sum[BLOCK_SIZE];
|
| 15 |
|
| 16 |
void main() {
|
| 17 |
-
const uint row = gl_WorkGroupID.x;
|
| 18 |
const uint tid = gl_LocalInvocationID.x;
|
| 19 |
|
| 20 |
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
|
|
|
| 14 |
shared FLOAT_TYPE sum[BLOCK_SIZE];
|
| 15 |
|
| 16 |
void main() {
|
| 17 |
+
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
| 18 |
const uint tid = gl_LocalInvocationID.x;
|
| 19 |
|
| 20 |
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
ggml/src/vulkan-shaders/scale.comp
CHANGED
|
@@ -4,9 +4,11 @@
|
|
| 4 |
#include "generic_unary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
return;
|
| 9 |
}
|
| 10 |
|
| 11 |
-
data_d[p.d_offset + dst_idx(
|
| 12 |
}
|
|
|
|
| 4 |
#include "generic_unary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
return;
|
| 11 |
}
|
| 12 |
|
| 13 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
|
| 14 |
}
|
ggml/src/vulkan-shaders/silu.comp
CHANGED
|
@@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
|
| 11 |
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 12 |
|
| 13 |
void main() {
|
| 14 |
-
const uint i = gl_GlobalInvocationID.x;
|
| 15 |
|
| 16 |
if (i >= p.KX) {
|
| 17 |
return;
|
|
|
|
| 11 |
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 12 |
|
| 13 |
void main() {
|
| 14 |
+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 15 |
|
| 16 |
if (i >= p.KX) {
|
| 17 |
return;
|
ggml/src/vulkan-shaders/sin.comp
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "types.comp"
|
| 4 |
+
#include "generic_unary_head.comp"
|
| 5 |
+
|
| 6 |
+
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
+
return;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
| 14 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
|
| 15 |
+
}
|
ggml/src/vulkan-shaders/soft_max.comp
CHANGED
|
@@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
|
|
| 28 |
|
| 29 |
void main() {
|
| 30 |
const uint tid = gl_LocalInvocationID.x;
|
| 31 |
-
const uint rowx = gl_WorkGroupID.x;
|
| 32 |
const uint rowy = rowx % p.KY;
|
| 33 |
|
| 34 |
float slope = 1.0f;
|
|
|
|
| 28 |
|
| 29 |
void main() {
|
| 30 |
const uint tid = gl_LocalInvocationID.x;
|
| 31 |
+
const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
| 32 |
const uint rowy = rowx % p.KY;
|
| 33 |
|
| 34 |
float slope = 1.0f;
|
ggml/src/vulkan-shaders/square.comp
CHANGED
|
@@ -4,10 +4,12 @@
|
|
| 4 |
#include "generic_unary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
return;
|
| 9 |
}
|
| 10 |
|
| 11 |
-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(
|
| 12 |
-
data_d[p.d_offset + dst_idx(
|
| 13 |
}
|
|
|
|
| 4 |
#include "generic_unary_head.comp"
|
| 5 |
|
| 6 |
void main() {
|
| 7 |
+
const uint idx = get_idx();
|
| 8 |
+
|
| 9 |
+
if (idx >= p.ne) {
|
| 10 |
return;
|
| 11 |
}
|
| 12 |
|
| 13 |
+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
| 14 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
|
| 15 |
}
|
ggml/src/vulkan-shaders/sum_rows.comp
CHANGED
|
@@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
|
| 14 |
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
| 15 |
|
| 16 |
void main() {
|
| 17 |
-
const uint row = gl_WorkGroupID.x;
|
| 18 |
const uint col = gl_LocalInvocationID.x;
|
| 19 |
|
| 20 |
tmp[col] = FLOAT_TYPE(0.0f);
|
|
|
|
| 14 |
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
| 15 |
|
| 16 |
void main() {
|
| 17 |
+
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
| 18 |
const uint col = gl_LocalInvocationID.x;
|
| 19 |
|
| 20 |
tmp[col] = FLOAT_TYPE(0.0f);
|
ggml/src/vulkan-shaders/tanh.comp
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "generic_head.comp"
|
| 4 |
+
#include "types.comp"
|
| 5 |
+
|
| 6 |
+
#extension GL_EXT_control_flow_attributes : enable
|
| 7 |
+
|
| 8 |
+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
| 9 |
+
|
| 10 |
+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
| 11 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 12 |
+
|
| 13 |
+
void main() {
|
| 14 |
+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 15 |
+
|
| 16 |
+
if (i >= p.KX) {
|
| 17 |
+
return;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
data_d[i] = D_TYPE(tanh(data_a[i]));
|
| 21 |
+
}
|
ggml/src/vulkan-shaders/timestep_embedding.comp
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#extension GL_EXT_shader_16bit_storage : require
|
| 4 |
+
|
| 5 |
+
layout (push_constant) uniform parameter
|
| 6 |
+
{
|
| 7 |
+
uint nb1;
|
| 8 |
+
uint dim;
|
| 9 |
+
uint max_period;
|
| 10 |
+
} p;
|
| 11 |
+
|
| 12 |
+
#include "types.comp"
|
| 13 |
+
|
| 14 |
+
#extension GL_EXT_control_flow_attributes : enable
|
| 15 |
+
#define BLOCK_SIZE 256
|
| 16 |
+
|
| 17 |
+
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
| 18 |
+
|
| 19 |
+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
| 20 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 21 |
+
|
| 22 |
+
void main() {
|
| 23 |
+
const uint i = gl_WorkGroupID.y;
|
| 24 |
+
const uint j = gl_GlobalInvocationID.x;
|
| 25 |
+
const uint d_offset = i * p.nb1;
|
| 26 |
+
|
| 27 |
+
if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
|
| 28 |
+
data_d[d_offset + p.dim] = 0.f;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
const uint half_dim = p.dim / 2;
|
| 32 |
+
if (j >= half_dim) {
|
| 33 |
+
return;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
const float timestep = float(data_a[i]);
|
| 37 |
+
const float freq = float(exp(-log(p.max_period) * j / half_dim));
|
| 38 |
+
const float arg = timestep * freq;
|
| 39 |
+
data_d[d_offset + j] = D_TYPE(cos(arg));
|
| 40 |
+
data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
|
| 41 |
+
}
|
ggml/src/vulkan-shaders/types.comp
CHANGED
|
@@ -6,7 +6,7 @@
|
|
| 6 |
#define QUANT_K 1
|
| 7 |
#define QUANT_R 1
|
| 8 |
|
| 9 |
-
#
|
| 10 |
#define A_TYPE float
|
| 11 |
#elif LOAD_VEC_A == 4
|
| 12 |
#define A_TYPE vec4
|
|
@@ -19,7 +19,7 @@
|
|
| 19 |
#define QUANT_K 1
|
| 20 |
#define QUANT_R 1
|
| 21 |
|
| 22 |
-
#
|
| 23 |
#define A_TYPE float16_t
|
| 24 |
#elif LOAD_VEC_A == 4
|
| 25 |
#define A_TYPE f16vec4
|
|
@@ -177,3 +177,24 @@ struct block_q6_K
|
|
| 177 |
|
| 178 |
#define A_TYPE block_q6_K
|
| 179 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
#define QUANT_K 1
|
| 7 |
#define QUANT_R 1
|
| 8 |
|
| 9 |
+
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
| 10 |
#define A_TYPE float
|
| 11 |
#elif LOAD_VEC_A == 4
|
| 12 |
#define A_TYPE vec4
|
|
|
|
| 19 |
#define QUANT_K 1
|
| 20 |
#define QUANT_R 1
|
| 21 |
|
| 22 |
+
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
| 23 |
#define A_TYPE float16_t
|
| 24 |
#elif LOAD_VEC_A == 4
|
| 25 |
#define A_TYPE f16vec4
|
|
|
|
| 177 |
|
| 178 |
#define A_TYPE block_q6_K
|
| 179 |
#endif
|
| 180 |
+
|
| 181 |
+
// IQuants
|
| 182 |
+
|
| 183 |
+
#if defined(DATA_A_IQ4_NL)
|
| 184 |
+
#extension GL_EXT_shader_16bit_storage : require
|
| 185 |
+
#define QUANT_K 32
|
| 186 |
+
#define QUANT_R 2
|
| 187 |
+
|
| 188 |
+
struct block_iq4_nl
|
| 189 |
+
{
|
| 190 |
+
float16_t d;
|
| 191 |
+
uint8_t qs[QUANT_K/2];
|
| 192 |
+
};
|
| 193 |
+
|
| 194 |
+
#define A_TYPE block_iq4_nl
|
| 195 |
+
|
| 196 |
+
const int8_t kvalues_iq4nl[16] = {
|
| 197 |
+
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
|
| 198 |
+
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
| 199 |
+
};
|
| 200 |
+
#endif
|
ggml/src/vulkan-shaders/upscale.comp
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
layout (push_constant) uniform parameter
|
| 4 |
+
{
|
| 5 |
+
uint ne; uint d_offset;
|
| 6 |
+
uint nb00; uint nb01; uint nb02; uint nb03;
|
| 7 |
+
uint ne10; uint ne11; uint ne12; uint ne13;
|
| 8 |
+
float sf0; float sf1; float sf2; float sf3;
|
| 9 |
+
} p;
|
| 10 |
+
|
| 11 |
+
#include "types.comp"
|
| 12 |
+
|
| 13 |
+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
| 14 |
+
|
| 15 |
+
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
| 16 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 17 |
+
|
| 18 |
+
void main() {
|
| 19 |
+
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 20 |
+
|
| 21 |
+
if (idx >= p.ne) {
|
| 22 |
+
return;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
const uint i10 = idx % p.ne10;
|
| 26 |
+
const uint i11 = (idx / p.ne10) % p.ne11;
|
| 27 |
+
const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
|
| 28 |
+
const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
|
| 29 |
+
|
| 30 |
+
const uint i00 = uint(i10 / p.sf0);
|
| 31 |
+
const uint i01 = uint(i11 / p.sf1);
|
| 32 |
+
const uint i02 = uint(i12 / p.sf2);
|
| 33 |
+
const uint i03 = uint(i13 / p.sf3);
|
| 34 |
+
|
| 35 |
+
data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
| 36 |
+
}
|
ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
ADDED
|
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
#include <iostream>
|
| 4 |
+
#include <fstream>
|
| 5 |
+
#include <sstream>
|
| 6 |
+
#include <string>
|
| 7 |
+
#include <stdexcept>
|
| 8 |
+
#include <array>
|
| 9 |
+
#include <vector>
|
| 10 |
+
#include <map>
|
| 11 |
+
#include <thread>
|
| 12 |
+
#include <mutex>
|
| 13 |
+
#include <future>
|
| 14 |
+
#include <queue>
|
| 15 |
+
#include <condition_variable>
|
| 16 |
+
#include <cstdio>
|
| 17 |
+
#include <cstring>
|
| 18 |
+
#include <cstdlib>
|
| 19 |
+
#include <sys/stat.h>
|
| 20 |
+
#include <sys/types.h>
|
| 21 |
+
|
| 22 |
+
#ifdef _WIN32
|
| 23 |
+
#include <windows.h>
|
| 24 |
+
#include <direct.h> // For _mkdir on Windows
|
| 25 |
+
#include <algorithm> // For std::replace on w64devkit
|
| 26 |
+
#else
|
| 27 |
+
#include <unistd.h>
|
| 28 |
+
#include <sys/wait.h>
|
| 29 |
+
#include <fcntl.h>
|
| 30 |
+
#endif
|
| 31 |
+
|
| 32 |
+
#define ASYNCIO_CONCURRENCY 64
|
| 33 |
+
|
| 34 |
+
std::mutex lock;
|
| 35 |
+
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
| 36 |
+
|
| 37 |
+
std::string GLSLC = "glslc";
|
| 38 |
+
std::string input_dir = "vulkan-shaders";
|
| 39 |
+
std::string output_dir = "/tmp";
|
| 40 |
+
std::string target_hpp = "ggml-vulkan-shaders.hpp";
|
| 41 |
+
std::string target_cpp = "ggml-vulkan-shaders.cpp";
|
| 42 |
+
bool no_clean = false;
|
| 43 |
+
|
| 44 |
+
const std::vector<std::string> type_names = {
|
| 45 |
+
"f32",
|
| 46 |
+
"f16",
|
| 47 |
+
"q4_0",
|
| 48 |
+
"q4_1",
|
| 49 |
+
"q5_0",
|
| 50 |
+
"q5_1",
|
| 51 |
+
"q8_0",
|
| 52 |
+
"q2_k",
|
| 53 |
+
"q3_k",
|
| 54 |
+
"q4_k",
|
| 55 |
+
"q5_k",
|
| 56 |
+
"q6_k",
|
| 57 |
+
"iq4_nl"
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
|
| 61 |
+
#ifdef _WIN32
|
| 62 |
+
HANDLE stdout_read, stdout_write;
|
| 63 |
+
HANDLE stderr_read, stderr_write;
|
| 64 |
+
SECURITY_ATTRIBUTES sa = { sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
|
| 65 |
+
|
| 66 |
+
if (!CreatePipe(&stdout_read, &stdout_write, &sa, 0) ||
|
| 67 |
+
!SetHandleInformation(stdout_read, HANDLE_FLAG_INHERIT, 0)) {
|
| 68 |
+
throw std::runtime_error("Failed to create stdout pipe");
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
if (!CreatePipe(&stderr_read, &stderr_write, &sa, 0) ||
|
| 72 |
+
!SetHandleInformation(stderr_read, HANDLE_FLAG_INHERIT, 0)) {
|
| 73 |
+
throw std::runtime_error("Failed to create stderr pipe");
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
PROCESS_INFORMATION pi;
|
| 77 |
+
STARTUPINFOA si = { sizeof(STARTUPINFOA) };
|
| 78 |
+
si.dwFlags = STARTF_USESTDHANDLES;
|
| 79 |
+
si.hStdOutput = stdout_write;
|
| 80 |
+
si.hStdError = stderr_write;
|
| 81 |
+
|
| 82 |
+
std::vector<char> cmd(command.begin(), command.end());
|
| 83 |
+
cmd.push_back('\0');
|
| 84 |
+
|
| 85 |
+
if (!CreateProcessA(NULL, cmd.data(), NULL, NULL, TRUE, 0, NULL, NULL, &si, &pi)) {
|
| 86 |
+
throw std::runtime_error("Failed to create process");
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
CloseHandle(stdout_write);
|
| 90 |
+
CloseHandle(stderr_write);
|
| 91 |
+
|
| 92 |
+
std::array<char, 128> buffer;
|
| 93 |
+
DWORD bytes_read;
|
| 94 |
+
|
| 95 |
+
while (ReadFile(stdout_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
| 96 |
+
stdout_str.append(buffer.data(), bytes_read);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
while (ReadFile(stderr_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
| 100 |
+
stderr_str.append(buffer.data(), bytes_read);
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
CloseHandle(stdout_read);
|
| 104 |
+
CloseHandle(stderr_read);
|
| 105 |
+
WaitForSingleObject(pi.hProcess, INFINITE);
|
| 106 |
+
CloseHandle(pi.hProcess);
|
| 107 |
+
CloseHandle(pi.hThread);
|
| 108 |
+
#else
|
| 109 |
+
int stdout_pipe[2];
|
| 110 |
+
int stderr_pipe[2];
|
| 111 |
+
|
| 112 |
+
if (pipe(stdout_pipe) != 0 || pipe(stderr_pipe) != 0) {
|
| 113 |
+
throw std::runtime_error("Failed to create pipes");
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
pid_t pid = fork();
|
| 117 |
+
if (pid < 0) {
|
| 118 |
+
throw std::runtime_error("Failed to fork process");
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
if (pid == 0) {
|
| 122 |
+
close(stdout_pipe[0]);
|
| 123 |
+
close(stderr_pipe[0]);
|
| 124 |
+
dup2(stdout_pipe[1], STDOUT_FILENO);
|
| 125 |
+
dup2(stderr_pipe[1], STDERR_FILENO);
|
| 126 |
+
close(stdout_pipe[1]);
|
| 127 |
+
close(stderr_pipe[1]);
|
| 128 |
+
execl("/bin/sh", "sh", "-c", command.c_str(), (char*) nullptr);
|
| 129 |
+
_exit(EXIT_FAILURE);
|
| 130 |
+
} else {
|
| 131 |
+
close(stdout_pipe[1]);
|
| 132 |
+
close(stderr_pipe[1]);
|
| 133 |
+
|
| 134 |
+
std::array<char, 128> buffer;
|
| 135 |
+
ssize_t bytes_read;
|
| 136 |
+
|
| 137 |
+
while ((bytes_read = read(stdout_pipe[0], buffer.data(), buffer.size())) > 0) {
|
| 138 |
+
stdout_str.append(buffer.data(), bytes_read);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
while ((bytes_read = read(stderr_pipe[0], buffer.data(), buffer.size())) > 0) {
|
| 142 |
+
stderr_str.append(buffer.data(), bytes_read);
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
close(stdout_pipe[0]);
|
| 146 |
+
close(stderr_pipe[0]);
|
| 147 |
+
waitpid(pid, nullptr, 0);
|
| 148 |
+
}
|
| 149 |
+
#endif
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
bool directory_exists(const std::string& path) {
|
| 153 |
+
struct stat info;
|
| 154 |
+
if (stat(path.c_str(), &info) != 0) {
|
| 155 |
+
return false; // Path doesn't exist or can't be accessed
|
| 156 |
+
}
|
| 157 |
+
return (info.st_mode & S_IFDIR) != 0; // Check if it is a directory
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
bool create_directory(const std::string& path) {
|
| 161 |
+
#ifdef _WIN32
|
| 162 |
+
return _mkdir(path.c_str()) == 0 || errno == EEXIST; // EEXIST means the directory already exists
|
| 163 |
+
#else
|
| 164 |
+
return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST; // 0755 is the directory permissions
|
| 165 |
+
#endif
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
std::string to_uppercase(const std::string& input) {
|
| 169 |
+
std::string result = input;
|
| 170 |
+
for (char& c : result) {
|
| 171 |
+
c = std::toupper(c);
|
| 172 |
+
}
|
| 173 |
+
return result;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
bool string_ends_with(const std::string& str, const std::string& suffix) {
|
| 177 |
+
if (suffix.size() > str.size()) {
|
| 178 |
+
return false;
|
| 179 |
+
}
|
| 180 |
+
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
static const char path_separator = '/';
|
| 184 |
+
|
| 185 |
+
std::string join_paths(const std::string& path1, const std::string& path2) {
|
| 186 |
+
return path1 + path_separator + path2;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
std::string basename(const std::string &path) {
|
| 190 |
+
return path.substr(path.find_last_of("/\\") + 1);
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
|
| 194 |
+
std::string name = _name + (fp16 ? "" : "_fp32");
|
| 195 |
+
std::string out_fname = join_paths(output_dir, name + ".spv");
|
| 196 |
+
std::string in_path = join_paths(input_dir, in_fname);
|
| 197 |
+
|
| 198 |
+
#ifdef _WIN32
|
| 199 |
+
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
|
| 200 |
+
#else
|
| 201 |
+
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
| 202 |
+
#endif
|
| 203 |
+
for (const auto& define : defines) {
|
| 204 |
+
cmd.push_back("-D" + define.first + "=" + define.second);
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
std::string command;
|
| 208 |
+
for (const auto& part : cmd) {
|
| 209 |
+
command += part + " ";
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
std::string stdout_str, stderr_str;
|
| 213 |
+
try {
|
| 214 |
+
// std::cout << "Executing command: ";
|
| 215 |
+
// for (const auto& part : cmd) {
|
| 216 |
+
// std::cout << part << " ";
|
| 217 |
+
// }
|
| 218 |
+
// std::cout << std::endl;
|
| 219 |
+
|
| 220 |
+
execute_command(command, stdout_str, stderr_str);
|
| 221 |
+
if (!stderr_str.empty()) {
|
| 222 |
+
std::cerr << "cannot compile " << name << "\n\n" << command << "\n\n" << stderr_str << std::endl;
|
| 223 |
+
return;
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
std::lock_guard<std::mutex> guard(lock);
|
| 227 |
+
shader_fnames.push_back(std::make_pair(name, out_fname));
|
| 228 |
+
} catch (const std::exception& e) {
|
| 229 |
+
std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
|
| 234 |
+
std::map<std::string, std::string> result = a;
|
| 235 |
+
result.insert(b.begin(), b.end());
|
| 236 |
+
return result;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id) {
|
| 240 |
+
std::string load_vec = fp16 ? "8" : "4";
|
| 241 |
+
std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4";
|
| 242 |
+
std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4";
|
| 243 |
+
|
| 244 |
+
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", fp16 ? "float16_t" : "float"}};
|
| 245 |
+
std::string shader_name = "matmul";
|
| 246 |
+
|
| 247 |
+
if (matmul_id) {
|
| 248 |
+
base_dict["MUL_MAT_ID"] = "1";
|
| 249 |
+
shader_name = "matmul_id";
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
if (fp16) {
|
| 253 |
+
base_dict["FLOAT16"] = "1";
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
// Shaders with f16 B_TYPE
|
| 257 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 258 |
+
string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
| 259 |
+
}));
|
| 260 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 261 |
+
string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
| 262 |
+
}));
|
| 263 |
+
|
| 264 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 265 |
+
string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
| 266 |
+
}));
|
| 267 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 268 |
+
string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
| 269 |
+
}));
|
| 270 |
+
|
| 271 |
+
for (const auto& tname : type_names) {
|
| 272 |
+
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
| 273 |
+
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
| 274 |
+
std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
|
| 275 |
+
// For aligned matmul loads
|
| 276 |
+
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
|
| 277 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 278 |
+
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
| 279 |
+
}));
|
| 280 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 281 |
+
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
| 282 |
+
}));
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
void process_shaders(std::vector<std::future<void>>& tasks) {
|
| 287 |
+
std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
|
| 288 |
+
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
|
| 289 |
+
|
| 290 |
+
for (const auto& fp16 : {false, true}) {
|
| 291 |
+
matmul_shaders(tasks, fp16, false);
|
| 292 |
+
matmul_shaders(tasks, fp16, true);
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
for (const auto& tname : type_names) {
|
| 296 |
+
// mul mat vec
|
| 297 |
+
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
| 298 |
+
std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
| 299 |
+
|
| 300 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 301 |
+
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 302 |
+
}));
|
| 303 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 304 |
+
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
| 305 |
+
}));
|
| 306 |
+
|
| 307 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 308 |
+
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 309 |
+
}));
|
| 310 |
+
|
| 311 |
+
// Dequant shaders
|
| 312 |
+
if (tname != "f16") {
|
| 313 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 314 |
+
string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
|
| 315 |
+
}));
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
if (!string_ends_with(tname, "_k")) {
|
| 319 |
+
shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
|
| 320 |
+
|
| 321 |
+
if (tname == "f16") {
|
| 322 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 323 |
+
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
| 324 |
+
}));
|
| 325 |
+
} else {
|
| 326 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 327 |
+
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
|
| 328 |
+
}));
|
| 329 |
+
}
|
| 330 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 331 |
+
string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
|
| 332 |
+
}));
|
| 333 |
+
}
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 337 |
+
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 338 |
+
}));
|
| 339 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 340 |
+
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 341 |
+
}));
|
| 342 |
+
|
| 343 |
+
// Norms
|
| 344 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 345 |
+
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 346 |
+
}));
|
| 347 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 348 |
+
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 349 |
+
}));
|
| 350 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 351 |
+
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 352 |
+
}));
|
| 353 |
+
|
| 354 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 355 |
+
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 356 |
+
}));
|
| 357 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 358 |
+
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
| 359 |
+
}));
|
| 360 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 361 |
+
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
| 362 |
+
}));
|
| 363 |
+
|
| 364 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 365 |
+
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 366 |
+
}));
|
| 367 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 368 |
+
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
| 369 |
+
}));
|
| 370 |
+
|
| 371 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 372 |
+
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
| 373 |
+
}));
|
| 374 |
+
|
| 375 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 376 |
+
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 377 |
+
}));
|
| 378 |
+
|
| 379 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 380 |
+
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 381 |
+
}));
|
| 382 |
+
|
| 383 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 384 |
+
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 385 |
+
}));
|
| 386 |
+
|
| 387 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 388 |
+
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 389 |
+
}));
|
| 390 |
+
|
| 391 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 392 |
+
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 393 |
+
}));
|
| 394 |
+
|
| 395 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 396 |
+
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 397 |
+
}));
|
| 398 |
+
|
| 399 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 400 |
+
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 401 |
+
}));
|
| 402 |
+
|
| 403 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 404 |
+
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 405 |
+
}));
|
| 406 |
+
|
| 407 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 408 |
+
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 409 |
+
}));
|
| 410 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 411 |
+
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
| 412 |
+
}));
|
| 413 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 414 |
+
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
| 415 |
+
}));
|
| 416 |
+
|
| 417 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 418 |
+
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 419 |
+
}));
|
| 420 |
+
|
| 421 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 422 |
+
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 423 |
+
}));
|
| 424 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 425 |
+
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 426 |
+
}));
|
| 427 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 428 |
+
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 429 |
+
}));
|
| 430 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 431 |
+
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 432 |
+
}));
|
| 433 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 434 |
+
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 435 |
+
}));
|
| 436 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 437 |
+
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 438 |
+
}));
|
| 439 |
+
|
| 440 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 441 |
+
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 442 |
+
}));
|
| 443 |
+
|
| 444 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 445 |
+
string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 446 |
+
}));
|
| 447 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 448 |
+
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
| 449 |
+
}));
|
| 450 |
+
|
| 451 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 452 |
+
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 453 |
+
}));
|
| 454 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 455 |
+
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
| 456 |
+
}));
|
| 457 |
+
|
| 458 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 459 |
+
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 460 |
+
}));
|
| 461 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 462 |
+
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
| 463 |
+
}));
|
| 464 |
+
|
| 465 |
+
tasks.push_back(std::async(std::launch::async, [] {
|
| 466 |
+
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
| 467 |
+
}));
|
| 468 |
+
|
| 469 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 470 |
+
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 471 |
+
}));
|
| 472 |
+
|
| 473 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 474 |
+
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 475 |
+
}));
|
| 476 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 477 |
+
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
| 478 |
+
}));
|
| 479 |
+
|
| 480 |
+
tasks.push_back(std::async(std::launch::async, [=] {
|
| 481 |
+
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 482 |
+
}));
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
void write_output_files() {
|
| 486 |
+
FILE* hdr = fopen(target_hpp.c_str(), "w");
|
| 487 |
+
FILE* src = fopen(target_cpp.c_str(), "w");
|
| 488 |
+
|
| 489 |
+
fprintf(hdr, "#include <cstdint>\n\n");
|
| 490 |
+
fprintf(src, "#include \"%s\"\n\n", basename(target_hpp).c_str());
|
| 491 |
+
|
| 492 |
+
for (const auto& pair : shader_fnames) {
|
| 493 |
+
const std::string& name = pair.first;
|
| 494 |
+
#ifdef _WIN32
|
| 495 |
+
std::string path = pair.second;
|
| 496 |
+
std::replace(path.begin(), path.end(), '/', '\\' );
|
| 497 |
+
#else
|
| 498 |
+
const std::string& path = pair.second;
|
| 499 |
+
#endif
|
| 500 |
+
|
| 501 |
+
FILE* spv = fopen(path.c_str(), "rb");
|
| 502 |
+
if (!spv) {
|
| 503 |
+
std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
| 504 |
+
continue;
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
fseek(spv, 0, SEEK_END);
|
| 508 |
+
size_t size = ftell(spv);
|
| 509 |
+
fseek(spv, 0, SEEK_SET);
|
| 510 |
+
|
| 511 |
+
std::vector<unsigned char> data(size);
|
| 512 |
+
size_t read_size = fread(data.data(), 1, size, spv);
|
| 513 |
+
fclose(spv);
|
| 514 |
+
if (read_size != size) {
|
| 515 |
+
std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
| 516 |
+
continue;
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
fprintf(hdr, "extern unsigned char %s_data[%zu];\n", name.c_str(), size);
|
| 520 |
+
fprintf(hdr, "const uint64_t %s_len = %zu;\n\n", name.c_str(), size);
|
| 521 |
+
|
| 522 |
+
fprintf(src, "unsigned char %s_data[%zu] = {\n", name.c_str(), size);
|
| 523 |
+
for (size_t i = 0; i < size; ++i) {
|
| 524 |
+
fprintf(src, "0x%02x,", data[i]);
|
| 525 |
+
if ((i + 1) % 12 == 0) fprintf(src, "\n");
|
| 526 |
+
}
|
| 527 |
+
fprintf(src, "\n};\n\n");
|
| 528 |
+
|
| 529 |
+
if (!no_clean) {
|
| 530 |
+
std::remove(path.c_str());
|
| 531 |
+
}
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
fclose(hdr);
|
| 535 |
+
fclose(src);
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
int main(int argc, char** argv) {
|
| 539 |
+
std::map<std::string, std::string> args;
|
| 540 |
+
for (int i = 1; i < argc; i += 2) {
|
| 541 |
+
if (i + 1 < argc) {
|
| 542 |
+
args[argv[i]] = argv[i + 1];
|
| 543 |
+
}
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
if (args.find("--glslc") != args.end()) {
|
| 547 |
+
GLSLC = args["--glslc"]; // Path to glslc
|
| 548 |
+
}
|
| 549 |
+
if (args.find("--input-dir") != args.end()) {
|
| 550 |
+
input_dir = args["--input-dir"]; // Directory containing shader sources
|
| 551 |
+
}
|
| 552 |
+
if (args.find("--output-dir") != args.end()) {
|
| 553 |
+
output_dir = args["--output-dir"]; // Directory for containing SPIR-V output
|
| 554 |
+
}
|
| 555 |
+
if (args.find("--target-hpp") != args.end()) {
|
| 556 |
+
target_hpp = args["--target-hpp"]; // Path to generated header file
|
| 557 |
+
}
|
| 558 |
+
if (args.find("--target-cpp") != args.end()) {
|
| 559 |
+
target_cpp = args["--target-cpp"]; // Path to generated cpp file
|
| 560 |
+
}
|
| 561 |
+
if (args.find("--no-clean") != args.end()) {
|
| 562 |
+
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
if (!directory_exists(input_dir)) {
|
| 566 |
+
std::cerr << "\"" << input_dir << "\" must be a valid directory containing shader sources" << std::endl;
|
| 567 |
+
return EXIT_FAILURE;
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
if (!directory_exists(output_dir)) {
|
| 571 |
+
if (!create_directory(output_dir)) {
|
| 572 |
+
std::cerr << "Error creating output directory: " << output_dir << "\n";
|
| 573 |
+
return EXIT_FAILURE;
|
| 574 |
+
}
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
std::vector<std::future<void>> tasks;
|
| 578 |
+
process_shaders(tasks);
|
| 579 |
+
|
| 580 |
+
for (auto& task : tasks) {
|
| 581 |
+
task.get();
|
| 582 |
+
}
|
| 583 |
+
|
| 584 |
+
write_output_files();
|
| 585 |
+
|
| 586 |
+
return EXIT_SUCCESS;
|
| 587 |
+
}
|
scripts/sync-ggml-am.sh
CHANGED
|
@@ -65,6 +65,7 @@ while read c; do
|
|
| 65 |
src/ggml-cann/* \
|
| 66 |
src/ggml-cuda/* \
|
| 67 |
src/ggml-sycl/* \
|
|
|
|
| 68 |
include/ggml*.h \
|
| 69 |
examples/common.h \
|
| 70 |
examples/common.cpp \
|
|
|
|
| 65 |
src/ggml-cann/* \
|
| 66 |
src/ggml-cuda/* \
|
| 67 |
src/ggml-sycl/* \
|
| 68 |
+
src/vulkan-shaders/* \
|
| 69 |
include/ggml*.h \
|
| 70 |
examples/common.h \
|
| 71 |
examples/common.cpp \
|
scripts/sync-ggml.last
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
46e22f59eaf0aaa38a8e525fd89ba95e39ba7435
|