ggerganov commited on
Commit
c4c7e49
·
1 Parent(s): 630d713

sync : ggml vulkan (ggml/0)

Browse files
Files changed (38) hide show
  1. ggml/src/ggml-vulkan-shaders.hpp +0 -0
  2. ggml/src/vulkan-shaders/CMakeLists.txt +7 -0
  3. ggml/src/vulkan-shaders/add.comp +4 -2
  4. ggml/src/vulkan-shaders/clamp.comp +5 -3
  5. ggml/src/vulkan-shaders/concat.comp +35 -0
  6. ggml/src/vulkan-shaders/copy.comp +5 -3
  7. ggml/src/vulkan-shaders/cos.comp +15 -0
  8. ggml/src/vulkan-shaders/dequant_funcs.comp +8 -0
  9. ggml/src/vulkan-shaders/dequant_iq4_nl.comp +30 -0
  10. ggml/src/vulkan-shaders/dequant_q4_0.comp +4 -6
  11. ggml/src/vulkan-shaders/div.comp +4 -2
  12. ggml/src/vulkan-shaders/gelu.comp +1 -1
  13. ggml/src/vulkan-shaders/gelu_quick.comp +23 -0
  14. ggml/src/vulkan-shaders/generic_binary_head.comp +5 -1
  15. ggml/src/vulkan-shaders/generic_unary_head.comp +4 -0
  16. ggml/src/vulkan-shaders/group_norm.comp +66 -0
  17. ggml/src/vulkan-shaders/im2col.comp +57 -0
  18. ggml/src/vulkan-shaders/leaky_relu.comp +22 -0
  19. ggml/src/vulkan-shaders/mul.comp +4 -2
  20. ggml/src/vulkan-shaders/mul_mat_vec.comp +10 -3
  21. ggml/src/vulkan-shaders/mul_mm.comp +14 -1
  22. ggml/src/vulkan-shaders/norm.comp +1 -1
  23. ggml/src/vulkan-shaders/pad.comp +26 -0
  24. ggml/src/vulkan-shaders/relu.comp +1 -1
  25. ggml/src/vulkan-shaders/rms_norm.comp +1 -1
  26. ggml/src/vulkan-shaders/scale.comp +4 -2
  27. ggml/src/vulkan-shaders/silu.comp +1 -1
  28. ggml/src/vulkan-shaders/sin.comp +15 -0
  29. ggml/src/vulkan-shaders/soft_max.comp +1 -1
  30. ggml/src/vulkan-shaders/square.comp +5 -3
  31. ggml/src/vulkan-shaders/sum_rows.comp +1 -1
  32. ggml/src/vulkan-shaders/tanh.comp +21 -0
  33. ggml/src/vulkan-shaders/timestep_embedding.comp +41 -0
  34. ggml/src/vulkan-shaders/types.comp +23 -2
  35. ggml/src/vulkan-shaders/upscale.comp +36 -0
  36. ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +587 -0
  37. scripts/sync-ggml-am.sh +1 -0
  38. scripts/sync-ggml.last +1 -1
ggml/src/ggml-vulkan-shaders.hpp DELETED
The diff for this file is too large to render. See raw diff
 
ggml/src/vulkan-shaders/CMakeLists.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ find_package (Threads REQUIRED)
2
+
3
+ set(TARGET vulkan-shaders-gen)
4
+ add_executable(${TARGET} vulkan-shaders-gen.cpp)
5
+ install(TARGETS ${TARGET} RUNTIME)
6
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
7
+ target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
ggml/src/vulkan-shaders/add.comp CHANGED
@@ -4,9 +4,11 @@
4
  #include "generic_binary_head.comp"
5
 
6
  void main() {
7
- if (gl_GlobalInvocationID.x >= p.ne) {
 
 
8
  return;
9
  }
10
 
11
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
12
  }
 
4
  #include "generic_binary_head.comp"
5
 
6
  void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
  return;
11
  }
12
 
13
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
14
  }
ggml/src/vulkan-shaders/clamp.comp CHANGED
@@ -4,10 +4,12 @@
4
  #include "generic_unary_head.comp"
5
 
6
  void main() {
7
- if (gl_GlobalInvocationID.x >= p.ne) {
 
 
8
  return;
9
  }
10
 
11
- const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
12
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
13
  }
 
4
  #include "generic_unary_head.comp"
5
 
6
  void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
  return;
11
  }
12
 
13
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
15
  }
ggml/src/vulkan-shaders/concat.comp ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "types.comp"
4
+ #include "generic_binary_head.comp"
5
+
6
+ void main() {
7
+ const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
8
+ const int dim = p.param3;
9
+
10
+ if (idx >= p.ne) {
11
+ return;
12
+ }
13
+
14
+ const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
15
+ const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
16
+ const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
17
+ const uint i2_offset = i2*p.ne21*p.ne20;
18
+ const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
19
+ const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
20
+
21
+ uint o[4] = {0, 0, 0, 0};
22
+ o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
23
+
24
+ const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
25
+ const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
26
+ const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
27
+
28
+ const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
29
+
30
+ #ifndef OPTIMIZATION_ERROR_WORKAROUND
31
+ data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
32
+ #else
33
+ data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
34
+ #endif
35
+ }
ggml/src/vulkan-shaders/copy.comp CHANGED
@@ -4,13 +4,15 @@
4
  #include "generic_unary_head.comp"
5
 
6
  void main() {
7
- if (gl_GlobalInvocationID.x >= p.ne) {
 
 
8
  return;
9
  }
10
 
11
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
12
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
13
  #else
14
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
15
  #endif
16
  }
 
4
  #include "generic_unary_head.comp"
5
 
6
  void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
  return;
11
  }
12
 
13
  #ifndef OPTIMIZATION_ERROR_WORKAROUND
14
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
15
  #else
16
+ data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
17
  #endif
18
  }
ggml/src/vulkan-shaders/cos.comp ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "types.comp"
4
+ #include "generic_unary_head.comp"
5
+
6
+ void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
+ return;
11
+ }
12
+
13
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
15
+ }
ggml/src/vulkan-shaders/dequant_funcs.comp CHANGED
@@ -58,3 +58,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
58
  return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
59
  }
60
  #endif
 
 
 
 
 
 
 
 
 
58
  return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
59
  }
60
  #endif
61
+
62
+ #if defined(DATA_A_IQ4_NL)
63
+ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
64
+ const float d = float(data_a[a_offset + ib].d);
65
+ const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
66
+ return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
67
+ }
68
+ #endif
ggml/src/vulkan-shaders/dequant_iq4_nl.comp ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "dequant_head.comp"
4
+
5
+ layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
6
+
7
+ layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
8
+ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
9
+
10
+ void main() {
11
+ const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12
+
13
+ const uint tid = gl_LocalInvocationID.x % 64;
14
+ const uint il = tid/32;
15
+ const uint ir = tid%32;
16
+ const uint ib = 32*i + ir;
17
+ if (ib >= p.nel / 32) {
18
+ return;
19
+ }
20
+
21
+ const uint q_idx = 8*il;
22
+ const uint b_idx = 1024*i + 32*ir + q_idx;
23
+
24
+ const float d = float(data_a[ib].d);
25
+
26
+ [[unroll]] for (uint l = 0; l < 8; ++l) {
27
+ data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
28
+ data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
29
+ }
30
+ }
ggml/src/vulkan-shaders/dequant_q4_0.comp CHANGED
@@ -18,15 +18,13 @@ void main() {
18
  return;
19
  }
20
 
21
- const uint b_idx = 1024*i + 32*ir + 8*il;
 
22
 
23
  const float d = float(data_a[ib].d);
24
- const float dm = -8.0f * d;
25
-
26
- const uint q_idx = 8*il;
27
 
28
  [[unroll]] for (uint l = 0; l < 8; ++l) {
29
- data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
30
- data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + dm);
31
  }
32
  }
 
18
  return;
19
  }
20
 
21
+ const uint q_idx = 8*il;
22
+ const uint b_idx = 1024*i + 32*ir + q_idx;
23
 
24
  const float d = float(data_a[ib].d);
 
 
 
25
 
26
  [[unroll]] for (uint l = 0; l < 8; ++l) {
27
+ data_b[b_idx + l + 0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
28
+ data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >> 4) - 8.0f));
29
  }
30
  }
ggml/src/vulkan-shaders/div.comp CHANGED
@@ -4,9 +4,11 @@
4
  #include "generic_binary_head.comp"
5
 
6
  void main() {
7
- if (gl_GlobalInvocationID.x >= p.ne) {
 
 
8
  return;
9
  }
10
 
11
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
12
  }
 
4
  #include "generic_binary_head.comp"
5
 
6
  void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
  return;
11
  }
12
 
13
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
14
  }
ggml/src/vulkan-shaders/gelu.comp CHANGED
@@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
13
  void main() {
14
  const float GELU_COEF_A = 0.044715f;
15
  const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
16
- const uint i = gl_GlobalInvocationID.x;
17
 
18
  if (i >= p.KX) {
19
  return;
 
13
  void main() {
14
  const float GELU_COEF_A = 0.044715f;
15
  const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
16
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
17
 
18
  if (i >= p.KX) {
19
  return;
ggml/src/vulkan-shaders/gelu_quick.comp ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "generic_head.comp"
4
+ #include "types.comp"
5
+
6
+ #extension GL_EXT_control_flow_attributes : enable
7
+
8
+ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
9
+
10
+ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11
+ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12
+
13
+ void main() {
14
+ const float GELU_QUICK_COEF = -1.702f;
15
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
16
+
17
+ if (i >= p.KX) {
18
+ return;
19
+ }
20
+
21
+ const float x = float(data_a[i]);
22
+ data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
23
+ }
ggml/src/vulkan-shaders/generic_binary_head.comp CHANGED
@@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
7
  uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
8
  uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
9
  uint d_offset;
10
- float param1; float param2;
11
  } p;
12
 
13
  layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
@@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
16
  layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
17
  layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
18
 
 
 
 
 
19
  uint src0_idx(uint idx) {
20
  const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
21
  const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
 
7
  uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
8
  uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
9
  uint d_offset;
10
+ float param1; float param2; int param3;
11
  } p;
12
 
13
  layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
16
  layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
17
  layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
18
 
19
+ uint get_idx() {
20
+ return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
21
+ }
22
+
23
  uint src0_idx(uint idx) {
24
  const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
25
  const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
ggml/src/vulkan-shaders/generic_unary_head.comp CHANGED
@@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
14
  layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
15
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
16
 
 
 
 
 
17
  uint src0_idx(uint idx) {
18
  const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
19
  const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
 
14
  layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
15
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
16
 
17
+ uint get_idx() {
18
+ return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
19
+ }
20
+
21
  uint src0_idx(uint idx) {
22
  const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
23
  const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
ggml/src/vulkan-shaders/group_norm.comp ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "generic_head.comp"
4
+ #include "types.comp"
5
+
6
+ #extension GL_EXT_control_flow_attributes : enable
7
+ #define BLOCK_SIZE 512
8
+
9
+ layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
10
+
11
+ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
12
+ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
13
+
14
+ shared float tmp[BLOCK_SIZE];
15
+
16
+ void main() {
17
+ const uint group_size = p.KX;
18
+ const float eps = p.param1;
19
+
20
+ const uint tid = gl_LocalInvocationID.x;
21
+ const uint start = gl_WorkGroupID.x * group_size + tid;
22
+ const uint end = start + group_size;
23
+
24
+ tmp[tid] = 0.0f;
25
+
26
+ // Calculate mean
27
+ [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
28
+ tmp[tid] += float(data_a[col]);
29
+ }
30
+
31
+ // tmp up partial tmps and write back result
32
+ barrier();
33
+ [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
34
+ if (tid < s) {
35
+ tmp[tid] += tmp[tid + s];
36
+ }
37
+ barrier();
38
+ }
39
+
40
+ const float mean = tmp[0] / group_size;
41
+ barrier();
42
+ tmp[tid] = 0.0f;
43
+
44
+ // Calculate variance
45
+ [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
46
+ const float xi = float(data_a[col]) - mean;
47
+ data_d[col] = D_TYPE(xi);
48
+ tmp[tid] += xi * xi;
49
+ }
50
+
51
+ // sum up partial sums and write back result
52
+ barrier();
53
+ [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
54
+ if (tid < s) {
55
+ tmp[tid] += tmp[tid + s];
56
+ }
57
+ barrier();
58
+ }
59
+
60
+ const float variance = tmp[0] / group_size;
61
+ const float scale = inversesqrt(variance + eps);
62
+
63
+ [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
64
+ data_d[col] *= D_TYPE(scale);
65
+ }
66
+ }
ggml/src/vulkan-shaders/im2col.comp ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #extension GL_EXT_shader_16bit_storage : require
4
+
5
+ layout (push_constant) uniform parameter
6
+ {
7
+ uint batch_offset; uint offset_delta;
8
+ uint IC;
9
+ uint IW; uint IH;
10
+ uint OW; uint OH;
11
+ uint KW; uint KH;
12
+ uint pelements;
13
+ uint CHW;
14
+ int s0; int s1;
15
+ int p0; int p1;
16
+ int d0; int d1;
17
+ } p;
18
+
19
+ #include "types.comp"
20
+
21
+ #define BLOCK_SIZE 256
22
+
23
+ layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
24
+
25
+ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
26
+ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
27
+
28
+ void main() {
29
+ const uint i = gl_GlobalInvocationID.x;
30
+ if (i >= p.pelements) {
31
+ return;
32
+ }
33
+
34
+ const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
35
+ const uint kx = i / ksize;
36
+ const uint kd = kx * ksize;
37
+ const uint ky = (i - kd) / p.OW;
38
+ const uint ix = i % p.OW;
39
+
40
+ const uint oh = gl_GlobalInvocationID.y;
41
+ const uint batch = gl_GlobalInvocationID.z / p.IC;
42
+ const uint ic = gl_GlobalInvocationID.z % p.IC;
43
+
44
+ const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
45
+ const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
46
+
47
+ const uint offset_dst =
48
+ ((batch * p.OH + oh) * p.OW + ix) * p.CHW +
49
+ (ic * (p.KW * p.KH) + ky * p.KW + kx);
50
+
51
+ if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
52
+ data_d[offset_dst] = D_TYPE(0.0f);
53
+ } else {
54
+ const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
55
+ data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
56
+ }
57
+ }
ggml/src/vulkan-shaders/leaky_relu.comp ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "generic_head.comp"
4
+ #include "types.comp"
5
+
6
+ #extension GL_EXT_control_flow_attributes : enable
7
+
8
+ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
9
+
10
+ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11
+ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12
+
13
+ void main() {
14
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15
+
16
+ if (i >= p.KX) {
17
+ return;
18
+ }
19
+
20
+ const float val = float(data_a[i]);
21
+ data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
22
+ }
ggml/src/vulkan-shaders/mul.comp CHANGED
@@ -4,9 +4,11 @@
4
  #include "generic_binary_head.comp"
5
 
6
  void main() {
7
- if (gl_GlobalInvocationID.x >= p.ne) {
 
 
8
  return;
9
  }
10
 
11
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
12
  }
 
4
  #include "generic_binary_head.comp"
5
 
6
  void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
  return;
11
  }
12
 
13
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
14
  }
ggml/src/vulkan-shaders/mul_mat_vec.comp CHANGED
@@ -16,6 +16,13 @@ void main() {
16
  const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
17
  const uint tid = gl_LocalInvocationID.x;
18
 
 
 
 
 
 
 
 
19
  uint a_offset, b_offset, d_offset;
20
  get_offsets(a_offset, b_offset, d_offset);
21
 
@@ -23,8 +30,8 @@ void main() {
23
 
24
  tmp[tid] = FLOAT_TYPE(0.0f);
25
 
26
- [[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
27
- const uint col = i*BLOCK_SIZE + 2*tid;
28
  const uint ib = (row*p.ncols + col)/QUANT_K; // block index
29
  const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
30
  const uint iybs = col - col%QUANT_K; // y block start index
@@ -38,7 +45,7 @@ void main() {
38
 
39
  // sum up partial sums and write back result
40
  barrier();
41
- [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
42
  if (tid < s) {
43
  tmp[tid] += tmp[tid + s];
44
  }
 
16
  const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
17
  const uint tid = gl_LocalInvocationID.x;
18
 
19
+ // There are not enough cols to use all threads
20
+ if (tid >= p.ncols) {
21
+ return;
22
+ }
23
+
24
+ const uint block_size = min(p.ncols, BLOCK_SIZE);
25
+
26
  uint a_offset, b_offset, d_offset;
27
  get_offsets(a_offset, b_offset, d_offset);
28
 
 
30
 
31
  tmp[tid] = FLOAT_TYPE(0.0f);
32
 
33
+ [[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
34
+ const uint col = i*block_size + 2*tid;
35
  const uint ib = (row*p.ncols + col)/QUANT_K; // block index
36
  const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
37
  const uint iybs = col - col%QUANT_K; // y block start index
 
45
 
46
  // sum up partial sums and write back result
47
  barrier();
48
+ [[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
49
  if (tid < s) {
50
  tmp[tid] += tmp[tid + s];
51
  }
ggml/src/vulkan-shaders/mul_mm.comp CHANGED
@@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
71
  shared FLOAT_TYPE buf_b[BN * (BK+1)];
72
 
73
  #ifdef MUL_MAT_ID
74
- shared u16vec2 row_ids[2048];
75
  #endif
76
 
77
  void main() {
@@ -380,6 +380,19 @@ void main() {
380
 
381
  buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
382
  buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  #endif
384
  }
385
  [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
 
71
  shared FLOAT_TYPE buf_b[BN * (BK+1)];
72
 
73
  #ifdef MUL_MAT_ID
74
+ shared u16vec2 row_ids[3072];
75
  #endif
76
 
77
  void main() {
 
380
 
381
  buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
382
  buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
383
+ #elif defined(DATA_A_IQ4_NL)
384
+ const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
385
+ const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;
386
+
387
+ const uint ib = idx / 16;
388
+ const uint iqs = idx & 0xF;
389
+
390
+ const float d = float(data_a[ib].d);
391
+ const uint vui = uint(data_a[ib].qs[iqs]);
392
+ const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
393
+
394
+ buf_a[buf_idx ] = FLOAT_TYPE(v.x);
395
+ buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
396
  #endif
397
  }
398
  [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
ggml/src/vulkan-shaders/norm.comp CHANGED
@@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
14
  shared vec2 sum[BLOCK_SIZE];
15
 
16
  void main() {
17
- const uint row = gl_WorkGroupID.x;
18
  const uint tid = gl_LocalInvocationID.x;
19
 
20
  sum[tid] = vec2(0.0f, 0.0f);
 
14
  shared vec2 sum[BLOCK_SIZE];
15
 
16
  void main() {
17
+ const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
18
  const uint tid = gl_LocalInvocationID.x;
19
 
20
  sum[tid] = vec2(0.0f, 0.0f);
ggml/src/vulkan-shaders/pad.comp ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "types.comp"
4
+ #include "generic_unary_head.comp"
5
+
6
+ void main() {
7
+ const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
8
+
9
+ if (idx >= p.ne) {
10
+ return;
11
+ }
12
+
13
+ const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
14
+ const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
15
+ const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
16
+ const uint i2_offset = i2*p.ne11*p.ne10;
17
+ const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
18
+ const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
19
+
20
+ const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
21
+ const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
22
+
23
+ const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
24
+
25
+ data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
26
+ }
ggml/src/vulkan-shaders/relu.comp CHANGED
@@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12
 
13
  void main() {
14
- const uint i = gl_GlobalInvocationID.x;
15
 
16
  if (i >= p.KX) {
17
  return;
 
11
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12
 
13
  void main() {
14
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15
 
16
  if (i >= p.KX) {
17
  return;
ggml/src/vulkan-shaders/rms_norm.comp CHANGED
@@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
14
  shared FLOAT_TYPE sum[BLOCK_SIZE];
15
 
16
  void main() {
17
- const uint row = gl_WorkGroupID.x;
18
  const uint tid = gl_LocalInvocationID.x;
19
 
20
  sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
 
14
  shared FLOAT_TYPE sum[BLOCK_SIZE];
15
 
16
  void main() {
17
+ const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
18
  const uint tid = gl_LocalInvocationID.x;
19
 
20
  sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
ggml/src/vulkan-shaders/scale.comp CHANGED
@@ -4,9 +4,11 @@
4
  #include "generic_unary_head.comp"
5
 
6
  void main() {
7
- if (gl_GlobalInvocationID.x >= p.ne) {
 
 
8
  return;
9
  }
10
 
11
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
12
  }
 
4
  #include "generic_unary_head.comp"
5
 
6
  void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
  return;
11
  }
12
 
13
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
14
  }
ggml/src/vulkan-shaders/silu.comp CHANGED
@@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12
 
13
  void main() {
14
- const uint i = gl_GlobalInvocationID.x;
15
 
16
  if (i >= p.KX) {
17
  return;
 
11
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12
 
13
  void main() {
14
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15
 
16
  if (i >= p.KX) {
17
  return;
ggml/src/vulkan-shaders/sin.comp ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "types.comp"
4
+ #include "generic_unary_head.comp"
5
+
6
+ void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
+ return;
11
+ }
12
+
13
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
15
+ }
ggml/src/vulkan-shaders/soft_max.comp CHANGED
@@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
28
 
29
  void main() {
30
  const uint tid = gl_LocalInvocationID.x;
31
- const uint rowx = gl_WorkGroupID.x;
32
  const uint rowy = rowx % p.KY;
33
 
34
  float slope = 1.0f;
 
28
 
29
  void main() {
30
  const uint tid = gl_LocalInvocationID.x;
31
+ const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
32
  const uint rowy = rowx % p.KY;
33
 
34
  float slope = 1.0f;
ggml/src/vulkan-shaders/square.comp CHANGED
@@ -4,10 +4,12 @@
4
  #include "generic_unary_head.comp"
5
 
6
  void main() {
7
- if (gl_GlobalInvocationID.x >= p.ne) {
 
 
8
  return;
9
  }
10
 
11
- const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
12
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val);
13
  }
 
4
  #include "generic_unary_head.comp"
5
 
6
  void main() {
7
+ const uint idx = get_idx();
8
+
9
+ if (idx >= p.ne) {
10
  return;
11
  }
12
 
13
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
15
  }
ggml/src/vulkan-shaders/sum_rows.comp CHANGED
@@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
14
  shared FLOAT_TYPE tmp[BLOCK_SIZE];
15
 
16
  void main() {
17
- const uint row = gl_WorkGroupID.x;
18
  const uint col = gl_LocalInvocationID.x;
19
 
20
  tmp[col] = FLOAT_TYPE(0.0f);
 
14
  shared FLOAT_TYPE tmp[BLOCK_SIZE];
15
 
16
  void main() {
17
+ const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
18
  const uint col = gl_LocalInvocationID.x;
19
 
20
  tmp[col] = FLOAT_TYPE(0.0f);
ggml/src/vulkan-shaders/tanh.comp ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "generic_head.comp"
4
+ #include "types.comp"
5
+
6
+ #extension GL_EXT_control_flow_attributes : enable
7
+
8
+ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
9
+
10
+ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11
+ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12
+
13
+ void main() {
14
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15
+
16
+ if (i >= p.KX) {
17
+ return;
18
+ }
19
+
20
+ data_d[i] = D_TYPE(tanh(data_a[i]));
21
+ }
ggml/src/vulkan-shaders/timestep_embedding.comp ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #extension GL_EXT_shader_16bit_storage : require
4
+
5
+ layout (push_constant) uniform parameter
6
+ {
7
+ uint nb1;
8
+ uint dim;
9
+ uint max_period;
10
+ } p;
11
+
12
+ #include "types.comp"
13
+
14
+ #extension GL_EXT_control_flow_attributes : enable
15
+ #define BLOCK_SIZE 256
16
+
17
+ layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
18
+
19
+ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
20
+ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
21
+
22
+ void main() {
23
+ const uint i = gl_WorkGroupID.y;
24
+ const uint j = gl_GlobalInvocationID.x;
25
+ const uint d_offset = i * p.nb1;
26
+
27
+ if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
28
+ data_d[d_offset + p.dim] = 0.f;
29
+ }
30
+
31
+ const uint half_dim = p.dim / 2;
32
+ if (j >= half_dim) {
33
+ return;
34
+ }
35
+
36
+ const float timestep = float(data_a[i]);
37
+ const float freq = float(exp(-log(p.max_period) * j / half_dim));
38
+ const float arg = timestep * freq;
39
+ data_d[d_offset + j] = D_TYPE(cos(arg));
40
+ data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
41
+ }
ggml/src/vulkan-shaders/types.comp CHANGED
@@ -6,7 +6,7 @@
6
  #define QUANT_K 1
7
  #define QUANT_R 1
8
 
9
- #ifndef LOAD_VEC_A
10
  #define A_TYPE float
11
  #elif LOAD_VEC_A == 4
12
  #define A_TYPE vec4
@@ -19,7 +19,7 @@
19
  #define QUANT_K 1
20
  #define QUANT_R 1
21
 
22
- #ifndef LOAD_VEC_A
23
  #define A_TYPE float16_t
24
  #elif LOAD_VEC_A == 4
25
  #define A_TYPE f16vec4
@@ -177,3 +177,24 @@ struct block_q6_K
177
 
178
  #define A_TYPE block_q6_K
179
  #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  #define QUANT_K 1
7
  #define QUANT_R 1
8
 
9
+ #if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
10
  #define A_TYPE float
11
  #elif LOAD_VEC_A == 4
12
  #define A_TYPE vec4
 
19
  #define QUANT_K 1
20
  #define QUANT_R 1
21
 
22
+ #if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
23
  #define A_TYPE float16_t
24
  #elif LOAD_VEC_A == 4
25
  #define A_TYPE f16vec4
 
177
 
178
  #define A_TYPE block_q6_K
179
  #endif
180
+
181
+ // IQuants
182
+
183
+ #if defined(DATA_A_IQ4_NL)
184
+ #extension GL_EXT_shader_16bit_storage : require
185
+ #define QUANT_K 32
186
+ #define QUANT_R 2
187
+
188
+ struct block_iq4_nl
189
+ {
190
+ float16_t d;
191
+ uint8_t qs[QUANT_K/2];
192
+ };
193
+
194
+ #define A_TYPE block_iq4_nl
195
+
196
+ const int8_t kvalues_iq4nl[16] = {
197
+ int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
198
+ int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
199
+ };
200
+ #endif
ggml/src/vulkan-shaders/upscale.comp ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ layout (push_constant) uniform parameter
4
+ {
5
+ uint ne; uint d_offset;
6
+ uint nb00; uint nb01; uint nb02; uint nb03;
7
+ uint ne10; uint ne11; uint ne12; uint ne13;
8
+ float sf0; float sf1; float sf2; float sf3;
9
+ } p;
10
+
11
+ #include "types.comp"
12
+
13
+ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
14
+
15
+ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
16
+ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
17
+
18
+ void main() {
19
+ const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
20
+
21
+ if (idx >= p.ne) {
22
+ return;
23
+ }
24
+
25
+ const uint i10 = idx % p.ne10;
26
+ const uint i11 = (idx / p.ne10) % p.ne11;
27
+ const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
28
+ const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
29
+
30
+ const uint i00 = uint(i10 / p.sf0);
31
+ const uint i01 = uint(i11 / p.sf1);
32
+ const uint i02 = uint(i12 / p.sf2);
33
+ const uint i03 = uint(i13 / p.sf3);
34
+
35
+ data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
36
+ }
ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ #include <iostream>
4
+ #include <fstream>
5
+ #include <sstream>
6
+ #include <string>
7
+ #include <stdexcept>
8
+ #include <array>
9
+ #include <vector>
10
+ #include <map>
11
+ #include <thread>
12
+ #include <mutex>
13
+ #include <future>
14
+ #include <queue>
15
+ #include <condition_variable>
16
+ #include <cstdio>
17
+ #include <cstring>
18
+ #include <cstdlib>
19
+ #include <sys/stat.h>
20
+ #include <sys/types.h>
21
+
22
+ #ifdef _WIN32
23
+ #include <windows.h>
24
+ #include <direct.h> // For _mkdir on Windows
25
+ #include <algorithm> // For std::replace on w64devkit
26
+ #else
27
+ #include <unistd.h>
28
+ #include <sys/wait.h>
29
+ #include <fcntl.h>
30
+ #endif
31
+
32
+ #define ASYNCIO_CONCURRENCY 64
33
+
34
+ std::mutex lock;
35
+ std::vector<std::pair<std::string, std::string>> shader_fnames;
36
+
37
+ std::string GLSLC = "glslc";
38
+ std::string input_dir = "vulkan-shaders";
39
+ std::string output_dir = "/tmp";
40
+ std::string target_hpp = "ggml-vulkan-shaders.hpp";
41
+ std::string target_cpp = "ggml-vulkan-shaders.cpp";
42
+ bool no_clean = false;
43
+
44
+ const std::vector<std::string> type_names = {
45
+ "f32",
46
+ "f16",
47
+ "q4_0",
48
+ "q4_1",
49
+ "q5_0",
50
+ "q5_1",
51
+ "q8_0",
52
+ "q2_k",
53
+ "q3_k",
54
+ "q4_k",
55
+ "q5_k",
56
+ "q6_k",
57
+ "iq4_nl"
58
+ };
59
+
60
+ void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
61
+ #ifdef _WIN32
62
+ HANDLE stdout_read, stdout_write;
63
+ HANDLE stderr_read, stderr_write;
64
+ SECURITY_ATTRIBUTES sa = { sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
65
+
66
+ if (!CreatePipe(&stdout_read, &stdout_write, &sa, 0) ||
67
+ !SetHandleInformation(stdout_read, HANDLE_FLAG_INHERIT, 0)) {
68
+ throw std::runtime_error("Failed to create stdout pipe");
69
+ }
70
+
71
+ if (!CreatePipe(&stderr_read, &stderr_write, &sa, 0) ||
72
+ !SetHandleInformation(stderr_read, HANDLE_FLAG_INHERIT, 0)) {
73
+ throw std::runtime_error("Failed to create stderr pipe");
74
+ }
75
+
76
+ PROCESS_INFORMATION pi;
77
+ STARTUPINFOA si = { sizeof(STARTUPINFOA) };
78
+ si.dwFlags = STARTF_USESTDHANDLES;
79
+ si.hStdOutput = stdout_write;
80
+ si.hStdError = stderr_write;
81
+
82
+ std::vector<char> cmd(command.begin(), command.end());
83
+ cmd.push_back('\0');
84
+
85
+ if (!CreateProcessA(NULL, cmd.data(), NULL, NULL, TRUE, 0, NULL, NULL, &si, &pi)) {
86
+ throw std::runtime_error("Failed to create process");
87
+ }
88
+
89
+ CloseHandle(stdout_write);
90
+ CloseHandle(stderr_write);
91
+
92
+ std::array<char, 128> buffer;
93
+ DWORD bytes_read;
94
+
95
+ while (ReadFile(stdout_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
96
+ stdout_str.append(buffer.data(), bytes_read);
97
+ }
98
+
99
+ while (ReadFile(stderr_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
100
+ stderr_str.append(buffer.data(), bytes_read);
101
+ }
102
+
103
+ CloseHandle(stdout_read);
104
+ CloseHandle(stderr_read);
105
+ WaitForSingleObject(pi.hProcess, INFINITE);
106
+ CloseHandle(pi.hProcess);
107
+ CloseHandle(pi.hThread);
108
+ #else
109
+ int stdout_pipe[2];
110
+ int stderr_pipe[2];
111
+
112
+ if (pipe(stdout_pipe) != 0 || pipe(stderr_pipe) != 0) {
113
+ throw std::runtime_error("Failed to create pipes");
114
+ }
115
+
116
+ pid_t pid = fork();
117
+ if (pid < 0) {
118
+ throw std::runtime_error("Failed to fork process");
119
+ }
120
+
121
+ if (pid == 0) {
122
+ close(stdout_pipe[0]);
123
+ close(stderr_pipe[0]);
124
+ dup2(stdout_pipe[1], STDOUT_FILENO);
125
+ dup2(stderr_pipe[1], STDERR_FILENO);
126
+ close(stdout_pipe[1]);
127
+ close(stderr_pipe[1]);
128
+ execl("/bin/sh", "sh", "-c", command.c_str(), (char*) nullptr);
129
+ _exit(EXIT_FAILURE);
130
+ } else {
131
+ close(stdout_pipe[1]);
132
+ close(stderr_pipe[1]);
133
+
134
+ std::array<char, 128> buffer;
135
+ ssize_t bytes_read;
136
+
137
+ while ((bytes_read = read(stdout_pipe[0], buffer.data(), buffer.size())) > 0) {
138
+ stdout_str.append(buffer.data(), bytes_read);
139
+ }
140
+
141
+ while ((bytes_read = read(stderr_pipe[0], buffer.data(), buffer.size())) > 0) {
142
+ stderr_str.append(buffer.data(), bytes_read);
143
+ }
144
+
145
+ close(stdout_pipe[0]);
146
+ close(stderr_pipe[0]);
147
+ waitpid(pid, nullptr, 0);
148
+ }
149
+ #endif
150
+ }
151
+
152
+ bool directory_exists(const std::string& path) {
153
+ struct stat info;
154
+ if (stat(path.c_str(), &info) != 0) {
155
+ return false; // Path doesn't exist or can't be accessed
156
+ }
157
+ return (info.st_mode & S_IFDIR) != 0; // Check if it is a directory
158
+ }
159
+
160
+ bool create_directory(const std::string& path) {
161
+ #ifdef _WIN32
162
+ return _mkdir(path.c_str()) == 0 || errno == EEXIST; // EEXIST means the directory already exists
163
+ #else
164
+ return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST; // 0755 is the directory permissions
165
+ #endif
166
+ }
167
+
168
+ std::string to_uppercase(const std::string& input) {
169
+ std::string result = input;
170
+ for (char& c : result) {
171
+ c = std::toupper(c);
172
+ }
173
+ return result;
174
+ }
175
+
176
+ bool string_ends_with(const std::string& str, const std::string& suffix) {
177
+ if (suffix.size() > str.size()) {
178
+ return false;
179
+ }
180
+ return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
181
+ }
182
+
183
+ static const char path_separator = '/';
184
+
185
+ std::string join_paths(const std::string& path1, const std::string& path2) {
186
+ return path1 + path_separator + path2;
187
+ }
188
+
189
+ std::string basename(const std::string &path) {
190
+ return path.substr(path.find_last_of("/\\") + 1);
191
+ }
192
+
193
+ void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
194
+ std::string name = _name + (fp16 ? "" : "_fp32");
195
+ std::string out_fname = join_paths(output_dir, name + ".spv");
196
+ std::string in_path = join_paths(input_dir, in_fname);
197
+
198
+ #ifdef _WIN32
199
+ std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
200
+ #else
201
+ std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
202
+ #endif
203
+ for (const auto& define : defines) {
204
+ cmd.push_back("-D" + define.first + "=" + define.second);
205
+ }
206
+
207
+ std::string command;
208
+ for (const auto& part : cmd) {
209
+ command += part + " ";
210
+ }
211
+
212
+ std::string stdout_str, stderr_str;
213
+ try {
214
+ // std::cout << "Executing command: ";
215
+ // for (const auto& part : cmd) {
216
+ // std::cout << part << " ";
217
+ // }
218
+ // std::cout << std::endl;
219
+
220
+ execute_command(command, stdout_str, stderr_str);
221
+ if (!stderr_str.empty()) {
222
+ std::cerr << "cannot compile " << name << "\n\n" << command << "\n\n" << stderr_str << std::endl;
223
+ return;
224
+ }
225
+
226
+ std::lock_guard<std::mutex> guard(lock);
227
+ shader_fnames.push_back(std::make_pair(name, out_fname));
228
+ } catch (const std::exception& e) {
229
+ std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
230
+ }
231
+ }
232
+
233
+ std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
234
+ std::map<std::string, std::string> result = a;
235
+ result.insert(b.begin(), b.end());
236
+ return result;
237
+ }
238
+
239
+ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id) {
240
+ std::string load_vec = fp16 ? "8" : "4";
241
+ std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4";
242
+ std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4";
243
+
244
+ std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", fp16 ? "float16_t" : "float"}};
245
+ std::string shader_name = "matmul";
246
+
247
+ if (matmul_id) {
248
+ base_dict["MUL_MAT_ID"] = "1";
249
+ shader_name = "matmul_id";
250
+ }
251
+
252
+ if (fp16) {
253
+ base_dict["FLOAT16"] = "1";
254
+ }
255
+
256
+ // Shaders with f16 B_TYPE
257
+ tasks.push_back(std::async(std::launch::async, [=] {
258
+ string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
259
+ }));
260
+ tasks.push_back(std::async(std::launch::async, [=] {
261
+ string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
262
+ }));
263
+
264
+ tasks.push_back(std::async(std::launch::async, [=] {
265
+ string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
266
+ }));
267
+ tasks.push_back(std::async(std::launch::async, [=] {
268
+ string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
269
+ }));
270
+
271
+ for (const auto& tname : type_names) {
272
+ std::string data_a_key = "DATA_A_" + to_uppercase(tname);
273
+ // For unaligned, load one at a time for f32/f16, or two at a time for quants
274
+ std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
275
+ // For aligned matmul loads
276
+ std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
277
+ tasks.push_back(std::async(std::launch::async, [=] {
278
+ string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
279
+ }));
280
+ tasks.push_back(std::async(std::launch::async, [=] {
281
+ string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
282
+ }));
283
+ }
284
+ }
285
+
286
+ void process_shaders(std::vector<std::future<void>>& tasks) {
287
+ std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
288
+ std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
289
+
290
+ for (const auto& fp16 : {false, true}) {
291
+ matmul_shaders(tasks, fp16, false);
292
+ matmul_shaders(tasks, fp16, true);
293
+ }
294
+
295
+ for (const auto& tname : type_names) {
296
+ // mul mat vec
297
+ std::string data_a_key = "DATA_A_" + to_uppercase(tname);
298
+ std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
299
+
300
+ tasks.push_back(std::async(std::launch::async, [=] {
301
+ string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
302
+ }));
303
+ tasks.push_back(std::async(std::launch::async, [=] {
304
+ string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
305
+ }));
306
+
307
+ tasks.push_back(std::async(std::launch::async, [=] {
308
+ string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
309
+ }));
310
+
311
+ // Dequant shaders
312
+ if (tname != "f16") {
313
+ tasks.push_back(std::async(std::launch::async, [=] {
314
+ string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
315
+ }));
316
+ }
317
+
318
+ if (!string_ends_with(tname, "_k")) {
319
+ shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
320
+
321
+ if (tname == "f16") {
322
+ tasks.push_back(std::async(std::launch::async, [=] {
323
+ string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
324
+ }));
325
+ } else {
326
+ tasks.push_back(std::async(std::launch::async, [=] {
327
+ string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
328
+ }));
329
+ }
330
+ tasks.push_back(std::async(std::launch::async, [=] {
331
+ string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
332
+ }));
333
+ }
334
+ }
335
+
336
+ tasks.push_back(std::async(std::launch::async, [] {
337
+ string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
338
+ }));
339
+ tasks.push_back(std::async(std::launch::async, [] {
340
+ string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
341
+ }));
342
+
343
+ // Norms
344
+ tasks.push_back(std::async(std::launch::async, [=] {
345
+ string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
346
+ }));
347
+ tasks.push_back(std::async(std::launch::async, [=] {
348
+ string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
349
+ }));
350
+ tasks.push_back(std::async(std::launch::async, [=] {
351
+ string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
352
+ }));
353
+
354
+ tasks.push_back(std::async(std::launch::async, [] {
355
+ string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
356
+ }));
357
+ tasks.push_back(std::async(std::launch::async, [] {
358
+ string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
359
+ }));
360
+ tasks.push_back(std::async(std::launch::async, [] {
361
+ string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
362
+ }));
363
+
364
+ tasks.push_back(std::async(std::launch::async, [] {
365
+ string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
366
+ }));
367
+ tasks.push_back(std::async(std::launch::async, [] {
368
+ string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
369
+ }));
370
+
371
+ tasks.push_back(std::async(std::launch::async, [] {
372
+ string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
373
+ }));
374
+
375
+ tasks.push_back(std::async(std::launch::async, [] {
376
+ string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
377
+ }));
378
+
379
+ tasks.push_back(std::async(std::launch::async, [] {
380
+ string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
381
+ }));
382
+
383
+ tasks.push_back(std::async(std::launch::async, [] {
384
+ string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
385
+ }));
386
+
387
+ tasks.push_back(std::async(std::launch::async, [] {
388
+ string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
389
+ }));
390
+
391
+ tasks.push_back(std::async(std::launch::async, [] {
392
+ string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
393
+ }));
394
+
395
+ tasks.push_back(std::async(std::launch::async, [] {
396
+ string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
397
+ }));
398
+
399
+ tasks.push_back(std::async(std::launch::async, [] {
400
+ string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
401
+ }));
402
+
403
+ tasks.push_back(std::async(std::launch::async, [] {
404
+ string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
405
+ }));
406
+
407
+ tasks.push_back(std::async(std::launch::async, [] {
408
+ string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
409
+ }));
410
+ tasks.push_back(std::async(std::launch::async, [] {
411
+ string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
412
+ }));
413
+ tasks.push_back(std::async(std::launch::async, [] {
414
+ string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
415
+ }));
416
+
417
+ tasks.push_back(std::async(std::launch::async, [] {
418
+ string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
419
+ }));
420
+
421
+ tasks.push_back(std::async(std::launch::async, [] {
422
+ string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
423
+ }));
424
+ tasks.push_back(std::async(std::launch::async, [] {
425
+ string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
426
+ }));
427
+ tasks.push_back(std::async(std::launch::async, [] {
428
+ string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
429
+ }));
430
+ tasks.push_back(std::async(std::launch::async, [] {
431
+ string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
432
+ }));
433
+ tasks.push_back(std::async(std::launch::async, [] {
434
+ string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
435
+ }));
436
+ tasks.push_back(std::async(std::launch::async, [] {
437
+ string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
438
+ }));
439
+
440
+ tasks.push_back(std::async(std::launch::async, [] {
441
+ string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
442
+ }));
443
+
444
+ tasks.push_back(std::async(std::launch::async, [=] {
445
+ string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
446
+ }));
447
+ tasks.push_back(std::async(std::launch::async, [=] {
448
+ string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
449
+ }));
450
+
451
+ tasks.push_back(std::async(std::launch::async, [] {
452
+ string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
453
+ }));
454
+ tasks.push_back(std::async(std::launch::async, [] {
455
+ string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
456
+ }));
457
+
458
+ tasks.push_back(std::async(std::launch::async, [] {
459
+ string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
460
+ }));
461
+ tasks.push_back(std::async(std::launch::async, [] {
462
+ string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
463
+ }));
464
+
465
+ tasks.push_back(std::async(std::launch::async, [] {
466
+ string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
467
+ }));
468
+
469
+ tasks.push_back(std::async(std::launch::async, [=] {
470
+ string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
471
+ }));
472
+
473
+ tasks.push_back(std::async(std::launch::async, [=] {
474
+ string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
475
+ }));
476
+ tasks.push_back(std::async(std::launch::async, [=] {
477
+ string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
478
+ }));
479
+
480
+ tasks.push_back(std::async(std::launch::async, [=] {
481
+ string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
482
+ }));
483
+ }
484
+
485
+ void write_output_files() {
486
+ FILE* hdr = fopen(target_hpp.c_str(), "w");
487
+ FILE* src = fopen(target_cpp.c_str(), "w");
488
+
489
+ fprintf(hdr, "#include <cstdint>\n\n");
490
+ fprintf(src, "#include \"%s\"\n\n", basename(target_hpp).c_str());
491
+
492
+ for (const auto& pair : shader_fnames) {
493
+ const std::string& name = pair.first;
494
+ #ifdef _WIN32
495
+ std::string path = pair.second;
496
+ std::replace(path.begin(), path.end(), '/', '\\' );
497
+ #else
498
+ const std::string& path = pair.second;
499
+ #endif
500
+
501
+ FILE* spv = fopen(path.c_str(), "rb");
502
+ if (!spv) {
503
+ std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
504
+ continue;
505
+ }
506
+
507
+ fseek(spv, 0, SEEK_END);
508
+ size_t size = ftell(spv);
509
+ fseek(spv, 0, SEEK_SET);
510
+
511
+ std::vector<unsigned char> data(size);
512
+ size_t read_size = fread(data.data(), 1, size, spv);
513
+ fclose(spv);
514
+ if (read_size != size) {
515
+ std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
516
+ continue;
517
+ }
518
+
519
+ fprintf(hdr, "extern unsigned char %s_data[%zu];\n", name.c_str(), size);
520
+ fprintf(hdr, "const uint64_t %s_len = %zu;\n\n", name.c_str(), size);
521
+
522
+ fprintf(src, "unsigned char %s_data[%zu] = {\n", name.c_str(), size);
523
+ for (size_t i = 0; i < size; ++i) {
524
+ fprintf(src, "0x%02x,", data[i]);
525
+ if ((i + 1) % 12 == 0) fprintf(src, "\n");
526
+ }
527
+ fprintf(src, "\n};\n\n");
528
+
529
+ if (!no_clean) {
530
+ std::remove(path.c_str());
531
+ }
532
+ }
533
+
534
+ fclose(hdr);
535
+ fclose(src);
536
+ }
537
+
538
+ int main(int argc, char** argv) {
539
+ std::map<std::string, std::string> args;
540
+ for (int i = 1; i < argc; i += 2) {
541
+ if (i + 1 < argc) {
542
+ args[argv[i]] = argv[i + 1];
543
+ }
544
+ }
545
+
546
+ if (args.find("--glslc") != args.end()) {
547
+ GLSLC = args["--glslc"]; // Path to glslc
548
+ }
549
+ if (args.find("--input-dir") != args.end()) {
550
+ input_dir = args["--input-dir"]; // Directory containing shader sources
551
+ }
552
+ if (args.find("--output-dir") != args.end()) {
553
+ output_dir = args["--output-dir"]; // Directory for containing SPIR-V output
554
+ }
555
+ if (args.find("--target-hpp") != args.end()) {
556
+ target_hpp = args["--target-hpp"]; // Path to generated header file
557
+ }
558
+ if (args.find("--target-cpp") != args.end()) {
559
+ target_cpp = args["--target-cpp"]; // Path to generated cpp file
560
+ }
561
+ if (args.find("--no-clean") != args.end()) {
562
+ no_clean = true; // Keep temporary SPIR-V files in output-dir after build
563
+ }
564
+
565
+ if (!directory_exists(input_dir)) {
566
+ std::cerr << "\"" << input_dir << "\" must be a valid directory containing shader sources" << std::endl;
567
+ return EXIT_FAILURE;
568
+ }
569
+
570
+ if (!directory_exists(output_dir)) {
571
+ if (!create_directory(output_dir)) {
572
+ std::cerr << "Error creating output directory: " << output_dir << "\n";
573
+ return EXIT_FAILURE;
574
+ }
575
+ }
576
+
577
+ std::vector<std::future<void>> tasks;
578
+ process_shaders(tasks);
579
+
580
+ for (auto& task : tasks) {
581
+ task.get();
582
+ }
583
+
584
+ write_output_files();
585
+
586
+ return EXIT_SUCCESS;
587
+ }
scripts/sync-ggml-am.sh CHANGED
@@ -65,6 +65,7 @@ while read c; do
65
  src/ggml-cann/* \
66
  src/ggml-cuda/* \
67
  src/ggml-sycl/* \
 
68
  include/ggml*.h \
69
  examples/common.h \
70
  examples/common.cpp \
 
65
  src/ggml-cann/* \
66
  src/ggml-cuda/* \
67
  src/ggml-sycl/* \
68
+ src/vulkan-shaders/* \
69
  include/ggml*.h \
70
  examples/common.h \
71
  examples/common.cpp \
scripts/sync-ggml.last CHANGED
@@ -1 +1 @@
1
- a735a7b5fce27d23c2a6b0b3ccbb47b2c51e83e7
 
1
+ 46e22f59eaf0aaa38a8e525fd89ba95e39ba7435