Spaces:
Sleeping
Sleeping
sync : vulkan (skip) (llama/0)
Browse files- ggml/CMakeLists.txt +1 -0
- ggml/src/CMakeLists.txt +4 -0
- ggml/src/ggml-vulkan.cpp +0 -0
- ggml/src/vulkan-shaders/acc.comp +24 -0
- ggml/src/vulkan-shaders/concat.comp +5 -1
- ggml/src/vulkan-shaders/mul_mat_vec.comp +1 -2
- ggml/src/vulkan-shaders/mul_mat_vec_nc.comp +1 -1
- ggml/src/vulkan-shaders/mul_mat_vec_p021.comp +1 -1
- ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp +18 -17
- ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp +10 -9
- ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp +24 -21
- ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp +27 -29
- ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp +13 -13
- ggml/src/vulkan-shaders/mul_mm.comp +8 -7
- ggml/src/vulkan-shaders/repeat.comp +24 -0
- ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +7 -7
ggml/CMakeLists.txt
CHANGED
|
@@ -135,6 +135,7 @@ option(GGML_VULKAN "ggml: use Vulkan"
|
|
| 135 |
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
| 136 |
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
| 137 |
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
|
|
|
| 138 |
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
| 139 |
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
| 140 |
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
|
|
|
| 135 |
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
| 136 |
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
| 137 |
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
| 138 |
+
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
|
| 139 |
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
| 140 |
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
| 141 |
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
ggml/src/CMakeLists.txt
CHANGED
|
@@ -612,6 +612,10 @@ if (GGML_VULKAN)
|
|
| 612 |
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
| 613 |
endif()
|
| 614 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
if (GGML_VULKAN_VALIDATE)
|
| 616 |
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
| 617 |
endif()
|
|
|
|
| 612 |
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
| 613 |
endif()
|
| 614 |
|
| 615 |
+
if (GGML_VULKAN_PERF)
|
| 616 |
+
add_compile_definitions(GGML_VULKAN_PERF)
|
| 617 |
+
endif()
|
| 618 |
+
|
| 619 |
if (GGML_VULKAN_VALIDATE)
|
| 620 |
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
| 621 |
endif()
|
ggml/src/ggml-vulkan.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml/src/vulkan-shaders/acc.comp
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "types.comp"
|
| 4 |
+
#include "generic_binary_head.comp"
|
| 5 |
+
|
| 6 |
+
void main() {
|
| 7 |
+
const uint idx = gl_GlobalInvocationID.x;
|
| 8 |
+
if (idx >= p.ne) {
|
| 9 |
+
return;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
const uint offset = p.param3;
|
| 13 |
+
const uint src1_i = idx - offset;
|
| 14 |
+
const uint oz = src1_i / p.nb02;
|
| 15 |
+
const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
|
| 16 |
+
const uint ox = src1_i % p.nb01;
|
| 17 |
+
|
| 18 |
+
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
| 19 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
| 20 |
+
} else {
|
| 21 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]));
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
|
ggml/src/vulkan-shaders/concat.comp
CHANGED
|
@@ -30,6 +30,10 @@ void main() {
|
|
| 30 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 31 |
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
| 32 |
#else
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
#endif
|
| 35 |
}
|
|
|
|
| 30 |
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
| 31 |
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
| 32 |
#else
|
| 33 |
+
if (is_src0) {
|
| 34 |
+
data_d[p.d_offset + dst_idx] = data_a[src0_idx];
|
| 35 |
+
} else {
|
| 36 |
+
data_d[p.d_offset + dst_idx] = data_b[src1_idx];
|
| 37 |
+
}
|
| 38 |
#endif
|
| 39 |
}
|
ggml/src/vulkan-shaders/mul_mat_vec.comp
CHANGED
|
@@ -39,8 +39,7 @@ void main() {
|
|
| 39 |
vec2 v = dequantize(ib, iqs, a_offset / QUANT_K);
|
| 40 |
|
| 41 |
// matrix multiplication
|
| 42 |
-
tmp[tid]
|
| 43 |
-
FLOAT_TYPE(v.y) * FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
|
| 44 |
}
|
| 45 |
|
| 46 |
// sum up partial sums and write back result
|
|
|
|
| 39 |
vec2 v = dequantize(ib, iqs, a_offset / QUANT_K);
|
| 40 |
|
| 41 |
// matrix multiplication
|
| 42 |
+
tmp[tid] = fma(FLOAT_TYPE(v.x), FLOAT_TYPE(data_b[b_offset + iybs + iqs]), fma(FLOAT_TYPE(v.y), FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]), tmp[tid]));
|
|
|
|
| 43 |
}
|
| 44 |
|
| 45 |
// sum up partial sums and write back result
|
ggml/src/vulkan-shaders/mul_mat_vec_nc.comp
CHANGED
|
@@ -53,7 +53,7 @@ void main() {
|
|
| 53 |
|
| 54 |
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
| 55 |
|
| 56 |
-
tmp[tid]
|
| 57 |
}
|
| 58 |
|
| 59 |
// sum up partial sums and write back result
|
|
|
|
| 53 |
|
| 54 |
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
| 55 |
|
| 56 |
+
tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
|
| 57 |
}
|
| 58 |
|
| 59 |
// sum up partial sums and write back result
|
ggml/src/vulkan-shaders/mul_mat_vec_p021.comp
CHANGED
|
@@ -52,7 +52,7 @@ void main() {
|
|
| 52 |
// y is not transposed but permuted
|
| 53 |
const uint iy = channel*nrows_y + row_y;
|
| 54 |
|
| 55 |
-
tmp[tid]
|
| 56 |
}
|
| 57 |
|
| 58 |
// dst is not transposed and not permuted
|
|
|
|
| 52 |
// y is not transposed but permuted
|
| 53 |
const uint iy = channel*nrows_y + row_y;
|
| 54 |
|
| 55 |
+
tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
|
| 56 |
}
|
| 57 |
|
| 58 |
// dst is not transposed and not permuted
|
ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp
CHANGED
|
@@ -39,24 +39,25 @@ void main() {
|
|
| 39 |
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
| 40 |
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
| 41 |
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 42 |
-
sum1
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
sum2
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
}
|
| 59 |
-
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
// sum up partial sums and write back result
|
|
|
|
| 39 |
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
| 40 |
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
| 41 |
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 42 |
+
sum1 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3),
|
| 43 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3),
|
| 44 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3),
|
| 45 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3),
|
| 46 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3),
|
| 47 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3),
|
| 48 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3),
|
| 49 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3), sum1))))))));
|
| 50 |
+
sum2 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF),
|
| 51 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF),
|
| 52 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF),
|
| 53 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF),
|
| 54 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF),
|
| 55 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF),
|
| 56 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF),
|
| 57 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF), sum2))))))));
|
| 58 |
}
|
| 59 |
+
const uint tmp_idx = 16 * ix + tid;
|
| 60 |
+
tmp[tmp_idx] = fma(dall, sum1, fma(-dmin, sum2, tmp[tmp_idx]));
|
| 61 |
}
|
| 62 |
|
| 63 |
// sum up partial sums and write back result
|
ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp
CHANGED
|
@@ -40,16 +40,17 @@ void main() {
|
|
| 40 |
|
| 41 |
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
| 42 |
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 43 |
-
sum
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
}
|
| 52 |
-
|
|
|
|
| 53 |
}
|
| 54 |
|
| 55 |
// sum up partial sums and write back result
|
|
|
|
| 40 |
|
| 41 |
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
| 42 |
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
| 43 |
+
sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)),
|
| 44 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)),
|
| 45 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)),
|
| 46 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)),
|
| 47 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
|
| 48 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
|
| 49 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
|
| 50 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
|
| 51 |
}
|
| 52 |
+
const uint tmp_idx = 16 * ix + tid;
|
| 53 |
+
tmp[tmp_idx] = fma(d, sum, tmp[tmp_idx]);
|
| 54 |
}
|
| 55 |
|
| 56 |
// sum up partial sums and write back result
|
ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp
CHANGED
|
@@ -67,17 +67,17 @@ void main() {
|
|
| 67 |
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4);
|
| 68 |
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4);
|
| 69 |
|
| 70 |
-
const FLOAT_TYPE sx =
|
| 71 |
-
const FLOAT_TYPE sy =
|
| 72 |
-
const FLOAT_TYPE sz =
|
| 73 |
-
const FLOAT_TYPE sw =
|
| 74 |
-
const FLOAT_TYPE smin =
|
| 75 |
-
FLOAT_TYPE(data_b[b_offset + y1_idx ])
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
tmp[
|
| 81 |
#else
|
| 82 |
const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf);
|
| 83 |
const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf);
|
|
@@ -88,16 +88,19 @@ void main() {
|
|
| 88 |
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
|
| 89 |
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
|
| 90 |
|
| 91 |
-
const FLOAT_TYPE sx =
|
| 92 |
-
const FLOAT_TYPE sy =
|
| 93 |
-
const FLOAT_TYPE sz =
|
| 94 |
-
const FLOAT_TYPE sw =
|
| 95 |
-
const FLOAT_TYPE smin =
|
| 96 |
-
FLOAT_TYPE(data_b[b_offset + y1_idx])
|
| 97 |
-
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1])
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
| 101 |
#endif
|
| 102 |
}
|
| 103 |
|
|
|
|
| 67 |
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4);
|
| 68 |
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4);
|
| 69 |
|
| 70 |
+
const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx]), q4_0, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), q4_1, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3)));
|
| 71 |
+
const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_4, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), q4_5, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), q4_6, FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7)));
|
| 72 |
+
const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx]), q4_8, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), q4_9, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), q4_10, FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11)));
|
| 73 |
+
const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_12, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), q4_13, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), q4_14, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15)));
|
| 74 |
+
const FLOAT_TYPE smin =
|
| 75 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7,
|
| 76 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), sc7,
|
| 77 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), sc7,
|
| 78 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 3]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 35]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 3]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7)))))))))))))));
|
| 79 |
+
const uint tmp_idx = 16 * ix + tid;
|
| 80 |
+
tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
|
| 81 |
#else
|
| 82 |
const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf);
|
| 83 |
const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf);
|
|
|
|
| 88 |
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
|
| 89 |
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
|
| 90 |
|
| 91 |
+
const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), q4_0, FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1);
|
| 92 |
+
const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3);
|
| 93 |
+
const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), q4_4, FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5);
|
| 94 |
+
const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7);
|
| 95 |
+
const FLOAT_TYPE smin =
|
| 96 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7,
|
| 97 |
+
+ fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7)))))));
|
| 98 |
+
|
| 99 |
+
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) +
|
| 100 |
+
sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
|
| 101 |
+
const uint tmp_idx = 16 * ix + tid;
|
| 102 |
+
tmp[tmp_idx] = fma(dall, (fma(sx, FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f), fma(sy, FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f),
|
| 103 |
+
fma(sz, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)), fma(sw, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))))))), fma(-dmin, smin, tmp[tmp_idx]));
|
| 104 |
#endif
|
| 105 |
}
|
| 106 |
|
ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp
CHANGED
|
@@ -66,35 +66,33 @@ void main() {
|
|
| 66 |
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4);
|
| 67 |
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4);
|
| 68 |
|
| 69 |
-
const FLOAT_TYPE sx =
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
+ FLOAT_TYPE(data_b[b_offset +
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
);
|
| 97 |
-
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
|
| 98 |
}
|
| 99 |
|
| 100 |
// sum up partial sums and write back result
|
|
|
|
| 66 |
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4);
|
| 67 |
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4);
|
| 68 |
|
| 69 |
+
const FLOAT_TYPE sx =
|
| 70 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)),
|
| 71 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)),
|
| 72 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)),
|
| 73 |
+
FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0)))));
|
| 74 |
+
const FLOAT_TYPE sy =
|
| 75 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)),
|
| 76 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)),
|
| 77 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)),
|
| 78 |
+
FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0)))));
|
| 79 |
+
const FLOAT_TYPE sz =
|
| 80 |
+
fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)),
|
| 81 |
+
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)),
|
| 82 |
+
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)),
|
| 83 |
+
FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0)))));
|
| 84 |
+
const FLOAT_TYPE sw =
|
| 85 |
+
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)),
|
| 86 |
+
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)),
|
| 87 |
+
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)),
|
| 88 |
+
FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0)))));
|
| 89 |
+
const FLOAT_TYPE smin =
|
| 90 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2,
|
| 91 |
+
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3,
|
| 92 |
+
fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6,
|
| 93 |
+
(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7)));
|
| 94 |
+
const uint tmp_idx = 16 * ix + tid;
|
| 95 |
+
tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
|
|
|
|
|
|
|
| 96 |
}
|
| 97 |
|
| 98 |
// sum up partial sums and write back result
|
ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp
CHANGED
|
@@ -44,22 +44,22 @@ void main() {
|
|
| 44 |
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
| 45 |
|
| 46 |
#if K_QUANTS_PER_ITERATION == 1
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
#else
|
| 57 |
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
| 58 |
[[unroll]] for (int l = 0; l < 4; ++l) {
|
| 59 |
-
sum
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
}
|
| 64 |
tmp[16 * ix + tid] += sum;
|
| 65 |
#endif
|
|
|
|
| 44 |
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
| 45 |
|
| 46 |
#if K_QUANTS_PER_ITERATION == 1
|
| 47 |
+
const uint tmp_idx = 16 * ix + tid;
|
| 48 |
+
tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32),
|
| 49 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32),
|
| 50 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32),
|
| 51 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32),
|
| 52 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32),
|
| 53 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32),
|
| 54 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32),
|
| 55 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx]))))))));
|
| 56 |
#else
|
| 57 |
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
| 58 |
[[unroll]] for (int l = 0; l < 4; ++l) {
|
| 59 |
+
sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32),
|
| 60 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32),
|
| 61 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32),
|
| 62 |
+
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum))));
|
| 63 |
}
|
| 64 |
tmp[16 * ix + tid] += sum;
|
| 65 |
#endif
|
ggml/src/vulkan-shaders/mul_mm.comp
CHANGED
|
@@ -326,10 +326,10 @@ void main() {
|
|
| 326 |
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
|
| 327 |
}
|
| 328 |
const float d = loadd.x * sc;
|
| 329 |
-
const float m = loadd.y * mbyte;
|
| 330 |
|
| 331 |
-
buf_a[buf_idx ] = FLOAT_TYPE(d
|
| 332 |
-
buf_a[buf_idx + 1] = FLOAT_TYPE(d
|
| 333 |
#elif defined(DATA_A_Q5_K)
|
| 334 |
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 335 |
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
|
|
@@ -357,10 +357,10 @@ void main() {
|
|
| 357 |
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
|
| 358 |
}
|
| 359 |
const float d = loadd.x * sc;
|
| 360 |
-
const float m = loadd.y * mbyte;
|
| 361 |
|
| 362 |
-
buf_a[buf_idx ] = FLOAT_TYPE(d
|
| 363 |
-
buf_a[buf_idx + 1] = FLOAT_TYPE(d
|
| 364 |
#elif defined(DATA_A_Q6_K)
|
| 365 |
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 366 |
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
|
|
@@ -463,7 +463,8 @@ void main() {
|
|
| 463 |
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
| 464 |
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
| 465 |
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
| 466 |
-
|
|
|
|
| 467 |
}
|
| 468 |
}
|
| 469 |
}
|
|
|
|
| 326 |
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
|
| 327 |
}
|
| 328 |
const float d = loadd.x * sc;
|
| 329 |
+
const float m = -loadd.y * mbyte;
|
| 330 |
|
| 331 |
+
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m));
|
| 332 |
+
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
|
| 333 |
#elif defined(DATA_A_Q5_K)
|
| 334 |
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 335 |
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
|
|
|
|
| 357 |
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
|
| 358 |
}
|
| 359 |
const float d = loadd.x * sc;
|
| 360 |
+
const float m = -loadd.y * mbyte;
|
| 361 |
|
| 362 |
+
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m));
|
| 363 |
+
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
|
| 364 |
#elif defined(DATA_A_Q6_K)
|
| 365 |
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
| 366 |
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
|
|
|
|
| 463 |
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
| 464 |
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
| 465 |
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
| 466 |
+
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
|
| 467 |
+
sums[sums_idx] = fma(float(cache_a[wsir * TM + cr]), float(cache_b[wsic * TN + cc]), sums[sums_idx]);
|
| 468 |
}
|
| 469 |
}
|
| 470 |
}
|
ggml/src/vulkan-shaders/repeat.comp
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "types.comp"
|
| 4 |
+
#include "generic_unary_head.comp"
|
| 5 |
+
|
| 6 |
+
uint src0_idx_mod(uint idx) {
|
| 7 |
+
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
|
| 8 |
+
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
| 9 |
+
const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
|
| 10 |
+
const uint i12_offset = i12*p.ne11*p.ne10;
|
| 11 |
+
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
|
| 12 |
+
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
| 13 |
+
return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
void main() {
|
| 17 |
+
const uint idx = get_idx();
|
| 18 |
+
|
| 19 |
+
if (idx >= p.ne) {
|
| 20 |
+
return;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
|
| 24 |
+
}
|
ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
CHANGED
|
@@ -369,31 +369,31 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
|
| 369 |
}));
|
| 370 |
|
| 371 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 372 |
-
string_to_spv("
|
| 373 |
}));
|
| 374 |
|
| 375 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 376 |
-
string_to_spv("
|
| 377 |
}));
|
| 378 |
|
| 379 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 380 |
-
string_to_spv("
|
| 381 |
}));
|
| 382 |
|
| 383 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 384 |
-
string_to_spv("
|
| 385 |
}));
|
| 386 |
|
| 387 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 388 |
-
string_to_spv("
|
| 389 |
}));
|
| 390 |
|
| 391 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 392 |
-
string_to_spv("
|
| 393 |
}));
|
| 394 |
|
| 395 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 396 |
-
string_to_spv("
|
| 397 |
}));
|
| 398 |
|
| 399 |
tasks.push_back(std::async(std::launch::async, [] {
|
|
|
|
| 369 |
}));
|
| 370 |
|
| 371 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 372 |
+
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 373 |
}));
|
| 374 |
|
| 375 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 376 |
+
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
| 377 |
}));
|
| 378 |
|
| 379 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 380 |
+
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 381 |
}));
|
| 382 |
|
| 383 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 384 |
+
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 385 |
}));
|
| 386 |
|
| 387 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 388 |
+
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
| 389 |
}));
|
| 390 |
|
| 391 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 392 |
+
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 393 |
}));
|
| 394 |
|
| 395 |
tasks.push_back(std::async(std::launch::async, [] {
|
| 396 |
+
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
| 397 |
}));
|
| 398 |
|
| 399 |
tasks.push_back(std::async(std::launch::async, [] {
|