Spaces:
Running
Running
R0CKSTAR
commited on
Commit
·
12bb60d
1
Parent(s):
22dfdf6
musa: fix all warnings, re-enable `-DLLAMA_FATAL_WARNINGS=ON` in ci and update doc (llama/12611)
Browse files* musa: fix all warnings
Signed-off-by: Xiaodong Ye <[email protected]>
* musa: enable -DLLAMA_FATAL_WARNINGS=ON in run.sh
Signed-off-by: Xiaodong Ye <[email protected]>
* musa: update ci doc (install ccache)
Signed-off-by: Xiaodong Ye <[email protected]>
* fix Windows build issue
Signed-off-by: Xiaodong Ye <[email protected]>
* Address review comments
Signed-off-by: Xiaodong Ye <[email protected]>
* Address review comments
Signed-off-by: Xiaodong Ye <[email protected]>
---------
Signed-off-by: Xiaodong Ye <[email protected]>
- ggml/src/ggml-common.h +12 -6
- ggml/src/ggml-cuda/common.cuh +4 -0
- ggml/src/ggml-cuda/concat.cu +2 -2
- ggml/src/ggml-cuda/conv-transpose-1d.cu +4 -2
- ggml/src/ggml-cuda/convert.cu +1 -1
- ggml/src/ggml-cuda/fattn-common.cuh +5 -4
- ggml/src/ggml-cuda/fattn-mma-f16.cuh +58 -33
- ggml/src/ggml-cuda/fattn-tile-f16.cu +13 -1
- ggml/src/ggml-cuda/fattn-tile-f32.cu +12 -0
- ggml/src/ggml-cuda/fattn-vec-f16.cuh +13 -1
- ggml/src/ggml-cuda/fattn-vec-f32.cuh +10 -0
- ggml/src/ggml-cuda/fattn-wmma-f16.cu +11 -1
- ggml/src/ggml-cuda/mma.cuh +2 -0
- ggml/src/ggml-cuda/mmq.cuh +38 -22
- ggml/src/ggml-cuda/mmv.cu +1 -1
- ggml/src/ggml-cuda/mmvq.cu +4 -2
- ggml/src/ggml-cuda/pad.cu +1 -1
- ggml/src/ggml-cuda/upscale.cu +1 -1
ggml/src/ggml-common.h
CHANGED
|
@@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;
|
|
| 158 |
|
| 159 |
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
#define QK4_0 32
|
| 162 |
typedef struct {
|
| 163 |
ggml_half d; // delta
|
|
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b
|
|
| 167 |
|
| 168 |
#define QK4_1 32
|
| 169 |
typedef struct {
|
| 170 |
-
union {
|
| 171 |
struct {
|
| 172 |
ggml_half d; // delta
|
| 173 |
ggml_half m; // min
|
|
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0
|
|
| 188 |
|
| 189 |
#define QK5_1 32
|
| 190 |
typedef struct {
|
| 191 |
-
union {
|
| 192 |
struct {
|
| 193 |
ggml_half d; // delta
|
| 194 |
ggml_half m; // min
|
|
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block
|
|
| 209 |
|
| 210 |
#define QK8_1 32
|
| 211 |
typedef struct {
|
| 212 |
-
union {
|
| 213 |
struct {
|
| 214 |
ggml_half d; // delta
|
| 215 |
ggml_half s; // d * sum(qs[i])
|
|
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
|
|
| 250 |
typedef struct {
|
| 251 |
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
| 252 |
uint8_t qs[QK_K/4]; // quants
|
| 253 |
-
union {
|
| 254 |
struct {
|
| 255 |
ggml_half d; // super-block scale for quantized scales
|
| 256 |
ggml_half dmin; // super-block scale for quantized mins
|
|
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
|
|
| 277 |
// weight is represented as x = a * q + b
|
| 278 |
// Effectively 4.5 bits per weight
|
| 279 |
typedef struct {
|
| 280 |
-
union {
|
| 281 |
struct {
|
| 282 |
ggml_half d; // super-block scale for quantized scales
|
| 283 |
ggml_half dmin; // super-block scale for quantized mins
|
|
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
|
|
| 294 |
// weight is represented as x = a * q + b
|
| 295 |
// Effectively 5.5 bits per weight
|
| 296 |
typedef struct {
|
| 297 |
-
union {
|
| 298 |
struct {
|
| 299 |
ggml_half d; // super-block scale for quantized scales
|
| 300 |
ggml_half dmin; // super-block scale for quantized mins
|
|
|
|
| 158 |
|
| 159 |
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
| 160 |
|
| 161 |
+
#ifdef _MSC_VER
|
| 162 |
+
#define GGML_EXTENSION
|
| 163 |
+
#else // _MSC_VER
|
| 164 |
+
#define GGML_EXTENSION __extension__
|
| 165 |
+
#endif // _MSC_VER
|
| 166 |
+
|
| 167 |
#define QK4_0 32
|
| 168 |
typedef struct {
|
| 169 |
ggml_half d; // delta
|
|
|
|
| 173 |
|
| 174 |
#define QK4_1 32
|
| 175 |
typedef struct {
|
| 176 |
+
GGML_EXTENSION union {
|
| 177 |
struct {
|
| 178 |
ggml_half d; // delta
|
| 179 |
ggml_half m; // min
|
|
|
|
| 194 |
|
| 195 |
#define QK5_1 32
|
| 196 |
typedef struct {
|
| 197 |
+
GGML_EXTENSION union {
|
| 198 |
struct {
|
| 199 |
ggml_half d; // delta
|
| 200 |
ggml_half m; // min
|
|
|
|
| 215 |
|
| 216 |
#define QK8_1 32
|
| 217 |
typedef struct {
|
| 218 |
+
GGML_EXTENSION union {
|
| 219 |
struct {
|
| 220 |
ggml_half d; // delta
|
| 221 |
ggml_half s; // d * sum(qs[i])
|
|
|
|
| 256 |
typedef struct {
|
| 257 |
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
| 258 |
uint8_t qs[QK_K/4]; // quants
|
| 259 |
+
GGML_EXTENSION union {
|
| 260 |
struct {
|
| 261 |
ggml_half d; // super-block scale for quantized scales
|
| 262 |
ggml_half dmin; // super-block scale for quantized mins
|
|
|
|
| 283 |
// weight is represented as x = a * q + b
|
| 284 |
// Effectively 4.5 bits per weight
|
| 285 |
typedef struct {
|
| 286 |
+
GGML_EXTENSION union {
|
| 287 |
struct {
|
| 288 |
ggml_half d; // super-block scale for quantized scales
|
| 289 |
ggml_half dmin; // super-block scale for quantized mins
|
|
|
|
| 300 |
// weight is represented as x = a * q + b
|
| 301 |
// Effectively 5.5 bits per weight
|
| 302 |
typedef struct {
|
| 303 |
+
GGML_EXTENSION union {
|
| 304 |
struct {
|
| 305 |
ggml_half d; // super-block scale for quantized scales
|
| 306 |
ggml_half dmin; // super-block scale for quantized mins
|
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -288,6 +288,10 @@ static __device__ void no_device_code(
|
|
| 288 |
__trap();
|
| 289 |
|
| 290 |
GGML_UNUSED(no_device_code); // suppress unused function warning
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
}
|
| 292 |
|
| 293 |
#ifdef __CUDA_ARCH__
|
|
|
|
| 288 |
__trap();
|
| 289 |
|
| 290 |
GGML_UNUSED(no_device_code); // suppress unused function warning
|
| 291 |
+
|
| 292 |
+
#if defined(GGML_USE_MUSA)
|
| 293 |
+
__builtin_unreachable();
|
| 294 |
+
#endif // defined(GGML_USE_MUSA)
|
| 295 |
}
|
| 296 |
|
| 297 |
#ifdef __CUDA_ARCH__
|
ggml/src/ggml-cuda/concat.cu
CHANGED
|
@@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
|
|
| 38 |
blockIdx.y * ne0 +
|
| 39 |
blockIdx.z * ne0 * gridDim.y;
|
| 40 |
|
| 41 |
-
if (blockIdx.y < ne01) { // src0
|
| 42 |
int offset_src =
|
| 43 |
nidx +
|
| 44 |
blockIdx.y * ne0 +
|
|
@@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
|
|
| 64 |
blockIdx.y * ne0 +
|
| 65 |
blockIdx.z * ne0 * gridDim.y;
|
| 66 |
|
| 67 |
-
if (blockIdx.z < ne02) { // src0
|
| 68 |
int offset_src =
|
| 69 |
nidx +
|
| 70 |
blockIdx.y * ne0 +
|
|
|
|
| 38 |
blockIdx.y * ne0 +
|
| 39 |
blockIdx.z * ne0 * gridDim.y;
|
| 40 |
|
| 41 |
+
if (blockIdx.y < (unsigned)ne01) { // src0
|
| 42 |
int offset_src =
|
| 43 |
nidx +
|
| 44 |
blockIdx.y * ne0 +
|
|
|
|
| 64 |
blockIdx.y * ne0 +
|
| 65 |
blockIdx.z * ne0 * gridDim.y;
|
| 66 |
|
| 67 |
+
if (blockIdx.z < (unsigned)ne02) { // src0
|
| 68 |
int offset_src =
|
| 69 |
nidx +
|
| 70 |
blockIdx.y * ne0 +
|
ggml/src/ggml-cuda/conv-transpose-1d.cu
CHANGED
|
@@ -34,6 +34,10 @@ static __global__ void conv_transpose_1d_kernel(
|
|
| 34 |
}
|
| 35 |
}
|
| 36 |
dst[global_index] = accumulator;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
}
|
| 38 |
|
| 39 |
static void conv_transpose_1d_f32_f32_cuda(
|
|
@@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor
|
|
| 75 |
const int p0 = 0;//opts[3];
|
| 76 |
const int d0 = 1;//opts[4];
|
| 77 |
|
| 78 |
-
const int64_t kernel_size = ggml_nelements(src0);
|
| 79 |
-
const int64_t input_size = ggml_nelements(src1);
|
| 80 |
const int64_t output_size = ggml_nelements(dst);
|
| 81 |
|
| 82 |
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
|
|
|
|
| 34 |
}
|
| 35 |
}
|
| 36 |
dst[global_index] = accumulator;
|
| 37 |
+
GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
|
| 38 |
+
GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
|
| 39 |
+
GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
|
| 40 |
+
GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
|
| 41 |
}
|
| 42 |
|
| 43 |
static void conv_transpose_1d_f32_f32_cuda(
|
|
|
|
| 79 |
const int p0 = 0;//opts[3];
|
| 80 |
const int d0 = 1;//opts[4];
|
| 81 |
|
|
|
|
|
|
|
| 82 |
const int64_t output_size = ggml_nelements(dst);
|
| 83 |
|
| 84 |
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
|
ggml/src/ggml-cuda/convert.cu
CHANGED
|
@@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
|
| 577 |
return;
|
| 578 |
}
|
| 579 |
|
| 580 |
-
const src_t * x = (src_t *) vx;
|
| 581 |
|
| 582 |
y[i] = x[i];
|
| 583 |
}
|
|
|
|
| 577 |
return;
|
| 578 |
}
|
| 579 |
|
| 580 |
+
const src_t * x = (const src_t *) vx;
|
| 581 |
|
| 582 |
y[i] = x[i];
|
| 583 |
}
|
ggml/src/ggml-cuda/fattn-common.cuh
CHANGED
|
@@ -315,14 +315,14 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
|
|
| 315 |
|
| 316 |
float vals[sizeof(int)] = {0.0f};
|
| 317 |
#pragma unroll
|
| 318 |
-
for (int l = 0; l < sizeof(int); ++l) {
|
| 319 |
vals[l] = scale * x[4*threadIdx.x + l];
|
| 320 |
}
|
| 321 |
|
| 322 |
float amax = fabsf(vals[0]);
|
| 323 |
float sum = vals[0];
|
| 324 |
#pragma unroll
|
| 325 |
-
for (int l = 1; l < sizeof(int); ++l) {
|
| 326 |
amax = fmaxf(amax, fabsf(vals[l]));
|
| 327 |
sum += vals[l];
|
| 328 |
}
|
|
@@ -338,7 +338,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
|
|
| 338 |
|
| 339 |
if (d != 0.0f) {
|
| 340 |
#pragma unroll
|
| 341 |
-
for (int l = 0; l < sizeof(int); ++l) {
|
| 342 |
q8[l] = roundf(vals[l] / d);
|
| 343 |
}
|
| 344 |
}
|
|
@@ -638,7 +638,7 @@ static __global__ void flash_attn_combine_results(
|
|
| 638 |
float VKQ_denominator = 0.0f;
|
| 639 |
for (int l = 0; l < parallel_blocks; ++l) {
|
| 640 |
const float diff = meta[l].x - kqmax;
|
| 641 |
-
|
| 642 |
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
|
| 643 |
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
|
| 644 |
|
|
@@ -649,6 +649,7 @@ static __global__ void flash_attn_combine_results(
|
|
| 649 |
dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
|
| 650 |
}
|
| 651 |
|
|
|
|
| 652 |
static void on_no_fattn_vec_case(const int D) {
|
| 653 |
if (D == 64) {
|
| 654 |
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
|
|
|
| 315 |
|
| 316 |
float vals[sizeof(int)] = {0.0f};
|
| 317 |
#pragma unroll
|
| 318 |
+
for (int l = 0; l < int(sizeof(int)); ++l) {
|
| 319 |
vals[l] = scale * x[4*threadIdx.x + l];
|
| 320 |
}
|
| 321 |
|
| 322 |
float amax = fabsf(vals[0]);
|
| 323 |
float sum = vals[0];
|
| 324 |
#pragma unroll
|
| 325 |
+
for (int l = 1; l < int(sizeof(int)); ++l) {
|
| 326 |
amax = fmaxf(amax, fabsf(vals[l]));
|
| 327 |
sum += vals[l];
|
| 328 |
}
|
|
|
|
| 338 |
|
| 339 |
if (d != 0.0f) {
|
| 340 |
#pragma unroll
|
| 341 |
+
for (int l = 0; l < int(sizeof(int)); ++l) {
|
| 342 |
q8[l] = roundf(vals[l] / d);
|
| 343 |
}
|
| 344 |
}
|
|
|
|
| 638 |
float VKQ_denominator = 0.0f;
|
| 639 |
for (int l = 0; l < parallel_blocks; ++l) {
|
| 640 |
const float diff = meta[l].x - kqmax;
|
| 641 |
+
float KQ_max_scale = expf(diff);
|
| 642 |
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
|
| 643 |
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
|
| 644 |
|
|
|
|
| 649 |
dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
|
| 650 |
}
|
| 651 |
|
| 652 |
+
[[noreturn]]
|
| 653 |
static void on_no_fattn_vec_case(const int D) {
|
| 654 |
if (D == 64) {
|
| 655 |
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
ggml/src/ggml-cuda/fattn-mma-f16.cuh
CHANGED
|
@@ -406,6 +406,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
|
| 406 |
#endif // CP_ASYNC_AVAILABLE
|
| 407 |
|
| 408 |
#else
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
NO_DEVICE_CODE;
|
| 410 |
#endif // NEW_MMA_AVAILABLE
|
| 411 |
}
|
|
@@ -797,6 +806,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|
| 797 |
__syncthreads();
|
| 798 |
}
|
| 799 |
#else
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
NO_DEVICE_CODE;
|
| 801 |
#endif // NEW_MMA_AVAILABLE
|
| 802 |
}
|
|
@@ -931,6 +946,16 @@ static __global__ void flash_attn_ext_f16(
|
|
| 931 |
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
|
| 932 |
ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
| 933 |
#else
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 934 |
NO_DEVICE_CODE;
|
| 935 |
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
| 936 |
}
|
|
@@ -985,38 +1010,38 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
|
|
| 985 |
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
|
| 986 |
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
|
| 987 |
|
| 988 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8)
|
| 989 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8)
|
| 990 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8)
|
| 991 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8)
|
| 992 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8)
|
| 993 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8)
|
| 994 |
-
|
| 995 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16)
|
| 996 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16)
|
| 997 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16)
|
| 998 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16)
|
| 999 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16)
|
| 1000 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16)
|
| 1001 |
-
|
| 1002 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32)
|
| 1003 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32)
|
| 1004 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32)
|
| 1005 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32)
|
| 1006 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32)
|
| 1007 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32)
|
| 1008 |
-
|
| 1009 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64)
|
| 1010 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64)
|
| 1011 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64)
|
| 1012 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64)
|
| 1013 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64)
|
| 1014 |
-
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64)
|
| 1015 |
|
| 1016 |
// Kernels with ncols == 128 are only 4% faster due to register pressure.
|
| 1017 |
-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
|
| 1018 |
-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
|
| 1019 |
-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
|
| 1020 |
-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
|
| 1021 |
-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
|
| 1022 |
-
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128)
|
|
|
|
| 406 |
#endif // CP_ASYNC_AVAILABLE
|
| 407 |
|
| 408 |
#else
|
| 409 |
+
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
|
| 410 |
+
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
|
| 411 |
+
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
|
| 412 |
+
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
|
| 413 |
+
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
| 414 |
+
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
| 415 |
+
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
|
| 416 |
+
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
|
| 417 |
+
GGML_UNUSED(kb0);
|
| 418 |
NO_DEVICE_CODE;
|
| 419 |
#endif // NEW_MMA_AVAILABLE
|
| 420 |
}
|
|
|
|
| 806 |
__syncthreads();
|
| 807 |
}
|
| 808 |
#else
|
| 809 |
+
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
|
| 810 |
+
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
|
| 811 |
+
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
|
| 812 |
+
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
|
| 813 |
+
GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
|
| 814 |
+
GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
|
| 815 |
NO_DEVICE_CODE;
|
| 816 |
#endif // NEW_MMA_AVAILABLE
|
| 817 |
}
|
|
|
|
| 946 |
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
|
| 947 |
ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
| 948 |
#else
|
| 949 |
+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
| 950 |
+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
| 951 |
+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
| 952 |
+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
| 953 |
+
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
| 954 |
+
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
| 955 |
+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
| 956 |
+
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
| 957 |
+
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
| 958 |
+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
| 959 |
NO_DEVICE_CODE;
|
| 960 |
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
| 961 |
}
|
|
|
|
| 1010 |
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
|
| 1011 |
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
|
| 1012 |
|
| 1013 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8)
|
| 1014 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8)
|
| 1015 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8)
|
| 1016 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8)
|
| 1017 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8)
|
| 1018 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8)
|
| 1019 |
+
|
| 1020 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16)
|
| 1021 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16)
|
| 1022 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16)
|
| 1023 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16)
|
| 1024 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16)
|
| 1025 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16)
|
| 1026 |
+
|
| 1027 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32)
|
| 1028 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32)
|
| 1029 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32)
|
| 1030 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32)
|
| 1031 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32)
|
| 1032 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32)
|
| 1033 |
+
|
| 1034 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64)
|
| 1035 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64)
|
| 1036 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64)
|
| 1037 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64)
|
| 1038 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64)
|
| 1039 |
+
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64)
|
| 1040 |
|
| 1041 |
// Kernels with ncols == 128 are only 4% faster due to register pressure.
|
| 1042 |
+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
|
| 1043 |
+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
|
| 1044 |
+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
|
| 1045 |
+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
|
| 1046 |
+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
|
| 1047 |
+
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.
|
ggml/src/ggml-cuda/fattn-tile-f16.cu
CHANGED
|
@@ -282,7 +282,19 @@ static __global__ void flash_attn_tile_ext_f16(
|
|
| 282 |
}
|
| 283 |
}
|
| 284 |
#else
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
| 287 |
}
|
| 288 |
|
|
|
|
| 282 |
}
|
| 283 |
}
|
| 284 |
#else
|
| 285 |
+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
| 286 |
+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
| 287 |
+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
| 288 |
+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
| 289 |
+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
| 290 |
+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
| 291 |
+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
| 292 |
+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
| 293 |
+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
| 294 |
+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
| 295 |
+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
| 296 |
+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
| 297 |
+
NO_DEVICE_CODE;
|
| 298 |
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
| 299 |
}
|
| 300 |
|
ggml/src/ggml-cuda/fattn-tile-f32.cu
CHANGED
|
@@ -281,6 +281,18 @@ static __global__ void flash_attn_tile_ext_f32(
|
|
| 281 |
}
|
| 282 |
}
|
| 283 |
#else
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
NO_DEVICE_CODE;
|
| 285 |
#endif // FLASH_ATTN_AVAILABLE
|
| 286 |
}
|
|
|
|
| 281 |
}
|
| 282 |
}
|
| 283 |
#else
|
| 284 |
+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
| 285 |
+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
| 286 |
+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
| 287 |
+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
| 288 |
+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
| 289 |
+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
| 290 |
+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
| 291 |
+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
| 292 |
+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
| 293 |
+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
| 294 |
+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
| 295 |
+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
| 296 |
NO_DEVICE_CODE;
|
| 297 |
#endif // FLASH_ATTN_AVAILABLE
|
| 298 |
}
|
ggml/src/ggml-cuda/fattn-vec-f16.cuh
CHANGED
|
@@ -292,7 +292,19 @@ static __global__ void flash_attn_vec_ext_f16(
|
|
| 292 |
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
|
| 293 |
}
|
| 294 |
#else
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
| 297 |
}
|
| 298 |
|
|
|
|
| 292 |
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
|
| 293 |
}
|
| 294 |
#else
|
| 295 |
+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
| 296 |
+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
| 297 |
+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
| 298 |
+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
| 299 |
+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
| 300 |
+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
| 301 |
+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
| 302 |
+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
| 303 |
+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
| 304 |
+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
| 305 |
+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
| 306 |
+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
| 307 |
+
NO_DEVICE_CODE;
|
| 308 |
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
| 309 |
}
|
| 310 |
|
ggml/src/ggml-cuda/fattn-vec-f32.cuh
CHANGED
|
@@ -277,6 +277,16 @@ static __global__ void flash_attn_vec_ext_f32(
|
|
| 277 |
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
|
| 278 |
}
|
| 279 |
#else
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
NO_DEVICE_CODE;
|
| 281 |
#endif // FLASH_ATTN_AVAILABLE
|
| 282 |
}
|
|
|
|
| 277 |
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
|
| 278 |
}
|
| 279 |
#else
|
| 280 |
+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
| 281 |
+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
| 282 |
+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
| 283 |
+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
| 284 |
+
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
| 285 |
+
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
| 286 |
+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
| 287 |
+
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
| 288 |
+
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
| 289 |
+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
| 290 |
NO_DEVICE_CODE;
|
| 291 |
#endif // FLASH_ATTN_AVAILABLE
|
| 292 |
}
|
ggml/src/ggml-cuda/fattn-wmma-f16.cu
CHANGED
|
@@ -430,7 +430,17 @@ static __global__ void flash_attn_ext_f16(
|
|
| 430 |
dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val;
|
| 431 |
}
|
| 432 |
#else
|
| 433 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
| 435 |
}
|
| 436 |
|
|
|
|
| 430 |
dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val;
|
| 431 |
}
|
| 432 |
#else
|
| 433 |
+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
| 434 |
+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
| 435 |
+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
| 436 |
+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
| 437 |
+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
| 438 |
+
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
| 439 |
+
GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
| 440 |
+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
| 441 |
+
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
| 442 |
+
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
| 443 |
+
NO_DEVICE_CODE;
|
| 444 |
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
| 445 |
}
|
| 446 |
|
ggml/src/ggml-cuda/mma.cuh
CHANGED
|
@@ -26,6 +26,7 @@ static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
|
|
| 26 |
asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
|
| 27 |
: "=r"(ret) : "r"(x));
|
| 28 |
#else
|
|
|
|
| 29 |
NO_DEVICE_CODE;
|
| 30 |
#endif // defined(NEW_MMA_AVAILABLE)
|
| 31 |
return ret;
|
|
@@ -178,6 +179,7 @@ namespace ggml_cuda_mma {
|
|
| 178 |
: "l"(xs));
|
| 179 |
#else
|
| 180 |
load_generic(xs0, stride);
|
|
|
|
| 181 |
#endif // NEW_MMA_AVAILABLE
|
| 182 |
}
|
| 183 |
|
|
|
|
| 26 |
asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
|
| 27 |
: "=r"(ret) : "r"(x));
|
| 28 |
#else
|
| 29 |
+
GGML_UNUSED(x);
|
| 30 |
NO_DEVICE_CODE;
|
| 31 |
#endif // defined(NEW_MMA_AVAILABLE)
|
| 32 |
return ret;
|
|
|
|
| 179 |
: "l"(xs));
|
| 180 |
#else
|
| 181 |
load_generic(xs0, stride);
|
| 182 |
+
GGML_UNUSED(t);
|
| 183 |
#endif // NEW_MMA_AVAILABLE
|
| 184 |
}
|
| 185 |
|
ggml/src/ggml-cuda/mmq.cuh
CHANGED
|
@@ -945,7 +945,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
|
| 945 |
}
|
| 946 |
}
|
| 947 |
#else
|
| 948 |
-
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
| 949 |
NO_DEVICE_CODE;
|
| 950 |
#endif // NEW_MMA_AVAILABLE
|
| 951 |
}
|
|
@@ -1024,7 +1024,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
|
| 1024 |
}
|
| 1025 |
|
| 1026 |
#pragma unroll
|
| 1027 |
-
for (int k01 = 0; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
| 1028 |
const int k0 = k00 + k01;
|
| 1029 |
|
| 1030 |
#pragma unroll
|
|
@@ -1035,19 +1035,34 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
|
| 1035 |
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
| 1036 |
const int i = i0 + threadIdx.x;
|
| 1037 |
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
|
| 1047 |
-
|
| 1048 |
-
|
| 1049 |
-
|
| 1050 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1051 |
}
|
| 1052 |
}
|
| 1053 |
}
|
|
@@ -1176,7 +1191,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
|
| 1176 |
}
|
| 1177 |
}
|
| 1178 |
#else
|
| 1179 |
-
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
| 1180 |
NO_DEVICE_CODE;
|
| 1181 |
#endif // NEW_MMA_AVAILABLE
|
| 1182 |
}
|
|
@@ -1253,7 +1268,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 1253 |
const float d = bxi->d;
|
| 1254 |
|
| 1255 |
#pragma unroll
|
| 1256 |
-
for (int l = 0; l < sizeof(int); ++l) {
|
| 1257 |
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
|
| 1258 |
}
|
| 1259 |
#else
|
|
@@ -1376,7 +1391,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 1376 |
const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
|
| 1377 |
|
| 1378 |
#pragma unroll
|
| 1379 |
-
for (int l = 0; l < sizeof(int); ++l) {
|
| 1380 |
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
|
| 1381 |
}
|
| 1382 |
}
|
|
@@ -1517,7 +1532,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
| 1517 |
const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
|
| 1518 |
|
| 1519 |
#pragma unroll
|
| 1520 |
-
for (int l = 0; l < sizeof(int); ++l) {
|
| 1521 |
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
|
| 1522 |
}
|
| 1523 |
}
|
|
@@ -1810,7 +1825,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
|
| 1810 |
}
|
| 1811 |
}
|
| 1812 |
#else
|
| 1813 |
-
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
| 1814 |
NO_DEVICE_CODE;
|
| 1815 |
#endif // NEW_MMA_AVAILABLE
|
| 1816 |
}
|
|
@@ -2570,6 +2585,8 @@ static __device__ void mul_mat_q_process_tile(
|
|
| 2570 |
} else {
|
| 2571 |
write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
|
| 2572 |
}
|
|
|
|
|
|
|
| 2573 |
}
|
| 2574 |
|
| 2575 |
|
|
@@ -2695,7 +2712,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
|
| 2695 |
const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
|
| 2696 |
|
| 2697 |
// Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
|
| 2698 |
-
if (it != blockIdx.x || jt != blockIdx.y) {
|
| 2699 |
continue;
|
| 2700 |
}
|
| 2701 |
|
|
@@ -2825,7 +2842,6 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
|
| 2825 |
template <ggml_type type>
|
| 2826 |
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
| 2827 |
const int id = ggml_cuda_get_device();
|
| 2828 |
-
const int nsm = ggml_cuda_info().devices[id].nsm;
|
| 2829 |
const int cc = ggml_cuda_info().devices[id].cc;
|
| 2830 |
const int smpbo = ggml_cuda_info().devices[id].smpbo;
|
| 2831 |
|
|
|
|
| 945 |
}
|
| 946 |
}
|
| 947 |
#else
|
| 948 |
+
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
| 949 |
NO_DEVICE_CODE;
|
| 950 |
#endif // NEW_MMA_AVAILABLE
|
| 951 |
}
|
|
|
|
| 1024 |
}
|
| 1025 |
|
| 1026 |
#pragma unroll
|
| 1027 |
+
for (int k01 = 0; k01 < WARP_SIZE/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
| 1028 |
const int k0 = k00 + k01;
|
| 1029 |
|
| 1030 |
#pragma unroll
|
|
|
|
| 1035 |
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
| 1036 |
const int i = i0 + threadIdx.x;
|
| 1037 |
|
| 1038 |
+
constexpr int ns = 2;
|
| 1039 |
+
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
| 1040 |
+
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
| 1041 |
+
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
| 1042 |
+
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
| 1043 |
+
}
|
| 1044 |
+
}
|
| 1045 |
+
}
|
| 1046 |
+
|
| 1047 |
+
// Some compilers fail to unroll the loop over k01 if there is a conditional statement for ns in the inner loop.
|
| 1048 |
+
// As a workaround 2 separate loops are used instead.
|
| 1049 |
+
#pragma unroll
|
| 1050 |
+
for (int k01 = WARP_SIZE/2; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
| 1051 |
+
const int k0 = k00 + k01;
|
| 1052 |
+
|
| 1053 |
+
#pragma unroll
|
| 1054 |
+
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
| 1055 |
+
const int j = j0 + threadIdx.y;
|
| 1056 |
+
|
| 1057 |
+
#pragma unroll
|
| 1058 |
+
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
| 1059 |
+
const int i = i0 + threadIdx.x;
|
| 1060 |
+
|
| 1061 |
+
constexpr int ns = 1;
|
| 1062 |
+
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
| 1063 |
+
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
| 1064 |
+
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
| 1065 |
+
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
| 1066 |
}
|
| 1067 |
}
|
| 1068 |
}
|
|
|
|
| 1191 |
}
|
| 1192 |
}
|
| 1193 |
#else
|
| 1194 |
+
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
| 1195 |
NO_DEVICE_CODE;
|
| 1196 |
#endif // NEW_MMA_AVAILABLE
|
| 1197 |
}
|
|
|
|
| 1268 |
const float d = bxi->d;
|
| 1269 |
|
| 1270 |
#pragma unroll
|
| 1271 |
+
for (int l = 0; l < int(sizeof(int)); ++l) {
|
| 1272 |
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
|
| 1273 |
}
|
| 1274 |
#else
|
|
|
|
| 1391 |
const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
|
| 1392 |
|
| 1393 |
#pragma unroll
|
| 1394 |
+
for (int l = 0; l < int(sizeof(int)); ++l) {
|
| 1395 |
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
|
| 1396 |
}
|
| 1397 |
}
|
|
|
|
| 1532 |
const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
|
| 1533 |
|
| 1534 |
#pragma unroll
|
| 1535 |
+
for (int l = 0; l < int(sizeof(int)); ++l) {
|
| 1536 |
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
|
| 1537 |
}
|
| 1538 |
}
|
|
|
|
| 1825 |
}
|
| 1826 |
}
|
| 1827 |
#else
|
| 1828 |
+
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
| 1829 |
NO_DEVICE_CODE;
|
| 1830 |
#endif // NEW_MMA_AVAILABLE
|
| 1831 |
}
|
|
|
|
| 2585 |
} else {
|
| 2586 |
write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
|
| 2587 |
}
|
| 2588 |
+
|
| 2589 |
+
GGML_UNUSED(ne00); GGML_UNUSED(ne10);
|
| 2590 |
}
|
| 2591 |
|
| 2592 |
|
|
|
|
| 2712 |
const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
|
| 2713 |
|
| 2714 |
// Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
|
| 2715 |
+
if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) {
|
| 2716 |
continue;
|
| 2717 |
}
|
| 2718 |
|
|
|
|
| 2842 |
template <ggml_type type>
|
| 2843 |
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
| 2844 |
const int id = ggml_cuda_get_device();
|
|
|
|
| 2845 |
const int cc = ggml_cuda_info().devices[id].cc;
|
| 2846 |
const int smpbo = ggml_cuda_info().devices[id].smpbo;
|
| 2847 |
|
ggml/src/ggml-cuda/mmv.cu
CHANGED
|
@@ -29,7 +29,7 @@ static __global__ void mul_mat_vec(
|
|
| 29 |
__syncthreads();
|
| 30 |
}
|
| 31 |
|
| 32 |
-
float sumf;
|
| 33 |
|
| 34 |
if constexpr (std::is_same<T, half>::value) {
|
| 35 |
const half2 * x2 = (const half2 *) x;
|
|
|
|
| 29 |
__syncthreads();
|
| 30 |
}
|
| 31 |
|
| 32 |
+
float sumf = 0.0f;
|
| 33 |
|
| 34 |
if constexpr (std::is_same<T, half>::value) {
|
| 35 |
const half2 * x2 = (const half2 *) x;
|
ggml/src/ggml-cuda/mmvq.cu
CHANGED
|
@@ -151,7 +151,7 @@ static __global__ void mul_mat_vec_q(
|
|
| 151 |
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
| 152 |
|
| 153 |
// partial sum for each thread
|
| 154 |
-
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
| 155 |
|
| 156 |
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 157 |
|
|
@@ -197,10 +197,12 @@ static __global__ void mul_mat_vec_q(
|
|
| 197 |
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
| 198 |
}
|
| 199 |
|
| 200 |
-
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
| 201 |
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
| 202 |
}
|
| 203 |
}
|
|
|
|
|
|
|
| 204 |
}
|
| 205 |
|
| 206 |
static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
|
|
|
|
| 151 |
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
| 152 |
|
| 153 |
// partial sum for each thread
|
| 154 |
+
float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}};
|
| 155 |
|
| 156 |
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 157 |
|
|
|
|
| 197 |
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
| 198 |
}
|
| 199 |
|
| 200 |
+
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) {
|
| 201 |
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
| 202 |
}
|
| 203 |
}
|
| 204 |
+
|
| 205 |
+
GGML_UNUSED(nrows_x);
|
| 206 |
}
|
| 207 |
|
| 208 |
static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
|
ggml/src/ggml-cuda/pad.cu
CHANGED
|
@@ -14,7 +14,7 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
|
|
| 14 |
nidx +
|
| 15 |
blockIdx.y * ne0 +
|
| 16 |
blockIdx.z * ne0 * gridDim.y;
|
| 17 |
-
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
| 18 |
int offset_src =
|
| 19 |
nidx +
|
| 20 |
blockIdx.y * ne00 +
|
|
|
|
| 14 |
nidx +
|
| 15 |
blockIdx.y * ne0 +
|
| 16 |
blockIdx.z * ne0 * gridDim.y;
|
| 17 |
+
if (nidx < ne00 && blockIdx.y < (unsigned)ne01 && blockIdx.z < (unsigned)(ne02*ne03)) {
|
| 18 |
int offset_src =
|
| 19 |
nidx +
|
| 20 |
blockIdx.y * ne00 +
|
ggml/src/ggml-cuda/upscale.cu
CHANGED
|
@@ -19,7 +19,7 @@ static __global__ void upscale_f32(const float * x, float * dst,
|
|
| 19 |
int i02 = i12 / sf2;
|
| 20 |
int i03 = i13 / sf3;
|
| 21 |
|
| 22 |
-
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
| 23 |
}
|
| 24 |
|
| 25 |
static void upscale_f32_cuda(const float * x, float * dst,
|
|
|
|
| 19 |
int i02 = i12 / sf2;
|
| 20 |
int i03 = i13 / sf3;
|
| 21 |
|
| 22 |
+
dst[index] = *( (const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00) );
|
| 23 |
}
|
| 24 |
|
| 25 |
static void upscale_f32_cuda(const float * x, float * dst,
|