R0CKSTAR commited on
Commit
12bb60d
·
1 Parent(s): 22dfdf6

musa: fix all warnings, re-enable `-DLLAMA_FATAL_WARNINGS=ON` in ci and update doc (llama/12611)

Browse files

* musa: fix all warnings

Signed-off-by: Xiaodong Ye <[email protected]>

* musa: enable -DLLAMA_FATAL_WARNINGS=ON in run.sh

Signed-off-by: Xiaodong Ye <[email protected]>

* musa: update ci doc (install ccache)

Signed-off-by: Xiaodong Ye <[email protected]>

* fix Windows build issue

Signed-off-by: Xiaodong Ye <[email protected]>

* Address review comments

Signed-off-by: Xiaodong Ye <[email protected]>

* Address review comments

Signed-off-by: Xiaodong Ye <[email protected]>

---------

Signed-off-by: Xiaodong Ye <[email protected]>

ggml/src/ggml-common.h CHANGED
@@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;
158
 
159
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
160
 
 
 
 
 
 
 
161
  #define QK4_0 32
162
  typedef struct {
163
  ggml_half d; // delta
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b
167
 
168
  #define QK4_1 32
169
  typedef struct {
170
- union {
171
  struct {
172
  ggml_half d; // delta
173
  ggml_half m; // min
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0
188
 
189
  #define QK5_1 32
190
  typedef struct {
191
- union {
192
  struct {
193
  ggml_half d; // delta
194
  ggml_half m; // min
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block
209
 
210
  #define QK8_1 32
211
  typedef struct {
212
- union {
213
  struct {
214
  ggml_half d; // delta
215
  ggml_half s; // d * sum(qs[i])
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
250
  typedef struct {
251
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
252
  uint8_t qs[QK_K/4]; // quants
253
- union {
254
  struct {
255
  ggml_half d; // super-block scale for quantized scales
256
  ggml_half dmin; // super-block scale for quantized mins
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
277
  // weight is represented as x = a * q + b
278
  // Effectively 4.5 bits per weight
279
  typedef struct {
280
- union {
281
  struct {
282
  ggml_half d; // super-block scale for quantized scales
283
  ggml_half dmin; // super-block scale for quantized mins
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
294
  // weight is represented as x = a * q + b
295
  // Effectively 5.5 bits per weight
296
  typedef struct {
297
- union {
298
  struct {
299
  ggml_half d; // super-block scale for quantized scales
300
  ggml_half dmin; // super-block scale for quantized mins
 
158
 
159
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
160
 
161
+ #ifdef _MSC_VER
162
+ #define GGML_EXTENSION
163
+ #else // _MSC_VER
164
+ #define GGML_EXTENSION __extension__
165
+ #endif // _MSC_VER
166
+
167
  #define QK4_0 32
168
  typedef struct {
169
  ggml_half d; // delta
 
173
 
174
  #define QK4_1 32
175
  typedef struct {
176
+ GGML_EXTENSION union {
177
  struct {
178
  ggml_half d; // delta
179
  ggml_half m; // min
 
194
 
195
  #define QK5_1 32
196
  typedef struct {
197
+ GGML_EXTENSION union {
198
  struct {
199
  ggml_half d; // delta
200
  ggml_half m; // min
 
215
 
216
  #define QK8_1 32
217
  typedef struct {
218
+ GGML_EXTENSION union {
219
  struct {
220
  ggml_half d; // delta
221
  ggml_half s; // d * sum(qs[i])
 
256
  typedef struct {
257
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
258
  uint8_t qs[QK_K/4]; // quants
259
+ GGML_EXTENSION union {
260
  struct {
261
  ggml_half d; // super-block scale for quantized scales
262
  ggml_half dmin; // super-block scale for quantized mins
 
283
  // weight is represented as x = a * q + b
284
  // Effectively 4.5 bits per weight
285
  typedef struct {
286
+ GGML_EXTENSION union {
287
  struct {
288
  ggml_half d; // super-block scale for quantized scales
289
  ggml_half dmin; // super-block scale for quantized mins
 
300
  // weight is represented as x = a * q + b
301
  // Effectively 5.5 bits per weight
302
  typedef struct {
303
+ GGML_EXTENSION union {
304
  struct {
305
  ggml_half d; // super-block scale for quantized scales
306
  ggml_half dmin; // super-block scale for quantized mins
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -288,6 +288,10 @@ static __device__ void no_device_code(
288
  __trap();
289
 
290
  GGML_UNUSED(no_device_code); // suppress unused function warning
 
 
 
 
291
  }
292
 
293
  #ifdef __CUDA_ARCH__
 
288
  __trap();
289
 
290
  GGML_UNUSED(no_device_code); // suppress unused function warning
291
+
292
+ #if defined(GGML_USE_MUSA)
293
+ __builtin_unreachable();
294
+ #endif // defined(GGML_USE_MUSA)
295
  }
296
 
297
  #ifdef __CUDA_ARCH__
ggml/src/ggml-cuda/concat.cu CHANGED
@@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
38
  blockIdx.y * ne0 +
39
  blockIdx.z * ne0 * gridDim.y;
40
 
41
- if (blockIdx.y < ne01) { // src0
42
  int offset_src =
43
  nidx +
44
  blockIdx.y * ne0 +
@@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
64
  blockIdx.y * ne0 +
65
  blockIdx.z * ne0 * gridDim.y;
66
 
67
- if (blockIdx.z < ne02) { // src0
68
  int offset_src =
69
  nidx +
70
  blockIdx.y * ne0 +
 
38
  blockIdx.y * ne0 +
39
  blockIdx.z * ne0 * gridDim.y;
40
 
41
+ if (blockIdx.y < (unsigned)ne01) { // src0
42
  int offset_src =
43
  nidx +
44
  blockIdx.y * ne0 +
 
64
  blockIdx.y * ne0 +
65
  blockIdx.z * ne0 * gridDim.y;
66
 
67
+ if (blockIdx.z < (unsigned)ne02) { // src0
68
  int offset_src =
69
  nidx +
70
  blockIdx.y * ne0 +
ggml/src/ggml-cuda/conv-transpose-1d.cu CHANGED
@@ -34,6 +34,10 @@ static __global__ void conv_transpose_1d_kernel(
34
  }
35
  }
36
  dst[global_index] = accumulator;
 
 
 
 
37
  }
38
 
39
  static void conv_transpose_1d_f32_f32_cuda(
@@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor
75
  const int p0 = 0;//opts[3];
76
  const int d0 = 1;//opts[4];
77
 
78
- const int64_t kernel_size = ggml_nelements(src0);
79
- const int64_t input_size = ggml_nelements(src1);
80
  const int64_t output_size = ggml_nelements(dst);
81
 
82
  conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
 
34
  }
35
  }
36
  dst[global_index] = accumulator;
37
+ GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
38
+ GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
39
+ GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
40
+ GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
41
  }
42
 
43
  static void conv_transpose_1d_f32_f32_cuda(
 
79
  const int p0 = 0;//opts[3];
80
  const int d0 = 1;//opts[4];
81
 
 
 
82
  const int64_t output_size = ggml_nelements(dst);
83
 
84
  conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
ggml/src/ggml-cuda/convert.cu CHANGED
@@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
577
  return;
578
  }
579
 
580
- const src_t * x = (src_t *) vx;
581
 
582
  y[i] = x[i];
583
  }
 
577
  return;
578
  }
579
 
580
+ const src_t * x = (const src_t *) vx;
581
 
582
  y[i] = x[i];
583
  }
ggml/src/ggml-cuda/fattn-common.cuh CHANGED
@@ -315,14 +315,14 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
315
 
316
  float vals[sizeof(int)] = {0.0f};
317
  #pragma unroll
318
- for (int l = 0; l < sizeof(int); ++l) {
319
  vals[l] = scale * x[4*threadIdx.x + l];
320
  }
321
 
322
  float amax = fabsf(vals[0]);
323
  float sum = vals[0];
324
  #pragma unroll
325
- for (int l = 1; l < sizeof(int); ++l) {
326
  amax = fmaxf(amax, fabsf(vals[l]));
327
  sum += vals[l];
328
  }
@@ -338,7 +338,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
338
 
339
  if (d != 0.0f) {
340
  #pragma unroll
341
- for (int l = 0; l < sizeof(int); ++l) {
342
  q8[l] = roundf(vals[l] / d);
343
  }
344
  }
@@ -638,7 +638,7 @@ static __global__ void flash_attn_combine_results(
638
  float VKQ_denominator = 0.0f;
639
  for (int l = 0; l < parallel_blocks; ++l) {
640
  const float diff = meta[l].x - kqmax;
641
- const float KQ_max_scale = expf(diff);
642
  const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
643
  *((uint32_t *) &KQ_max_scale) &= ftz_mask;
644
 
@@ -649,6 +649,7 @@ static __global__ void flash_attn_combine_results(
649
  dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
650
  }
651
 
 
652
  static void on_no_fattn_vec_case(const int D) {
653
  if (D == 64) {
654
  fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
 
315
 
316
  float vals[sizeof(int)] = {0.0f};
317
  #pragma unroll
318
+ for (int l = 0; l < int(sizeof(int)); ++l) {
319
  vals[l] = scale * x[4*threadIdx.x + l];
320
  }
321
 
322
  float amax = fabsf(vals[0]);
323
  float sum = vals[0];
324
  #pragma unroll
325
+ for (int l = 1; l < int(sizeof(int)); ++l) {
326
  amax = fmaxf(amax, fabsf(vals[l]));
327
  sum += vals[l];
328
  }
 
338
 
339
  if (d != 0.0f) {
340
  #pragma unroll
341
+ for (int l = 0; l < int(sizeof(int)); ++l) {
342
  q8[l] = roundf(vals[l] / d);
343
  }
344
  }
 
638
  float VKQ_denominator = 0.0f;
639
  for (int l = 0; l < parallel_blocks; ++l) {
640
  const float diff = meta[l].x - kqmax;
641
+ float KQ_max_scale = expf(diff);
642
  const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
643
  *((uint32_t *) &KQ_max_scale) &= ftz_mask;
644
 
 
649
  dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
650
  }
651
 
652
+ [[noreturn]]
653
  static void on_no_fattn_vec_case(const int D) {
654
  if (D == 64) {
655
  fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
ggml/src/ggml-cuda/fattn-mma-f16.cuh CHANGED
@@ -406,6 +406,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
406
  #endif // CP_ASYNC_AVAILABLE
407
 
408
  #else
 
 
 
 
 
 
 
 
 
409
  NO_DEVICE_CODE;
410
  #endif // NEW_MMA_AVAILABLE
411
  }
@@ -797,6 +806,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
797
  __syncthreads();
798
  }
799
  #else
 
 
 
 
 
 
800
  NO_DEVICE_CODE;
801
  #endif // NEW_MMA_AVAILABLE
802
  }
@@ -931,6 +946,16 @@ static __global__ void flash_attn_ext_f16(
931
  (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
932
  ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
933
  #else
 
 
 
 
 
 
 
 
 
 
934
  NO_DEVICE_CODE;
935
  #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
936
  }
@@ -985,38 +1010,38 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
985
  extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
986
  extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
987
 
988
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8);
989
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8);
990
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8);
991
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8);
992
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8);
993
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8);
994
-
995
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16);
996
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16);
997
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16);
998
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16);
999
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16);
1000
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16);
1001
-
1002
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32);
1003
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32);
1004
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32);
1005
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32);
1006
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32);
1007
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32);
1008
-
1009
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64);
1010
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64);
1011
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64);
1012
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64);
1013
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64);
1014
- DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64);
1015
 
1016
  // Kernels with ncols == 128 are only 4% faster due to register pressure.
1017
- // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128);
1018
- // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128);
1019
- // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128);
1020
- // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128);
1021
- // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128);
1022
- // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128); // Needs too much shared memory.
 
406
  #endif // CP_ASYNC_AVAILABLE
407
 
408
  #else
409
+ GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
410
+ GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
411
+ GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
412
+ GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
413
+ GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
414
+ GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
415
+ GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
416
+ GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
417
+ GGML_UNUSED(kb0);
418
  NO_DEVICE_CODE;
419
  #endif // NEW_MMA_AVAILABLE
420
  }
 
806
  __syncthreads();
807
  }
808
  #else
809
+ GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
810
+ GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
811
+ GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
812
+ GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
813
+ GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
814
+ GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
815
  NO_DEVICE_CODE;
816
  #endif // NEW_MMA_AVAILABLE
817
  }
 
946
  (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
947
  ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
948
  #else
949
+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
950
+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
951
+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
952
+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
953
+ GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
954
+ GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
955
+ GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
956
+ GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
957
+ GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
958
+ GGML_UNUSED(ne2); GGML_UNUSED(ne3);
959
  NO_DEVICE_CODE;
960
  #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
961
  }
 
1010
  extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
1011
  extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
1012
 
1013
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8)
1014
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8)
1015
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8)
1016
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8)
1017
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8)
1018
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8)
1019
+
1020
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16)
1021
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16)
1022
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16)
1023
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16)
1024
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16)
1025
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16)
1026
+
1027
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32)
1028
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32)
1029
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32)
1030
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32)
1031
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32)
1032
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32)
1033
+
1034
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64)
1035
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64)
1036
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64)
1037
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64)
1038
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64)
1039
+ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64)
1040
 
1041
  // Kernels with ncols == 128 are only 4% faster due to register pressure.
1042
+ // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
1043
+ // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
1044
+ // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
1045
+ // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
1046
+ // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
1047
+ // DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.
ggml/src/ggml-cuda/fattn-tile-f16.cu CHANGED
@@ -282,7 +282,19 @@ static __global__ void flash_attn_tile_ext_f16(
282
  }
283
  }
284
  #else
285
- NO_DEVICE_CODE;
 
 
 
 
 
 
 
 
 
 
 
 
286
  #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
287
  }
288
 
 
282
  }
283
  }
284
  #else
285
+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
286
+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
287
+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
288
+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
289
+ GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
290
+ GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
291
+ GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
292
+ GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
293
+ GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
294
+ GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
295
+ GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
296
+ GGML_UNUSED(ne2); GGML_UNUSED(ne3);
297
+ NO_DEVICE_CODE;
298
  #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
299
  }
300
 
ggml/src/ggml-cuda/fattn-tile-f32.cu CHANGED
@@ -281,6 +281,18 @@ static __global__ void flash_attn_tile_ext_f32(
281
  }
282
  }
283
  #else
 
 
 
 
 
 
 
 
 
 
 
 
284
  NO_DEVICE_CODE;
285
  #endif // FLASH_ATTN_AVAILABLE
286
  }
 
281
  }
282
  }
283
  #else
284
+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
285
+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
286
+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
287
+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
288
+ GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
289
+ GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
290
+ GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
291
+ GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
292
+ GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
293
+ GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
294
+ GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
295
+ GGML_UNUSED(ne2); GGML_UNUSED(ne3);
296
  NO_DEVICE_CODE;
297
  #endif // FLASH_ATTN_AVAILABLE
298
  }
ggml/src/ggml-cuda/fattn-vec-f16.cuh CHANGED
@@ -292,7 +292,19 @@ static __global__ void flash_attn_vec_ext_f16(
292
  dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
293
  }
294
  #else
295
- NO_DEVICE_CODE;
 
 
 
 
 
 
 
 
 
 
 
 
296
  #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
297
  }
298
 
 
292
  dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
293
  }
294
  #else
295
+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
296
+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
297
+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
298
+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
299
+ GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
300
+ GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
301
+ GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
302
+ GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
303
+ GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
304
+ GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
305
+ GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
306
+ GGML_UNUSED(ne2); GGML_UNUSED(ne3);
307
+ NO_DEVICE_CODE;
308
  #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
309
  }
310
 
ggml/src/ggml-cuda/fattn-vec-f32.cuh CHANGED
@@ -277,6 +277,16 @@ static __global__ void flash_attn_vec_ext_f32(
277
  dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
278
  }
279
  #else
 
 
 
 
 
 
 
 
 
 
280
  NO_DEVICE_CODE;
281
  #endif // FLASH_ATTN_AVAILABLE
282
  }
 
277
  dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
278
  }
279
  #else
280
+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
281
+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
282
+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
283
+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
284
+ GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
285
+ GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
286
+ GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
287
+ GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
288
+ GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
289
+ GGML_UNUSED(ne2); GGML_UNUSED(ne3);
290
  NO_DEVICE_CODE;
291
  #endif // FLASH_ATTN_AVAILABLE
292
  }
ggml/src/ggml-cuda/fattn-wmma-f16.cu CHANGED
@@ -430,7 +430,17 @@ static __global__ void flash_attn_ext_f16(
430
  dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val;
431
  }
432
  #else
433
- NO_DEVICE_CODE;
 
 
 
 
 
 
 
 
 
 
434
  #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
435
  }
436
 
 
430
  dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val;
431
  }
432
  #else
433
+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
434
+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
435
+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
436
+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
437
+ GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
438
+ GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
439
+ GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
440
+ GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
441
+ GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
442
+ GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
443
+ NO_DEVICE_CODE;
444
  #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
445
  }
446
 
ggml/src/ggml-cuda/mma.cuh CHANGED
@@ -26,6 +26,7 @@ static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
26
  asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
27
  : "=r"(ret) : "r"(x));
28
  #else
 
29
  NO_DEVICE_CODE;
30
  #endif // defined(NEW_MMA_AVAILABLE)
31
  return ret;
@@ -178,6 +179,7 @@ namespace ggml_cuda_mma {
178
  : "l"(xs));
179
  #else
180
  load_generic(xs0, stride);
 
181
  #endif // NEW_MMA_AVAILABLE
182
  }
183
 
 
26
  asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
27
  : "=r"(ret) : "r"(x));
28
  #else
29
+ GGML_UNUSED(x);
30
  NO_DEVICE_CODE;
31
  #endif // defined(NEW_MMA_AVAILABLE)
32
  return ret;
 
179
  : "l"(xs));
180
  #else
181
  load_generic(xs0, stride);
182
+ GGML_UNUSED(t);
183
  #endif // NEW_MMA_AVAILABLE
184
  }
185
 
ggml/src/ggml-cuda/mmq.cuh CHANGED
@@ -945,7 +945,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
945
  }
946
  }
947
  #else
948
- GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
949
  NO_DEVICE_CODE;
950
  #endif // NEW_MMA_AVAILABLE
951
  }
@@ -1024,7 +1024,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
1024
  }
1025
 
1026
  #pragma unroll
1027
- for (int k01 = 0; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
1028
  const int k0 = k00 + k01;
1029
 
1030
  #pragma unroll
@@ -1035,19 +1035,34 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
1035
  for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1036
  const int i = i0 + threadIdx.x;
1037
 
1038
- if (k01 < WARP_SIZE/2) {
1039
- constexpr int ns = 2;
1040
- sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
1041
- &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1042
- &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
1043
- &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1044
- } else {
1045
- constexpr int ns = 1;
1046
- sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
1047
- &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1048
- &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
1049
- &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1050
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1051
  }
1052
  }
1053
  }
@@ -1176,7 +1191,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
1176
  }
1177
  }
1178
  #else
1179
- GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
1180
  NO_DEVICE_CODE;
1181
  #endif // NEW_MMA_AVAILABLE
1182
  }
@@ -1253,7 +1268,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
1253
  const float d = bxi->d;
1254
 
1255
  #pragma unroll
1256
- for (int l = 0; l < sizeof(int); ++l) {
1257
  x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
1258
  }
1259
  #else
@@ -1376,7 +1391,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
1376
  const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
1377
 
1378
  #pragma unroll
1379
- for (int l = 0; l < sizeof(int); ++l) {
1380
  x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
1381
  }
1382
  }
@@ -1517,7 +1532,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
1517
  const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
1518
 
1519
  #pragma unroll
1520
- for (int l = 0; l < sizeof(int); ++l) {
1521
  x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
1522
  }
1523
  }
@@ -1810,7 +1825,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
1810
  }
1811
  }
1812
  #else
1813
- GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
1814
  NO_DEVICE_CODE;
1815
  #endif // NEW_MMA_AVAILABLE
1816
  }
@@ -2570,6 +2585,8 @@ static __device__ void mul_mat_q_process_tile(
2570
  } else {
2571
  write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
2572
  }
 
 
2573
  }
2574
 
2575
 
@@ -2695,7 +2712,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
2695
  const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
2696
 
2697
  // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
2698
- if (it != blockIdx.x || jt != blockIdx.y) {
2699
  continue;
2700
  }
2701
 
@@ -2825,7 +2842,6 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
2825
  template <ggml_type type>
2826
  void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
2827
  const int id = ggml_cuda_get_device();
2828
- const int nsm = ggml_cuda_info().devices[id].nsm;
2829
  const int cc = ggml_cuda_info().devices[id].cc;
2830
  const int smpbo = ggml_cuda_info().devices[id].smpbo;
2831
 
 
945
  }
946
  }
947
  #else
948
+ GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
949
  NO_DEVICE_CODE;
950
  #endif // NEW_MMA_AVAILABLE
951
  }
 
1024
  }
1025
 
1026
  #pragma unroll
1027
+ for (int k01 = 0; k01 < WARP_SIZE/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
1028
  const int k0 = k00 + k01;
1029
 
1030
  #pragma unroll
 
1035
  for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1036
  const int i = i0 + threadIdx.x;
1037
 
1038
+ constexpr int ns = 2;
1039
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
1040
+ &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1041
+ &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
1042
+ &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1043
+ }
1044
+ }
1045
+ }
1046
+
1047
+ // Some compilers fail to unroll the loop over k01 if there is a conditional statement for ns in the inner loop.
1048
+ // As a workaround 2 separate loops are used instead.
1049
+ #pragma unroll
1050
+ for (int k01 = WARP_SIZE/2; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
1051
+ const int k0 = k00 + k01;
1052
+
1053
+ #pragma unroll
1054
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
1055
+ const int j = j0 + threadIdx.y;
1056
+
1057
+ #pragma unroll
1058
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
1059
+ const int i = i0 + threadIdx.x;
1060
+
1061
+ constexpr int ns = 1;
1062
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
1063
+ &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
1064
+ &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
1065
+ &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
1066
  }
1067
  }
1068
  }
 
1191
  }
1192
  }
1193
  #else
1194
+ GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
1195
  NO_DEVICE_CODE;
1196
  #endif // NEW_MMA_AVAILABLE
1197
  }
 
1268
  const float d = bxi->d;
1269
 
1270
  #pragma unroll
1271
+ for (int l = 0; l < int(sizeof(int)); ++l) {
1272
  x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
1273
  }
1274
  #else
 
1391
  const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
1392
 
1393
  #pragma unroll
1394
+ for (int l = 0; l < int(sizeof(int)); ++l) {
1395
  x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
1396
  }
1397
  }
 
1532
  const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
1533
 
1534
  #pragma unroll
1535
+ for (int l = 0; l < int(sizeof(int)); ++l) {
1536
  x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
1537
  }
1538
  }
 
1825
  }
1826
  }
1827
  #else
1828
+ GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
1829
  NO_DEVICE_CODE;
1830
  #endif // NEW_MMA_AVAILABLE
1831
  }
 
2585
  } else {
2586
  write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
2587
  }
2588
+
2589
+ GGML_UNUSED(ne00); GGML_UNUSED(ne10);
2590
  }
2591
 
2592
 
 
2712
  const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
2713
 
2714
  // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
2715
+ if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) {
2716
  continue;
2717
  }
2718
 
 
2842
  template <ggml_type type>
2843
  void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
2844
  const int id = ggml_cuda_get_device();
 
2845
  const int cc = ggml_cuda_info().devices[id].cc;
2846
  const int smpbo = ggml_cuda_info().devices[id].smpbo;
2847
 
ggml/src/ggml-cuda/mmv.cu CHANGED
@@ -29,7 +29,7 @@ static __global__ void mul_mat_vec(
29
  __syncthreads();
30
  }
31
 
32
- float sumf;
33
 
34
  if constexpr (std::is_same<T, half>::value) {
35
  const half2 * x2 = (const half2 *) x;
 
29
  __syncthreads();
30
  }
31
 
32
+ float sumf = 0.0f;
33
 
34
  if constexpr (std::is_same<T, half>::value) {
35
  const half2 * x2 = (const half2 *) x;
ggml/src/ggml-cuda/mmvq.cu CHANGED
@@ -151,7 +151,7 @@ static __global__ void mul_mat_vec_q(
151
  constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
152
 
153
  // partial sum for each thread
154
- float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
155
 
156
  const block_q8_1 * y = (const block_q8_1 *) vy;
157
 
@@ -197,10 +197,12 @@ static __global__ void mul_mat_vec_q(
197
  tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
198
  }
199
 
200
- if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
201
  dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
202
  }
203
  }
 
 
204
  }
205
 
206
  static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
 
151
  constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
152
 
153
  // partial sum for each thread
154
+ float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}};
155
 
156
  const block_q8_1 * y = (const block_q8_1 *) vy;
157
 
 
197
  tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
198
  }
199
 
200
+ if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) {
201
  dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
202
  }
203
  }
204
+
205
+ GGML_UNUSED(nrows_x);
206
  }
207
 
208
  static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
ggml/src/ggml-cuda/pad.cu CHANGED
@@ -14,7 +14,7 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
14
  nidx +
15
  blockIdx.y * ne0 +
16
  blockIdx.z * ne0 * gridDim.y;
17
- if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
18
  int offset_src =
19
  nidx +
20
  blockIdx.y * ne00 +
 
14
  nidx +
15
  blockIdx.y * ne0 +
16
  blockIdx.z * ne0 * gridDim.y;
17
+ if (nidx < ne00 && blockIdx.y < (unsigned)ne01 && blockIdx.z < (unsigned)(ne02*ne03)) {
18
  int offset_src =
19
  nidx +
20
  blockIdx.y * ne00 +
ggml/src/ggml-cuda/upscale.cu CHANGED
@@ -19,7 +19,7 @@ static __global__ void upscale_f32(const float * x, float * dst,
19
  int i02 = i12 / sf2;
20
  int i03 = i13 / sf3;
21
 
22
- dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
23
  }
24
 
25
  static void upscale_f32_cuda(const float * x, float * dst,
 
19
  int i02 = i12 / sf2;
20
  int i03 = i13 / sf3;
21
 
22
+ dst[index] = *( (const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00) );
23
  }
24
 
25
  static void upscale_f32_cuda(const float * x, float * dst,