R0CKSTAR commited on
Commit
090ad80
·
1 Parent(s): 77d7613

musa: fix compilation warnings in mp_22/31 (llama/12780)

Browse files

Signed-off-by: Xiaodong Ye <[email protected]>

ggml/src/ggml-cuda/cpy.cu CHANGED
@@ -360,6 +360,9 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des
360
  // copy destination pointers to GPU
361
  CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
362
  cuda_graph->graph_cpynode_index = 0; // reset index
 
 
 
363
  #endif
364
  }
365
 
 
360
  // copy destination pointers to GPU
361
  CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
362
  cuda_graph->graph_cpynode_index = 0; // reset index
363
+ #else
364
+ GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
365
+ GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
366
  #endif
367
  }
368
 
ggml/src/ggml-cuda/fattn-common.cuh CHANGED
@@ -62,7 +62,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
62
  T sum = 0.0f;
63
 
64
  #pragma unroll
65
- for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
66
  const int k_KQ = k_KQ_0 + threadIdx.x;
67
 
68
  const int ib = k_KQ / QI8_1;
@@ -102,7 +102,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
102
  T sum = 0.0f;
103
 
104
  #pragma unroll
105
- for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
106
  const int k_KQ = k_KQ_0 + threadIdx.x;
107
 
108
  const int ib = k_KQ / QI8_1;
@@ -146,7 +146,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
146
  T sum = 0.0f;
147
 
148
  #pragma unroll
149
- for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
150
  const int k_KQ = k_KQ_0 + threadIdx.x;
151
 
152
  const int ib = k_KQ / QI8_1;
@@ -193,7 +193,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
193
  T sum = 0.0f;
194
 
195
  #pragma unroll
196
- for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
197
  const int k_KQ = k_KQ_0 + threadIdx.x;
198
 
199
  const int ib = k_KQ / QI8_1;
@@ -244,7 +244,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
244
  T sum = 0.0f;
245
 
246
  #pragma unroll
247
- for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
248
  const int k_KQ = k_KQ_0 + threadIdx.x;
249
 
250
  const int ib = k_KQ / QI8_0;
 
62
  T sum = 0.0f;
63
 
64
  #pragma unroll
65
+ for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
66
  const int k_KQ = k_KQ_0 + threadIdx.x;
67
 
68
  const int ib = k_KQ / QI8_1;
 
102
  T sum = 0.0f;
103
 
104
  #pragma unroll
105
+ for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
106
  const int k_KQ = k_KQ_0 + threadIdx.x;
107
 
108
  const int ib = k_KQ / QI8_1;
 
146
  T sum = 0.0f;
147
 
148
  #pragma unroll
149
+ for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
150
  const int k_KQ = k_KQ_0 + threadIdx.x;
151
 
152
  const int ib = k_KQ / QI8_1;
 
193
  T sum = 0.0f;
194
 
195
  #pragma unroll
196
+ for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
197
  const int k_KQ = k_KQ_0 + threadIdx.x;
198
 
199
  const int ib = k_KQ / QI8_1;
 
244
  T sum = 0.0f;
245
 
246
  #pragma unroll
247
+ for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
248
  const int k_KQ = k_KQ_0 + threadIdx.x;
249
 
250
  const int ib = k_KQ / QI8_0;
ggml/src/ggml-cuda/fattn-tile-f32.cu CHANGED
@@ -52,6 +52,18 @@ static __global__ void flash_attn_tile_ext_f32(
52
  return;
53
  #endif // FP16_MMA_AVAILABLE
54
  if (use_logit_softcap && !(D == 128 || D == 256)) {
 
 
 
 
 
 
 
 
 
 
 
 
55
  NO_DEVICE_CODE;
56
  return;
57
  }
 
52
  return;
53
  #endif // FP16_MMA_AVAILABLE
54
  if (use_logit_softcap && !(D == 128 || D == 256)) {
55
+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
56
+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
57
+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
58
+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
59
+ GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
60
+ GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
61
+ GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
62
+ GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
63
+ GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
64
+ GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
65
+ GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
66
+ GGML_UNUSED(ne2); GGML_UNUSED(ne3);
67
  NO_DEVICE_CODE;
68
  return;
69
  }
ggml/src/ggml-cuda/fattn-vec-f32.cuh CHANGED
@@ -45,6 +45,18 @@ static __global__ void flash_attn_vec_ext_f32(
45
 
46
  // Skip unused kernel variants for faster compilation:
47
  if (use_logit_softcap && !(D == 128 || D == 256)) {
 
 
 
 
 
 
 
 
 
 
 
 
48
  NO_DEVICE_CODE;
49
  return;
50
  }
@@ -114,7 +126,7 @@ static __global__ void flash_attn_vec_ext_f32(
114
  // Set memory to zero if out of bounds:
115
  if (ncols > 2 && ic0 + j >= ne01) {
116
  #pragma unroll
117
- for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
118
  const int i = i0 + threadIdx.x;
119
 
120
  tmp_q_i32[i] = 0;
@@ -127,7 +139,7 @@ static __global__ void flash_attn_vec_ext_f32(
127
 
128
  const float * Q_f = (const float *) (Q + j*nb01);
129
  #pragma unroll
130
- for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
131
  quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
132
  }
133
  }
@@ -140,7 +152,7 @@ static __global__ void flash_attn_vec_ext_f32(
140
  float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
141
 
142
  #pragma unroll
143
- for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
144
  const int i = i0 + threadIdx.x;
145
 
146
  Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
 
45
 
46
  // Skip unused kernel variants for faster compilation:
47
  if (use_logit_softcap && !(D == 128 || D == 256)) {
48
+ GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
49
+ GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
50
+ GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
51
+ GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
52
+ GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
53
+ GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
54
+ GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
55
+ GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
56
+ GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
57
+ GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
58
+ GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
59
+ GGML_UNUSED(ne2); GGML_UNUSED(ne3);
60
  NO_DEVICE_CODE;
61
  return;
62
  }
 
126
  // Set memory to zero if out of bounds:
127
  if (ncols > 2 && ic0 + j >= ne01) {
128
  #pragma unroll
129
+ for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
130
  const int i = i0 + threadIdx.x;
131
 
132
  tmp_q_i32[i] = 0;
 
139
 
140
  const float * Q_f = (const float *) (Q + j*nb01);
141
  #pragma unroll
142
+ for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
143
  quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
144
  }
145
  }
 
152
  float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
153
 
154
  #pragma unroll
155
+ for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
156
  const int i = i0 + threadIdx.x;
157
 
158
  Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];