Spaces:
Running
Running
R0CKSTAR
commited on
Commit
·
090ad80
1
Parent(s):
77d7613
musa: fix compilation warnings in mp_22/31 (llama/12780)
Browse filesSigned-off-by: Xiaodong Ye <[email protected]>
ggml/src/ggml-cuda/cpy.cu
CHANGED
|
@@ -360,6 +360,9 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des
|
|
| 360 |
// copy destination pointers to GPU
|
| 361 |
CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
|
| 362 |
cuda_graph->graph_cpynode_index = 0; // reset index
|
|
|
|
|
|
|
|
|
|
| 363 |
#endif
|
| 364 |
}
|
| 365 |
|
|
|
|
| 360 |
// copy destination pointers to GPU
|
| 361 |
CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
|
| 362 |
cuda_graph->graph_cpynode_index = 0; // reset index
|
| 363 |
+
#else
|
| 364 |
+
GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
|
| 365 |
+
GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
|
| 366 |
#endif
|
| 367 |
}
|
| 368 |
|
ggml/src/ggml-cuda/fattn-common.cuh
CHANGED
|
@@ -62,7 +62,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
|
| 62 |
T sum = 0.0f;
|
| 63 |
|
| 64 |
#pragma unroll
|
| 65 |
-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
| 66 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 67 |
|
| 68 |
const int ib = k_KQ / QI8_1;
|
|
@@ -102,7 +102,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
|
| 102 |
T sum = 0.0f;
|
| 103 |
|
| 104 |
#pragma unroll
|
| 105 |
-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
| 106 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 107 |
|
| 108 |
const int ib = k_KQ / QI8_1;
|
|
@@ -146,7 +146,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
|
| 146 |
T sum = 0.0f;
|
| 147 |
|
| 148 |
#pragma unroll
|
| 149 |
-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
| 150 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 151 |
|
| 152 |
const int ib = k_KQ / QI8_1;
|
|
@@ -193,7 +193,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
|
| 193 |
T sum = 0.0f;
|
| 194 |
|
| 195 |
#pragma unroll
|
| 196 |
-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
| 197 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 198 |
|
| 199 |
const int ib = k_KQ / QI8_1;
|
|
@@ -244,7 +244,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
|
| 244 |
T sum = 0.0f;
|
| 245 |
|
| 246 |
#pragma unroll
|
| 247 |
-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
|
| 248 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 249 |
|
| 250 |
const int ib = k_KQ / QI8_0;
|
|
|
|
| 62 |
T sum = 0.0f;
|
| 63 |
|
| 64 |
#pragma unroll
|
| 65 |
+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
| 66 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 67 |
|
| 68 |
const int ib = k_KQ / QI8_1;
|
|
|
|
| 102 |
T sum = 0.0f;
|
| 103 |
|
| 104 |
#pragma unroll
|
| 105 |
+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
| 106 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 107 |
|
| 108 |
const int ib = k_KQ / QI8_1;
|
|
|
|
| 146 |
T sum = 0.0f;
|
| 147 |
|
| 148 |
#pragma unroll
|
| 149 |
+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
| 150 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 151 |
|
| 152 |
const int ib = k_KQ / QI8_1;
|
|
|
|
| 193 |
T sum = 0.0f;
|
| 194 |
|
| 195 |
#pragma unroll
|
| 196 |
+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
| 197 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 198 |
|
| 199 |
const int ib = k_KQ / QI8_1;
|
|
|
|
| 244 |
T sum = 0.0f;
|
| 245 |
|
| 246 |
#pragma unroll
|
| 247 |
+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
|
| 248 |
const int k_KQ = k_KQ_0 + threadIdx.x;
|
| 249 |
|
| 250 |
const int ib = k_KQ / QI8_0;
|
ggml/src/ggml-cuda/fattn-tile-f32.cu
CHANGED
|
@@ -52,6 +52,18 @@ static __global__ void flash_attn_tile_ext_f32(
|
|
| 52 |
return;
|
| 53 |
#endif // FP16_MMA_AVAILABLE
|
| 54 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
NO_DEVICE_CODE;
|
| 56 |
return;
|
| 57 |
}
|
|
|
|
| 52 |
return;
|
| 53 |
#endif // FP16_MMA_AVAILABLE
|
| 54 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
| 55 |
+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
| 56 |
+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
| 57 |
+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
| 58 |
+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
| 59 |
+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
| 60 |
+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
| 61 |
+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
| 62 |
+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
| 63 |
+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
| 64 |
+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
| 65 |
+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
| 66 |
+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
| 67 |
NO_DEVICE_CODE;
|
| 68 |
return;
|
| 69 |
}
|
ggml/src/ggml-cuda/fattn-vec-f32.cuh
CHANGED
|
@@ -45,6 +45,18 @@ static __global__ void flash_attn_vec_ext_f32(
|
|
| 45 |
|
| 46 |
// Skip unused kernel variants for faster compilation:
|
| 47 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
NO_DEVICE_CODE;
|
| 49 |
return;
|
| 50 |
}
|
|
@@ -114,7 +126,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
|
| 114 |
// Set memory to zero if out of bounds:
|
| 115 |
if (ncols > 2 && ic0 + j >= ne01) {
|
| 116 |
#pragma unroll
|
| 117 |
-
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
|
| 118 |
const int i = i0 + threadIdx.x;
|
| 119 |
|
| 120 |
tmp_q_i32[i] = 0;
|
|
@@ -127,7 +139,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
|
| 127 |
|
| 128 |
const float * Q_f = (const float *) (Q + j*nb01);
|
| 129 |
#pragma unroll
|
| 130 |
-
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
|
| 131 |
quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
|
| 132 |
}
|
| 133 |
}
|
|
@@ -140,7 +152,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
|
| 140 |
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
|
| 141 |
|
| 142 |
#pragma unroll
|
| 143 |
-
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
|
| 144 |
const int i = i0 + threadIdx.x;
|
| 145 |
|
| 146 |
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
|
|
|
|
| 45 |
|
| 46 |
// Skip unused kernel variants for faster compilation:
|
| 47 |
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
| 48 |
+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
| 49 |
+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
| 50 |
+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
| 51 |
+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
| 52 |
+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
| 53 |
+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
| 54 |
+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
| 55 |
+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
| 56 |
+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
| 57 |
+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
| 58 |
+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
| 59 |
+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
| 60 |
NO_DEVICE_CODE;
|
| 61 |
return;
|
| 62 |
}
|
|
|
|
| 126 |
// Set memory to zero if out of bounds:
|
| 127 |
if (ncols > 2 && ic0 + j >= ne01) {
|
| 128 |
#pragma unroll
|
| 129 |
+
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
| 130 |
const int i = i0 + threadIdx.x;
|
| 131 |
|
| 132 |
tmp_q_i32[i] = 0;
|
|
|
|
| 139 |
|
| 140 |
const float * Q_f = (const float *) (Q + j*nb01);
|
| 141 |
#pragma unroll
|
| 142 |
+
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
| 143 |
quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
|
| 144 |
}
|
| 145 |
}
|
|
|
|
| 152 |
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
|
| 153 |
|
| 154 |
#pragma unroll
|
| 155 |
+
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
| 156 |
const int i = i0 + threadIdx.x;
|
| 157 |
|
| 158 |
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
|