Spaces:
Running
Running
chen fan
commited on
Commit
·
0274100
1
Parent(s):
79bc58c
CANN: weight format to NZ for Ascend310P3 (llama/14407)
Browse files* weight format to nz for 310p
* remove quant weight format to nz
* clean code
* fix
* make the conditions for converting weights to NZ format consistent
* clean code
ggml/src/ggml-cann/aclnn_ops.cpp
CHANGED
|
@@ -1785,8 +1785,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|
| 1785 |
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
| 1786 |
bcast_weight_nb[2], bcast_weight_nb[3],
|
| 1787 |
bcast_weight_nb[4], bcast_weight_nb[5]};
|
| 1788 |
-
aclTensor* acl_weight_tensor
|
| 1789 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1790 |
aclTensor* acl_dst =
|
| 1791 |
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
| 1792 |
|
|
|
|
| 1785 |
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
| 1786 |
bcast_weight_nb[2], bcast_weight_nb[3],
|
| 1787 |
bcast_weight_nb[4], bcast_weight_nb[5]};
|
| 1788 |
+
aclTensor* acl_weight_tensor;
|
| 1789 |
+
|
| 1790 |
+
bool weightToNZ = false;
|
| 1791 |
+
#ifdef ASCEND_310P
|
| 1792 |
+
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
| 1793 |
+
#endif
|
| 1794 |
+
if (weightToNZ && is_matmul_weight(weight)) {
|
| 1795 |
+
int64_t acl_stride[2] = {1, transpose_ne[1]};
|
| 1796 |
+
|
| 1797 |
+
// Reverse ne.
|
| 1798 |
+
std::reverse(transpose_ne, transpose_ne + n_dims);
|
| 1799 |
+
|
| 1800 |
+
std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
|
| 1801 |
+
|
| 1802 |
+
acl_weight_tensor = aclCreateTensor(
|
| 1803 |
+
transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
|
| 1804 |
+
0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
|
| 1805 |
+
} else {
|
| 1806 |
+
acl_weight_tensor =
|
| 1807 |
+
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
| 1808 |
+
}
|
| 1809 |
aclTensor* acl_dst =
|
| 1810 |
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
| 1811 |
|
ggml/src/ggml-cann/aclnn_ops.h
CHANGED
|
@@ -23,6 +23,7 @@
|
|
| 23 |
#ifndef CANN_ACLNN_OPS
|
| 24 |
#define CANN_ACLNN_OPS
|
| 25 |
|
|
|
|
| 26 |
#include <functional>
|
| 27 |
#include <aclnnop/aclnn_abs.h>
|
| 28 |
#include <aclnnop/aclnn_neg.h>
|
|
@@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
|
|
| 1020 |
*/
|
| 1021 |
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
| 1022 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1023 |
/**
|
| 1024 |
* @brief Applies a element-wise operation to two input tensors using the CANN
|
| 1025 |
* backend.
|
|
|
|
| 23 |
#ifndef CANN_ACLNN_OPS
|
| 24 |
#define CANN_ACLNN_OPS
|
| 25 |
|
| 26 |
+
#include <unordered_set>
|
| 27 |
#include <functional>
|
| 28 |
#include <aclnnop/aclnn_abs.h>
|
| 29 |
#include <aclnnop/aclnn_neg.h>
|
|
|
|
| 1021 |
*/
|
| 1022 |
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
| 1023 |
|
| 1024 |
+
/**
|
| 1025 |
+
* @brief Check whether a tensor is a weight tensor for matrix multiplication.
|
| 1026 |
+
*
|
| 1027 |
+
* @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
|
| 1028 |
+
* typically within neural network layers. The function maintains a static set of canonical weight
|
| 1029 |
+
* naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
|
| 1030 |
+
* tensors even with hierarchical naming patterns.
|
| 1031 |
+
*
|
| 1032 |
+
* @param tensor Pointer to the target ggml_tensor object (const-qualified).
|
| 1033 |
+
*/
|
| 1034 |
+
static bool is_matmul_weight(const ggml_tensor* tensor) {
|
| 1035 |
+
std::string name = ggml_get_name(tensor);
|
| 1036 |
+
static const std::unordered_set<std::string> weight_suffixes{
|
| 1037 |
+
"output.weight",
|
| 1038 |
+
"attn_q.weight",
|
| 1039 |
+
"attn_k.weight",
|
| 1040 |
+
"attn_v.weight",
|
| 1041 |
+
"attn_output.weight",
|
| 1042 |
+
"ffn_gate.weight",
|
| 1043 |
+
"ffn_up.weight",
|
| 1044 |
+
"ffn_down.weight"
|
| 1045 |
+
};
|
| 1046 |
+
|
| 1047 |
+
for (const auto& suffix : weight_suffixes) {
|
| 1048 |
+
if (name.find(suffix) != std::string::npos) {
|
| 1049 |
+
return true;
|
| 1050 |
+
}
|
| 1051 |
+
}
|
| 1052 |
+
return false;
|
| 1053 |
+
}
|
| 1054 |
+
|
| 1055 |
/**
|
| 1056 |
* @brief Applies a element-wise operation to two input tensors using the CANN
|
| 1057 |
* backend.
|
ggml/src/ggml-cann/ggml-cann.cpp
CHANGED
|
@@ -24,6 +24,7 @@
|
|
| 24 |
|
| 25 |
#include <acl/acl.h>
|
| 26 |
#include <stdarg.h>
|
|
|
|
| 27 |
|
| 28 |
#include <cmath>
|
| 29 |
#include <cstdio>
|
|
@@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
|
| 1115 |
return GGML_STATUS_SUCCESS;
|
| 1116 |
}
|
| 1117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1118 |
// TODO: need handle tensor which has paddings.
|
| 1119 |
/**
|
| 1120 |
* @brief Set tensor data in a CANN buffer.
|
|
@@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor(
|
|
| 1139 |
// For acl, synchronous functions use this default stream.
|
| 1140 |
// Why aclrtSynchronizeDevice?
|
| 1141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1142 |
if (!need_transform(tensor->type)) {
|
| 1143 |
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
| 1144 |
ACL_MEMCPY_HOST_TO_DEVICE));
|
|
|
|
|
|
|
|
|
|
| 1145 |
} else {
|
| 1146 |
void *transform_buffer = malloc(size);
|
| 1147 |
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
|
|
|
| 24 |
|
| 25 |
#include <acl/acl.h>
|
| 26 |
#include <stdarg.h>
|
| 27 |
+
#include <aclnnop/aclnn_trans_matmul_weight.h>
|
| 28 |
|
| 29 |
#include <cmath>
|
| 30 |
#include <cstdio>
|
|
|
|
| 1116 |
return GGML_STATUS_SUCCESS;
|
| 1117 |
}
|
| 1118 |
|
| 1119 |
+
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
|
| 1120 |
+
aclDataType dataType, aclTensor **tensor)
|
| 1121 |
+
{
|
| 1122 |
+
uint64_t size = 1;
|
| 1123 |
+
for (auto i : shape) {
|
| 1124 |
+
size *= i;
|
| 1125 |
+
}
|
| 1126 |
+
|
| 1127 |
+
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
|
| 1128 |
+
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
|
| 1129 |
+
|
| 1130 |
+
size *= sizeof(int16_t);
|
| 1131 |
+
|
| 1132 |
+
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
| 1133 |
+
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
| 1134 |
+
|
| 1135 |
+
std::vector<int64_t> strides(shape.size(), 1);
|
| 1136 |
+
for (int64_t i = shape.size() - 2; i >= 0; i--) {
|
| 1137 |
+
strides[i] = shape[i + 1] * strides[i + 1];
|
| 1138 |
+
}
|
| 1139 |
+
|
| 1140 |
+
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
|
| 1141 |
+
shape.data(), shape.size(), *deviceAddr);
|
| 1142 |
+
return 0;
|
| 1143 |
+
}
|
| 1144 |
+
|
| 1145 |
+
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
|
| 1146 |
+
aclrtStream stream;
|
| 1147 |
+
ACL_CHECK(aclrtCreateStream(&stream));
|
| 1148 |
+
|
| 1149 |
+
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
|
| 1150 |
+
void *weightTransposedDeviceAddr = nullptr;
|
| 1151 |
+
aclTensor *weightTransposed = nullptr;
|
| 1152 |
+
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
|
| 1153 |
+
ggml_cann_type_mapping(tensor->type), &weightTransposed);
|
| 1154 |
+
|
| 1155 |
+
uint64_t workspaceSize = 0;
|
| 1156 |
+
aclOpExecutor *executor;
|
| 1157 |
+
void *workspaceAddr = nullptr;
|
| 1158 |
+
|
| 1159 |
+
// TransMatmulWeight
|
| 1160 |
+
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
|
| 1161 |
+
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
|
| 1162 |
+
if (workspaceSize > 0) {
|
| 1163 |
+
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
|
| 1164 |
+
workspaceAddrPtrTrans.reset(workspaceAddr);
|
| 1165 |
+
}
|
| 1166 |
+
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
|
| 1167 |
+
|
| 1168 |
+
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
|
| 1169 |
+
|
| 1170 |
+
aclrtMemcpy((char *)tensor->data + offset, size,
|
| 1171 |
+
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
| 1172 |
+
ACL_CHECK(aclDestroyTensor(weightTransposed));
|
| 1173 |
+
aclrtFree(weightTransposedDeviceAddr);
|
| 1174 |
+
}
|
| 1175 |
+
|
| 1176 |
// TODO: need handle tensor which has paddings.
|
| 1177 |
/**
|
| 1178 |
* @brief Set tensor data in a CANN buffer.
|
|
|
|
| 1197 |
// For acl, synchronous functions use this default stream.
|
| 1198 |
// Why aclrtSynchronizeDevice?
|
| 1199 |
|
| 1200 |
+
bool weightToNZ = false;
|
| 1201 |
+
#ifdef ASCEND_310P
|
| 1202 |
+
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
| 1203 |
+
#endif
|
| 1204 |
if (!need_transform(tensor->type)) {
|
| 1205 |
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
| 1206 |
ACL_MEMCPY_HOST_TO_DEVICE));
|
| 1207 |
+
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
|
| 1208 |
+
weight_format_to_nz(tensor, data, offset);
|
| 1209 |
+
}
|
| 1210 |
} else {
|
| 1211 |
void *transform_buffer = malloc(size);
|
| 1212 |
ggml_backend_cann_transform(tensor, data, transform_buffer);
|