chen fan commited on
Commit
0274100
·
1 Parent(s): 79bc58c

CANN: weight format to NZ for Ascend310P3 (llama/14407)

Browse files

* weight format to nz for 310p

* remove quant weight format to nz

* clean code

* fix

* make the conditions for converting weights to NZ format consistent

* clean code

ggml/src/ggml-cann/aclnn_ops.cpp CHANGED
@@ -1785,8 +1785,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1785
  size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
1786
  bcast_weight_nb[2], bcast_weight_nb[3],
1787
  bcast_weight_nb[4], bcast_weight_nb[5]};
1788
- aclTensor* acl_weight_tensor =
1789
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1790
  aclTensor* acl_dst =
1791
  ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
1792
 
 
1785
  size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
1786
  bcast_weight_nb[2], bcast_weight_nb[3],
1787
  bcast_weight_nb[4], bcast_weight_nb[5]};
1788
+ aclTensor* acl_weight_tensor;
1789
+
1790
+ bool weightToNZ = false;
1791
+ #ifdef ASCEND_310P
1792
+ weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1793
+ #endif
1794
+ if (weightToNZ && is_matmul_weight(weight)) {
1795
+ int64_t acl_stride[2] = {1, transpose_ne[1]};
1796
+
1797
+ // Reverse ne.
1798
+ std::reverse(transpose_ne, transpose_ne + n_dims);
1799
+
1800
+ std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
1801
+
1802
+ acl_weight_tensor = aclCreateTensor(
1803
+ transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
1804
+ 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
1805
+ } else {
1806
+ acl_weight_tensor =
1807
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
1808
+ }
1809
  aclTensor* acl_dst =
1810
  ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
1811
 
ggml/src/ggml-cann/aclnn_ops.h CHANGED
@@ -23,6 +23,7 @@
23
  #ifndef CANN_ACLNN_OPS
24
  #define CANN_ACLNN_OPS
25
 
 
26
  #include <functional>
27
  #include <aclnnop/aclnn_abs.h>
28
  #include <aclnnop/aclnn_neg.h>
@@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
1020
  */
1021
  void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
1022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
  /**
1024
  * @brief Applies a element-wise operation to two input tensors using the CANN
1025
  * backend.
 
23
  #ifndef CANN_ACLNN_OPS
24
  #define CANN_ACLNN_OPS
25
 
26
+ #include <unordered_set>
27
  #include <functional>
28
  #include <aclnnop/aclnn_abs.h>
29
  #include <aclnnop/aclnn_neg.h>
 
1021
  */
1022
  void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
1023
 
1024
+ /**
1025
+ * @brief Check whether a tensor is a weight tensor for matrix multiplication.
1026
+ *
1027
+ * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
1028
+ * typically within neural network layers. The function maintains a static set of canonical weight
1029
+ * naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
1030
+ * tensors even with hierarchical naming patterns.
1031
+ *
1032
+ * @param tensor Pointer to the target ggml_tensor object (const-qualified).
1033
+ */
1034
+ static bool is_matmul_weight(const ggml_tensor* tensor) {
1035
+ std::string name = ggml_get_name(tensor);
1036
+ static const std::unordered_set<std::string> weight_suffixes{
1037
+ "output.weight",
1038
+ "attn_q.weight",
1039
+ "attn_k.weight",
1040
+ "attn_v.weight",
1041
+ "attn_output.weight",
1042
+ "ffn_gate.weight",
1043
+ "ffn_up.weight",
1044
+ "ffn_down.weight"
1045
+ };
1046
+
1047
+ for (const auto& suffix : weight_suffixes) {
1048
+ if (name.find(suffix) != std::string::npos) {
1049
+ return true;
1050
+ }
1051
+ }
1052
+ return false;
1053
+ }
1054
+
1055
  /**
1056
  * @brief Applies a element-wise operation to two input tensors using the CANN
1057
  * backend.
ggml/src/ggml-cann/ggml-cann.cpp CHANGED
@@ -24,6 +24,7 @@
24
 
25
  #include <acl/acl.h>
26
  #include <stdarg.h>
 
27
 
28
  #include <cmath>
29
  #include <cstdio>
@@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1115
  return GGML_STATUS_SUCCESS;
1116
  }
1117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
  // TODO: need handle tensor which has paddings.
1119
  /**
1120
  * @brief Set tensor data in a CANN buffer.
@@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor(
1139
  // For acl, synchronous functions use this default stream.
1140
  // Why aclrtSynchronizeDevice?
1141
 
 
 
 
 
1142
  if (!need_transform(tensor->type)) {
1143
  ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
1144
  ACL_MEMCPY_HOST_TO_DEVICE));
 
 
 
1145
  } else {
1146
  void *transform_buffer = malloc(size);
1147
  ggml_backend_cann_transform(tensor, data, transform_buffer);
 
24
 
25
  #include <acl/acl.h>
26
  #include <stdarg.h>
27
+ #include <aclnnop/aclnn_trans_matmul_weight.h>
28
 
29
  #include <cmath>
30
  #include <cstdio>
 
1116
  return GGML_STATUS_SUCCESS;
1117
  }
1118
 
1119
+ static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
1120
+ aclDataType dataType, aclTensor **tensor)
1121
+ {
1122
+ uint64_t size = 1;
1123
+ for (auto i : shape) {
1124
+ size *= i;
1125
+ }
1126
+
1127
+ const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
1128
+ ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
1129
+
1130
+ size *= sizeof(int16_t);
1131
+
1132
+ ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
1133
+ aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
1134
+
1135
+ std::vector<int64_t> strides(shape.size(), 1);
1136
+ for (int64_t i = shape.size() - 2; i >= 0; i--) {
1137
+ strides[i] = shape[i + 1] * strides[i + 1];
1138
+ }
1139
+
1140
+ *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
1141
+ shape.data(), shape.size(), *deviceAddr);
1142
+ return 0;
1143
+ }
1144
+
1145
+ static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
1146
+ aclrtStream stream;
1147
+ ACL_CHECK(aclrtCreateStream(&stream));
1148
+
1149
+ std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
1150
+ void *weightTransposedDeviceAddr = nullptr;
1151
+ aclTensor *weightTransposed = nullptr;
1152
+ CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
1153
+ ggml_cann_type_mapping(tensor->type), &weightTransposed);
1154
+
1155
+ uint64_t workspaceSize = 0;
1156
+ aclOpExecutor *executor;
1157
+ void *workspaceAddr = nullptr;
1158
+
1159
+ // TransMatmulWeight
1160
+ ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
1161
+ std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
1162
+ if (workspaceSize > 0) {
1163
+ ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
1164
+ workspaceAddrPtrTrans.reset(workspaceAddr);
1165
+ }
1166
+ ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
1167
+
1168
+ size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
1169
+
1170
+ aclrtMemcpy((char *)tensor->data + offset, size,
1171
+ weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
1172
+ ACL_CHECK(aclDestroyTensor(weightTransposed));
1173
+ aclrtFree(weightTransposedDeviceAddr);
1174
+ }
1175
+
1176
  // TODO: need handle tensor which has paddings.
1177
  /**
1178
  * @brief Set tensor data in a CANN buffer.
 
1197
  // For acl, synchronous functions use this default stream.
1198
  // Why aclrtSynchronizeDevice?
1199
 
1200
+ bool weightToNZ = false;
1201
+ #ifdef ASCEND_310P
1202
+ weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1203
+ #endif
1204
  if (!need_transform(tensor->type)) {
1205
  ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
1206
  ACL_MEMCPY_HOST_TO_DEVICE));
1207
+ if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
1208
+ weight_format_to_nz(tensor, data, offset);
1209
+ }
1210
  } else {
1211
  void *transform_buffer = malloc(size);
1212
  ggml_backend_cann_transform(tensor, data, transform_buffer);