Spaces:
Sleeping
Sleeping
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama-arch.cpp +20 -0
- examples/talk-llama/llama-arch.h +2 -0
- examples/talk-llama/llama-chat.cpp +7 -15
- examples/talk-llama/llama-chat.h +2 -2
- examples/talk-llama/llama-context.cpp +4 -17
- examples/talk-llama/llama-context.h +1 -2
- examples/talk-llama/llama-graph.cpp +42 -16
- examples/talk-llama/llama-graph.h +5 -7
- examples/talk-llama/llama-hparams.h +1 -0
- examples/talk-llama/llama-model.cpp +65 -12
- examples/talk-llama/llama-model.h +7 -2
- examples/talk-llama/llama-sampling.cpp +2 -1
- examples/talk-llama/llama.h +1 -0
examples/talk-llama/llama-arch.cpp
CHANGED
|
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 19 |
{ LLM_ARCH_REFACT, "refact" },
|
| 20 |
{ LLM_ARCH_BERT, "bert" },
|
| 21 |
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
|
|
| 22 |
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
| 23 |
{ LLM_ARCH_BLOOM, "bloom" },
|
| 24 |
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
@@ -106,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 106 |
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
| 107 |
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
| 108 |
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
|
|
|
| 109 |
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
| 110 |
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
| 111 |
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
@@ -472,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 472 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 473 |
},
|
| 474 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
{
|
| 476 |
LLM_ARCH_JINA_BERT_V2,
|
| 477 |
{
|
|
|
|
| 19 |
{ LLM_ARCH_REFACT, "refact" },
|
| 20 |
{ LLM_ARCH_BERT, "bert" },
|
| 21 |
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
| 22 |
+
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
| 23 |
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
| 24 |
{ LLM_ARCH_BLOOM, "bloom" },
|
| 25 |
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
|
|
| 107 |
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
| 108 |
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
| 109 |
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
| 110 |
+
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
| 111 |
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
| 112 |
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
| 113 |
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
|
|
| 474 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 475 |
},
|
| 476 |
},
|
| 477 |
+
{
|
| 478 |
+
LLM_ARCH_NOMIC_BERT_MOE,
|
| 479 |
+
{
|
| 480 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 481 |
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
| 482 |
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
| 483 |
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
| 484 |
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
| 485 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 486 |
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
| 487 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 488 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 489 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 490 |
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
| 491 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 492 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 493 |
+
},
|
| 494 |
+
},
|
| 495 |
{
|
| 496 |
LLM_ARCH_JINA_BERT_V2,
|
| 497 |
{
|
examples/talk-llama/llama-arch.h
CHANGED
|
@@ -23,6 +23,7 @@ enum llm_arch {
|
|
| 23 |
LLM_ARCH_REFACT,
|
| 24 |
LLM_ARCH_BERT,
|
| 25 |
LLM_ARCH_NOMIC_BERT,
|
|
|
|
| 26 |
LLM_ARCH_JINA_BERT_V2,
|
| 27 |
LLM_ARCH_BLOOM,
|
| 28 |
LLM_ARCH_STABLELM,
|
|
@@ -110,6 +111,7 @@ enum llm_kv {
|
|
| 110 |
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
| 111 |
LLM_KV_EXPERT_WEIGHTS_NORM,
|
| 112 |
LLM_KV_EXPERT_GATING_FUNC,
|
|
|
|
| 113 |
LLM_KV_POOLING_TYPE,
|
| 114 |
LLM_KV_LOGIT_SCALE,
|
| 115 |
LLM_KV_DECODER_START_TOKEN_ID,
|
|
|
|
| 23 |
LLM_ARCH_REFACT,
|
| 24 |
LLM_ARCH_BERT,
|
| 25 |
LLM_ARCH_NOMIC_BERT,
|
| 26 |
+
LLM_ARCH_NOMIC_BERT_MOE,
|
| 27 |
LLM_ARCH_JINA_BERT_V2,
|
| 28 |
LLM_ARCH_BLOOM,
|
| 29 |
LLM_ARCH_STABLELM,
|
|
|
|
| 111 |
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
| 112 |
LLM_KV_EXPERT_WEIGHTS_NORM,
|
| 113 |
LLM_KV_EXPERT_GATING_FUNC,
|
| 114 |
+
LLM_KV_MOE_EVERY_N_LAYERS,
|
| 115 |
LLM_KV_POOLING_TYPE,
|
| 116 |
LLM_KV_LOGIT_SCALE,
|
| 117 |
LLM_KV_DECODER_START_TOKEN_ID,
|
examples/talk-llama/llama-chat.cpp
CHANGED
|
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
| 50 |
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
| 51 |
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
| 52 |
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
| 53 |
-
{ "chatglm3",
|
| 54 |
-
{ "chatglm4",
|
| 55 |
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
| 56 |
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
| 57 |
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
|
@@ -122,6 +122,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
| 122 |
}
|
| 123 |
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
| 124 |
return LLM_CHAT_TEMPLATE_PHI_3;
|
|
|
|
|
|
|
| 125 |
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
| 126 |
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
| 127 |
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
|
@@ -154,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
| 154 |
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
| 155 |
} else if (tmpl_contains("[gMASK]sop")) {
|
| 156 |
// chatglm3-6b
|
| 157 |
-
return
|
| 158 |
-
} else if (tmpl_contains("[gMASK]<sop>")) {
|
| 159 |
-
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
| 160 |
} else if (tmpl_contains(LU8("<用户>"))) {
|
| 161 |
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
| 162 |
return LLM_CHAT_TEMPLATE_MINICPM;
|
|
@@ -437,7 +437,7 @@ int32_t llm_chat_apply_template(
|
|
| 437 |
if (add_ass) {
|
| 438 |
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
| 439 |
}
|
| 440 |
-
} else if (tmpl ==
|
| 441 |
// chatglm3-6b
|
| 442 |
ss << "[gMASK]" << "sop";
|
| 443 |
for (auto message : chat) {
|
|
@@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
|
|
| 447 |
if (add_ass) {
|
| 448 |
ss << "<|assistant|>";
|
| 449 |
}
|
| 450 |
-
} else if (tmpl ==
|
| 451 |
ss << "[gMASK]" << "<sop>";
|
| 452 |
for (auto message : chat) {
|
| 453 |
std::string role(message->role);
|
|
@@ -456,14 +456,6 @@ int32_t llm_chat_apply_template(
|
|
| 456 |
if (add_ass) {
|
| 457 |
ss << "<|assistant|>";
|
| 458 |
}
|
| 459 |
-
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
| 460 |
-
for (auto message : chat) {
|
| 461 |
-
std::string role(message->role);
|
| 462 |
-
ss << "<|" << role << "|>" << "\n" << message->content;
|
| 463 |
-
}
|
| 464 |
-
if (add_ass) {
|
| 465 |
-
ss << "<|assistant|>";
|
| 466 |
-
}
|
| 467 |
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
| 468 |
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
| 469 |
for (auto message : chat) {
|
|
|
|
| 50 |
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
| 51 |
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
| 52 |
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
| 53 |
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
| 54 |
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
| 55 |
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
| 56 |
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
| 57 |
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
|
|
|
| 122 |
}
|
| 123 |
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
| 124 |
return LLM_CHAT_TEMPLATE_PHI_3;
|
| 125 |
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
| 126 |
+
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
| 127 |
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
| 128 |
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
| 129 |
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
|
|
|
| 156 |
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
| 157 |
} else if (tmpl_contains("[gMASK]sop")) {
|
| 158 |
// chatglm3-6b
|
| 159 |
+
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
|
|
|
|
|
|
| 160 |
} else if (tmpl_contains(LU8("<用户>"))) {
|
| 161 |
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
| 162 |
return LLM_CHAT_TEMPLATE_MINICPM;
|
|
|
|
| 437 |
if (add_ass) {
|
| 438 |
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
| 439 |
}
|
| 440 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
| 441 |
// chatglm3-6b
|
| 442 |
ss << "[gMASK]" << "sop";
|
| 443 |
for (auto message : chat) {
|
|
|
|
| 447 |
if (add_ass) {
|
| 448 |
ss << "<|assistant|>";
|
| 449 |
}
|
| 450 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
| 451 |
ss << "[gMASK]" << "<sop>";
|
| 452 |
for (auto message : chat) {
|
| 453 |
std::string role(message->role);
|
|
|
|
| 456 |
if (add_ass) {
|
| 457 |
ss << "<|assistant|>";
|
| 458 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
| 460 |
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
| 461 |
for (auto message : chat) {
|
examples/talk-llama/llama-chat.h
CHANGED
|
@@ -29,8 +29,8 @@ enum llm_chat_template {
|
|
| 29 |
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
| 30 |
LLM_CHAT_TEMPLATE_COMMAND_R,
|
| 31 |
LLM_CHAT_TEMPLATE_LLAMA_3,
|
| 32 |
-
|
| 33 |
-
|
| 34 |
LLM_CHAT_TEMPLATE_GLMEDGE,
|
| 35 |
LLM_CHAT_TEMPLATE_MINICPM,
|
| 36 |
LLM_CHAT_TEMPLATE_EXAONE_3,
|
|
|
|
| 29 |
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
| 30 |
LLM_CHAT_TEMPLATE_COMMAND_R,
|
| 31 |
LLM_CHAT_TEMPLATE_LLAMA_3,
|
| 32 |
+
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
| 33 |
+
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
| 34 |
LLM_CHAT_TEMPLATE_GLMEDGE,
|
| 35 |
LLM_CHAT_TEMPLATE_MINICPM,
|
| 36 |
LLM_CHAT_TEMPLATE_EXAONE_3,
|
examples/talk-llama/llama-context.cpp
CHANGED
|
@@ -114,7 +114,7 @@ llama_context::llama_context(
|
|
| 114 |
}
|
| 115 |
|
| 116 |
if (n_ctx_per_seq > hparams.n_ctx_train) {
|
| 117 |
-
LLAMA_LOG_WARN("%s:
|
| 118 |
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
| 119 |
}
|
| 120 |
|
|
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
|
|
| 469 |
ggml_tensor * shift,
|
| 470 |
ggml_tensor * factors,
|
| 471 |
float freq_base,
|
| 472 |
-
float freq_scale
|
| 473 |
-
ggml_backend_buffer * bbuf) const {
|
| 474 |
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
| 475 |
|
| 476 |
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
|
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
|
|
| 492 |
// dequantize to f32 -> RoPE -> quantize back
|
| 493 |
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
| 494 |
|
| 495 |
-
|
| 496 |
-
for (const auto & backend : backends) {
|
| 497 |
-
// Figure out which backend KV cache belongs to
|
| 498 |
-
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
|
| 499 |
-
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
|
| 500 |
-
break;
|
| 501 |
-
}
|
| 502 |
-
}
|
| 503 |
-
}
|
| 504 |
-
|
| 505 |
-
tmp = ggml_rope_ext_inplace(ctx0, tmp,
|
| 506 |
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 507 |
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
| 508 |
|
|
@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
|
| 582 |
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
| 583 |
0);
|
| 584 |
|
| 585 |
-
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l
|
| 586 |
|
| 587 |
ggml_build_forward_expand(gf, cur);
|
| 588 |
}
|
|
@@ -1547,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
| 1547 |
// set all ids as invalid (negative)
|
| 1548 |
std::fill(output_ids.begin(), output_ids.end(), -1);
|
| 1549 |
|
| 1550 |
-
ggml_backend_buffer_clear(buf_output.get(), 0);
|
| 1551 |
-
|
| 1552 |
this->n_outputs = 0;
|
| 1553 |
this->n_outputs_max = n_outputs_max;
|
| 1554 |
|
|
|
|
| 114 |
}
|
| 115 |
|
| 116 |
if (n_ctx_per_seq > hparams.n_ctx_train) {
|
| 117 |
+
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
| 118 |
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
| 119 |
}
|
| 120 |
|
|
|
|
| 469 |
ggml_tensor * shift,
|
| 470 |
ggml_tensor * factors,
|
| 471 |
float freq_base,
|
| 472 |
+
float freq_scale) const {
|
|
|
|
| 473 |
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
| 474 |
|
| 475 |
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
|
|
|
| 491 |
// dequantize to f32 -> RoPE -> quantize back
|
| 492 |
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
| 493 |
|
| 494 |
+
tmp = ggml_rope_ext(ctx0, tmp,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 496 |
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
| 497 |
|
|
|
|
| 571 |
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
| 572 |
0);
|
| 573 |
|
| 574 |
+
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
| 575 |
|
| 576 |
ggml_build_forward_expand(gf, cur);
|
| 577 |
}
|
|
|
|
| 1536 |
// set all ids as invalid (negative)
|
| 1537 |
std::fill(output_ids.begin(), output_ids.end(), -1);
|
| 1538 |
|
|
|
|
|
|
|
| 1539 |
this->n_outputs = 0;
|
| 1540 |
this->n_outputs_max = n_outputs_max;
|
| 1541 |
|
examples/talk-llama/llama-context.h
CHANGED
|
@@ -170,8 +170,7 @@ private:
|
|
| 170 |
ggml_tensor * shift,
|
| 171 |
ggml_tensor * factors,
|
| 172 |
float freq_base,
|
| 173 |
-
float freq_scale
|
| 174 |
-
ggml_backend_buffer * bbuf) const;
|
| 175 |
|
| 176 |
llm_graph_result_ptr build_kv_self_shift(
|
| 177 |
ggml_context * ctx0,
|
|
|
|
| 170 |
ggml_tensor * shift,
|
| 171 |
ggml_tensor * factors,
|
| 172 |
float freq_base,
|
| 173 |
+
float freq_scale) const;
|
|
|
|
| 174 |
|
| 175 |
llm_graph_result_ptr build_kv_self_shift(
|
| 176 |
ggml_context * ctx0,
|
examples/talk-llama/llama-graph.cpp
CHANGED
|
@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
|
| 55 |
if (ubatch->pos && pos) {
|
| 56 |
const int64_t n_tokens = ubatch->n_tokens;
|
| 57 |
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
}
|
| 60 |
}
|
| 61 |
|
|
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|
| 71 |
) * f_attn_temp_scale + 1.0;
|
| 72 |
}
|
| 73 |
|
| 74 |
-
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*
|
| 75 |
}
|
| 76 |
}
|
| 77 |
|
|
@@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|
| 592 |
res (std::make_unique<llm_graph_result>()) {
|
| 593 |
}
|
| 594 |
|
| 595 |
-
int64_t llm_graph_context::
|
| 596 |
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
| 597 |
}
|
| 598 |
|
|
@@ -803,6 +817,10 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
| 803 |
|
| 804 |
if (down) {
|
| 805 |
cur = build_lora_mm(down, cur);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 806 |
}
|
| 807 |
|
| 808 |
if (down_b) {
|
|
@@ -910,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
| 910 |
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
| 911 |
cb(up, "ffn_moe_up", il);
|
| 912 |
|
| 913 |
-
ggml_tensor *
|
| 914 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 915 |
|
| 916 |
switch (type_op) {
|
| 917 |
case LLM_FFN_SILU:
|
| 918 |
{
|
| 919 |
-
|
| 920 |
-
cb(
|
| 921 |
} break;
|
| 922 |
case LLM_FFN_GELU:
|
| 923 |
{
|
| 924 |
-
|
| 925 |
-
cb(
|
| 926 |
} break;
|
| 927 |
default:
|
| 928 |
GGML_ABORT("fatal error");
|
| 929 |
}
|
| 930 |
|
| 931 |
-
|
| 932 |
-
|
|
|
|
|
|
|
| 933 |
|
| 934 |
-
|
| 935 |
cb(experts, "ffn_moe_down", il);
|
| 936 |
|
| 937 |
if (!weight_before_ffn) {
|
|
@@ -1014,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|
| 1014 |
}
|
| 1015 |
|
| 1016 |
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
| 1017 |
-
auto inp = std::make_unique<llm_graph_input_pos>(
|
| 1018 |
|
| 1019 |
auto & cur = inp->pos;
|
| 1020 |
|
| 1021 |
-
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*
|
| 1022 |
ggml_set_input(cur);
|
| 1023 |
|
| 1024 |
res->add_input(std::move(inp));
|
|
@@ -1027,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|
| 1027 |
}
|
| 1028 |
|
| 1029 |
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
| 1030 |
-
auto inp = std::make_unique<llm_graph_input_attn_temp>(
|
| 1031 |
|
| 1032 |
auto & cur = inp->attn_scale;
|
| 1033 |
|
| 1034 |
-
|
|
|
|
| 1035 |
ggml_set_input(cur);
|
| 1036 |
|
| 1037 |
res->add_input(std::move(inp));
|
|
|
|
| 55 |
if (ubatch->pos && pos) {
|
| 56 |
const int64_t n_tokens = ubatch->n_tokens;
|
| 57 |
|
| 58 |
+
if (ubatch->token && n_pos_per_embd == 4) {
|
| 59 |
+
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
|
| 60 |
+
// the 3 first dims are the same, and 4th dim is all 0
|
| 61 |
+
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
|
| 62 |
+
// copy the first dimension
|
| 63 |
+
for (int i = 0; i < n_tokens; ++i) {
|
| 64 |
+
pos_data[ i] = ubatch->pos[i];
|
| 65 |
+
pos_data[ n_tokens + i] = ubatch->pos[i];
|
| 66 |
+
pos_data[2 * n_tokens + i] = ubatch->pos[i];
|
| 67 |
+
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
|
| 68 |
+
}
|
| 69 |
+
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
|
| 70 |
+
} else {
|
| 71 |
+
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
|
| 72 |
+
}
|
| 73 |
}
|
| 74 |
}
|
| 75 |
|
|
|
|
| 85 |
) * f_attn_temp_scale + 1.0;
|
| 86 |
}
|
| 87 |
|
| 88 |
+
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
|
| 89 |
}
|
| 90 |
}
|
| 91 |
|
|
|
|
| 606 |
res (std::make_unique<llm_graph_result>()) {
|
| 607 |
}
|
| 608 |
|
| 609 |
+
int64_t llm_graph_context::n_pos_per_embd() const {
|
| 610 |
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
| 611 |
}
|
| 612 |
|
|
|
|
| 817 |
|
| 818 |
if (down) {
|
| 819 |
cur = build_lora_mm(down, cur);
|
| 820 |
+
if (arch == LLM_ARCH_GLM4) {
|
| 821 |
+
// GLM4 seems to have numerical issues with half-precision accumulators
|
| 822 |
+
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
| 823 |
+
}
|
| 824 |
}
|
| 825 |
|
| 826 |
if (down_b) {
|
|
|
|
| 928 |
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
| 929 |
cb(up, "ffn_moe_up", il);
|
| 930 |
|
| 931 |
+
ggml_tensor * experts = nullptr;
|
| 932 |
+
if (gate_exps) {
|
| 933 |
+
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
| 934 |
+
cb(cur, "ffn_moe_gate", il);
|
| 935 |
+
} else {
|
| 936 |
+
cur = up;
|
| 937 |
+
}
|
| 938 |
|
| 939 |
switch (type_op) {
|
| 940 |
case LLM_FFN_SILU:
|
| 941 |
{
|
| 942 |
+
cur = ggml_silu(ctx0, cur);
|
| 943 |
+
cb(cur, "ffn_moe_silu", il);
|
| 944 |
} break;
|
| 945 |
case LLM_FFN_GELU:
|
| 946 |
{
|
| 947 |
+
cur = ggml_gelu(ctx0, cur);
|
| 948 |
+
cb(cur, "ffn_moe_gelu", il);
|
| 949 |
} break;
|
| 950 |
default:
|
| 951 |
GGML_ABORT("fatal error");
|
| 952 |
}
|
| 953 |
|
| 954 |
+
if (gate_exps) {
|
| 955 |
+
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
|
| 956 |
+
cb(cur, "ffn_moe_gate_par", il);
|
| 957 |
+
}
|
| 958 |
|
| 959 |
+
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
| 960 |
cb(experts, "ffn_moe_down", il);
|
| 961 |
|
| 962 |
if (!weight_before_ffn) {
|
|
|
|
| 1039 |
}
|
| 1040 |
|
| 1041 |
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
| 1042 |
+
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
|
| 1043 |
|
| 1044 |
auto & cur = inp->pos;
|
| 1045 |
|
| 1046 |
+
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
|
| 1047 |
ggml_set_input(cur);
|
| 1048 |
|
| 1049 |
res->add_input(std::move(inp));
|
|
|
|
| 1052 |
}
|
| 1053 |
|
| 1054 |
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
| 1055 |
+
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
| 1056 |
|
| 1057 |
auto & cur = inp->attn_scale;
|
| 1058 |
|
| 1059 |
+
// this need to be 1x1xN for broadcasting
|
| 1060 |
+
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
|
| 1061 |
ggml_set_input(cur);
|
| 1062 |
|
| 1063 |
res->add_input(std::move(inp));
|
examples/talk-llama/llama-graph.h
CHANGED
|
@@ -90,29 +90,27 @@ public:
|
|
| 90 |
|
| 91 |
class llm_graph_input_pos : public llm_graph_input_i {
|
| 92 |
public:
|
| 93 |
-
llm_graph_input_pos(int64_t
|
| 94 |
virtual ~llm_graph_input_pos() = default;
|
| 95 |
|
| 96 |
void set_input(const llama_ubatch * ubatch) override;
|
| 97 |
|
| 98 |
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
| 99 |
|
| 100 |
-
const int64_t
|
| 101 |
};
|
| 102 |
|
| 103 |
// temperature tuning, used by llama4
|
| 104 |
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
| 105 |
public:
|
| 106 |
-
llm_graph_input_attn_temp(
|
| 107 |
-
:
|
| 108 |
virtual ~llm_graph_input_attn_temp() = default;
|
| 109 |
|
| 110 |
void set_input(const llama_ubatch * ubatch) override;
|
| 111 |
|
| 112 |
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
| 113 |
|
| 114 |
-
const int64_t n_pos_per_token = 1;
|
| 115 |
-
|
| 116 |
const uint32_t n_attn_temp_floor_scale;
|
| 117 |
const float f_attn_temp_scale;
|
| 118 |
};
|
|
@@ -419,7 +417,7 @@ struct llm_graph_context {
|
|
| 419 |
|
| 420 |
llm_graph_context(const llm_graph_params & params);
|
| 421 |
|
| 422 |
-
int64_t
|
| 423 |
|
| 424 |
void cb(ggml_tensor * cur, const char * name, int il) const;
|
| 425 |
|
|
|
|
| 90 |
|
| 91 |
class llm_graph_input_pos : public llm_graph_input_i {
|
| 92 |
public:
|
| 93 |
+
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
|
| 94 |
virtual ~llm_graph_input_pos() = default;
|
| 95 |
|
| 96 |
void set_input(const llama_ubatch * ubatch) override;
|
| 97 |
|
| 98 |
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
| 99 |
|
| 100 |
+
const int64_t n_pos_per_embd = 1;
|
| 101 |
};
|
| 102 |
|
| 103 |
// temperature tuning, used by llama4
|
| 104 |
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
| 105 |
public:
|
| 106 |
+
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
| 107 |
+
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
| 108 |
virtual ~llm_graph_input_attn_temp() = default;
|
| 109 |
|
| 110 |
void set_input(const llama_ubatch * ubatch) override;
|
| 111 |
|
| 112 |
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
| 113 |
|
|
|
|
|
|
|
| 114 |
const uint32_t n_attn_temp_floor_scale;
|
| 115 |
const float f_attn_temp_scale;
|
| 116 |
};
|
|
|
|
| 417 |
|
| 418 |
llm_graph_context(const llm_graph_params & params);
|
| 419 |
|
| 420 |
+
int64_t n_pos_per_embd() const;
|
| 421 |
|
| 422 |
void cb(ggml_tensor * cur, const char * name, int il) const;
|
| 423 |
|
examples/talk-llama/llama-hparams.h
CHANGED
|
@@ -66,6 +66,7 @@ struct llama_hparams {
|
|
| 66 |
float expert_weights_scale = 0.0;
|
| 67 |
bool expert_weights_norm = false;
|
| 68 |
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
|
|
|
| 69 |
|
| 70 |
float f_norm_eps;
|
| 71 |
float f_norm_rms_eps;
|
|
|
|
| 66 |
float expert_weights_scale = 0.0;
|
| 67 |
bool expert_weights_norm = false;
|
| 68 |
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
| 69 |
+
uint32_t moe_every_n_layers = 0;
|
| 70 |
|
| 71 |
float f_norm_eps;
|
| 72 |
float f_norm_rms_eps;
|
examples/talk-llama/llama-model.cpp
CHANGED
|
@@ -40,14 +40,17 @@ const char * llm_type_name(llm_type type) {
|
|
| 40 |
case LLM_TYPE_335M: return "335M";
|
| 41 |
case LLM_TYPE_410M: return "410M";
|
| 42 |
case LLM_TYPE_450M: return "450M";
|
|
|
|
| 43 |
case LLM_TYPE_770M: return "770M";
|
| 44 |
case LLM_TYPE_780M: return "780M";
|
| 45 |
case LLM_TYPE_0_5B: return "0.5B";
|
|
|
|
| 46 |
case LLM_TYPE_1B: return "1B";
|
| 47 |
case LLM_TYPE_1_3B: return "1.3B";
|
| 48 |
case LLM_TYPE_1_4B: return "1.4B";
|
| 49 |
case LLM_TYPE_1_5B: return "1.5B";
|
| 50 |
case LLM_TYPE_1_6B: return "1.6B";
|
|
|
|
| 51 |
case LLM_TYPE_1_8B: return "1.8B";
|
| 52 |
case LLM_TYPE_2B: return "2B";
|
| 53 |
case LLM_TYPE_2_8B: return "2.8B";
|
|
@@ -66,6 +69,7 @@ const char * llm_type_name(llm_type type) {
|
|
| 66 |
case LLM_TYPE_15B: return "15B";
|
| 67 |
case LLM_TYPE_16B: return "16B";
|
| 68 |
case LLM_TYPE_20B: return "20B";
|
|
|
|
| 69 |
case LLM_TYPE_30B: return "30B";
|
| 70 |
case LLM_TYPE_32B: return "32B";
|
| 71 |
case LLM_TYPE_34B: return "34B";
|
|
@@ -74,6 +78,7 @@ const char * llm_type_name(llm_type type) {
|
|
| 74 |
case LLM_TYPE_65B: return "65B";
|
| 75 |
case LLM_TYPE_70B: return "70B";
|
| 76 |
case LLM_TYPE_236B: return "236B";
|
|
|
|
| 77 |
case LLM_TYPE_314B: return "314B";
|
| 78 |
case LLM_TYPE_671B: return "671B";
|
| 79 |
case LLM_TYPE_SMALL: return "0.1B";
|
|
@@ -88,10 +93,10 @@ const char * llm_type_name(llm_type type) {
|
|
| 88 |
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
| 89 |
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
| 90 |
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
| 91 |
-
case LLM_TYPE_27B: return "27B";
|
| 92 |
-
case LLM_TYPE_290B: return "290B";
|
| 93 |
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
| 94 |
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
|
|
|
|
|
| 95 |
default: return "?B";
|
| 96 |
}
|
| 97 |
}
|
|
@@ -695,13 +700,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 695 |
}
|
| 696 |
} break;
|
| 697 |
case LLM_ARCH_NOMIC_BERT:
|
|
|
|
| 698 |
{
|
| 699 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 700 |
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
| 701 |
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
|
|
| 702 |
|
| 703 |
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
| 704 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
}
|
| 706 |
} break;
|
| 707 |
case LLM_ARCH_BLOOM:
|
|
@@ -791,6 +802,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 791 |
{
|
| 792 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 793 |
switch (hparams.n_layer) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
default: type = LLM_TYPE_UNKNOWN;
|
| 795 |
}
|
| 796 |
} break;
|
|
@@ -800,6 +815,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 800 |
|
| 801 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 802 |
switch (hparams.n_layer) {
|
|
|
|
|
|
|
| 803 |
default: type = LLM_TYPE_UNKNOWN;
|
| 804 |
}
|
| 805 |
} break;
|
|
@@ -2057,6 +2074,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 2057 |
} break;
|
| 2058 |
case LLM_ARCH_BERT:
|
| 2059 |
case LLM_ARCH_NOMIC_BERT:
|
|
|
|
| 2060 |
{
|
| 2061 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 2062 |
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
|
@@ -2090,20 +2108,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 2090 |
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
| 2091 |
}
|
| 2092 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2093 |
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
| 2094 |
|
| 2095 |
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
| 2096 |
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
| 2097 |
|
| 2098 |
-
|
| 2099 |
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
| 2100 |
-
|
| 2101 |
-
if (arch == LLM_ARCH_BERT) {
|
| 2102 |
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
| 2103 |
-
layer.
|
| 2104 |
-
layer.
|
|
|
|
| 2105 |
} else {
|
| 2106 |
-
layer.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2107 |
}
|
| 2108 |
|
| 2109 |
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
@@ -5730,6 +5759,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
| 5730 |
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
| 5731 |
cb(cur, "wqkv", il);
|
| 5732 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5733 |
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
| 5734 |
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
| 5735 |
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
@@ -5782,13 +5816,29 @@ struct llm_build_bert : public llm_graph_context {
|
|
| 5782 |
cb(ffn_inp, "ffn_inp", il);
|
| 5783 |
|
| 5784 |
// feed-forward network
|
| 5785 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5786 |
cur = build_ffn(cur,
|
| 5787 |
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
| 5788 |
NULL, NULL, NULL,
|
| 5789 |
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
| 5790 |
NULL,
|
| 5791 |
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
|
|
| 5792 |
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
| 5793 |
cur = build_ffn(cur,
|
| 5794 |
model.layers[il].ffn_up, NULL, NULL,
|
|
@@ -5796,6 +5846,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
| 5796 |
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
| 5797 |
NULL,
|
| 5798 |
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
|
|
| 5799 |
} else {
|
| 5800 |
cur = build_ffn(cur,
|
| 5801 |
model.layers[il].ffn_up, NULL, NULL,
|
|
@@ -5803,8 +5854,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
| 5803 |
model.layers[il].ffn_down, NULL, NULL,
|
| 5804 |
NULL,
|
| 5805 |
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
|
|
| 5806 |
}
|
| 5807 |
-
cb(cur, "ffn_out", il);
|
| 5808 |
|
| 5809 |
// attentions bypass the intermediate layer
|
| 5810 |
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -12842,6 +12893,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
| 12842 |
case LLM_ARCH_BERT:
|
| 12843 |
case LLM_ARCH_JINA_BERT_V2:
|
| 12844 |
case LLM_ARCH_NOMIC_BERT:
|
|
|
|
| 12845 |
{
|
| 12846 |
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
| 12847 |
} break;
|
|
@@ -13200,6 +13252,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
| 13200 |
case LLM_ARCH_DBRX:
|
| 13201 |
case LLM_ARCH_BERT:
|
| 13202 |
case LLM_ARCH_NOMIC_BERT:
|
|
|
|
| 13203 |
case LLM_ARCH_STABLELM:
|
| 13204 |
case LLM_ARCH_BITNET:
|
| 13205 |
case LLM_ARCH_QWEN:
|
|
|
|
| 40 |
case LLM_TYPE_335M: return "335M";
|
| 41 |
case LLM_TYPE_410M: return "410M";
|
| 42 |
case LLM_TYPE_450M: return "450M";
|
| 43 |
+
case LLM_TYPE_475M: return "475M";
|
| 44 |
case LLM_TYPE_770M: return "770M";
|
| 45 |
case LLM_TYPE_780M: return "780M";
|
| 46 |
case LLM_TYPE_0_5B: return "0.5B";
|
| 47 |
+
case LLM_TYPE_0_6B: return "0.6B";
|
| 48 |
case LLM_TYPE_1B: return "1B";
|
| 49 |
case LLM_TYPE_1_3B: return "1.3B";
|
| 50 |
case LLM_TYPE_1_4B: return "1.4B";
|
| 51 |
case LLM_TYPE_1_5B: return "1.5B";
|
| 52 |
case LLM_TYPE_1_6B: return "1.6B";
|
| 53 |
+
case LLM_TYPE_1_7B: return "1.7B";
|
| 54 |
case LLM_TYPE_1_8B: return "1.8B";
|
| 55 |
case LLM_TYPE_2B: return "2B";
|
| 56 |
case LLM_TYPE_2_8B: return "2.8B";
|
|
|
|
| 69 |
case LLM_TYPE_15B: return "15B";
|
| 70 |
case LLM_TYPE_16B: return "16B";
|
| 71 |
case LLM_TYPE_20B: return "20B";
|
| 72 |
+
case LLM_TYPE_27B: return "27B";
|
| 73 |
case LLM_TYPE_30B: return "30B";
|
| 74 |
case LLM_TYPE_32B: return "32B";
|
| 75 |
case LLM_TYPE_34B: return "34B";
|
|
|
|
| 78 |
case LLM_TYPE_65B: return "65B";
|
| 79 |
case LLM_TYPE_70B: return "70B";
|
| 80 |
case LLM_TYPE_236B: return "236B";
|
| 81 |
+
case LLM_TYPE_290B: return "290B";
|
| 82 |
case LLM_TYPE_314B: return "314B";
|
| 83 |
case LLM_TYPE_671B: return "671B";
|
| 84 |
case LLM_TYPE_SMALL: return "0.1B";
|
|
|
|
| 93 |
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
| 94 |
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
| 95 |
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
|
|
|
|
|
|
| 96 |
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
| 97 |
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
| 98 |
+
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
| 99 |
+
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
| 100 |
default: return "?B";
|
| 101 |
}
|
| 102 |
}
|
|
|
|
| 700 |
}
|
| 701 |
} break;
|
| 702 |
case LLM_ARCH_NOMIC_BERT:
|
| 703 |
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
| 704 |
{
|
| 705 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 706 |
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
| 707 |
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
| 708 |
+
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
| 709 |
|
| 710 |
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
| 711 |
+
if (arch == LLM_ARCH_NOMIC_BERT) {
|
| 712 |
+
type = LLM_TYPE_137M;
|
| 713 |
+
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
|
| 714 |
+
type = LLM_TYPE_475M;
|
| 715 |
+
}
|
| 716 |
}
|
| 717 |
} break;
|
| 718 |
case LLM_ARCH_BLOOM:
|
|
|
|
| 802 |
{
|
| 803 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 804 |
switch (hparams.n_layer) {
|
| 805 |
+
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
| 806 |
+
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
|
| 807 |
+
case 40: type = LLM_TYPE_14B; break;
|
| 808 |
+
case 64: type = LLM_TYPE_32B; break;
|
| 809 |
default: type = LLM_TYPE_UNKNOWN;
|
| 810 |
}
|
| 811 |
} break;
|
|
|
|
| 815 |
|
| 816 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 817 |
switch (hparams.n_layer) {
|
| 818 |
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
| 819 |
+
case 94: type = LLM_TYPE_235B_A22B; break;
|
| 820 |
default: type = LLM_TYPE_UNKNOWN;
|
| 821 |
}
|
| 822 |
} break;
|
|
|
|
| 2074 |
} break;
|
| 2075 |
case LLM_ARCH_BERT:
|
| 2076 |
case LLM_ARCH_NOMIC_BERT:
|
| 2077 |
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
| 2078 |
{
|
| 2079 |
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 2080 |
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
|
|
|
| 2108 |
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
| 2109 |
}
|
| 2110 |
|
| 2111 |
+
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
| 2112 |
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
| 2113 |
+
}
|
| 2114 |
+
|
| 2115 |
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
| 2116 |
|
| 2117 |
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
| 2118 |
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
| 2119 |
|
| 2120 |
+
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
|
|
|
|
|
|
|
|
|
| 2121 |
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
| 2122 |
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
| 2123 |
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
| 2124 |
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
| 2125 |
} else {
|
| 2126 |
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
| 2127 |
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
| 2128 |
+
|
| 2129 |
+
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
| 2130 |
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
| 2131 |
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
| 2132 |
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
| 2133 |
+
} else {
|
| 2134 |
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
| 2135 |
+
}
|
| 2136 |
}
|
| 2137 |
|
| 2138 |
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
|
|
| 5759 |
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
| 5760 |
cb(cur, "wqkv", il);
|
| 5761 |
|
| 5762 |
+
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
| 5763 |
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
| 5764 |
+
cb(cur, "bqkv", il);
|
| 5765 |
+
}
|
| 5766 |
+
|
| 5767 |
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
| 5768 |
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
| 5769 |
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
|
|
| 5816 |
cb(ffn_inp, "ffn_inp", il);
|
| 5817 |
|
| 5818 |
// feed-forward network
|
| 5819 |
+
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
| 5820 |
+
// MoE branch
|
| 5821 |
+
cur = build_moe_ffn(cur,
|
| 5822 |
+
model.layers[il].ffn_gate_inp,
|
| 5823 |
+
model.layers[il].ffn_up_exps,
|
| 5824 |
+
nullptr,
|
| 5825 |
+
model.layers[il].ffn_down_exps,
|
| 5826 |
+
nullptr,
|
| 5827 |
+
hparams.n_expert,
|
| 5828 |
+
hparams.n_expert_used,
|
| 5829 |
+
LLM_FFN_GELU,
|
| 5830 |
+
false, false,
|
| 5831 |
+
0.0f,
|
| 5832 |
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
| 5833 |
+
cb(cur, "ffn_moe_out", il);
|
| 5834 |
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
| 5835 |
cur = build_ffn(cur,
|
| 5836 |
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
| 5837 |
NULL, NULL, NULL,
|
| 5838 |
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
| 5839 |
NULL,
|
| 5840 |
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
| 5841 |
+
cb(cur, "ffn_out", il);
|
| 5842 |
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
| 5843 |
cur = build_ffn(cur,
|
| 5844 |
model.layers[il].ffn_up, NULL, NULL,
|
|
|
|
| 5846 |
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
| 5847 |
NULL,
|
| 5848 |
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
| 5849 |
+
cb(cur, "ffn_out", il);
|
| 5850 |
} else {
|
| 5851 |
cur = build_ffn(cur,
|
| 5852 |
model.layers[il].ffn_up, NULL, NULL,
|
|
|
|
| 5854 |
model.layers[il].ffn_down, NULL, NULL,
|
| 5855 |
NULL,
|
| 5856 |
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
| 5857 |
+
cb(cur, "ffn_out", il);
|
| 5858 |
}
|
|
|
|
| 5859 |
|
| 5860 |
// attentions bypass the intermediate layer
|
| 5861 |
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
|
|
| 12893 |
case LLM_ARCH_BERT:
|
| 12894 |
case LLM_ARCH_JINA_BERT_V2:
|
| 12895 |
case LLM_ARCH_NOMIC_BERT:
|
| 12896 |
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
| 12897 |
{
|
| 12898 |
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
| 12899 |
} break;
|
|
|
|
| 13252 |
case LLM_ARCH_DBRX:
|
| 13253 |
case LLM_ARCH_BERT:
|
| 13254 |
case LLM_ARCH_NOMIC_BERT:
|
| 13255 |
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
| 13256 |
case LLM_ARCH_STABLELM:
|
| 13257 |
case LLM_ARCH_BITNET:
|
| 13258 |
case LLM_ARCH_QWEN:
|
examples/talk-llama/llama-model.h
CHANGED
|
@@ -36,14 +36,17 @@ enum llm_type {
|
|
| 36 |
LLM_TYPE_335M,
|
| 37 |
LLM_TYPE_410M,
|
| 38 |
LLM_TYPE_450M,
|
|
|
|
| 39 |
LLM_TYPE_770M,
|
| 40 |
LLM_TYPE_780M,
|
| 41 |
LLM_TYPE_0_5B,
|
|
|
|
| 42 |
LLM_TYPE_1B,
|
| 43 |
LLM_TYPE_1_3B,
|
| 44 |
LLM_TYPE_1_4B,
|
| 45 |
LLM_TYPE_1_5B,
|
| 46 |
LLM_TYPE_1_6B,
|
|
|
|
| 47 |
LLM_TYPE_1_8B,
|
| 48 |
LLM_TYPE_2B,
|
| 49 |
LLM_TYPE_2_8B,
|
|
@@ -62,6 +65,7 @@ enum llm_type {
|
|
| 62 |
LLM_TYPE_15B,
|
| 63 |
LLM_TYPE_16B,
|
| 64 |
LLM_TYPE_20B,
|
|
|
|
| 65 |
LLM_TYPE_30B,
|
| 66 |
LLM_TYPE_32B,
|
| 67 |
LLM_TYPE_34B,
|
|
@@ -70,6 +74,7 @@ enum llm_type {
|
|
| 70 |
LLM_TYPE_65B,
|
| 71 |
LLM_TYPE_70B,
|
| 72 |
LLM_TYPE_236B,
|
|
|
|
| 73 |
LLM_TYPE_314B,
|
| 74 |
LLM_TYPE_671B,
|
| 75 |
LLM_TYPE_SMALL,
|
|
@@ -84,10 +89,10 @@ enum llm_type {
|
|
| 84 |
LLM_TYPE_16x3_8B,
|
| 85 |
LLM_TYPE_10B_128x3_66B,
|
| 86 |
LLM_TYPE_57B_A14B,
|
| 87 |
-
LLM_TYPE_27B,
|
| 88 |
-
LLM_TYPE_290B,
|
| 89 |
LLM_TYPE_17B_16E, // llama4 Scout
|
| 90 |
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
|
|
|
|
|
| 91 |
};
|
| 92 |
|
| 93 |
struct llama_layer_posnet {
|
|
|
|
| 36 |
LLM_TYPE_335M,
|
| 37 |
LLM_TYPE_410M,
|
| 38 |
LLM_TYPE_450M,
|
| 39 |
+
LLM_TYPE_475M,
|
| 40 |
LLM_TYPE_770M,
|
| 41 |
LLM_TYPE_780M,
|
| 42 |
LLM_TYPE_0_5B,
|
| 43 |
+
LLM_TYPE_0_6B,
|
| 44 |
LLM_TYPE_1B,
|
| 45 |
LLM_TYPE_1_3B,
|
| 46 |
LLM_TYPE_1_4B,
|
| 47 |
LLM_TYPE_1_5B,
|
| 48 |
LLM_TYPE_1_6B,
|
| 49 |
+
LLM_TYPE_1_7B,
|
| 50 |
LLM_TYPE_1_8B,
|
| 51 |
LLM_TYPE_2B,
|
| 52 |
LLM_TYPE_2_8B,
|
|
|
|
| 65 |
LLM_TYPE_15B,
|
| 66 |
LLM_TYPE_16B,
|
| 67 |
LLM_TYPE_20B,
|
| 68 |
+
LLM_TYPE_27B,
|
| 69 |
LLM_TYPE_30B,
|
| 70 |
LLM_TYPE_32B,
|
| 71 |
LLM_TYPE_34B,
|
|
|
|
| 74 |
LLM_TYPE_65B,
|
| 75 |
LLM_TYPE_70B,
|
| 76 |
LLM_TYPE_236B,
|
| 77 |
+
LLM_TYPE_290B,
|
| 78 |
LLM_TYPE_314B,
|
| 79 |
LLM_TYPE_671B,
|
| 80 |
LLM_TYPE_SMALL,
|
|
|
|
| 89 |
LLM_TYPE_16x3_8B,
|
| 90 |
LLM_TYPE_10B_128x3_66B,
|
| 91 |
LLM_TYPE_57B_A14B,
|
|
|
|
|
|
|
| 92 |
LLM_TYPE_17B_16E, // llama4 Scout
|
| 93 |
LLM_TYPE_17B_128E, // llama4 Maverick
|
| 94 |
+
LLM_TYPE_30B_A3B,
|
| 95 |
+
LLM_TYPE_235B_A22B,
|
| 96 |
};
|
| 97 |
|
| 98 |
struct llama_layer_posnet {
|
examples/talk-llama/llama-sampling.cpp
CHANGED
|
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
| 232 |
// }
|
| 233 |
|
| 234 |
if (k <= 0) {
|
| 235 |
-
|
| 236 |
}
|
| 237 |
|
| 238 |
k = std::min(k, (int) cur_p->size);
|
|
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
| 298 |
}
|
| 299 |
cur_p->sorted = true;
|
| 300 |
}
|
|
|
|
| 301 |
cur_p->size = k;
|
| 302 |
}
|
| 303 |
|
|
|
|
| 232 |
// }
|
| 233 |
|
| 234 |
if (k <= 0) {
|
| 235 |
+
return;
|
| 236 |
}
|
| 237 |
|
| 238 |
k = std::min(k, (int) cur_p->size);
|
|
|
|
| 298 |
}
|
| 299 |
cur_p->sorted = true;
|
| 300 |
}
|
| 301 |
+
|
| 302 |
cur_p->size = k;
|
| 303 |
}
|
| 304 |
|
examples/talk-llama/llama.h
CHANGED
|
@@ -1232,6 +1232,7 @@ extern "C" {
|
|
| 1232 |
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
| 1233 |
|
| 1234 |
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
|
|
| 1235 |
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
| 1236 |
|
| 1237 |
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
|
|
| 1232 |
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
| 1233 |
|
| 1234 |
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
| 1235 |
+
/// Setting k <= 0 makes this a noop
|
| 1236 |
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
| 1237 |
|
| 1238 |
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|