Spaces:
Running
Running
talk-llama : fix build + sync latest llama.cpp
Browse files- examples/talk-llama/llama.cpp +265 -170
- examples/talk-llama/llama.h +20 -18
- examples/talk-llama/talk-llama.cpp +4 -4
examples/talk-llama/llama.cpp
CHANGED
|
@@ -9,6 +9,9 @@
|
|
| 9 |
#include "llama.h"
|
| 10 |
|
| 11 |
#include "ggml.h"
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
#include <array>
|
| 14 |
#include <ctime>
|
|
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
|
|
| 50 |
|
| 51 |
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
| 52 |
{
|
| 53 |
-
static std::map<e_model, size_t>
|
| 54 |
{ MODEL_7B, 512ull * MB },
|
| 55 |
{ MODEL_13B, 512ull * MB },
|
| 56 |
{ MODEL_30B, 512ull * MB },
|
| 57 |
{ MODEL_65B, 1024ull * MB },
|
| 58 |
};
|
| 59 |
-
return
|
| 60 |
}
|
| 61 |
|
| 62 |
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
| 63 |
{
|
| 64 |
-
static std::map<e_model, size_t>
|
| 65 |
{ MODEL_7B, 512ull * MB },
|
| 66 |
{ MODEL_13B, 512ull * MB },
|
| 67 |
{ MODEL_30B, 512ull * MB },
|
| 68 |
{ MODEL_65B, 1024ull * MB },
|
| 69 |
};
|
| 70 |
-
return
|
| 71 |
}
|
| 72 |
|
| 73 |
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
| 74 |
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
| 75 |
{
|
| 76 |
-
static std::map<e_model, size_t>
|
| 77 |
{ MODEL_7B, 1026ull * MB },
|
| 78 |
{ MODEL_13B, 1608ull * MB },
|
| 79 |
{ MODEL_30B, 3124ull * MB },
|
| 80 |
{ MODEL_65B, 5120ull * MB },
|
| 81 |
};
|
| 82 |
-
return
|
| 83 |
}
|
| 84 |
|
| 85 |
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
| 86 |
// not actually needed if BLAS is disabled
|
| 87 |
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
| 88 |
{
|
| 89 |
-
static std::map<e_model, size_t>
|
| 90 |
{ MODEL_7B, 768ull * MB },
|
| 91 |
{ MODEL_13B, 1024ull * MB },
|
| 92 |
{ MODEL_30B, 1280ull * MB },
|
| 93 |
{ MODEL_65B, 1536ull * MB },
|
| 94 |
};
|
| 95 |
-
return
|
| 96 |
}
|
| 97 |
|
| 98 |
// default hparams (LLaMA 7B)
|
|
@@ -402,6 +405,7 @@ enum llama_file_version {
|
|
| 402 |
LLAMA_FILE_VERSION_GGML,
|
| 403 |
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
| 404 |
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
|
|
|
| 405 |
};
|
| 406 |
|
| 407 |
struct llama_file_loader {
|
|
@@ -432,6 +436,8 @@ struct llama_file_loader {
|
|
| 432 |
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
| 433 |
} else if (magic == 'ggjt' && version == 1) {
|
| 434 |
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
|
|
|
|
|
|
| 435 |
} else {
|
| 436 |
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
| 437 |
magic, version);
|
|
@@ -482,7 +488,6 @@ struct llama_file_loader {
|
|
| 482 |
case GGML_TYPE_F16:
|
| 483 |
case GGML_TYPE_Q4_0:
|
| 484 |
case GGML_TYPE_Q4_1:
|
| 485 |
-
case GGML_TYPE_Q4_2:
|
| 486 |
case GGML_TYPE_Q5_0:
|
| 487 |
case GGML_TYPE_Q5_1:
|
| 488 |
case GGML_TYPE_Q8_0:
|
|
@@ -527,8 +532,8 @@ struct llama_file_saver {
|
|
| 527 |
write_vocab();
|
| 528 |
}
|
| 529 |
void write_magic() {
|
| 530 |
-
file.write_u32(
|
| 531 |
-
file.write_u32(
|
| 532 |
}
|
| 533 |
void write_hparams(enum llama_ftype new_ftype) {
|
| 534 |
const llama_hparams & hparams = any_file_loader->hparams;
|
|
@@ -558,7 +563,6 @@ struct llama_file_saver {
|
|
| 558 |
case GGML_TYPE_F16:
|
| 559 |
case GGML_TYPE_Q4_0:
|
| 560 |
case GGML_TYPE_Q4_1:
|
| 561 |
-
case GGML_TYPE_Q4_2:
|
| 562 |
case GGML_TYPE_Q5_0:
|
| 563 |
case GGML_TYPE_Q5_1:
|
| 564 |
case GGML_TYPE_Q8_0:
|
|
@@ -585,12 +589,12 @@ struct llama_model_loader {
|
|
| 585 |
std::unique_ptr<llama_mmap> mapping;
|
| 586 |
|
| 587 |
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
| 588 |
-
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
| 589 |
file_loaders.emplace_back(first_file);
|
| 590 |
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
| 591 |
for (uint32_t i = 1; i < n_parts; i++) {
|
| 592 |
std::string fname = fname_base + "." + std::to_string(i);
|
| 593 |
-
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
| 594 |
file_loaders.emplace_back(ith_file);
|
| 595 |
if (ith_file->hparams != first_file->hparams) {
|
| 596 |
throw format("llama.cpp: hparams inconsistent between files");
|
|
@@ -637,7 +641,7 @@ struct llama_model_loader {
|
|
| 637 |
}
|
| 638 |
}
|
| 639 |
|
| 640 |
-
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
| 641 |
auto it = tensors_map.name_to_idx.find(name);
|
| 642 |
if (it == tensors_map.name_to_idx.end()) {
|
| 643 |
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
|
@@ -659,13 +663,14 @@ struct llama_model_loader {
|
|
| 659 |
LLAMA_ASSERT(lt.ne.size() == 1);
|
| 660 |
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
| 661 |
}
|
|
|
|
| 662 |
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
| 663 |
lt.ggml_tensor = tensor;
|
| 664 |
num_ggml_tensors_created++;
|
| 665 |
return tensor;
|
| 666 |
}
|
| 667 |
|
| 668 |
-
void done_getting_tensors() {
|
| 669 |
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
| 670 |
throw std::string("llama.cpp: file contained more tensors than expected");
|
| 671 |
}
|
|
@@ -727,8 +732,7 @@ struct llama_model_loader {
|
|
| 727 |
LLAMA_ASSERT(offset == lt.size);
|
| 728 |
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
| 729 |
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
| 730 |
-
std::vector<llama_buffer> tmp_bufs;
|
| 731 |
-
tmp_bufs.resize(lt.shards.size());
|
| 732 |
for (size_t i = 0; i < lt.shards.size(); i++) {
|
| 733 |
llama_load_tensor_shard & shard = lt.shards.at(i);
|
| 734 |
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
|
@@ -799,6 +803,8 @@ static bool kv_cache_init(
|
|
| 799 |
|
| 800 |
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
| 801 |
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
|
|
|
|
|
|
| 802 |
|
| 803 |
return true;
|
| 804 |
}
|
|
@@ -807,7 +813,8 @@ struct llama_context_params llama_context_default_params() {
|
|
| 807 |
struct llama_context_params result = {
|
| 808 |
/*.n_ctx =*/ 512,
|
| 809 |
/*.n_parts =*/ -1,
|
| 810 |
-
/*.
|
|
|
|
| 811 |
/*.f16_kv =*/ false,
|
| 812 |
/*.logits_all =*/ false,
|
| 813 |
/*.vocab_only =*/ false,
|
|
@@ -837,9 +844,11 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
| 837 |
switch (version) {
|
| 838 |
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
| 839 |
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
| 840 |
-
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (
|
| 841 |
-
|
| 842 |
}
|
|
|
|
|
|
|
| 843 |
}
|
| 844 |
|
| 845 |
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
@@ -850,7 +859,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
| 850 |
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
| 851 |
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
| 852 |
return "mostly Q4_1, some F16";
|
| 853 |
-
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
| 854 |
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
| 855 |
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
| 856 |
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
|
@@ -872,6 +880,7 @@ static void llama_model_load_internal(
|
|
| 872 |
const std::string & fname,
|
| 873 |
llama_context & lctx,
|
| 874 |
int n_ctx,
|
|
|
|
| 875 |
ggml_type memory_type,
|
| 876 |
bool use_mmap,
|
| 877 |
bool use_mlock,
|
|
@@ -916,13 +925,22 @@ static void llama_model_load_internal(
|
|
| 916 |
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
| 917 |
}
|
| 918 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 919 |
if (vocab_only) {
|
| 920 |
return;
|
| 921 |
}
|
| 922 |
|
| 923 |
auto & ctx = model.ctx;
|
| 924 |
|
| 925 |
-
size_t ctx_size
|
|
|
|
| 926 |
ml->calc_sizes(&ctx_size, &mmapped_size);
|
| 927 |
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
| 928 |
|
|
@@ -968,8 +986,6 @@ static void llama_model_load_internal(
|
|
| 968 |
|
| 969 |
// prepare memory for the weights
|
| 970 |
{
|
| 971 |
-
const auto & hparams = model.hparams;
|
| 972 |
-
|
| 973 |
const uint32_t n_embd = hparams.n_embd;
|
| 974 |
const uint32_t n_layer = hparams.n_layer;
|
| 975 |
const uint32_t n_vocab = hparams.n_vocab;
|
|
@@ -1011,6 +1027,35 @@ static void llama_model_load_internal(
|
|
| 1011 |
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
| 1012 |
|
| 1013 |
model.mapping = std::move(ml->mapping);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
|
| 1015 |
// loading time will be recalculate after the first eval, so
|
| 1016 |
// we take page faults deferred by mmap() into consideration
|
|
@@ -1021,6 +1066,7 @@ static bool llama_model_load(
|
|
| 1021 |
const std::string & fname,
|
| 1022 |
llama_context & lctx,
|
| 1023 |
int n_ctx,
|
|
|
|
| 1024 |
ggml_type memory_type,
|
| 1025 |
bool use_mmap,
|
| 1026 |
bool use_mlock,
|
|
@@ -1028,7 +1074,7 @@ static bool llama_model_load(
|
|
| 1028 |
llama_progress_callback progress_callback,
|
| 1029 |
void *progress_callback_user_data) {
|
| 1030 |
try {
|
| 1031 |
-
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
| 1032 |
vocab_only, progress_callback, progress_callback_user_data);
|
| 1033 |
return true;
|
| 1034 |
} catch (const std::string & err) {
|
|
@@ -1050,6 +1096,13 @@ static bool llama_eval_internal(
|
|
| 1050 |
const int n_tokens,
|
| 1051 |
const int n_past,
|
| 1052 |
const int n_threads) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1053 |
const int64_t t_start_us = ggml_time_us();
|
| 1054 |
|
| 1055 |
const int N = n_tokens;
|
|
@@ -1057,7 +1110,7 @@ static bool llama_eval_internal(
|
|
| 1057 |
const auto & model = lctx.model;
|
| 1058 |
const auto & hparams = model.hparams;
|
| 1059 |
|
| 1060 |
-
auto & kv_self = model.kv_self;
|
| 1061 |
|
| 1062 |
LLAMA_ASSERT(!!kv_self.ctx);
|
| 1063 |
|
|
@@ -1085,6 +1138,7 @@ static bool llama_eval_internal(
|
|
| 1085 |
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
| 1086 |
|
| 1087 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
|
|
| 1088 |
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
| 1089 |
|
| 1090 |
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
|
@@ -1109,8 +1163,10 @@ static bool llama_eval_internal(
|
|
| 1109 |
// self-attention
|
| 1110 |
{
|
| 1111 |
// compute Q and K and RoPE them
|
| 1112 |
-
struct ggml_tensor * Qcur =
|
| 1113 |
-
struct ggml_tensor * Kcur =
|
|
|
|
|
|
|
| 1114 |
|
| 1115 |
// store key and value to memory
|
| 1116 |
{
|
|
@@ -1131,6 +1187,7 @@ static bool llama_eval_internal(
|
|
| 1131 |
ggml_permute(ctx0,
|
| 1132 |
Qcur,
|
| 1133 |
0, 2, 1, 3);
|
|
|
|
| 1134 |
|
| 1135 |
struct ggml_tensor * K =
|
| 1136 |
ggml_permute(ctx0,
|
|
@@ -1138,21 +1195,28 @@ static bool llama_eval_internal(
|
|
| 1138 |
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
| 1139 |
n_embd/n_head, n_head, n_past + N),
|
| 1140 |
0, 2, 1, 3);
|
|
|
|
| 1141 |
|
| 1142 |
// K * Q
|
| 1143 |
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
|
|
| 1144 |
|
| 1145 |
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
| 1146 |
-
struct ggml_tensor *
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
|
|
|
|
|
|
| 1150 |
|
| 1151 |
// KQ_masked = mask_past(KQ_scaled)
|
| 1152 |
-
struct ggml_tensor * KQ_masked =
|
|
|
|
| 1153 |
|
| 1154 |
// KQ = soft_max(KQ_masked)
|
| 1155 |
-
struct ggml_tensor * KQ_soft_max =
|
|
|
|
|
|
|
| 1156 |
|
| 1157 |
// split cached V into n_head heads
|
| 1158 |
struct ggml_tensor * V =
|
|
@@ -1161,9 +1225,11 @@ static bool llama_eval_internal(
|
|
| 1161 |
n_ctx*ggml_element_size(kv_self.v),
|
| 1162 |
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
| 1163 |
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
|
|
|
| 1164 |
|
| 1165 |
#if 1
|
| 1166 |
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
|
|
| 1167 |
#else
|
| 1168 |
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
| 1169 |
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
|
@@ -1174,11 +1240,13 @@ static bool llama_eval_internal(
|
|
| 1174 |
|
| 1175 |
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
| 1176 |
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
|
|
| 1177 |
|
| 1178 |
// cur = KQV_merged.contiguous().view(n_embd, N)
|
| 1179 |
cur = ggml_cpy(ctx0,
|
| 1180 |
KQV_merged,
|
| 1181 |
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
|
|
|
| 1182 |
|
| 1183 |
// projection (no bias)
|
| 1184 |
cur = ggml_mul_mat(ctx0,
|
|
@@ -1250,7 +1318,7 @@ static bool llama_eval_internal(
|
|
| 1250 |
lctx.use_buf(ctx0, -1);
|
| 1251 |
|
| 1252 |
// logits -> probs
|
| 1253 |
-
//inpL =
|
| 1254 |
|
| 1255 |
// run the computation
|
| 1256 |
ggml_build_forward_expand(&gf, inpL);
|
|
@@ -1288,7 +1356,7 @@ static bool llama_eval_internal(
|
|
| 1288 |
}
|
| 1289 |
|
| 1290 |
// extract embeddings
|
| 1291 |
-
if (lctx.embedding.
|
| 1292 |
auto & embedding_out = lctx.embedding;
|
| 1293 |
|
| 1294 |
embedding_out.resize(n_embd);
|
|
@@ -1339,6 +1407,8 @@ struct llama_sp_symbol {
|
|
| 1339 |
size_t n;
|
| 1340 |
};
|
| 1341 |
|
|
|
|
|
|
|
| 1342 |
struct llama_sp_bigram {
|
| 1343 |
struct comparator {
|
| 1344 |
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
|
@@ -1371,7 +1441,7 @@ struct llama_tokenizer {
|
|
| 1371 |
sym.prev = index - 1;
|
| 1372 |
sym.next = offs == text.size() ? -1 : index + 1;
|
| 1373 |
index++;
|
| 1374 |
-
symbols_.emplace_back(
|
| 1375 |
}
|
| 1376 |
|
| 1377 |
// seed the work queue with all possible 2-character tokens.
|
|
@@ -1462,12 +1532,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
| 1462 |
llama_tokenizer tokenizer(vocab);
|
| 1463 |
std::vector<llama_vocab::id> output;
|
| 1464 |
|
| 1465 |
-
if (text.
|
| 1466 |
return output;
|
| 1467 |
}
|
| 1468 |
|
| 1469 |
if (bos) {
|
| 1470 |
-
output.push_back(
|
| 1471 |
}
|
| 1472 |
|
| 1473 |
tokenizer.tokenize(text, output);
|
|
@@ -1690,7 +1760,7 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
| 1690 |
}
|
| 1691 |
}
|
| 1692 |
|
| 1693 |
-
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
| 1694 |
if (last_tokens_size == 0 || penalty == 1.0f) {
|
| 1695 |
return;
|
| 1696 |
}
|
|
@@ -1698,7 +1768,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
|
|
| 1698 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 1699 |
|
| 1700 |
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1701 |
-
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
| 1702 |
if (token_iter == last_tokens + last_tokens_size) {
|
| 1703 |
continue;
|
| 1704 |
}
|
|
@@ -1719,7 +1789,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
|
|
| 1719 |
}
|
| 1720 |
}
|
| 1721 |
|
| 1722 |
-
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
| 1723 |
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
| 1724 |
return;
|
| 1725 |
}
|
|
@@ -1776,7 +1846,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
| 1776 |
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
| 1777 |
|
| 1778 |
// Sample the next word X using top-k sampling
|
| 1779 |
-
llama_sample_top_k(nullptr, candidates, int(k));
|
| 1780 |
if (ctx) {
|
| 1781 |
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1782 |
}
|
|
@@ -1842,7 +1912,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
| 1842 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 1843 |
|
| 1844 |
// Find max element
|
| 1845 |
-
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
| 1846 |
return a.logit < b.logit;
|
| 1847 |
});
|
| 1848 |
|
|
@@ -1885,7 +1955,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 1885 |
switch (ftype) {
|
| 1886 |
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
| 1887 |
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
| 1888 |
-
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
| 1889 |
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
| 1890 |
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
| 1891 |
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
|
@@ -1896,7 +1965,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 1896 |
nthread = std::thread::hardware_concurrency();
|
| 1897 |
}
|
| 1898 |
|
| 1899 |
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp
|
| 1900 |
/*vocab_only*/ false));
|
| 1901 |
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
| 1902 |
|
|
@@ -1950,7 +2019,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 1950 |
} else if (tensor.type == GGML_TYPE_F16) {
|
| 1951 |
f32_conv_buf.resize(nelements * sizeof(float));
|
| 1952 |
f32_data = (float *) f32_conv_buf.addr;
|
| 1953 |
-
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
| 1954 |
for (size_t i = 0; i < nelements; i++) {
|
| 1955 |
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
| 1956 |
}
|
|
@@ -1981,21 +2050,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 1981 |
size_t first = counter; counter += chunk_size;
|
| 1982 |
if (first >= nelements) {
|
| 1983 |
if (!local_hist.empty()) {
|
| 1984 |
-
for (int j=0; j<int(local_hist.size()); ++j)
|
|
|
|
|
|
|
| 1985 |
new_size += local_size;
|
| 1986 |
}
|
| 1987 |
break;
|
| 1988 |
}
|
| 1989 |
lock.unlock();
|
| 1990 |
size_t last = std::min(nelements, first + chunk_size);
|
| 1991 |
-
if (local_hist.empty())
|
|
|
|
|
|
|
| 1992 |
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
| 1993 |
}
|
| 1994 |
};
|
| 1995 |
-
if (int
|
| 1996 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1997 |
compute();
|
| 1998 |
-
for (int it = 0; it < nthread_use - 1; ++it)
|
|
|
|
|
|
|
| 1999 |
}
|
| 2000 |
|
| 2001 |
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
@@ -2041,7 +2120,7 @@ struct llama_context * llama_init_from_file(
|
|
| 2041 |
|
| 2042 |
llama_context * ctx = new llama_context;
|
| 2043 |
|
| 2044 |
-
if (params.seed
|
| 2045 |
params.seed = time(NULL);
|
| 2046 |
}
|
| 2047 |
|
|
@@ -2067,7 +2146,7 @@ struct llama_context * llama_init_from_file(
|
|
| 2067 |
|
| 2068 |
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 2069 |
|
| 2070 |
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
| 2071 |
params.use_mmap, params.use_mlock, params.vocab_only,
|
| 2072 |
params.progress_callback, params.progress_callback_user_data)) {
|
| 2073 |
fprintf(stderr, "%s: failed to load model\n", __func__);
|
|
@@ -2193,7 +2272,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2193 |
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
| 2194 |
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
| 2195 |
|
| 2196 |
-
size_t ctx_size
|
|
|
|
| 2197 |
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
| 2198 |
base_buf.resize(ctx_size);
|
| 2199 |
|
|
@@ -2232,8 +2312,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2232 |
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
| 2233 |
}
|
| 2234 |
|
| 2235 |
-
std::string name
|
| 2236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2237 |
|
| 2238 |
// check for lora suffix and get the type of tensor
|
| 2239 |
const std::string lora_suffix = ".lora";
|
|
@@ -2248,7 +2332,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2248 |
base_name.erase(pos);
|
| 2249 |
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
| 2250 |
|
| 2251 |
-
if (model_tensors.find(base_name
|
| 2252 |
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
| 2253 |
return 1;
|
| 2254 |
}
|
|
@@ -2328,7 +2412,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2328 |
|
| 2329 |
if (scaling != 1.0f) {
|
| 2330 |
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
| 2331 |
-
BA =
|
| 2332 |
}
|
| 2333 |
|
| 2334 |
ggml_tensor * r;
|
|
@@ -2350,8 +2434,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2350 |
lora_tensors.clear();
|
| 2351 |
|
| 2352 |
n_tensors++;
|
| 2353 |
-
if (n_tensors % 4 == 0)
|
| 2354 |
fprintf(stderr, ".");
|
|
|
|
| 2355 |
}
|
| 2356 |
}
|
| 2357 |
|
|
@@ -2376,21 +2461,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
| 2376 |
}
|
| 2377 |
}
|
| 2378 |
|
| 2379 |
-
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
| 2380 |
return ctx->model.kv_self.n;
|
| 2381 |
}
|
| 2382 |
|
| 2383 |
-
#define LLAMA_MAX_RNG_STATE 64*1024
|
| 2384 |
|
| 2385 |
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
| 2386 |
-
if (seed
|
| 2387 |
seed = time(NULL);
|
| 2388 |
}
|
| 2389 |
ctx->rng.seed(seed);
|
| 2390 |
}
|
| 2391 |
|
| 2392 |
// Returns the *maximum* size of the state
|
| 2393 |
-
size_t llama_get_state_size(struct llama_context * ctx) {
|
| 2394 |
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
| 2395 |
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
| 2396 |
const size_t s_rng_size = sizeof(size_t);
|
|
@@ -2421,8 +2506,8 @@ size_t llama_get_state_size(struct llama_context * ctx) {
|
|
| 2421 |
}
|
| 2422 |
|
| 2423 |
// Copies the state to the specified destination address
|
| 2424 |
-
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
| 2425 |
-
uint8_t * out =
|
| 2426 |
|
| 2427 |
// copy rng
|
| 2428 |
{
|
|
@@ -2482,7 +2567,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
| 2482 |
|
| 2483 |
if (kv_size) {
|
| 2484 |
const size_t elt_size = ggml_element_size(kv_self.k);
|
|
|
|
| 2485 |
char buffer[4096];
|
|
|
|
| 2486 |
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
| 2487 |
ggml_cgraph gf{};
|
| 2488 |
gf.n_threads = 1;
|
|
@@ -2506,10 +2593,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
| 2506 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
| 2507 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
| 2508 |
ggml_graph_compute(cpy_ctx, &gf);
|
|
|
|
|
|
|
| 2509 |
}
|
| 2510 |
}
|
| 2511 |
|
| 2512 |
-
const size_t written = out -
|
| 2513 |
const size_t max_size = llama_get_state_size(ctx);
|
| 2514 |
|
| 2515 |
LLAMA_ASSERT(written <= max_size);
|
|
@@ -2519,15 +2608,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
| 2519 |
|
| 2520 |
// Sets the state reading from the specified source address
|
| 2521 |
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
| 2522 |
-
const uint8_t *
|
| 2523 |
|
| 2524 |
// set rng
|
| 2525 |
{
|
| 2526 |
size_t rng_size;
|
| 2527 |
char rng_buf[LLAMA_MAX_RNG_STATE];
|
| 2528 |
|
| 2529 |
-
memcpy(&rng_size,
|
| 2530 |
-
memcpy(&rng_buf[0],
|
| 2531 |
|
| 2532 |
std::stringstream rng_ss;
|
| 2533 |
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
|
@@ -2541,30 +2630,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
| 2541 |
size_t logits_cap;
|
| 2542 |
size_t logits_size;
|
| 2543 |
|
| 2544 |
-
memcpy(&logits_cap,
|
| 2545 |
-
memcpy(&logits_size,
|
| 2546 |
|
| 2547 |
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
| 2548 |
|
| 2549 |
if (logits_size) {
|
| 2550 |
ctx->logits.resize(logits_size);
|
| 2551 |
-
memcpy(ctx->logits.data(),
|
| 2552 |
}
|
| 2553 |
|
| 2554 |
-
|
| 2555 |
}
|
| 2556 |
|
| 2557 |
// set embeddings
|
| 2558 |
{
|
| 2559 |
size_t embedding_size;
|
| 2560 |
|
| 2561 |
-
memcpy(&embedding_size,
|
| 2562 |
|
| 2563 |
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
| 2564 |
|
| 2565 |
if (embedding_size) {
|
| 2566 |
-
memcpy(ctx->embedding.data(),
|
| 2567 |
-
|
| 2568 |
}
|
| 2569 |
}
|
| 2570 |
|
|
@@ -2579,25 +2668,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
| 2579 |
size_t kv_size;
|
| 2580 |
int kv_ntok;
|
| 2581 |
|
| 2582 |
-
memcpy(&kv_size,
|
| 2583 |
-
memcpy(&kv_ntok,
|
| 2584 |
|
| 2585 |
if (kv_size) {
|
| 2586 |
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
| 2587 |
|
| 2588 |
const size_t elt_size = ggml_element_size(kv_self.k);
|
|
|
|
| 2589 |
char buffer[4096];
|
|
|
|
| 2590 |
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
| 2591 |
ggml_cgraph gf{};
|
| 2592 |
gf.n_threads = 1;
|
| 2593 |
|
| 2594 |
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
| 2595 |
-
kin3d->data = (void *)
|
| 2596 |
-
|
| 2597 |
|
| 2598 |
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
| 2599 |
-
vin3d->data = (void *)
|
| 2600 |
-
|
| 2601 |
|
| 2602 |
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
| 2603 |
n_embd, kv_ntok, n_layer,
|
|
@@ -2611,12 +2702,13 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
| 2611 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
| 2612 |
ggml_graph_compute(cpy_ctx, &gf);
|
| 2613 |
|
|
|
|
| 2614 |
}
|
| 2615 |
|
| 2616 |
ctx->model.kv_self.n = kv_ntok;
|
| 2617 |
}
|
| 2618 |
|
| 2619 |
-
const size_t nread =
|
| 2620 |
const size_t max_size = llama_get_state_size(ctx);
|
| 2621 |
|
| 2622 |
LLAMA_ASSERT(nread <= max_size);
|
|
@@ -2624,6 +2716,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
| 2624 |
return nread;
|
| 2625 |
}
|
| 2626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2627 |
int llama_eval(
|
| 2628 |
struct llama_context * ctx,
|
| 2629 |
const llama_token * tokens,
|
|
@@ -2634,11 +2805,14 @@ int llama_eval(
|
|
| 2634 |
fprintf(stderr, "%s: failed to eval\n", __func__);
|
| 2635 |
return 1;
|
| 2636 |
}
|
|
|
|
| 2637 |
// get a more accurate load time, upon first eval
|
|
|
|
| 2638 |
if (!ctx->has_evaluated_once) {
|
| 2639 |
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
| 2640 |
ctx->has_evaluated_once = true;
|
| 2641 |
}
|
|
|
|
| 2642 |
return 0;
|
| 2643 |
}
|
| 2644 |
|
|
@@ -2662,15 +2836,15 @@ int llama_tokenize(
|
|
| 2662 |
return res.size();
|
| 2663 |
}
|
| 2664 |
|
| 2665 |
-
int llama_n_vocab(struct llama_context * ctx) {
|
| 2666 |
return ctx->vocab.id_to_token.size();
|
| 2667 |
}
|
| 2668 |
|
| 2669 |
-
int llama_n_ctx(struct llama_context * ctx) {
|
| 2670 |
return ctx->model.hparams.n_ctx;
|
| 2671 |
}
|
| 2672 |
|
| 2673 |
-
int llama_n_embd(struct llama_context * ctx) {
|
| 2674 |
return ctx->model.hparams.n_embd;
|
| 2675 |
}
|
| 2676 |
|
|
@@ -2682,7 +2856,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
| 2682 |
return ctx->embedding.data();
|
| 2683 |
}
|
| 2684 |
|
| 2685 |
-
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
|
| 2686 |
if (token >= llama_n_vocab(ctx)) {
|
| 2687 |
return nullptr;
|
| 2688 |
}
|
|
@@ -2712,9 +2886,9 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
| 2712 |
|
| 2713 |
fprintf(stderr, "\n");
|
| 2714 |
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
| 2715 |
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per
|
| 2716 |
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
| 2717 |
-
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per
|
| 2718 |
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
| 2719 |
}
|
| 2720 |
|
|
@@ -2751,82 +2925,3 @@ const char * llama_print_system_info(void) {
|
|
| 2751 |
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
| 2752 |
return ctx->model.tensors_by_name;
|
| 2753 |
}
|
| 2754 |
-
|
| 2755 |
-
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
| 2756 |
-
llama_file file(path_session, "rb");
|
| 2757 |
-
|
| 2758 |
-
// sanity checks
|
| 2759 |
-
{
|
| 2760 |
-
const uint32_t magic = file.read_u32();
|
| 2761 |
-
const uint32_t version = file.read_u32();
|
| 2762 |
-
|
| 2763 |
-
if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
|
| 2764 |
-
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
| 2765 |
-
return false;
|
| 2766 |
-
}
|
| 2767 |
-
|
| 2768 |
-
llama_hparams session_hparams;
|
| 2769 |
-
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
| 2770 |
-
|
| 2771 |
-
if (session_hparams != ctx->model.hparams) {
|
| 2772 |
-
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
|
| 2773 |
-
return false;
|
| 2774 |
-
}
|
| 2775 |
-
}
|
| 2776 |
-
|
| 2777 |
-
// load the prompt
|
| 2778 |
-
{
|
| 2779 |
-
const uint32_t n_token_count = file.read_u32();
|
| 2780 |
-
|
| 2781 |
-
if (n_token_count > n_token_capacity) {
|
| 2782 |
-
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
| 2783 |
-
return false;
|
| 2784 |
-
}
|
| 2785 |
-
|
| 2786 |
-
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
| 2787 |
-
*n_token_count_out = n_token_count;
|
| 2788 |
-
}
|
| 2789 |
-
|
| 2790 |
-
// restore the context state
|
| 2791 |
-
{
|
| 2792 |
-
const size_t n_state_size_cur = file.size - file.tell();
|
| 2793 |
-
const size_t n_state_size_max = llama_get_state_size(ctx);
|
| 2794 |
-
|
| 2795 |
-
if (n_state_size_cur > n_state_size_max) {
|
| 2796 |
-
fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
| 2797 |
-
return false;
|
| 2798 |
-
}
|
| 2799 |
-
|
| 2800 |
-
std::vector<uint8_t> state_data(n_state_size_max);
|
| 2801 |
-
file.read_raw(state_data.data(), n_state_size_cur);
|
| 2802 |
-
|
| 2803 |
-
llama_set_state_data(ctx, state_data.data());
|
| 2804 |
-
}
|
| 2805 |
-
|
| 2806 |
-
return true;
|
| 2807 |
-
}
|
| 2808 |
-
|
| 2809 |
-
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
| 2810 |
-
llama_file file(path_session, "wb");
|
| 2811 |
-
|
| 2812 |
-
file.write_u32(LLAMA_SESSION_MAGIC);
|
| 2813 |
-
file.write_u32(LLAMA_SESSION_VERSION);
|
| 2814 |
-
|
| 2815 |
-
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
|
| 2816 |
-
|
| 2817 |
-
// save the prompt
|
| 2818 |
-
file.write_u32((uint32_t) n_token_count);
|
| 2819 |
-
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
| 2820 |
-
|
| 2821 |
-
// save the context state
|
| 2822 |
-
{
|
| 2823 |
-
const size_t n_state_size_max = llama_get_state_size(ctx);
|
| 2824 |
-
|
| 2825 |
-
std::vector<uint8_t> state_data(n_state_size_max);
|
| 2826 |
-
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
|
| 2827 |
-
|
| 2828 |
-
file.write_raw(state_data.data(), n_state_size_cur);
|
| 2829 |
-
}
|
| 2830 |
-
|
| 2831 |
-
return true;
|
| 2832 |
-
}
|
|
|
|
| 9 |
#include "llama.h"
|
| 10 |
|
| 11 |
#include "ggml.h"
|
| 12 |
+
#ifdef GGML_USE_CUBLAS
|
| 13 |
+
#include "ggml-cuda.h"
|
| 14 |
+
#endif
|
| 15 |
|
| 16 |
#include <array>
|
| 17 |
#include <ctime>
|
|
|
|
| 53 |
|
| 54 |
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
| 55 |
{
|
| 56 |
+
static std::map<e_model, size_t> k_sizes = {
|
| 57 |
{ MODEL_7B, 512ull * MB },
|
| 58 |
{ MODEL_13B, 512ull * MB },
|
| 59 |
{ MODEL_30B, 512ull * MB },
|
| 60 |
{ MODEL_65B, 1024ull * MB },
|
| 61 |
};
|
| 62 |
+
return k_sizes;
|
| 63 |
}
|
| 64 |
|
| 65 |
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
| 66 |
{
|
| 67 |
+
static std::map<e_model, size_t> k_sizes = {
|
| 68 |
{ MODEL_7B, 512ull * MB },
|
| 69 |
{ MODEL_13B, 512ull * MB },
|
| 70 |
{ MODEL_30B, 512ull * MB },
|
| 71 |
{ MODEL_65B, 1024ull * MB },
|
| 72 |
};
|
| 73 |
+
return k_sizes;
|
| 74 |
}
|
| 75 |
|
| 76 |
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
| 77 |
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
| 78 |
{
|
| 79 |
+
static std::map<e_model, size_t> k_sizes = {
|
| 80 |
{ MODEL_7B, 1026ull * MB },
|
| 81 |
{ MODEL_13B, 1608ull * MB },
|
| 82 |
{ MODEL_30B, 3124ull * MB },
|
| 83 |
{ MODEL_65B, 5120ull * MB },
|
| 84 |
};
|
| 85 |
+
return k_sizes;
|
| 86 |
}
|
| 87 |
|
| 88 |
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
| 89 |
// not actually needed if BLAS is disabled
|
| 90 |
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
| 91 |
{
|
| 92 |
+
static std::map<e_model, size_t> k_sizes = {
|
| 93 |
{ MODEL_7B, 768ull * MB },
|
| 94 |
{ MODEL_13B, 1024ull * MB },
|
| 95 |
{ MODEL_30B, 1280ull * MB },
|
| 96 |
{ MODEL_65B, 1536ull * MB },
|
| 97 |
};
|
| 98 |
+
return k_sizes;
|
| 99 |
}
|
| 100 |
|
| 101 |
// default hparams (LLaMA 7B)
|
|
|
|
| 405 |
LLAMA_FILE_VERSION_GGML,
|
| 406 |
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
| 407 |
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
| 408 |
+
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
| 409 |
};
|
| 410 |
|
| 411 |
struct llama_file_loader {
|
|
|
|
| 436 |
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
| 437 |
} else if (magic == 'ggjt' && version == 1) {
|
| 438 |
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
| 439 |
+
} else if (magic == 'ggjt' && version == 2) {
|
| 440 |
+
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
| 441 |
} else {
|
| 442 |
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
| 443 |
magic, version);
|
|
|
|
| 488 |
case GGML_TYPE_F16:
|
| 489 |
case GGML_TYPE_Q4_0:
|
| 490 |
case GGML_TYPE_Q4_1:
|
|
|
|
| 491 |
case GGML_TYPE_Q5_0:
|
| 492 |
case GGML_TYPE_Q5_1:
|
| 493 |
case GGML_TYPE_Q8_0:
|
|
|
|
| 532 |
write_vocab();
|
| 533 |
}
|
| 534 |
void write_magic() {
|
| 535 |
+
file.write_u32(LLAMA_FILE_MAGIC); // magic
|
| 536 |
+
file.write_u32(LLAMA_FILE_VERSION); // version
|
| 537 |
}
|
| 538 |
void write_hparams(enum llama_ftype new_ftype) {
|
| 539 |
const llama_hparams & hparams = any_file_loader->hparams;
|
|
|
|
| 563 |
case GGML_TYPE_F16:
|
| 564 |
case GGML_TYPE_Q4_0:
|
| 565 |
case GGML_TYPE_Q4_1:
|
|
|
|
| 566 |
case GGML_TYPE_Q5_0:
|
| 567 |
case GGML_TYPE_Q5_1:
|
| 568 |
case GGML_TYPE_Q8_0:
|
|
|
|
| 589 |
std::unique_ptr<llama_mmap> mapping;
|
| 590 |
|
| 591 |
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
| 592 |
+
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
| 593 |
file_loaders.emplace_back(first_file);
|
| 594 |
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
| 595 |
for (uint32_t i = 1; i < n_parts; i++) {
|
| 596 |
std::string fname = fname_base + "." + std::to_string(i);
|
| 597 |
+
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
| 598 |
file_loaders.emplace_back(ith_file);
|
| 599 |
if (ith_file->hparams != first_file->hparams) {
|
| 600 |
throw format("llama.cpp: hparams inconsistent between files");
|
|
|
|
| 641 |
}
|
| 642 |
}
|
| 643 |
|
| 644 |
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
| 645 |
auto it = tensors_map.name_to_idx.find(name);
|
| 646 |
if (it == tensors_map.name_to_idx.end()) {
|
| 647 |
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
|
|
|
| 663 |
LLAMA_ASSERT(lt.ne.size() == 1);
|
| 664 |
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
| 665 |
}
|
| 666 |
+
ggml_set_name(tensor, lt.name.c_str());
|
| 667 |
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
| 668 |
lt.ggml_tensor = tensor;
|
| 669 |
num_ggml_tensors_created++;
|
| 670 |
return tensor;
|
| 671 |
}
|
| 672 |
|
| 673 |
+
void done_getting_tensors() const {
|
| 674 |
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
| 675 |
throw std::string("llama.cpp: file contained more tensors than expected");
|
| 676 |
}
|
|
|
|
| 732 |
LLAMA_ASSERT(offset == lt.size);
|
| 733 |
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
| 734 |
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
| 735 |
+
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
|
|
|
| 736 |
for (size_t i = 0; i < lt.shards.size(); i++) {
|
| 737 |
llama_load_tensor_shard & shard = lt.shards.at(i);
|
| 738 |
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
|
|
|
| 803 |
|
| 804 |
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
| 805 |
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
| 806 |
+
ggml_set_name(cache.k, "cache_k");
|
| 807 |
+
ggml_set_name(cache.v, "cache_v");
|
| 808 |
|
| 809 |
return true;
|
| 810 |
}
|
|
|
|
| 813 |
struct llama_context_params result = {
|
| 814 |
/*.n_ctx =*/ 512,
|
| 815 |
/*.n_parts =*/ -1,
|
| 816 |
+
/*.gpu_layers =*/ 0,
|
| 817 |
+
/*.seed =*/ -1,
|
| 818 |
/*.f16_kv =*/ false,
|
| 819 |
/*.logits_all =*/ false,
|
| 820 |
/*.vocab_only =*/ false,
|
|
|
|
| 844 |
switch (version) {
|
| 845 |
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
| 846 |
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
| 847 |
+
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
| 848 |
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
| 849 |
}
|
| 850 |
+
|
| 851 |
+
return "unknown";
|
| 852 |
}
|
| 853 |
|
| 854 |
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
|
|
| 859 |
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
| 860 |
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
| 861 |
return "mostly Q4_1, some F16";
|
|
|
|
| 862 |
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
| 863 |
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
| 864 |
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
|
|
|
| 880 |
const std::string & fname,
|
| 881 |
llama_context & lctx,
|
| 882 |
int n_ctx,
|
| 883 |
+
int n_gpu_layers,
|
| 884 |
ggml_type memory_type,
|
| 885 |
bool use_mmap,
|
| 886 |
bool use_mlock,
|
|
|
|
| 925 |
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
| 926 |
}
|
| 927 |
|
| 928 |
+
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
|
| 929 |
+
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
| 930 |
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
| 931 |
+
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
| 932 |
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
|
| 933 |
+
}
|
| 934 |
+
}
|
| 935 |
+
|
| 936 |
if (vocab_only) {
|
| 937 |
return;
|
| 938 |
}
|
| 939 |
|
| 940 |
auto & ctx = model.ctx;
|
| 941 |
|
| 942 |
+
size_t ctx_size;
|
| 943 |
+
size_t mmapped_size;
|
| 944 |
ml->calc_sizes(&ctx_size, &mmapped_size);
|
| 945 |
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
| 946 |
|
|
|
|
| 986 |
|
| 987 |
// prepare memory for the weights
|
| 988 |
{
|
|
|
|
|
|
|
| 989 |
const uint32_t n_embd = hparams.n_embd;
|
| 990 |
const uint32_t n_layer = hparams.n_layer;
|
| 991 |
const uint32_t n_vocab = hparams.n_vocab;
|
|
|
|
| 1027 |
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
| 1028 |
|
| 1029 |
model.mapping = std::move(ml->mapping);
|
| 1030 |
+
#ifdef GGML_USE_CUBLAS
|
| 1031 |
+
{
|
| 1032 |
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
| 1033 |
+
|
| 1034 |
+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
| 1035 |
+
|
| 1036 |
+
size_t vram_total = 0;
|
| 1037 |
+
|
| 1038 |
+
for (int i = 0; i < n_gpu; ++i) {
|
| 1039 |
+
const auto & layer = model.layers[i];
|
| 1040 |
+
|
| 1041 |
+
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
| 1042 |
+
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
| 1043 |
+
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
| 1044 |
+
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
| 1045 |
+
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
| 1046 |
+
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
| 1047 |
+
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
| 1048 |
+
}
|
| 1049 |
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
| 1050 |
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
| 1051 |
+
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
| 1052 |
+
}
|
| 1053 |
+
|
| 1054 |
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
| 1055 |
+
}
|
| 1056 |
+
#else
|
| 1057 |
+
(void) n_gpu_layers;
|
| 1058 |
+
#endif
|
| 1059 |
|
| 1060 |
// loading time will be recalculate after the first eval, so
|
| 1061 |
// we take page faults deferred by mmap() into consideration
|
|
|
|
| 1066 |
const std::string & fname,
|
| 1067 |
llama_context & lctx,
|
| 1068 |
int n_ctx,
|
| 1069 |
+
int n_gpu_layers,
|
| 1070 |
ggml_type memory_type,
|
| 1071 |
bool use_mmap,
|
| 1072 |
bool use_mlock,
|
|
|
|
| 1074 |
llama_progress_callback progress_callback,
|
| 1075 |
void *progress_callback_user_data) {
|
| 1076 |
try {
|
| 1077 |
+
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
| 1078 |
vocab_only, progress_callback, progress_callback_user_data);
|
| 1079 |
return true;
|
| 1080 |
} catch (const std::string & err) {
|
|
|
|
| 1096 |
const int n_tokens,
|
| 1097 |
const int n_past,
|
| 1098 |
const int n_threads) {
|
| 1099 |
+
|
| 1100 |
+
// enforce that the first token is BOS
|
| 1101 |
+
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
| 1102 |
+
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
| 1103 |
+
return false;
|
| 1104 |
+
}
|
| 1105 |
+
|
| 1106 |
const int64_t t_start_us = ggml_time_us();
|
| 1107 |
|
| 1108 |
const int N = n_tokens;
|
|
|
|
| 1110 |
const auto & model = lctx.model;
|
| 1111 |
const auto & hparams = model.hparams;
|
| 1112 |
|
| 1113 |
+
const auto & kv_self = model.kv_self;
|
| 1114 |
|
| 1115 |
LLAMA_ASSERT(!!kv_self.ctx);
|
| 1116 |
|
|
|
|
| 1138 |
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
| 1139 |
|
| 1140 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
| 1141 |
+
ggml_set_name(embd, "embd");
|
| 1142 |
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
| 1143 |
|
| 1144 |
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
|
|
|
| 1163 |
// self-attention
|
| 1164 |
{
|
| 1165 |
// compute Q and K and RoPE them
|
| 1166 |
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
| 1167 |
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
| 1168 |
+
ggml_set_name(Qcur, "Qcur");
|
| 1169 |
+
ggml_set_name(Kcur, "Kcur");
|
| 1170 |
|
| 1171 |
// store key and value to memory
|
| 1172 |
{
|
|
|
|
| 1187 |
ggml_permute(ctx0,
|
| 1188 |
Qcur,
|
| 1189 |
0, 2, 1, 3);
|
| 1190 |
+
ggml_set_name(Q, "Q");
|
| 1191 |
|
| 1192 |
struct ggml_tensor * K =
|
| 1193 |
ggml_permute(ctx0,
|
|
|
|
| 1195 |
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
| 1196 |
n_embd/n_head, n_head, n_past + N),
|
| 1197 |
0, 2, 1, 3);
|
| 1198 |
+
ggml_set_name(K, "K");
|
| 1199 |
|
| 1200 |
// K * Q
|
| 1201 |
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
| 1202 |
+
ggml_set_name(KQ, "KQ");
|
| 1203 |
|
| 1204 |
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
| 1205 |
+
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
| 1206 |
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
| 1207 |
+
|
| 1208 |
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
| 1209 |
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
| 1210 |
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
| 1211 |
|
| 1212 |
// KQ_masked = mask_past(KQ_scaled)
|
| 1213 |
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
| 1214 |
+
ggml_set_name(KQ_masked, "KQ_masked");
|
| 1215 |
|
| 1216 |
// KQ = soft_max(KQ_masked)
|
| 1217 |
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
| 1218 |
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
| 1219 |
+
|
| 1220 |
|
| 1221 |
// split cached V into n_head heads
|
| 1222 |
struct ggml_tensor * V =
|
|
|
|
| 1225 |
n_ctx*ggml_element_size(kv_self.v),
|
| 1226 |
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
| 1227 |
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
| 1228 |
+
ggml_set_name(V, "V");
|
| 1229 |
|
| 1230 |
#if 1
|
| 1231 |
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
| 1232 |
+
ggml_set_name(KQV, "KQV");
|
| 1233 |
#else
|
| 1234 |
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
| 1235 |
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
|
|
|
| 1240 |
|
| 1241 |
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
| 1242 |
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
| 1243 |
+
ggml_set_name(KQV_merged, "KQV_merged");
|
| 1244 |
|
| 1245 |
// cur = KQV_merged.contiguous().view(n_embd, N)
|
| 1246 |
cur = ggml_cpy(ctx0,
|
| 1247 |
KQV_merged,
|
| 1248 |
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
| 1249 |
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
| 1250 |
|
| 1251 |
// projection (no bias)
|
| 1252 |
cur = ggml_mul_mat(ctx0,
|
|
|
|
| 1318 |
lctx.use_buf(ctx0, -1);
|
| 1319 |
|
| 1320 |
// logits -> probs
|
| 1321 |
+
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
| 1322 |
|
| 1323 |
// run the computation
|
| 1324 |
ggml_build_forward_expand(&gf, inpL);
|
|
|
|
| 1356 |
}
|
| 1357 |
|
| 1358 |
// extract embeddings
|
| 1359 |
+
if (!lctx.embedding.empty()) {
|
| 1360 |
auto & embedding_out = lctx.embedding;
|
| 1361 |
|
| 1362 |
embedding_out.resize(n_embd);
|
|
|
|
| 1407 |
size_t n;
|
| 1408 |
};
|
| 1409 |
|
| 1410 |
+
static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
|
| 1411 |
+
|
| 1412 |
struct llama_sp_bigram {
|
| 1413 |
struct comparator {
|
| 1414 |
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
|
|
|
| 1441 |
sym.prev = index - 1;
|
| 1442 |
sym.next = offs == text.size() ? -1 : index + 1;
|
| 1443 |
index++;
|
| 1444 |
+
symbols_.emplace_back(sym);
|
| 1445 |
}
|
| 1446 |
|
| 1447 |
// seed the work queue with all possible 2-character tokens.
|
|
|
|
| 1532 |
llama_tokenizer tokenizer(vocab);
|
| 1533 |
std::vector<llama_vocab::id> output;
|
| 1534 |
|
| 1535 |
+
if (text.empty()) {
|
| 1536 |
return output;
|
| 1537 |
}
|
| 1538 |
|
| 1539 |
if (bos) {
|
| 1540 |
+
output.push_back(llama_token_bos());
|
| 1541 |
}
|
| 1542 |
|
| 1543 |
tokenizer.tokenize(text, output);
|
|
|
|
| 1760 |
}
|
| 1761 |
}
|
| 1762 |
|
| 1763 |
+
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
| 1764 |
if (last_tokens_size == 0 || penalty == 1.0f) {
|
| 1765 |
return;
|
| 1766 |
}
|
|
|
|
| 1768 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 1769 |
|
| 1770 |
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1771 |
+
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
| 1772 |
if (token_iter == last_tokens + last_tokens_size) {
|
| 1773 |
continue;
|
| 1774 |
}
|
|
|
|
| 1789 |
}
|
| 1790 |
}
|
| 1791 |
|
| 1792 |
+
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
| 1793 |
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
| 1794 |
return;
|
| 1795 |
}
|
|
|
|
| 1846 |
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
| 1847 |
|
| 1848 |
// Sample the next word X using top-k sampling
|
| 1849 |
+
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
| 1850 |
if (ctx) {
|
| 1851 |
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1852 |
}
|
|
|
|
| 1912 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 1913 |
|
| 1914 |
// Find max element
|
| 1915 |
+
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
| 1916 |
return a.logit < b.logit;
|
| 1917 |
});
|
| 1918 |
|
|
|
|
| 1955 |
switch (ftype) {
|
| 1956 |
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
| 1957 |
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
|
|
|
| 1958 |
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
| 1959 |
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
| 1960 |
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
|
|
|
| 1965 |
nthread = std::thread::hardware_concurrency();
|
| 1966 |
}
|
| 1967 |
|
| 1968 |
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
| 1969 |
/*vocab_only*/ false));
|
| 1970 |
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
| 1971 |
|
|
|
|
| 2019 |
} else if (tensor.type == GGML_TYPE_F16) {
|
| 2020 |
f32_conv_buf.resize(nelements * sizeof(float));
|
| 2021 |
f32_data = (float *) f32_conv_buf.addr;
|
| 2022 |
+
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
| 2023 |
for (size_t i = 0; i < nelements; i++) {
|
| 2024 |
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
| 2025 |
}
|
|
|
|
| 2050 |
size_t first = counter; counter += chunk_size;
|
| 2051 |
if (first >= nelements) {
|
| 2052 |
if (!local_hist.empty()) {
|
| 2053 |
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
| 2054 |
+
hist_cur[j] += local_hist[j];
|
| 2055 |
+
}
|
| 2056 |
new_size += local_size;
|
| 2057 |
}
|
| 2058 |
break;
|
| 2059 |
}
|
| 2060 |
lock.unlock();
|
| 2061 |
size_t last = std::min(nelements, first + chunk_size);
|
| 2062 |
+
if (local_hist.empty()) {
|
| 2063 |
+
local_hist.resize(hist_cur.size(), 0);
|
| 2064 |
+
}
|
| 2065 |
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
| 2066 |
}
|
| 2067 |
};
|
| 2068 |
+
if ((int) workers.size() < nthread_use - 1) {
|
| 2069 |
+
workers.resize(nthread_use - 1);
|
| 2070 |
+
}
|
| 2071 |
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
| 2072 |
+
workers[it] = std::thread(compute);
|
| 2073 |
+
}
|
| 2074 |
compute();
|
| 2075 |
+
for (int it = 0; it < nthread_use - 1; ++it) {
|
| 2076 |
+
workers[it].join();
|
| 2077 |
+
}
|
| 2078 |
}
|
| 2079 |
|
| 2080 |
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
|
|
| 2120 |
|
| 2121 |
llama_context * ctx = new llama_context;
|
| 2122 |
|
| 2123 |
+
if (params.seed < 0) {
|
| 2124 |
params.seed = time(NULL);
|
| 2125 |
}
|
| 2126 |
|
|
|
|
| 2146 |
|
| 2147 |
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 2148 |
|
| 2149 |
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
| 2150 |
params.use_mmap, params.use_mlock, params.vocab_only,
|
| 2151 |
params.progress_callback, params.progress_callback_user_data)) {
|
| 2152 |
fprintf(stderr, "%s: failed to load model\n", __func__);
|
|
|
|
| 2272 |
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
| 2273 |
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
| 2274 |
|
| 2275 |
+
size_t ctx_size;
|
| 2276 |
+
size_t mmapped_size;
|
| 2277 |
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
| 2278 |
base_buf.resize(ctx_size);
|
| 2279 |
|
|
|
|
| 2312 |
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
| 2313 |
}
|
| 2314 |
|
| 2315 |
+
std::string name;
|
| 2316 |
+
{
|
| 2317 |
+
char buf[1024];
|
| 2318 |
+
fin.read(buf, length);
|
| 2319 |
+
name = std::string(buf, length);
|
| 2320 |
+
}
|
| 2321 |
|
| 2322 |
// check for lora suffix and get the type of tensor
|
| 2323 |
const std::string lora_suffix = ".lora";
|
|
|
|
| 2332 |
base_name.erase(pos);
|
| 2333 |
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
| 2334 |
|
| 2335 |
+
if (model_tensors.find(base_name) == model_tensors.end()) {
|
| 2336 |
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
| 2337 |
return 1;
|
| 2338 |
}
|
|
|
|
| 2412 |
|
| 2413 |
if (scaling != 1.0f) {
|
| 2414 |
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
| 2415 |
+
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
| 2416 |
}
|
| 2417 |
|
| 2418 |
ggml_tensor * r;
|
|
|
|
| 2434 |
lora_tensors.clear();
|
| 2435 |
|
| 2436 |
n_tensors++;
|
| 2437 |
+
if (n_tensors % 4 == 0) {
|
| 2438 |
fprintf(stderr, ".");
|
| 2439 |
+
}
|
| 2440 |
}
|
| 2441 |
}
|
| 2442 |
|
|
|
|
| 2461 |
}
|
| 2462 |
}
|
| 2463 |
|
| 2464 |
+
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
| 2465 |
return ctx->model.kv_self.n;
|
| 2466 |
}
|
| 2467 |
|
| 2468 |
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
| 2469 |
|
| 2470 |
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
| 2471 |
+
if (seed < 0) {
|
| 2472 |
seed = time(NULL);
|
| 2473 |
}
|
| 2474 |
ctx->rng.seed(seed);
|
| 2475 |
}
|
| 2476 |
|
| 2477 |
// Returns the *maximum* size of the state
|
| 2478 |
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
| 2479 |
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
| 2480 |
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
| 2481 |
const size_t s_rng_size = sizeof(size_t);
|
|
|
|
| 2506 |
}
|
| 2507 |
|
| 2508 |
// Copies the state to the specified destination address
|
| 2509 |
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
| 2510 |
+
uint8_t * out = dst;
|
| 2511 |
|
| 2512 |
// copy rng
|
| 2513 |
{
|
|
|
|
| 2567 |
|
| 2568 |
if (kv_size) {
|
| 2569 |
const size_t elt_size = ggml_element_size(kv_self.k);
|
| 2570 |
+
|
| 2571 |
char buffer[4096];
|
| 2572 |
+
|
| 2573 |
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
| 2574 |
ggml_cgraph gf{};
|
| 2575 |
gf.n_threads = 1;
|
|
|
|
| 2593 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
| 2594 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
| 2595 |
ggml_graph_compute(cpy_ctx, &gf);
|
| 2596 |
+
|
| 2597 |
+
ggml_free(cpy_ctx);
|
| 2598 |
}
|
| 2599 |
}
|
| 2600 |
|
| 2601 |
+
const size_t written = out - dst;
|
| 2602 |
const size_t max_size = llama_get_state_size(ctx);
|
| 2603 |
|
| 2604 |
LLAMA_ASSERT(written <= max_size);
|
|
|
|
| 2608 |
|
| 2609 |
// Sets the state reading from the specified source address
|
| 2610 |
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
| 2611 |
+
const uint8_t * inp = src;
|
| 2612 |
|
| 2613 |
// set rng
|
| 2614 |
{
|
| 2615 |
size_t rng_size;
|
| 2616 |
char rng_buf[LLAMA_MAX_RNG_STATE];
|
| 2617 |
|
| 2618 |
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
| 2619 |
+
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
| 2620 |
|
| 2621 |
std::stringstream rng_ss;
|
| 2622 |
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
|
|
|
| 2630 |
size_t logits_cap;
|
| 2631 |
size_t logits_size;
|
| 2632 |
|
| 2633 |
+
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
| 2634 |
+
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
| 2635 |
|
| 2636 |
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
| 2637 |
|
| 2638 |
if (logits_size) {
|
| 2639 |
ctx->logits.resize(logits_size);
|
| 2640 |
+
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
| 2641 |
}
|
| 2642 |
|
| 2643 |
+
inp += logits_cap * sizeof(float);
|
| 2644 |
}
|
| 2645 |
|
| 2646 |
// set embeddings
|
| 2647 |
{
|
| 2648 |
size_t embedding_size;
|
| 2649 |
|
| 2650 |
+
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
| 2651 |
|
| 2652 |
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
| 2653 |
|
| 2654 |
if (embedding_size) {
|
| 2655 |
+
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
| 2656 |
+
inp += embedding_size * sizeof(float);
|
| 2657 |
}
|
| 2658 |
}
|
| 2659 |
|
|
|
|
| 2668 |
size_t kv_size;
|
| 2669 |
int kv_ntok;
|
| 2670 |
|
| 2671 |
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
| 2672 |
+
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
| 2673 |
|
| 2674 |
if (kv_size) {
|
| 2675 |
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
| 2676 |
|
| 2677 |
const size_t elt_size = ggml_element_size(kv_self.k);
|
| 2678 |
+
|
| 2679 |
char buffer[4096];
|
| 2680 |
+
|
| 2681 |
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
| 2682 |
ggml_cgraph gf{};
|
| 2683 |
gf.n_threads = 1;
|
| 2684 |
|
| 2685 |
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
| 2686 |
+
kin3d->data = (void *) inp;
|
| 2687 |
+
inp += ggml_nbytes(kin3d);
|
| 2688 |
|
| 2689 |
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
| 2690 |
+
vin3d->data = (void *) inp;
|
| 2691 |
+
inp += ggml_nbytes(vin3d);
|
| 2692 |
|
| 2693 |
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
| 2694 |
n_embd, kv_ntok, n_layer,
|
|
|
|
| 2702 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
| 2703 |
ggml_graph_compute(cpy_ctx, &gf);
|
| 2704 |
|
| 2705 |
+
ggml_free(cpy_ctx);
|
| 2706 |
}
|
| 2707 |
|
| 2708 |
ctx->model.kv_self.n = kv_ntok;
|
| 2709 |
}
|
| 2710 |
|
| 2711 |
+
const size_t nread = inp - src;
|
| 2712 |
const size_t max_size = llama_get_state_size(ctx);
|
| 2713 |
|
| 2714 |
LLAMA_ASSERT(nread <= max_size);
|
|
|
|
| 2716 |
return nread;
|
| 2717 |
}
|
| 2718 |
|
| 2719 |
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
| 2720 |
+
llama_file file(path_session, "rb");
|
| 2721 |
+
|
| 2722 |
+
// sanity checks
|
| 2723 |
+
{
|
| 2724 |
+
const uint32_t magic = file.read_u32();
|
| 2725 |
+
const uint32_t version = file.read_u32();
|
| 2726 |
+
|
| 2727 |
+
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
| 2728 |
+
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
| 2729 |
+
return false;
|
| 2730 |
+
}
|
| 2731 |
+
|
| 2732 |
+
llama_hparams session_hparams;
|
| 2733 |
+
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
| 2734 |
+
|
| 2735 |
+
if (session_hparams != ctx->model.hparams) {
|
| 2736 |
+
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
|
| 2737 |
+
return false;
|
| 2738 |
+
}
|
| 2739 |
+
}
|
| 2740 |
+
|
| 2741 |
+
// load the prompt
|
| 2742 |
+
{
|
| 2743 |
+
const uint32_t n_token_count = file.read_u32();
|
| 2744 |
+
|
| 2745 |
+
if (n_token_count > n_token_capacity) {
|
| 2746 |
+
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
| 2747 |
+
return false;
|
| 2748 |
+
}
|
| 2749 |
+
|
| 2750 |
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
| 2751 |
+
*n_token_count_out = n_token_count;
|
| 2752 |
+
}
|
| 2753 |
+
|
| 2754 |
+
// restore the context state
|
| 2755 |
+
{
|
| 2756 |
+
const size_t n_state_size_cur = file.size - file.tell();
|
| 2757 |
+
const size_t n_state_size_max = llama_get_state_size(ctx);
|
| 2758 |
+
|
| 2759 |
+
if (n_state_size_cur > n_state_size_max) {
|
| 2760 |
+
fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
| 2761 |
+
return false;
|
| 2762 |
+
}
|
| 2763 |
+
|
| 2764 |
+
std::vector<uint8_t> state_data(n_state_size_max);
|
| 2765 |
+
file.read_raw(state_data.data(), n_state_size_cur);
|
| 2766 |
+
|
| 2767 |
+
llama_set_state_data(ctx, state_data.data());
|
| 2768 |
+
}
|
| 2769 |
+
|
| 2770 |
+
return true;
|
| 2771 |
+
}
|
| 2772 |
+
|
| 2773 |
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
| 2774 |
+
llama_file file(path_session, "wb");
|
| 2775 |
+
|
| 2776 |
+
file.write_u32(LLAMA_SESSION_MAGIC);
|
| 2777 |
+
file.write_u32(LLAMA_SESSION_VERSION);
|
| 2778 |
+
|
| 2779 |
+
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
|
| 2780 |
+
|
| 2781 |
+
// save the prompt
|
| 2782 |
+
file.write_u32((uint32_t) n_token_count);
|
| 2783 |
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
| 2784 |
+
|
| 2785 |
+
// save the context state
|
| 2786 |
+
{
|
| 2787 |
+
const size_t n_state_size_max = llama_get_state_size(ctx);
|
| 2788 |
+
|
| 2789 |
+
std::vector<uint8_t> state_data(n_state_size_max);
|
| 2790 |
+
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
|
| 2791 |
+
|
| 2792 |
+
file.write_raw(state_data.data(), n_state_size_cur);
|
| 2793 |
+
}
|
| 2794 |
+
|
| 2795 |
+
return true;
|
| 2796 |
+
}
|
| 2797 |
+
|
| 2798 |
int llama_eval(
|
| 2799 |
struct llama_context * ctx,
|
| 2800 |
const llama_token * tokens,
|
|
|
|
| 2805 |
fprintf(stderr, "%s: failed to eval\n", __func__);
|
| 2806 |
return 1;
|
| 2807 |
}
|
| 2808 |
+
|
| 2809 |
// get a more accurate load time, upon first eval
|
| 2810 |
+
// TODO: fix this
|
| 2811 |
if (!ctx->has_evaluated_once) {
|
| 2812 |
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
| 2813 |
ctx->has_evaluated_once = true;
|
| 2814 |
}
|
| 2815 |
+
|
| 2816 |
return 0;
|
| 2817 |
}
|
| 2818 |
|
|
|
|
| 2836 |
return res.size();
|
| 2837 |
}
|
| 2838 |
|
| 2839 |
+
int llama_n_vocab(const struct llama_context * ctx) {
|
| 2840 |
return ctx->vocab.id_to_token.size();
|
| 2841 |
}
|
| 2842 |
|
| 2843 |
+
int llama_n_ctx(const struct llama_context * ctx) {
|
| 2844 |
return ctx->model.hparams.n_ctx;
|
| 2845 |
}
|
| 2846 |
|
| 2847 |
+
int llama_n_embd(const struct llama_context * ctx) {
|
| 2848 |
return ctx->model.hparams.n_embd;
|
| 2849 |
}
|
| 2850 |
|
|
|
|
| 2856 |
return ctx->embedding.data();
|
| 2857 |
}
|
| 2858 |
|
| 2859 |
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
| 2860 |
if (token >= llama_n_vocab(ctx)) {
|
| 2861 |
return nullptr;
|
| 2862 |
}
|
|
|
|
| 2886 |
|
| 2887 |
fprintf(stderr, "\n");
|
| 2888 |
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
| 2889 |
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
| 2890 |
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
| 2891 |
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
| 2892 |
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
| 2893 |
}
|
| 2894 |
|
|
|
|
| 2925 |
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
| 2926 |
return ctx->model.tensors_by_name;
|
| 2927 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -19,7 +19,7 @@
|
|
| 19 |
# define LLAMA_API
|
| 20 |
#endif
|
| 21 |
|
| 22 |
-
#define LLAMA_FILE_VERSION
|
| 23 |
#define LLAMA_FILE_MAGIC 'ggjt'
|
| 24 |
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
| 25 |
#define LLAMA_SESSION_MAGIC 'ggsn'
|
|
@@ -54,9 +54,10 @@ extern "C" {
|
|
| 54 |
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
| 55 |
|
| 56 |
struct llama_context_params {
|
| 57 |
-
int n_ctx;
|
| 58 |
-
int n_parts;
|
| 59 |
-
int
|
|
|
|
| 60 |
|
| 61 |
bool f16_kv; // use fp16 for KV cache
|
| 62 |
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
|
@@ -78,7 +79,7 @@ extern "C" {
|
|
| 78 |
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
| 79 |
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
| 80 |
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
| 81 |
-
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, //
|
| 82 |
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
| 83 |
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 84 |
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
|
@@ -122,19 +123,19 @@ extern "C" {
|
|
| 122 |
int n_threads);
|
| 123 |
|
| 124 |
// Returns the number of tokens in the KV cache
|
| 125 |
-
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
| 126 |
|
| 127 |
// Sets the current rng seed.
|
| 128 |
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
| 129 |
|
| 130 |
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
| 131 |
// and kv_cache) - will often be smaller after compacting tokens
|
| 132 |
-
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
|
| 133 |
|
| 134 |
// Copies the state to the specified destination address.
|
| 135 |
// Destination needs to have allocated enough memory.
|
| 136 |
// Returns the number of bytes copied
|
| 137 |
-
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
| 138 |
|
| 139 |
// Set the state reading from the specified address
|
| 140 |
// Returns the number of bytes read
|
|
@@ -143,6 +144,7 @@ extern "C" {
|
|
| 143 |
// Save/load session file
|
| 144 |
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
| 145 |
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
|
|
|
| 146 |
// Run the llama inference to obtain the logits and probabilities for the next token.
|
| 147 |
// tokens + n_tokens is the provided batch of new tokens to process
|
| 148 |
// n_past is the number of tokens to use from previous eval calls
|
|
@@ -166,9 +168,9 @@ extern "C" {
|
|
| 166 |
int n_max_tokens,
|
| 167 |
bool add_bos);
|
| 168 |
|
| 169 |
-
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
|
| 170 |
-
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
|
| 171 |
-
LLAMA_API int llama_n_embd (struct llama_context * ctx);
|
| 172 |
|
| 173 |
// Token logits obtained from the last call to llama_eval()
|
| 174 |
// The logits for the last token are stored in the last row
|
|
@@ -182,7 +184,7 @@ extern "C" {
|
|
| 182 |
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
| 183 |
|
| 184 |
// Token Id -> String. Uses the vocabulary in the provided context
|
| 185 |
-
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
| 186 |
|
| 187 |
// Special tokens
|
| 188 |
LLAMA_API llama_token llama_token_bos();
|
|
@@ -192,25 +194,25 @@ extern "C" {
|
|
| 192 |
// Sampling functions
|
| 193 |
|
| 194 |
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
| 195 |
-
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
| 196 |
|
| 197 |
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
| 198 |
-
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
| 199 |
|
| 200 |
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
| 201 |
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 202 |
|
| 203 |
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
| 204 |
-
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep
|
| 205 |
|
| 206 |
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
| 207 |
-
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
| 208 |
|
| 209 |
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
| 210 |
-
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep
|
| 211 |
|
| 212 |
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
| 213 |
-
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
| 214 |
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
| 215 |
|
| 216 |
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
|
|
|
| 19 |
# define LLAMA_API
|
| 20 |
#endif
|
| 21 |
|
| 22 |
+
#define LLAMA_FILE_VERSION 2
|
| 23 |
#define LLAMA_FILE_MAGIC 'ggjt'
|
| 24 |
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
| 25 |
#define LLAMA_SESSION_MAGIC 'ggsn'
|
|
|
|
| 54 |
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
| 55 |
|
| 56 |
struct llama_context_params {
|
| 57 |
+
int n_ctx; // text context
|
| 58 |
+
int n_parts; // -1 for default
|
| 59 |
+
int n_gpu_layers; // number of layers to store in VRAM
|
| 60 |
+
int seed; // RNG seed, -1 for random
|
| 61 |
|
| 62 |
bool f16_kv; // use fp16 for KV cache
|
| 63 |
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
|
|
|
| 79 |
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
| 80 |
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
| 81 |
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
| 82 |
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
| 83 |
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
| 84 |
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 85 |
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
|
|
|
| 123 |
int n_threads);
|
| 124 |
|
| 125 |
// Returns the number of tokens in the KV cache
|
| 126 |
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
| 127 |
|
| 128 |
// Sets the current rng seed.
|
| 129 |
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
| 130 |
|
| 131 |
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
| 132 |
// and kv_cache) - will often be smaller after compacting tokens
|
| 133 |
+
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
| 134 |
|
| 135 |
// Copies the state to the specified destination address.
|
| 136 |
// Destination needs to have allocated enough memory.
|
| 137 |
// Returns the number of bytes copied
|
| 138 |
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
| 139 |
|
| 140 |
// Set the state reading from the specified address
|
| 141 |
// Returns the number of bytes read
|
|
|
|
| 144 |
// Save/load session file
|
| 145 |
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
| 146 |
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
| 147 |
+
|
| 148 |
// Run the llama inference to obtain the logits and probabilities for the next token.
|
| 149 |
// tokens + n_tokens is the provided batch of new tokens to process
|
| 150 |
// n_past is the number of tokens to use from previous eval calls
|
|
|
|
| 168 |
int n_max_tokens,
|
| 169 |
bool add_bos);
|
| 170 |
|
| 171 |
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
| 172 |
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
| 173 |
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
| 174 |
|
| 175 |
// Token logits obtained from the last call to llama_eval()
|
| 176 |
// The logits for the last token are stored in the last row
|
|
|
|
| 184 |
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
| 185 |
|
| 186 |
// Token Id -> String. Uses the vocabulary in the provided context
|
| 187 |
+
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
| 188 |
|
| 189 |
// Special tokens
|
| 190 |
LLAMA_API llama_token llama_token_bos();
|
|
|
|
| 194 |
// Sampling functions
|
| 195 |
|
| 196 |
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
| 197 |
+
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
| 198 |
|
| 199 |
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
| 200 |
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
| 201 |
|
| 202 |
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
| 203 |
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 204 |
|
| 205 |
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
| 206 |
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
| 207 |
|
| 208 |
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
| 209 |
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
| 210 |
|
| 211 |
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
| 212 |
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
| 213 |
|
| 214 |
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
| 215 |
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
| 216 |
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
| 217 |
|
| 218 |
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -560,7 +560,7 @@ int main(int argc, char ** argv) {
|
|
| 560 |
|
| 561 |
embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
|
| 562 |
n_past += embd.size();
|
| 563 |
-
|
| 564 |
embd.clear();
|
| 565 |
|
| 566 |
if (done) break;
|
|
@@ -577,7 +577,7 @@ int main(int argc, char ** argv) {
|
|
| 577 |
if (!path_session.empty() && need_to_save_session) {
|
| 578 |
need_to_save_session = false;
|
| 579 |
llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
| 580 |
-
}
|
| 581 |
|
| 582 |
llama_token id = 0;
|
| 583 |
|
|
@@ -609,8 +609,8 @@ int main(int argc, char ** argv) {
|
|
| 609 |
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
|
| 610 |
} else {
|
| 611 |
// Temperature sampling
|
| 612 |
-
llama_sample_top_k(ctx_llama, &candidates_p, top_k);
|
| 613 |
-
llama_sample_top_p(ctx_llama, &candidates_p, top_p);
|
| 614 |
llama_sample_temperature(ctx_llama, &candidates_p, temp);
|
| 615 |
id = llama_sample_token(ctx_llama, &candidates_p);
|
| 616 |
}
|
|
|
|
| 560 |
|
| 561 |
embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
|
| 562 |
n_past += embd.size();
|
| 563 |
+
|
| 564 |
embd.clear();
|
| 565 |
|
| 566 |
if (done) break;
|
|
|
|
| 577 |
if (!path_session.empty() && need_to_save_session) {
|
| 578 |
need_to_save_session = false;
|
| 579 |
llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
| 580 |
+
}
|
| 581 |
|
| 582 |
llama_token id = 0;
|
| 583 |
|
|
|
|
| 609 |
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
|
| 610 |
} else {
|
| 611 |
// Temperature sampling
|
| 612 |
+
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
|
| 613 |
+
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
|
| 614 |
llama_sample_temperature(ctx_llama, &candidates_p, temp);
|
| 615 |
id = llama_sample_token(ctx_llama, &candidates_p);
|
| 616 |
}
|