ggerganov commited on
Commit
ef85c02
·
unverified ·
1 Parent(s): dd59682

talk-llama : fix build + sync latest llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -9,6 +9,9 @@
9
  #include "llama.h"
10
 
11
  #include "ggml.h"
 
 
 
12
 
13
  #include <array>
14
  #include <ctime>
@@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;
50
 
51
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
52
  {
53
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
54
  { MODEL_7B, 512ull * MB },
55
  { MODEL_13B, 512ull * MB },
56
  { MODEL_30B, 512ull * MB },
57
  { MODEL_65B, 1024ull * MB },
58
  };
59
- return _MEM_REQ_SCRATCH0;
60
  }
61
 
62
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
63
  {
64
- static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
65
  { MODEL_7B, 512ull * MB },
66
  { MODEL_13B, 512ull * MB },
67
  { MODEL_30B, 512ull * MB },
68
  { MODEL_65B, 1024ull * MB },
69
  };
70
- return _MEM_REQ_SCRATCH1;
71
  }
72
 
73
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
75
  {
76
- static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
77
  { MODEL_7B, 1026ull * MB },
78
  { MODEL_13B, 1608ull * MB },
79
  { MODEL_30B, 3124ull * MB },
80
  { MODEL_65B, 5120ull * MB },
81
  };
82
- return _MEM_REQ_KV_SELF;
83
  }
84
 
85
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
  // not actually needed if BLAS is disabled
87
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
88
  {
89
- static std::map<e_model, size_t> _MEM_REQ_EVAL = {
90
  { MODEL_7B, 768ull * MB },
91
  { MODEL_13B, 1024ull * MB },
92
  { MODEL_30B, 1280ull * MB },
93
  { MODEL_65B, 1536ull * MB },
94
  };
95
- return _MEM_REQ_EVAL;
96
  }
97
 
98
  // default hparams (LLaMA 7B)
@@ -402,6 +405,7 @@ enum llama_file_version {
402
  LLAMA_FILE_VERSION_GGML,
403
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
404
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
 
405
  };
406
 
407
  struct llama_file_loader {
@@ -432,6 +436,8 @@ struct llama_file_loader {
432
  file_version = LLAMA_FILE_VERSION_GGMF_V1;
433
  } else if (magic == 'ggjt' && version == 1) {
434
  file_version = LLAMA_FILE_VERSION_GGJT_V1;
 
 
435
  } else {
436
  throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
437
  magic, version);
@@ -482,7 +488,6 @@ struct llama_file_loader {
482
  case GGML_TYPE_F16:
483
  case GGML_TYPE_Q4_0:
484
  case GGML_TYPE_Q4_1:
485
- case GGML_TYPE_Q4_2:
486
  case GGML_TYPE_Q5_0:
487
  case GGML_TYPE_Q5_1:
488
  case GGML_TYPE_Q8_0:
@@ -527,8 +532,8 @@ struct llama_file_saver {
527
  write_vocab();
528
  }
529
  void write_magic() {
530
- file.write_u32('ggjt'); // magic
531
- file.write_u32(1); // version
532
  }
533
  void write_hparams(enum llama_ftype new_ftype) {
534
  const llama_hparams & hparams = any_file_loader->hparams;
@@ -558,7 +563,6 @@ struct llama_file_saver {
558
  case GGML_TYPE_F16:
559
  case GGML_TYPE_Q4_0:
560
  case GGML_TYPE_Q4_1:
561
- case GGML_TYPE_Q4_2:
562
  case GGML_TYPE_Q5_0:
563
  case GGML_TYPE_Q5_1:
564
  case GGML_TYPE_Q8_0:
@@ -585,12 +589,12 @@ struct llama_model_loader {
585
  std::unique_ptr<llama_mmap> mapping;
586
 
587
  llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
588
- auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
589
  file_loaders.emplace_back(first_file);
590
  uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
591
  for (uint32_t i = 1; i < n_parts; i++) {
592
  std::string fname = fname_base + "." + std::to_string(i);
593
- auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
594
  file_loaders.emplace_back(ith_file);
595
  if (ith_file->hparams != first_file->hparams) {
596
  throw format("llama.cpp: hparams inconsistent between files");
@@ -637,7 +641,7 @@ struct llama_model_loader {
637
  }
638
  }
639
 
640
- struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
641
  auto it = tensors_map.name_to_idx.find(name);
642
  if (it == tensors_map.name_to_idx.end()) {
643
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -659,13 +663,14 @@ struct llama_model_loader {
659
  LLAMA_ASSERT(lt.ne.size() == 1);
660
  tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
661
  }
 
662
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
663
  lt.ggml_tensor = tensor;
664
  num_ggml_tensors_created++;
665
  return tensor;
666
  }
667
 
668
- void done_getting_tensors() {
669
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
670
  throw std::string("llama.cpp: file contained more tensors than expected");
671
  }
@@ -727,8 +732,7 @@ struct llama_model_loader {
727
  LLAMA_ASSERT(offset == lt.size);
728
  } else if (lt.split_type == SPLIT_BY_COLUMNS) {
729
  // Let's load the data into temporary buffers to ensure the OS performs large loads.
730
- std::vector<llama_buffer> tmp_bufs;
731
- tmp_bufs.resize(lt.shards.size());
732
  for (size_t i = 0; i < lt.shards.size(); i++) {
733
  llama_load_tensor_shard & shard = lt.shards.at(i);
734
  llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -799,6 +803,8 @@ static bool kv_cache_init(
799
 
800
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
801
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
 
 
802
 
803
  return true;
804
  }
@@ -807,7 +813,8 @@ struct llama_context_params llama_context_default_params() {
807
  struct llama_context_params result = {
808
  /*.n_ctx =*/ 512,
809
  /*.n_parts =*/ -1,
810
- /*.seed =*/ 0,
 
811
  /*.f16_kv =*/ false,
812
  /*.logits_all =*/ false,
813
  /*.vocab_only =*/ false,
@@ -837,9 +844,11 @@ static const char *llama_file_version_name(llama_file_version version) {
837
  switch (version) {
838
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
839
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
840
- case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
841
- default: LLAMA_ASSERT(false);
842
  }
 
 
843
  }
844
 
845
  static const char *llama_ftype_name(enum llama_ftype ftype) {
@@ -850,7 +859,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
850
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
851
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
852
  return "mostly Q4_1, some F16";
853
- case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
854
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
855
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
856
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -872,6 +880,7 @@ static void llama_model_load_internal(
872
  const std::string & fname,
873
  llama_context & lctx,
874
  int n_ctx,
 
875
  ggml_type memory_type,
876
  bool use_mmap,
877
  bool use_mlock,
@@ -916,13 +925,22 @@ static void llama_model_load_internal(
916
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
917
  }
918
 
 
 
 
 
 
 
 
 
919
  if (vocab_only) {
920
  return;
921
  }
922
 
923
  auto & ctx = model.ctx;
924
 
925
- size_t ctx_size, mmapped_size;
 
926
  ml->calc_sizes(&ctx_size, &mmapped_size);
927
  fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
928
 
@@ -968,8 +986,6 @@ static void llama_model_load_internal(
968
 
969
  // prepare memory for the weights
970
  {
971
- const auto & hparams = model.hparams;
972
-
973
  const uint32_t n_embd = hparams.n_embd;
974
  const uint32_t n_layer = hparams.n_layer;
975
  const uint32_t n_vocab = hparams.n_vocab;
@@ -1011,6 +1027,35 @@ static void llama_model_load_internal(
1011
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1012
 
1013
  model.mapping = std::move(ml->mapping);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1014
 
1015
  // loading time will be recalculate after the first eval, so
1016
  // we take page faults deferred by mmap() into consideration
@@ -1021,6 +1066,7 @@ static bool llama_model_load(
1021
  const std::string & fname,
1022
  llama_context & lctx,
1023
  int n_ctx,
 
1024
  ggml_type memory_type,
1025
  bool use_mmap,
1026
  bool use_mlock,
@@ -1028,7 +1074,7 @@ static bool llama_model_load(
1028
  llama_progress_callback progress_callback,
1029
  void *progress_callback_user_data) {
1030
  try {
1031
- llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
1032
  vocab_only, progress_callback, progress_callback_user_data);
1033
  return true;
1034
  } catch (const std::string & err) {
@@ -1050,6 +1096,13 @@ static bool llama_eval_internal(
1050
  const int n_tokens,
1051
  const int n_past,
1052
  const int n_threads) {
 
 
 
 
 
 
 
1053
  const int64_t t_start_us = ggml_time_us();
1054
 
1055
  const int N = n_tokens;
@@ -1057,7 +1110,7 @@ static bool llama_eval_internal(
1057
  const auto & model = lctx.model;
1058
  const auto & hparams = model.hparams;
1059
 
1060
- auto & kv_self = model.kv_self;
1061
 
1062
  LLAMA_ASSERT(!!kv_self.ctx);
1063
 
@@ -1085,6 +1138,7 @@ static bool llama_eval_internal(
1085
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1086
 
1087
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
 
1088
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1089
 
1090
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1109,8 +1163,10 @@ static bool llama_eval_internal(
1109
  // self-attention
1110
  {
1111
  // compute Q and K and RoPE them
1112
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1113
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
 
 
1114
 
1115
  // store key and value to memory
1116
  {
@@ -1131,6 +1187,7 @@ static bool llama_eval_internal(
1131
  ggml_permute(ctx0,
1132
  Qcur,
1133
  0, 2, 1, 3);
 
1134
 
1135
  struct ggml_tensor * K =
1136
  ggml_permute(ctx0,
@@ -1138,21 +1195,28 @@ static bool llama_eval_internal(
1138
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1139
  n_embd/n_head, n_head, n_past + N),
1140
  0, 2, 1, 3);
 
1141
 
1142
  // K * Q
1143
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
1144
 
1145
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
1146
- struct ggml_tensor * KQ_scaled =
1147
- ggml_scale(ctx0,
1148
- KQ,
1149
- ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
 
 
1150
 
1151
  // KQ_masked = mask_past(KQ_scaled)
1152
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
 
1153
 
1154
  // KQ = soft_max(KQ_masked)
1155
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
 
 
1156
 
1157
  // split cached V into n_head heads
1158
  struct ggml_tensor * V =
@@ -1161,9 +1225,11 @@ static bool llama_eval_internal(
1161
  n_ctx*ggml_element_size(kv_self.v),
1162
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1163
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
 
1164
 
1165
  #if 1
1166
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
 
1167
  #else
1168
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1169
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1174,11 +1240,13 @@ static bool llama_eval_internal(
1174
 
1175
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1176
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
 
1177
 
1178
  // cur = KQV_merged.contiguous().view(n_embd, N)
1179
  cur = ggml_cpy(ctx0,
1180
  KQV_merged,
1181
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
 
1182
 
1183
  // projection (no bias)
1184
  cur = ggml_mul_mat(ctx0,
@@ -1250,7 +1318,7 @@ static bool llama_eval_internal(
1250
  lctx.use_buf(ctx0, -1);
1251
 
1252
  // logits -> probs
1253
- //inpL = ggml_soft_max(ctx0, inpL);
1254
 
1255
  // run the computation
1256
  ggml_build_forward_expand(&gf, inpL);
@@ -1288,7 +1356,7 @@ static bool llama_eval_internal(
1288
  }
1289
 
1290
  // extract embeddings
1291
- if (lctx.embedding.size()) {
1292
  auto & embedding_out = lctx.embedding;
1293
 
1294
  embedding_out.resize(n_embd);
@@ -1339,6 +1407,8 @@ struct llama_sp_symbol {
1339
  size_t n;
1340
  };
1341
 
 
 
1342
  struct llama_sp_bigram {
1343
  struct comparator {
1344
  bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@@ -1371,7 +1441,7 @@ struct llama_tokenizer {
1371
  sym.prev = index - 1;
1372
  sym.next = offs == text.size() ? -1 : index + 1;
1373
  index++;
1374
- symbols_.emplace_back(std::move(sym));
1375
  }
1376
 
1377
  // seed the work queue with all possible 2-character tokens.
@@ -1462,12 +1532,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1462
  llama_tokenizer tokenizer(vocab);
1463
  std::vector<llama_vocab::id> output;
1464
 
1465
- if (text.size() == 0) {
1466
  return output;
1467
  }
1468
 
1469
  if (bos) {
1470
- output.push_back(1);
1471
  }
1472
 
1473
  tokenizer.tokenize(text, output);
@@ -1690,7 +1760,7 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
1690
  }
1691
  }
1692
 
1693
- void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) {
1694
  if (last_tokens_size == 0 || penalty == 1.0f) {
1695
  return;
1696
  }
@@ -1698,7 +1768,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
1698
  const int64_t t_start_sample_us = ggml_time_us();
1699
 
1700
  for (size_t i = 0; i < candidates->size; ++i) {
1701
- auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1702
  if (token_iter == last_tokens + last_tokens_size) {
1703
  continue;
1704
  }
@@ -1719,7 +1789,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
1719
  }
1720
  }
1721
 
1722
- void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
1723
  if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
1724
  return;
1725
  }
@@ -1776,7 +1846,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
1776
  float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1777
 
1778
  // Sample the next word X using top-k sampling
1779
- llama_sample_top_k(nullptr, candidates, int(k));
1780
  if (ctx) {
1781
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1782
  }
@@ -1842,7 +1912,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
1842
  const int64_t t_start_sample_us = ggml_time_us();
1843
 
1844
  // Find max element
1845
- auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1846
  return a.logit < b.logit;
1847
  });
1848
 
@@ -1885,7 +1955,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1885
  switch (ftype) {
1886
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1887
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1888
- case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1889
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1890
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1891
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1896,7 +1965,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1896
  nthread = std::thread::hardware_concurrency();
1897
  }
1898
 
1899
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1900
  /*vocab_only*/ false));
1901
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1902
 
@@ -1950,7 +2019,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1950
  } else if (tensor.type == GGML_TYPE_F16) {
1951
  f32_conv_buf.resize(nelements * sizeof(float));
1952
  f32_data = (float *) f32_conv_buf.addr;
1953
- auto f16_data = (const ggml_fp16_t *) tensor.data;
1954
  for (size_t i = 0; i < nelements; i++) {
1955
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1956
  }
@@ -1981,21 +2050,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1981
  size_t first = counter; counter += chunk_size;
1982
  if (first >= nelements) {
1983
  if (!local_hist.empty()) {
1984
- for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
 
 
1985
  new_size += local_size;
1986
  }
1987
  break;
1988
  }
1989
  lock.unlock();
1990
  size_t last = std::min(nelements, first + chunk_size);
1991
- if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
 
 
1992
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1993
  }
1994
  };
1995
- if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1996
- for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
 
 
 
 
1997
  compute();
1998
- for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
 
 
1999
  }
2000
 
2001
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -2041,7 +2120,7 @@ struct llama_context * llama_init_from_file(
2041
 
2042
  llama_context * ctx = new llama_context;
2043
 
2044
- if (params.seed <= 0) {
2045
  params.seed = time(NULL);
2046
  }
2047
 
@@ -2067,7 +2146,7 @@ struct llama_context * llama_init_from_file(
2067
 
2068
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2069
 
2070
- if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
2071
  params.use_mmap, params.use_mlock, params.vocab_only,
2072
  params.progress_callback, params.progress_callback_user_data)) {
2073
  fprintf(stderr, "%s: failed to load model\n", __func__);
@@ -2193,7 +2272,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2193
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2194
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2195
 
2196
- size_t ctx_size, mmapped_size;
 
2197
  model_loader->calc_sizes(&ctx_size, &mmapped_size);
2198
  base_buf.resize(ctx_size);
2199
 
@@ -2232,8 +2312,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2232
  fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
2233
  }
2234
 
2235
- std::string name(length, 0);
2236
- fin.read(&name[0], length);
 
 
 
 
2237
 
2238
  // check for lora suffix and get the type of tensor
2239
  const std::string lora_suffix = ".lora";
@@ -2248,7 +2332,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2248
  base_name.erase(pos);
2249
  // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
2250
 
2251
- if (model_tensors.find(base_name.data()) == model_tensors.end()) {
2252
  fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
2253
  return 1;
2254
  }
@@ -2328,7 +2412,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2328
 
2329
  if (scaling != 1.0f) {
2330
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2331
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
2332
  }
2333
 
2334
  ggml_tensor * r;
@@ -2350,8 +2434,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2350
  lora_tensors.clear();
2351
 
2352
  n_tensors++;
2353
- if (n_tensors % 4 == 0)
2354
  fprintf(stderr, ".");
 
2355
  }
2356
  }
2357
 
@@ -2376,21 +2461,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2376
  }
2377
  }
2378
 
2379
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2380
  return ctx->model.kv_self.n;
2381
  }
2382
 
2383
- #define LLAMA_MAX_RNG_STATE 64*1024
2384
 
2385
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2386
- if (seed <= 0) {
2387
  seed = time(NULL);
2388
  }
2389
  ctx->rng.seed(seed);
2390
  }
2391
 
2392
  // Returns the *maximum* size of the state
2393
- size_t llama_get_state_size(struct llama_context * ctx) {
2394
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2395
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
2396
  const size_t s_rng_size = sizeof(size_t);
@@ -2421,8 +2506,8 @@ size_t llama_get_state_size(struct llama_context * ctx) {
2421
  }
2422
 
2423
  // Copies the state to the specified destination address
2424
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2425
- uint8_t * out = dest;
2426
 
2427
  // copy rng
2428
  {
@@ -2482,7 +2567,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2482
 
2483
  if (kv_size) {
2484
  const size_t elt_size = ggml_element_size(kv_self.k);
 
2485
  char buffer[4096];
 
2486
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2487
  ggml_cgraph gf{};
2488
  gf.n_threads = 1;
@@ -2506,10 +2593,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2506
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2507
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2508
  ggml_graph_compute(cpy_ctx, &gf);
 
 
2509
  }
2510
  }
2511
 
2512
- const size_t written = out - dest;
2513
  const size_t max_size = llama_get_state_size(ctx);
2514
 
2515
  LLAMA_ASSERT(written <= max_size);
@@ -2519,15 +2608,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2519
 
2520
  // Sets the state reading from the specified source address
2521
  size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2522
- const uint8_t * in = src;
2523
 
2524
  // set rng
2525
  {
2526
  size_t rng_size;
2527
  char rng_buf[LLAMA_MAX_RNG_STATE];
2528
 
2529
- memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2530
- memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2531
 
2532
  std::stringstream rng_ss;
2533
  rng_ss.str(std::string(&rng_buf[0], rng_size));
@@ -2541,30 +2630,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2541
  size_t logits_cap;
2542
  size_t logits_size;
2543
 
2544
- memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2545
- memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2546
 
2547
  LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2548
 
2549
  if (logits_size) {
2550
  ctx->logits.resize(logits_size);
2551
- memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2552
  }
2553
 
2554
- in += logits_cap * sizeof(float);
2555
  }
2556
 
2557
  // set embeddings
2558
  {
2559
  size_t embedding_size;
2560
 
2561
- memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2562
 
2563
  LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2564
 
2565
  if (embedding_size) {
2566
- memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2567
- in += embedding_size * sizeof(float);
2568
  }
2569
  }
2570
 
@@ -2579,25 +2668,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2579
  size_t kv_size;
2580
  int kv_ntok;
2581
 
2582
- memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2583
- memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2584
 
2585
  if (kv_size) {
2586
  LLAMA_ASSERT(kv_self.buf.size == kv_size);
2587
 
2588
  const size_t elt_size = ggml_element_size(kv_self.k);
 
2589
  char buffer[4096];
 
2590
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2591
  ggml_cgraph gf{};
2592
  gf.n_threads = 1;
2593
 
2594
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2595
- kin3d->data = (void *) in;
2596
- in += ggml_nbytes(kin3d);
2597
 
2598
  ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2599
- vin3d->data = (void *) in;
2600
- in += ggml_nbytes(vin3d);
2601
 
2602
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2603
  n_embd, kv_ntok, n_layer,
@@ -2611,12 +2702,13 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2611
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2612
  ggml_graph_compute(cpy_ctx, &gf);
2613
 
 
2614
  }
2615
 
2616
  ctx->model.kv_self.n = kv_ntok;
2617
  }
2618
 
2619
- const size_t nread = in - src;
2620
  const size_t max_size = llama_get_state_size(ctx);
2621
 
2622
  LLAMA_ASSERT(nread <= max_size);
@@ -2624,6 +2716,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2624
  return nread;
2625
  }
2626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2627
  int llama_eval(
2628
  struct llama_context * ctx,
2629
  const llama_token * tokens,
@@ -2634,11 +2805,14 @@ int llama_eval(
2634
  fprintf(stderr, "%s: failed to eval\n", __func__);
2635
  return 1;
2636
  }
 
2637
  // get a more accurate load time, upon first eval
 
2638
  if (!ctx->has_evaluated_once) {
2639
  ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
2640
  ctx->has_evaluated_once = true;
2641
  }
 
2642
  return 0;
2643
  }
2644
 
@@ -2662,15 +2836,15 @@ int llama_tokenize(
2662
  return res.size();
2663
  }
2664
 
2665
- int llama_n_vocab(struct llama_context * ctx) {
2666
  return ctx->vocab.id_to_token.size();
2667
  }
2668
 
2669
- int llama_n_ctx(struct llama_context * ctx) {
2670
  return ctx->model.hparams.n_ctx;
2671
  }
2672
 
2673
- int llama_n_embd(struct llama_context * ctx) {
2674
  return ctx->model.hparams.n_embd;
2675
  }
2676
 
@@ -2682,7 +2856,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
2682
  return ctx->embedding.data();
2683
  }
2684
 
2685
- const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
2686
  if (token >= llama_n_vocab(ctx)) {
2687
  return nullptr;
2688
  }
@@ -2712,9 +2886,9 @@ void llama_print_timings(struct llama_context * ctx) {
2712
 
2713
  fprintf(stderr, "\n");
2714
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
2715
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2716
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
2717
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2718
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
2719
  }
2720
 
@@ -2751,82 +2925,3 @@ const char * llama_print_system_info(void) {
2751
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2752
  return ctx->model.tensors_by_name;
2753
  }
2754
-
2755
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2756
- llama_file file(path_session, "rb");
2757
-
2758
- // sanity checks
2759
- {
2760
- const uint32_t magic = file.read_u32();
2761
- const uint32_t version = file.read_u32();
2762
-
2763
- if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2764
- fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2765
- return false;
2766
- }
2767
-
2768
- llama_hparams session_hparams;
2769
- file.read_raw(&session_hparams, sizeof(llama_hparams));
2770
-
2771
- if (session_hparams != ctx->model.hparams) {
2772
- fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2773
- return false;
2774
- }
2775
- }
2776
-
2777
- // load the prompt
2778
- {
2779
- const uint32_t n_token_count = file.read_u32();
2780
-
2781
- if (n_token_count > n_token_capacity) {
2782
- fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
2783
- return false;
2784
- }
2785
-
2786
- file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2787
- *n_token_count_out = n_token_count;
2788
- }
2789
-
2790
- // restore the context state
2791
- {
2792
- const size_t n_state_size_cur = file.size - file.tell();
2793
- const size_t n_state_size_max = llama_get_state_size(ctx);
2794
-
2795
- if (n_state_size_cur > n_state_size_max) {
2796
- fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
2797
- return false;
2798
- }
2799
-
2800
- std::vector<uint8_t> state_data(n_state_size_max);
2801
- file.read_raw(state_data.data(), n_state_size_cur);
2802
-
2803
- llama_set_state_data(ctx, state_data.data());
2804
- }
2805
-
2806
- return true;
2807
- }
2808
-
2809
- bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2810
- llama_file file(path_session, "wb");
2811
-
2812
- file.write_u32(LLAMA_SESSION_MAGIC);
2813
- file.write_u32(LLAMA_SESSION_VERSION);
2814
-
2815
- file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2816
-
2817
- // save the prompt
2818
- file.write_u32((uint32_t) n_token_count);
2819
- file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2820
-
2821
- // save the context state
2822
- {
2823
- const size_t n_state_size_max = llama_get_state_size(ctx);
2824
-
2825
- std::vector<uint8_t> state_data(n_state_size_max);
2826
- const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
2827
-
2828
- file.write_raw(state_data.data(), n_state_size_cur);
2829
- }
2830
-
2831
- return true;
2832
- }
 
9
  #include "llama.h"
10
 
11
  #include "ggml.h"
12
+ #ifdef GGML_USE_CUBLAS
13
+ #include "ggml-cuda.h"
14
+ #endif
15
 
16
  #include <array>
17
  #include <ctime>
 
53
 
54
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
55
  {
56
+ static std::map<e_model, size_t> k_sizes = {
57
  { MODEL_7B, 512ull * MB },
58
  { MODEL_13B, 512ull * MB },
59
  { MODEL_30B, 512ull * MB },
60
  { MODEL_65B, 1024ull * MB },
61
  };
62
+ return k_sizes;
63
  }
64
 
65
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
66
  {
67
+ static std::map<e_model, size_t> k_sizes = {
68
  { MODEL_7B, 512ull * MB },
69
  { MODEL_13B, 512ull * MB },
70
  { MODEL_30B, 512ull * MB },
71
  { MODEL_65B, 1024ull * MB },
72
  };
73
+ return k_sizes;
74
  }
75
 
76
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
77
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
78
  {
79
+ static std::map<e_model, size_t> k_sizes = {
80
  { MODEL_7B, 1026ull * MB },
81
  { MODEL_13B, 1608ull * MB },
82
  { MODEL_30B, 3124ull * MB },
83
  { MODEL_65B, 5120ull * MB },
84
  };
85
+ return k_sizes;
86
  }
87
 
88
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
89
  // not actually needed if BLAS is disabled
90
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
91
  {
92
+ static std::map<e_model, size_t> k_sizes = {
93
  { MODEL_7B, 768ull * MB },
94
  { MODEL_13B, 1024ull * MB },
95
  { MODEL_30B, 1280ull * MB },
96
  { MODEL_65B, 1536ull * MB },
97
  };
98
+ return k_sizes;
99
  }
100
 
101
  // default hparams (LLaMA 7B)
 
405
  LLAMA_FILE_VERSION_GGML,
406
  LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
407
  LLAMA_FILE_VERSION_GGJT_V1, // added padding
408
+ LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
409
  };
410
 
411
  struct llama_file_loader {
 
436
  file_version = LLAMA_FILE_VERSION_GGMF_V1;
437
  } else if (magic == 'ggjt' && version == 1) {
438
  file_version = LLAMA_FILE_VERSION_GGJT_V1;
439
+ } else if (magic == 'ggjt' && version == 2) {
440
+ file_version = LLAMA_FILE_VERSION_GGJT_V2;
441
  } else {
442
  throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
443
  magic, version);
 
488
  case GGML_TYPE_F16:
489
  case GGML_TYPE_Q4_0:
490
  case GGML_TYPE_Q4_1:
 
491
  case GGML_TYPE_Q5_0:
492
  case GGML_TYPE_Q5_1:
493
  case GGML_TYPE_Q8_0:
 
532
  write_vocab();
533
  }
534
  void write_magic() {
535
+ file.write_u32(LLAMA_FILE_MAGIC); // magic
536
+ file.write_u32(LLAMA_FILE_VERSION); // version
537
  }
538
  void write_hparams(enum llama_ftype new_ftype) {
539
  const llama_hparams & hparams = any_file_loader->hparams;
 
563
  case GGML_TYPE_F16:
564
  case GGML_TYPE_Q4_0:
565
  case GGML_TYPE_Q4_1:
 
566
  case GGML_TYPE_Q5_0:
567
  case GGML_TYPE_Q5_1:
568
  case GGML_TYPE_Q8_0:
 
589
  std::unique_ptr<llama_mmap> mapping;
590
 
591
  llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
592
+ auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
593
  file_loaders.emplace_back(first_file);
594
  uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
595
  for (uint32_t i = 1; i < n_parts; i++) {
596
  std::string fname = fname_base + "." + std::to_string(i);
597
+ auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
598
  file_loaders.emplace_back(ith_file);
599
  if (ith_file->hparams != first_file->hparams) {
600
  throw format("llama.cpp: hparams inconsistent between files");
 
641
  }
642
  }
643
 
644
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
645
  auto it = tensors_map.name_to_idx.find(name);
646
  if (it == tensors_map.name_to_idx.end()) {
647
  throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
 
663
  LLAMA_ASSERT(lt.ne.size() == 1);
664
  tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
665
  }
666
+ ggml_set_name(tensor, lt.name.c_str());
667
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
668
  lt.ggml_tensor = tensor;
669
  num_ggml_tensors_created++;
670
  return tensor;
671
  }
672
 
673
+ void done_getting_tensors() const {
674
  if (num_ggml_tensors_created != tensors_map.tensors.size()) {
675
  throw std::string("llama.cpp: file contained more tensors than expected");
676
  }
 
732
  LLAMA_ASSERT(offset == lt.size);
733
  } else if (lt.split_type == SPLIT_BY_COLUMNS) {
734
  // Let's load the data into temporary buffers to ensure the OS performs large loads.
735
+ std::vector<llama_buffer> tmp_bufs(lt.shards.size());
 
736
  for (size_t i = 0; i < lt.shards.size(); i++) {
737
  llama_load_tensor_shard & shard = lt.shards.at(i);
738
  llama_file & file = file_loaders.at(shard.file_idx)->file;
 
803
 
804
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
805
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
806
+ ggml_set_name(cache.k, "cache_k");
807
+ ggml_set_name(cache.v, "cache_v");
808
 
809
  return true;
810
  }
 
813
  struct llama_context_params result = {
814
  /*.n_ctx =*/ 512,
815
  /*.n_parts =*/ -1,
816
+ /*.gpu_layers =*/ 0,
817
+ /*.seed =*/ -1,
818
  /*.f16_kv =*/ false,
819
  /*.logits_all =*/ false,
820
  /*.vocab_only =*/ false,
 
844
  switch (version) {
845
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
846
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
847
+ case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
848
+ case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
849
  }
850
+
851
+ return "unknown";
852
  }
853
 
854
  static const char *llama_ftype_name(enum llama_ftype ftype) {
 
859
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
860
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
861
  return "mostly Q4_1, some F16";
 
862
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
863
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
864
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
 
880
  const std::string & fname,
881
  llama_context & lctx,
882
  int n_ctx,
883
+ int n_gpu_layers,
884
  ggml_type memory_type,
885
  bool use_mmap,
886
  bool use_mlock,
 
925
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
926
  }
927
 
928
+ if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
929
+ if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
930
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
931
+ hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
932
+ throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
933
+ }
934
+ }
935
+
936
  if (vocab_only) {
937
  return;
938
  }
939
 
940
  auto & ctx = model.ctx;
941
 
942
+ size_t ctx_size;
943
+ size_t mmapped_size;
944
  ml->calc_sizes(&ctx_size, &mmapped_size);
945
  fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
946
 
 
986
 
987
  // prepare memory for the weights
988
  {
 
 
989
  const uint32_t n_embd = hparams.n_embd;
990
  const uint32_t n_layer = hparams.n_layer;
991
  const uint32_t n_vocab = hparams.n_vocab;
 
1027
  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1028
 
1029
  model.mapping = std::move(ml->mapping);
1030
+ #ifdef GGML_USE_CUBLAS
1031
+ {
1032
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1033
+
1034
+ fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1035
+
1036
+ size_t vram_total = 0;
1037
+
1038
+ for (int i = 0; i < n_gpu; ++i) {
1039
+ const auto & layer = model.layers[i];
1040
+
1041
+ ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1042
+ ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1043
+ ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1044
+ ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1045
+ ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1046
+ ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1047
+ ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1048
+ }
1049
+ if (n_gpu_layers > (int) hparams.n_layer) {
1050
+ fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1051
+ ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1052
+ }
1053
+
1054
+ fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1055
+ }
1056
+ #else
1057
+ (void) n_gpu_layers;
1058
+ #endif
1059
 
1060
  // loading time will be recalculate after the first eval, so
1061
  // we take page faults deferred by mmap() into consideration
 
1066
  const std::string & fname,
1067
  llama_context & lctx,
1068
  int n_ctx,
1069
+ int n_gpu_layers,
1070
  ggml_type memory_type,
1071
  bool use_mmap,
1072
  bool use_mlock,
 
1074
  llama_progress_callback progress_callback,
1075
  void *progress_callback_user_data) {
1076
  try {
1077
+ llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
1078
  vocab_only, progress_callback, progress_callback_user_data);
1079
  return true;
1080
  } catch (const std::string & err) {
 
1096
  const int n_tokens,
1097
  const int n_past,
1098
  const int n_threads) {
1099
+
1100
+ // enforce that the first token is BOS
1101
+ if (n_past == 0 && tokens[0] != llama_token_bos()) {
1102
+ fprintf(stderr, "%s: first token must be BOS\n", __func__);
1103
+ return false;
1104
+ }
1105
+
1106
  const int64_t t_start_us = ggml_time_us();
1107
 
1108
  const int N = n_tokens;
 
1110
  const auto & model = lctx.model;
1111
  const auto & hparams = model.hparams;
1112
 
1113
+ const auto & kv_self = model.kv_self;
1114
 
1115
  LLAMA_ASSERT(!!kv_self.ctx);
1116
 
 
1138
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1139
 
1140
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1141
+ ggml_set_name(embd, "embd");
1142
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1143
 
1144
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
 
1163
  // self-attention
1164
  {
1165
  // compute Q and K and RoPE them
1166
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1167
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1168
+ ggml_set_name(Qcur, "Qcur");
1169
+ ggml_set_name(Kcur, "Kcur");
1170
 
1171
  // store key and value to memory
1172
  {
 
1187
  ggml_permute(ctx0,
1188
  Qcur,
1189
  0, 2, 1, 3);
1190
+ ggml_set_name(Q, "Q");
1191
 
1192
  struct ggml_tensor * K =
1193
  ggml_permute(ctx0,
 
1195
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1196
  n_embd/n_head, n_head, n_past + N),
1197
  0, 2, 1, 3);
1198
+ ggml_set_name(K, "K");
1199
 
1200
  // K * Q
1201
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1202
+ ggml_set_name(KQ, "KQ");
1203
 
1204
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
1205
+ struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1206
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1207
+
1208
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
1209
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1210
+ ggml_set_name(KQ_scaled, "KQ_scaled");
1211
 
1212
  // KQ_masked = mask_past(KQ_scaled)
1213
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1214
+ ggml_set_name(KQ_masked, "KQ_masked");
1215
 
1216
  // KQ = soft_max(KQ_masked)
1217
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1218
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
1219
+
1220
 
1221
  // split cached V into n_head heads
1222
  struct ggml_tensor * V =
 
1225
  n_ctx*ggml_element_size(kv_self.v),
1226
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1227
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1228
+ ggml_set_name(V, "V");
1229
 
1230
  #if 1
1231
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1232
+ ggml_set_name(KQV, "KQV");
1233
  #else
1234
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1235
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
 
1240
 
1241
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1242
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1243
+ ggml_set_name(KQV_merged, "KQV_merged");
1244
 
1245
  // cur = KQV_merged.contiguous().view(n_embd, N)
1246
  cur = ggml_cpy(ctx0,
1247
  KQV_merged,
1248
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1249
+ ggml_set_name(cur, "KQV_merged_contiguous");
1250
 
1251
  // projection (no bias)
1252
  cur = ggml_mul_mat(ctx0,
 
1318
  lctx.use_buf(ctx0, -1);
1319
 
1320
  // logits -> probs
1321
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
1322
 
1323
  // run the computation
1324
  ggml_build_forward_expand(&gf, inpL);
 
1356
  }
1357
 
1358
  // extract embeddings
1359
+ if (!lctx.embedding.empty()) {
1360
  auto & embedding_out = lctx.embedding;
1361
 
1362
  embedding_out.resize(n_embd);
 
1407
  size_t n;
1408
  };
1409
 
1410
+ static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
1411
+
1412
  struct llama_sp_bigram {
1413
  struct comparator {
1414
  bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
 
1441
  sym.prev = index - 1;
1442
  sym.next = offs == text.size() ? -1 : index + 1;
1443
  index++;
1444
+ symbols_.emplace_back(sym);
1445
  }
1446
 
1447
  // seed the work queue with all possible 2-character tokens.
 
1532
  llama_tokenizer tokenizer(vocab);
1533
  std::vector<llama_vocab::id> output;
1534
 
1535
+ if (text.empty()) {
1536
  return output;
1537
  }
1538
 
1539
  if (bos) {
1540
+ output.push_back(llama_token_bos());
1541
  }
1542
 
1543
  tokenizer.tokenize(text, output);
 
1760
  }
1761
  }
1762
 
1763
+ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
1764
  if (last_tokens_size == 0 || penalty == 1.0f) {
1765
  return;
1766
  }
 
1768
  const int64_t t_start_sample_us = ggml_time_us();
1769
 
1770
  for (size_t i = 0; i < candidates->size; ++i) {
1771
+ const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1772
  if (token_iter == last_tokens + last_tokens_size) {
1773
  continue;
1774
  }
 
1789
  }
1790
  }
1791
 
1792
+ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
1793
  if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
1794
  return;
1795
  }
 
1846
  float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1847
 
1848
  // Sample the next word X using top-k sampling
1849
+ llama_sample_top_k(nullptr, candidates, int(k), 1);
1850
  if (ctx) {
1851
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1852
  }
 
1912
  const int64_t t_start_sample_us = ggml_time_us();
1913
 
1914
  // Find max element
1915
+ auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1916
  return a.logit < b.logit;
1917
  });
1918
 
 
1955
  switch (ftype) {
1956
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1957
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
 
1958
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1959
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1960
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
 
1965
  nthread = std::thread::hardware_concurrency();
1966
  }
1967
 
1968
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
1969
  /*vocab_only*/ false));
1970
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1971
 
 
2019
  } else if (tensor.type == GGML_TYPE_F16) {
2020
  f32_conv_buf.resize(nelements * sizeof(float));
2021
  f32_data = (float *) f32_conv_buf.addr;
2022
+ const auto * f16_data = (const ggml_fp16_t *) tensor.data;
2023
  for (size_t i = 0; i < nelements; i++) {
2024
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
2025
  }
 
2050
  size_t first = counter; counter += chunk_size;
2051
  if (first >= nelements) {
2052
  if (!local_hist.empty()) {
2053
+ for (int j=0; j<int(local_hist.size()); ++j) {
2054
+ hist_cur[j] += local_hist[j];
2055
+ }
2056
  new_size += local_size;
2057
  }
2058
  break;
2059
  }
2060
  lock.unlock();
2061
  size_t last = std::min(nelements, first + chunk_size);
2062
+ if (local_hist.empty()) {
2063
+ local_hist.resize(hist_cur.size(), 0);
2064
+ }
2065
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
2066
  }
2067
  };
2068
+ if ((int) workers.size() < nthread_use - 1) {
2069
+ workers.resize(nthread_use - 1);
2070
+ }
2071
+ for (int it = 0; it < nthread_use - 1; ++it) {
2072
+ workers[it] = std::thread(compute);
2073
+ }
2074
  compute();
2075
+ for (int it = 0; it < nthread_use - 1; ++it) {
2076
+ workers[it].join();
2077
+ }
2078
  }
2079
 
2080
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
 
2120
 
2121
  llama_context * ctx = new llama_context;
2122
 
2123
+ if (params.seed < 0) {
2124
  params.seed = time(NULL);
2125
  }
2126
 
 
2146
 
2147
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2148
 
2149
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
2150
  params.use_mmap, params.use_mlock, params.vocab_only,
2151
  params.progress_callback, params.progress_callback_user_data)) {
2152
  fprintf(stderr, "%s: failed to load model\n", __func__);
 
2272
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2273
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2274
 
2275
+ size_t ctx_size;
2276
+ size_t mmapped_size;
2277
  model_loader->calc_sizes(&ctx_size, &mmapped_size);
2278
  base_buf.resize(ctx_size);
2279
 
 
2312
  fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
2313
  }
2314
 
2315
+ std::string name;
2316
+ {
2317
+ char buf[1024];
2318
+ fin.read(buf, length);
2319
+ name = std::string(buf, length);
2320
+ }
2321
 
2322
  // check for lora suffix and get the type of tensor
2323
  const std::string lora_suffix = ".lora";
 
2332
  base_name.erase(pos);
2333
  // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
2334
 
2335
+ if (model_tensors.find(base_name) == model_tensors.end()) {
2336
  fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
2337
  return 1;
2338
  }
 
2412
 
2413
  if (scaling != 1.0f) {
2414
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2415
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2416
  }
2417
 
2418
  ggml_tensor * r;
 
2434
  lora_tensors.clear();
2435
 
2436
  n_tensors++;
2437
+ if (n_tensors % 4 == 0) {
2438
  fprintf(stderr, ".");
2439
+ }
2440
  }
2441
  }
2442
 
 
2461
  }
2462
  }
2463
 
2464
+ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2465
  return ctx->model.kv_self.n;
2466
  }
2467
 
2468
+ #define LLAMA_MAX_RNG_STATE (64*1024)
2469
 
2470
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2471
+ if (seed < 0) {
2472
  seed = time(NULL);
2473
  }
2474
  ctx->rng.seed(seed);
2475
  }
2476
 
2477
  // Returns the *maximum* size of the state
2478
+ size_t llama_get_state_size(const struct llama_context * ctx) {
2479
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2480
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
2481
  const size_t s_rng_size = sizeof(size_t);
 
2506
  }
2507
 
2508
  // Copies the state to the specified destination address
2509
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2510
+ uint8_t * out = dst;
2511
 
2512
  // copy rng
2513
  {
 
2567
 
2568
  if (kv_size) {
2569
  const size_t elt_size = ggml_element_size(kv_self.k);
2570
+
2571
  char buffer[4096];
2572
+
2573
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2574
  ggml_cgraph gf{};
2575
  gf.n_threads = 1;
 
2593
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2594
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2595
  ggml_graph_compute(cpy_ctx, &gf);
2596
+
2597
+ ggml_free(cpy_ctx);
2598
  }
2599
  }
2600
 
2601
+ const size_t written = out - dst;
2602
  const size_t max_size = llama_get_state_size(ctx);
2603
 
2604
  LLAMA_ASSERT(written <= max_size);
 
2608
 
2609
  // Sets the state reading from the specified source address
2610
  size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2611
+ const uint8_t * inp = src;
2612
 
2613
  // set rng
2614
  {
2615
  size_t rng_size;
2616
  char rng_buf[LLAMA_MAX_RNG_STATE];
2617
 
2618
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
2619
+ memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
2620
 
2621
  std::stringstream rng_ss;
2622
  rng_ss.str(std::string(&rng_buf[0], rng_size));
 
2630
  size_t logits_cap;
2631
  size_t logits_size;
2632
 
2633
+ memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
2634
+ memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
2635
 
2636
  LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2637
 
2638
  if (logits_size) {
2639
  ctx->logits.resize(logits_size);
2640
+ memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
2641
  }
2642
 
2643
+ inp += logits_cap * sizeof(float);
2644
  }
2645
 
2646
  // set embeddings
2647
  {
2648
  size_t embedding_size;
2649
 
2650
+ memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
2651
 
2652
  LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2653
 
2654
  if (embedding_size) {
2655
+ memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
2656
+ inp += embedding_size * sizeof(float);
2657
  }
2658
  }
2659
 
 
2668
  size_t kv_size;
2669
  int kv_ntok;
2670
 
2671
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
2672
+ memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
2673
 
2674
  if (kv_size) {
2675
  LLAMA_ASSERT(kv_self.buf.size == kv_size);
2676
 
2677
  const size_t elt_size = ggml_element_size(kv_self.k);
2678
+
2679
  char buffer[4096];
2680
+
2681
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2682
  ggml_cgraph gf{};
2683
  gf.n_threads = 1;
2684
 
2685
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2686
+ kin3d->data = (void *) inp;
2687
+ inp += ggml_nbytes(kin3d);
2688
 
2689
  ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2690
+ vin3d->data = (void *) inp;
2691
+ inp += ggml_nbytes(vin3d);
2692
 
2693
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2694
  n_embd, kv_ntok, n_layer,
 
2702
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2703
  ggml_graph_compute(cpy_ctx, &gf);
2704
 
2705
+ ggml_free(cpy_ctx);
2706
  }
2707
 
2708
  ctx->model.kv_self.n = kv_ntok;
2709
  }
2710
 
2711
+ const size_t nread = inp - src;
2712
  const size_t max_size = llama_get_state_size(ctx);
2713
 
2714
  LLAMA_ASSERT(nread <= max_size);
 
2716
  return nread;
2717
  }
2718
 
2719
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2720
+ llama_file file(path_session, "rb");
2721
+
2722
+ // sanity checks
2723
+ {
2724
+ const uint32_t magic = file.read_u32();
2725
+ const uint32_t version = file.read_u32();
2726
+
2727
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
2728
+ fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2729
+ return false;
2730
+ }
2731
+
2732
+ llama_hparams session_hparams;
2733
+ file.read_raw(&session_hparams, sizeof(llama_hparams));
2734
+
2735
+ if (session_hparams != ctx->model.hparams) {
2736
+ fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2737
+ return false;
2738
+ }
2739
+ }
2740
+
2741
+ // load the prompt
2742
+ {
2743
+ const uint32_t n_token_count = file.read_u32();
2744
+
2745
+ if (n_token_count > n_token_capacity) {
2746
+ fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
2747
+ return false;
2748
+ }
2749
+
2750
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2751
+ *n_token_count_out = n_token_count;
2752
+ }
2753
+
2754
+ // restore the context state
2755
+ {
2756
+ const size_t n_state_size_cur = file.size - file.tell();
2757
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2758
+
2759
+ if (n_state_size_cur > n_state_size_max) {
2760
+ fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
2761
+ return false;
2762
+ }
2763
+
2764
+ std::vector<uint8_t> state_data(n_state_size_max);
2765
+ file.read_raw(state_data.data(), n_state_size_cur);
2766
+
2767
+ llama_set_state_data(ctx, state_data.data());
2768
+ }
2769
+
2770
+ return true;
2771
+ }
2772
+
2773
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2774
+ llama_file file(path_session, "wb");
2775
+
2776
+ file.write_u32(LLAMA_SESSION_MAGIC);
2777
+ file.write_u32(LLAMA_SESSION_VERSION);
2778
+
2779
+ file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2780
+
2781
+ // save the prompt
2782
+ file.write_u32((uint32_t) n_token_count);
2783
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2784
+
2785
+ // save the context state
2786
+ {
2787
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2788
+
2789
+ std::vector<uint8_t> state_data(n_state_size_max);
2790
+ const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
2791
+
2792
+ file.write_raw(state_data.data(), n_state_size_cur);
2793
+ }
2794
+
2795
+ return true;
2796
+ }
2797
+
2798
  int llama_eval(
2799
  struct llama_context * ctx,
2800
  const llama_token * tokens,
 
2805
  fprintf(stderr, "%s: failed to eval\n", __func__);
2806
  return 1;
2807
  }
2808
+
2809
  // get a more accurate load time, upon first eval
2810
+ // TODO: fix this
2811
  if (!ctx->has_evaluated_once) {
2812
  ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
2813
  ctx->has_evaluated_once = true;
2814
  }
2815
+
2816
  return 0;
2817
  }
2818
 
 
2836
  return res.size();
2837
  }
2838
 
2839
+ int llama_n_vocab(const struct llama_context * ctx) {
2840
  return ctx->vocab.id_to_token.size();
2841
  }
2842
 
2843
+ int llama_n_ctx(const struct llama_context * ctx) {
2844
  return ctx->model.hparams.n_ctx;
2845
  }
2846
 
2847
+ int llama_n_embd(const struct llama_context * ctx) {
2848
  return ctx->model.hparams.n_embd;
2849
  }
2850
 
 
2856
  return ctx->embedding.data();
2857
  }
2858
 
2859
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
2860
  if (token >= llama_n_vocab(ctx)) {
2861
  return nullptr;
2862
  }
 
2886
 
2887
  fprintf(stderr, "\n");
2888
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
2889
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
2890
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
2891
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
2892
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
2893
  }
2894
 
 
2925
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2926
  return ctx->model.tensors_by_name;
2927
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk-llama/llama.h CHANGED
@@ -19,7 +19,7 @@
19
  # define LLAMA_API
20
  #endif
21
 
22
- #define LLAMA_FILE_VERSION 1
23
  #define LLAMA_FILE_MAGIC 'ggjt'
24
  #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
  #define LLAMA_SESSION_MAGIC 'ggsn'
@@ -54,9 +54,10 @@ extern "C" {
54
  typedef void (*llama_progress_callback)(float progress, void *ctx);
55
 
56
  struct llama_context_params {
57
- int n_ctx; // text context
58
- int n_parts; // -1 for default
59
- int seed; // RNG seed, 0 for random
 
60
 
61
  bool f16_kv; // use fp16 for KV cache
62
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -78,7 +79,7 @@ extern "C" {
78
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
80
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
81
- LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
82
  // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83
  LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84
  LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
@@ -122,19 +123,19 @@ extern "C" {
122
  int n_threads);
123
 
124
  // Returns the number of tokens in the KV cache
125
- LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
126
 
127
  // Sets the current rng seed.
128
  LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
129
 
130
  // Returns the maximum size in bytes of the state (rng, logits, embedding
131
  // and kv_cache) - will often be smaller after compacting tokens
132
- LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
133
 
134
  // Copies the state to the specified destination address.
135
  // Destination needs to have allocated enough memory.
136
  // Returns the number of bytes copied
137
- LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
138
 
139
  // Set the state reading from the specified address
140
  // Returns the number of bytes read
@@ -143,6 +144,7 @@ extern "C" {
143
  // Save/load session file
144
  LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
145
  LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
 
146
  // Run the llama inference to obtain the logits and probabilities for the next token.
147
  // tokens + n_tokens is the provided batch of new tokens to process
148
  // n_past is the number of tokens to use from previous eval calls
@@ -166,9 +168,9 @@ extern "C" {
166
  int n_max_tokens,
167
  bool add_bos);
168
 
169
- LLAMA_API int llama_n_vocab(struct llama_context * ctx);
170
- LLAMA_API int llama_n_ctx (struct llama_context * ctx);
171
- LLAMA_API int llama_n_embd (struct llama_context * ctx);
172
 
173
  // Token logits obtained from the last call to llama_eval()
174
  // The logits for the last token are stored in the last row
@@ -182,7 +184,7 @@ extern "C" {
182
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
183
 
184
  // Token Id -> String. Uses the vocabulary in the provided context
185
- LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
186
 
187
  // Special tokens
188
  LLAMA_API llama_token llama_token_bos();
@@ -192,25 +194,25 @@ extern "C" {
192
  // Sampling functions
193
 
194
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
195
- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
196
 
197
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
198
- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
199
 
200
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
201
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
202
 
203
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
204
- LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
205
 
206
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
207
- LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
208
 
209
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
210
- LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
211
 
212
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
213
- LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
214
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
215
 
216
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
 
19
  # define LLAMA_API
20
  #endif
21
 
22
+ #define LLAMA_FILE_VERSION 2
23
  #define LLAMA_FILE_MAGIC 'ggjt'
24
  #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
  #define LLAMA_SESSION_MAGIC 'ggsn'
 
54
  typedef void (*llama_progress_callback)(float progress, void *ctx);
55
 
56
  struct llama_context_params {
57
+ int n_ctx; // text context
58
+ int n_parts; // -1 for default
59
+ int n_gpu_layers; // number of layers to store in VRAM
60
+ int seed; // RNG seed, -1 for random
61
 
62
  bool f16_kv; // use fp16 for KV cache
63
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
 
79
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
80
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
81
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
82
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
83
  // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
84
  LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
85
  LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
 
123
  int n_threads);
124
 
125
  // Returns the number of tokens in the KV cache
126
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
127
 
128
  // Sets the current rng seed.
129
  LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
130
 
131
  // Returns the maximum size in bytes of the state (rng, logits, embedding
132
  // and kv_cache) - will often be smaller after compacting tokens
133
+ LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
134
 
135
  // Copies the state to the specified destination address.
136
  // Destination needs to have allocated enough memory.
137
  // Returns the number of bytes copied
138
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
139
 
140
  // Set the state reading from the specified address
141
  // Returns the number of bytes read
 
144
  // Save/load session file
145
  LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
146
  LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
147
+
148
  // Run the llama inference to obtain the logits and probabilities for the next token.
149
  // tokens + n_tokens is the provided batch of new tokens to process
150
  // n_past is the number of tokens to use from previous eval calls
 
168
  int n_max_tokens,
169
  bool add_bos);
170
 
171
+ LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
172
+ LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
173
+ LLAMA_API int llama_n_embd (const struct llama_context * ctx);
174
 
175
  // Token logits obtained from the last call to llama_eval()
176
  // The logits for the last token are stored in the last row
 
184
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
185
 
186
  // Token Id -> String. Uses the vocabulary in the provided context
187
+ LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
188
 
189
  // Special tokens
190
  LLAMA_API llama_token llama_token_bos();
 
194
  // Sampling functions
195
 
196
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
197
+ LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
198
 
199
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
200
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
201
 
202
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
203
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
204
 
205
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
206
+ LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
207
 
208
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
209
+ LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
210
 
211
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
212
+ LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
213
 
214
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
215
+ LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
216
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
217
 
218
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
examples/talk-llama/talk-llama.cpp CHANGED
@@ -560,7 +560,7 @@ int main(int argc, char ** argv) {
560
 
561
  embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
562
  n_past += embd.size();
563
-
564
  embd.clear();
565
 
566
  if (done) break;
@@ -577,7 +577,7 @@ int main(int argc, char ** argv) {
577
  if (!path_session.empty() && need_to_save_session) {
578
  need_to_save_session = false;
579
  llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
580
- }
581
 
582
  llama_token id = 0;
583
 
@@ -609,8 +609,8 @@ int main(int argc, char ** argv) {
609
  id = llama_sample_token_greedy(ctx_llama, &candidates_p);
610
  } else {
611
  // Temperature sampling
612
- llama_sample_top_k(ctx_llama, &candidates_p, top_k);
613
- llama_sample_top_p(ctx_llama, &candidates_p, top_p);
614
  llama_sample_temperature(ctx_llama, &candidates_p, temp);
615
  id = llama_sample_token(ctx_llama, &candidates_p);
616
  }
 
560
 
561
  embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
562
  n_past += embd.size();
563
+
564
  embd.clear();
565
 
566
  if (done) break;
 
577
  if (!path_session.empty() && need_to_save_session) {
578
  need_to_save_session = false;
579
  llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
580
+ }
581
 
582
  llama_token id = 0;
583
 
 
609
  id = llama_sample_token_greedy(ctx_llama, &candidates_p);
610
  } else {
611
  // Temperature sampling
612
+ llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
613
+ llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
614
  llama_sample_temperature(ctx_llama, &candidates_p, temp);
615
  id = llama_sample_token(ctx_llama, &candidates_p);
616
  }