Spaces:

Verias
/

testing_space

Paused

App Files Files Community

testing_space / modules /llamacpp_model.py

Verias

Upload 94 files

fd63d2f verified almost 2 years ago

raw

history blame contribute delete

6.39 kB

	import re
	from functools import partial

	import numpy as np
	import torch

	from modules import RoPE, shared
	from modules.callbacks import Iteratorize
	from modules.logging_colors import logger
	from modules.text_generation import get_max_prompt_length

	try:
	import llama_cpp
	except:
	llama_cpp = None

	try:
	import llama_cpp_cuda
	except:
	llama_cpp_cuda = None

	try:
	import llama_cpp_cuda_tensorcores
	except:
	llama_cpp_cuda_tensorcores = None


	def llama_cpp_lib():
	if shared.args.cpu and llama_cpp is not None:
	return llama_cpp
	elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
	return llama_cpp_cuda_tensorcores
	elif llama_cpp_cuda is not None:
	return llama_cpp_cuda
	else:
	return llama_cpp


	def ban_eos_logits_processor(eos_token, input_ids, logits):
	logits[eos_token] = -float('inf')
	return logits


	def custom_token_ban_logits_processor(token_ids, input_ids, logits):
	for token_id in token_ids:
	logits[token_id] = -float('inf')

	return logits


	class LlamaCppModel:
	def __init__(self):
	self.initialized = False
	self.grammar_string = ''
	self.grammar = None

	def __del__(self):
	del self.model

	@classmethod
	def from_pretrained(self, path):

	Llama = llama_cpp_lib().Llama
	LlamaCache = llama_cpp_lib().LlamaCache

	result = self()
	cache_capacity = 0
	if shared.args.cache_capacity is not None:
	if 'GiB' in shared.args.cache_capacity:
	cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000 * 1000
	elif 'MiB' in shared.args.cache_capacity:
	cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000
	else:
	cache_capacity = int(shared.args.cache_capacity)

	if cache_capacity > 0:
	logger.info("Cache capacity is " + str(cache_capacity) + " bytes")

	if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
	tensor_split_list = None
	else:
	tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]

	params = {
	'model_path': str(path),
	'n_ctx': shared.args.n_ctx,
	'n_threads': shared.args.threads or None,
	'n_threads_batch': shared.args.threads_batch or None,
	'n_batch': shared.args.n_batch,
	'use_mmap': not shared.args.no_mmap,
	'use_mlock': shared.args.mlock,
	'mul_mat_q': not shared.args.no_mul_mat_q,
	'numa': shared.args.numa,
	'n_gpu_layers': shared.args.n_gpu_layers,
	'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
	'tensor_split': tensor_split_list,
	'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
	'offload_kqv': not shared.args.no_offload_kqv,
	'split_mode': 1 if not shared.args.row_split else 2
	}

	result.model = Llama(**params)
	if cache_capacity > 0:
	result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))

	# This is ugly, but the model and the tokenizer are the same object in this library.
	return result, result

	def encode(self, string):
	if type(string) is str:
	string = string.encode()

	return self.model.tokenize(string)

	def decode(self, ids, **kwargs):
	return self.model.detokenize(ids).decode('utf-8')

	def get_logits(self, tokens):
	self.model.reset()
	self.model.eval(tokens)
	logits = self.model._scores
	logits = np.expand_dims(logits, 0) # batch dim is expected
	return torch.tensor(logits, dtype=torch.float32)

	def load_grammar(self, string):
	if string != self.grammar_string:
	self.grammar_string = string
	if string.strip() != '':
	self.grammar = llama_cpp_lib().LlamaGrammar.from_string(string)
	else:
	self.grammar = None

	def generate(self, prompt, state, callback=None):
	LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
	prompt = prompt if type(prompt) is str else prompt.decode()

	# Handle truncation
	prompt = self.encode(prompt)
	prompt = prompt[-get_max_prompt_length(state):]
	prompt = self.decode(prompt)

	self.load_grammar(state['grammar_string'])
	logit_processors = LogitsProcessorList()
	if state['ban_eos_token']:
	logit_processors.append(partial(ban_eos_logits_processor, self.model.token_eos()))

	if state['custom_token_bans']:
	to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
	if len(to_ban) > 0:
	logit_processors.append(partial(custom_token_ban_logits_processor, to_ban))

	completion_chunks = self.model.create_completion(
	prompt=prompt,
	max_tokens=state['max_new_tokens'],
	temperature=state['temperature'],
	top_p=state['top_p'],
	min_p=state['min_p'],
	typical_p=state['typical_p'],
	frequency_penalty=state['frequency_penalty'],
	presence_penalty=state['presence_penalty'],
	repeat_penalty=state['repetition_penalty'],
	top_k=state['top_k'],
	stream=True,
	seed=int(state['seed']) if state['seed'] != -1 else None,
	tfs_z=state['tfs'],
	mirostat_mode=int(state['mirostat_mode']),
	mirostat_tau=state['mirostat_tau'],
	mirostat_eta=state['mirostat_eta'],
	logits_processor=logit_processors,
	grammar=self.grammar
	)

	output = ""
	for completion_chunk in completion_chunks:
	if shared.stop_everything:
	break

	text = completion_chunk['choices'][0]['text']
	output += text
	if callback:
	callback(text)

	return output

	def generate_with_streaming(self, args, *kwargs):
	with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
	reply = ''
	for token in generator:
	reply += token
	yield reply