Vivek16 commited on
Commit
cc46459
Β·
verified Β·
1 Parent(s): 9da1c7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -33
app.py CHANGED
@@ -1,45 +1,58 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
3
 
4
  # -----------------------------
5
- # Respond function using text-generation
6
  # -----------------------------
7
- async def respond(
8
- message,
9
- history: list[dict[str, str]],
10
- system_message,
11
- max_tokens,
12
- temperature,
13
- top_p,
14
- hf_token: gr.OAuthToken,
15
- ):
16
- """
17
- Non-chat model version using text-generation.
18
- Works for LoRA / 4-bit Qwen models.
19
- """
20
- client = InferenceClient(
21
- token=hf_token.token,
22
- model="unsloth/qwen2.5-math-1.5b-bnb-4bit"
23
- )
 
24
 
25
- # Build prompt manually
 
 
 
 
 
 
26
  prompt = system_message + "\n"
27
  for h in history:
28
  prompt += f"User: {h['content']}\n"
29
  prompt += f"User: {message}\nBot:"
30
 
31
- try:
32
- # Generate text (non-streaming)
33
- output = client.text_generation(
34
- prompt,
35
- max_new_tokens=max_tokens,
36
- temperature=temperature,
37
- top_p=top_p,
38
- )
39
- yield output.text
 
40
 
41
- except Exception as e:
42
- yield f"[Error] {str(e)}"
 
 
 
43
 
44
  # -----------------------------
45
  # Gradio Chat Interface
@@ -56,8 +69,6 @@ chatbot = gr.ChatInterface(
56
  )
57
 
58
  with gr.Blocks() as demo:
59
- with gr.Sidebar():
60
- gr.LoginButton()
61
  chatbot.render()
62
 
63
  # -----------------------------
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
3
+ from peft import PeftModel
4
+ import torch
5
 
6
  # -----------------------------
7
+ # Load 4-bit Qwen model locally
8
  # -----------------------------
9
+ MODEL_NAME = "unsloth/qwen2.5-math-1.5b-bnb-4bit"
10
+
11
+ print("Loading tokenizer...")
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
13
+
14
+ print("Loading model in 4-bit...")
15
+ base_model = AutoModelForCausalLM.from_pretrained(
16
+ MODEL_NAME,
17
+ device_map="auto",
18
+ torch_dtype=torch.float16,
19
+ low_cpu_mem_usage=True
20
+ )
21
+
22
+ # Check if LoRA adapter exists
23
+ try:
24
+ model = PeftModel.from_pretrained(base_model, MODEL_NAME, device_map="auto")
25
+ except:
26
+ model = base_model
27
 
28
+ model.eval()
29
+
30
+ # -----------------------------
31
+ # Respond function
32
+ # -----------------------------
33
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
34
+ # Build chat prompt
35
  prompt = system_message + "\n"
36
  for h in history:
37
  prompt += f"User: {h['content']}\n"
38
  prompt += f"User: {message}\nBot:"
39
 
40
+ # Tokenize
41
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
42
+
43
+ # Generation config
44
+ gen_config = GenerationConfig(
45
+ max_new_tokens=max_tokens,
46
+ temperature=temperature,
47
+ top_p=top_p,
48
+ do_sample=True
49
+ )
50
 
51
+ # Generate output
52
+ with torch.no_grad():
53
+ output_ids = model.generate(**inputs, **gen_config.to_dict())
54
+ output = tokenizer.decode(output_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
55
+ return output
56
 
57
  # -----------------------------
58
  # Gradio Chat Interface
 
69
  )
70
 
71
  with gr.Blocks() as demo:
 
 
72
  chatbot.render()
73
 
74
  # -----------------------------