Vivek16 commited on
Commit
aa75746
Β·
verified Β·
1 Parent(s): bde5f3c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -23
app.py CHANGED
@@ -4,7 +4,7 @@ from peft import PeftModel
4
  import torch
5
 
6
  # -----------------------------
7
- # Load 4-bit Qwen model locally
8
  # -----------------------------
9
  MODEL_NAME = "unsloth/qwen2.5-math-1.5b-bnb-4bit"
10
 
@@ -12,35 +12,40 @@ print("Loading tokenizer...")
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
13
 
14
  print("Loading model in 4-bit...")
15
- base_model = AutoModelForCausalLM.from_pretrained(
16
- MODEL_NAME,
17
- device_map="auto",
18
- torch_dtype=torch.float16,
19
- low_cpu_mem_usage=True
20
- )
21
-
22
- # Check if LoRA adapter exists
23
  try:
24
- model = PeftModel.from_pretrained(base_model, MODEL_NAME, device_map="auto")
25
- except:
26
- model = base_model
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  model.eval()
29
 
30
  # -----------------------------
31
- # Respond function
32
  # -----------------------------
33
  def respond(message, history, system_message, max_tokens, temperature, top_p):
34
- # Build chat prompt
 
 
 
 
35
  prompt = system_message + "\n"
36
  for h in history:
37
  prompt += f"User: {h['content']}\n"
38
  prompt += f"User: {message}\nBot:"
39
 
40
- # Tokenize
41
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
42
 
43
- # Generation config
44
  gen_config = GenerationConfig(
45
  max_new_tokens=max_tokens,
46
  temperature=temperature,
@@ -48,7 +53,6 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
48
  do_sample=True
49
  )
50
 
51
- # Generate output
52
  with torch.no_grad():
53
  output_ids = model.generate(**inputs, **gen_config.to_dict())
54
  output = tokenizer.decode(output_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
@@ -62,8 +66,8 @@ chatbot = gr.ChatInterface(
62
  type="messages",
63
  additional_inputs=[
64
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
65
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
66
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
67
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
68
  ],
69
  )
@@ -71,9 +75,5 @@ chatbot = gr.ChatInterface(
71
  with gr.Blocks() as demo:
72
  chatbot.render()
73
 
74
- # -----------------------------
75
- # Launch Gradio app
76
- # -----------------------------
77
  if __name__ == "__main__":
78
  demo.launch(server_name="0.0.0.0", server_port=7860)
79
-
 
4
  import torch
5
 
6
  # -----------------------------
7
+ # Model config
8
  # -----------------------------
9
  MODEL_NAME = "unsloth/qwen2.5-math-1.5b-bnb-4bit"
10
 
 
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
13
 
14
  print("Loading model in 4-bit...")
 
 
 
 
 
 
 
 
15
  try:
16
+ base_model = AutoModelForCausalLM.from_pretrained(
17
+ MODEL_NAME,
18
+ device_map="auto",
19
+ dtype=torch.float16, # newer transformers prefers dtype
20
+ low_cpu_mem_usage=True
21
+ )
22
+ # Load LoRA adapter if exists
23
+ try:
24
+ model = PeftModel.from_pretrained(base_model, MODEL_NAME, device_map="auto")
25
+ except:
26
+ model = base_model
27
+ except Exception as e:
28
+ print("Error loading model:", e)
29
+ raise e
30
 
31
  model.eval()
32
 
33
  # -----------------------------
34
+ # Response function
35
  # -----------------------------
36
  def respond(message, history, system_message, max_tokens, temperature, top_p):
37
+ # Limit max tokens for safety
38
+ if max_tokens > 128:
39
+ max_tokens = 128
40
+
41
+ # Build prompt
42
  prompt = system_message + "\n"
43
  for h in history:
44
  prompt += f"User: {h['content']}\n"
45
  prompt += f"User: {message}\nBot:"
46
 
 
47
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
48
 
 
49
  gen_config = GenerationConfig(
50
  max_new_tokens=max_tokens,
51
  temperature=temperature,
 
53
  do_sample=True
54
  )
55
 
 
56
  with torch.no_grad():
57
  output_ids = model.generate(**inputs, **gen_config.to_dict())
58
  output = tokenizer.decode(output_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
 
66
  type="messages",
67
  additional_inputs=[
68
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
69
+ gr.Slider(minimum=1, maximum=2048, value=64, step=1, label="Max new tokens"),
70
+ gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
71
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
72
  ],
73
  )
 
75
  with gr.Blocks() as demo:
76
  chatbot.render()
77
 
 
 
 
78
  if __name__ == "__main__":
79
  demo.launch(server_name="0.0.0.0", server_port=7860)