Vivek16 commited on
Commit
275aa80
Β·
verified Β·
1 Parent(s): 7c94cae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -20
app.py CHANGED
@@ -4,29 +4,25 @@ from peft import PeftModel
4
  import torch
5
 
6
  # -----------------------------
7
- # Model config
8
  # -----------------------------
9
- MODEL_NAME = "unsloth/qwen2.5-math-1.5b-bnb-4bit"
10
 
11
  print("Loading tokenizer...")
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
13
 
14
- print("Loading model in 4-bit...")
 
 
 
 
 
 
 
15
  try:
16
- base_model = AutoModelForCausalLM.from_pretrained(
17
- MODEL_NAME,
18
- device_map="auto",
19
- dtype=torch.float16, # newer transformers prefers dtype
20
- low_cpu_mem_usage=True
21
- )
22
- # Load LoRA adapter if exists
23
- try:
24
- model = PeftModel.from_pretrained(base_model, MODEL_NAME, device_map="auto")
25
- except:
26
- model = base_model
27
- except Exception as e:
28
- print("Error loading model:", e)
29
- raise e
30
 
31
  model.eval()
32
 
@@ -34,7 +30,7 @@ model.eval()
34
  # Response function
35
  # -----------------------------
36
  def respond(message, history, system_message, max_tokens, temperature, top_p):
37
- # Limit max tokens for safety
38
  if max_tokens > 128:
39
  max_tokens = 128
40
 
@@ -44,7 +40,7 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
44
  prompt += f"User: {h['content']}\n"
45
  prompt += f"User: {message}\nBot:"
46
 
47
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
48
 
49
  gen_config = GenerationConfig(
50
  max_new_tokens=max_tokens,
@@ -66,7 +62,7 @@ chatbot = gr.ChatInterface(
66
  type="messages",
67
  additional_inputs=[
68
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
69
- gr.Slider(minimum=1, maximum=2048, value=64, step=1, label="Max new tokens"),
70
  gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
71
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
72
  ],
@@ -77,3 +73,4 @@ with gr.Blocks() as demo:
77
 
78
  if __name__ == "__main__":
79
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
4
  import torch
5
 
6
  # -----------------------------
7
+ # CPU-friendly model
8
  # -----------------------------
9
+ MODEL_NAME = "tiiuae/falcon-7b-instruct" # smaller CPU-friendly model
10
 
11
  print("Loading tokenizer...")
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
13
 
14
+ print("Loading model...")
15
+ base_model = AutoModelForCausalLM.from_pretrained(
16
+ MODEL_NAME,
17
+ device_map=None, # CPU only
18
+ torch_dtype=torch.float32
19
+ )
20
+
21
+ # Load LoRA if exists (optional)
22
  try:
23
+ model = PeftModel.from_pretrained(base_model, MODEL_NAME, device_map=None)
24
+ except:
25
+ model = base_model
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  model.eval()
28
 
 
30
  # Response function
31
  # -----------------------------
32
  def respond(message, history, system_message, max_tokens, temperature, top_p):
33
+ # Limit max tokens for CPU safety
34
  if max_tokens > 128:
35
  max_tokens = 128
36
 
 
40
  prompt += f"User: {h['content']}\n"
41
  prompt += f"User: {message}\nBot:"
42
 
43
+ inputs = tokenizer(prompt, return_tensors="pt")
44
 
45
  gen_config = GenerationConfig(
46
  max_new_tokens=max_tokens,
 
62
  type="messages",
63
  additional_inputs=[
64
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
65
+ gr.Slider(minimum=1, maximum=128, value=64, step=1, label="Max new tokens"),
66
  gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
67
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
68
  ],
 
73
 
74
  if __name__ == "__main__":
75
  demo.launch(server_name="0.0.0.0", server_port=7860)
76
+