Smllm2

Sleeping

App Files Files Community

ghosthets commited on 15 days ago

Commit

48cf71a

verified ·

1 Parent(s): 620d411

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -14

app.py CHANGED Viewed

@@ -1,38 +1,47 @@
 import flask
 from flask import request, jsonify
-from transformers import pipeline
 import torch
-import warnings # warning suppress करने के लिए
-# warnings को suppress करें, वर्ना CPU पर warnings आ सकती हैं
 warnings.filterwarnings("ignore")
 app = flask.Flask(__name__)
 # ===========================
-# LOAD MODEL (StableLM-3B-Chat)
 # ===========================
 model_id = "HuggingFaceTB/SmolLM-1.7B"
 print("🔄 Loading model...")
 # CPU/GPU device set
-# हम CPU पर लोड करते समय 'torch.bfloat16' का उपयोग करके मेमोरी को कम करने की कोशिश करेंगे।
 device = 0 if torch.cuda.is_available() else -1
-dtype = torch.float32 if device == -1 else torch.bfloat16 # CPU के लिए float32
 try:
     ai = pipeline(
         "text-generation",
         model=model_id,
         max_new_tokens=200,
         device=device,
-        torch_dtype=dtype, # CPU/Memory optimization
-        trust_remote_code=True # StableLM के लिए आवश्यक
     )
     print("✅ Model loaded!")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
-    ai = None # If load fails, prevent later API errors
 # ===========================
 # CHAT API
@@ -48,14 +57,24 @@ def chat():
         if not msg:
             return jsonify({"error": "No message sent"}), 400
-        # StableLM Instruction Format:
-        prompt = f"<|user|>\n{msg}<|end|>\n<|assistant|>"
         output = ai(prompt)[0]["generated_text"]
-        # Output को clean करें ताकि सिर्फ assistant का जवाब मिले
-        reply = output.split("<|assistant|>")[-1].strip()
         return jsonify({"reply": reply})
     except Exception as e:
         return jsonify({"error": str(e)}), 500

 import flask
 from flask import request, jsonify
+from transformers import pipeline, AutoTokenizer # Added AutoTokenizer
 import torch
+import warnings
+# Suppress minor warnings that occur on CPU runs
 warnings.filterwarnings("ignore")
 app = flask.Flask(__name__)
 # ===========================
+# LOAD MODEL (SmolLM-1.7B-Chat)
+# This model is small (1.7B) and fully open-access.
 # ===========================
 model_id = "HuggingFaceTB/SmolLM-1.7B"
 print("🔄 Loading model...")
 # CPU/GPU device set
 device = 0 if torch.cuda.is_available() else -1
+# Use float32 for CPU (or bfloat16 for GPU)
+dtype = torch.float32 if device == -1 else torch.bfloat16
 try:
+    # 1. Load Tokenizer and set pad_token for stability
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        # Set pad_token to eos_token to fix generation warning/error
+        tokenizer.pad_token = tokenizer.eos_token
+    # 2. Load Pipeline with the fixed tokenizer
     ai = pipeline(
         "text-generation",
         model=model_id,
+        tokenizer=tokenizer, # Passing the configured tokenizer here
         max_new_tokens=200,
         device=device,
+        torch_dtype=dtype,
+        trust_remote_code=True
     )
     print("✅ Model loaded!")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
+    ai = None
 # ===========================
 # CHAT API
         if not msg:
             return jsonify({"error": "No message sent"}), 400
+        # Instruction Format: Using a simple template for this model
+        prompt = f"User: {msg}\nAssistant:"
         output = ai(prompt)[0]["generated_text"]
+        # Clean the output to extract only the model's reply
+        # We split based on the 'Assistant:' tag in the prompt template
+        if "Assistant:" in output:
+             reply = output.split("Assistant:")[-1].strip()
+        elif "User:" in output: # Sometimes the model repeats the prompt
+             reply = output.split("User:")[0].strip()
+        else:
+             reply = output.strip()
+        # Remove any remaining instruction markers from the start
+        if reply.startswith(msg):
+            reply = reply[len(msg):].strip()
         return jsonify({"reply": reply})
     except Exception as e:
         return jsonify({"error": str(e)}), 500