bezzam HF Staff commited on
Commit
0de3b6e
·
verified ·
1 Parent(s): 3940b49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -22
app.py CHANGED
@@ -1,6 +1,10 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
 
 
 
 
4
 
5
  def respond(
6
  message,
@@ -9,41 +13,32 @@ def respond(
9
  max_tokens,
10
  temperature,
11
  top_p,
12
- hf_token: gr.OAuthToken,
13
  ):
14
  """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
16
  """
17
- # client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b") # bills personal account
18
- client = InferenceClient(model="meta-llama/Llama-3.1-8B-Instruct") # models that HF hosts natively can be used without token
19
-
20
  messages = [{"role": "system", "content": system_message}]
21
-
22
  messages.extend(history)
23
-
24
  messages.append({"role": "user", "content": message})
25
 
26
  response = ""
27
-
28
- for message in client.chat_completion(
29
- messages,
30
  max_tokens=max_tokens,
31
- stream=True,
32
  temperature=temperature,
33
  top_p=top_p,
 
34
  ):
35
- choices = message.choices
36
- token = ""
37
- if len(choices) and choices[0].delta.content:
38
- token = choices[0].delta.content
39
-
40
- response += token
41
- yield response
42
 
43
 
44
- """
45
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
46
- """
47
  chatbot = gr.ChatInterface(
48
  respond,
49
  type="messages",
@@ -61,11 +56,11 @@ chatbot = gr.ChatInterface(
61
  ],
62
  )
63
 
 
64
  with gr.Blocks() as demo:
65
  with gr.Sidebar():
66
  gr.LoginButton()
67
  chatbot.render()
68
 
69
-
70
  if __name__ == "__main__":
71
  demo.launch()
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
+ # Initialize the client for a HF-hosted model
5
+ # No token needed when running inside a Space owned by a Team org
6
+ client = InferenceClient(model="meta-llama/Llama-3.1-8B-Instruct")
7
+
8
 
9
  def respond(
10
  message,
 
13
  max_tokens,
14
  temperature,
15
  top_p,
 
16
  ):
17
  """
18
+ Generate responses using HF-hosted Llama 3.1 model.
19
+ This version avoids Novita/Groq routing and does not require tokens.
20
  """
 
 
 
21
  messages = [{"role": "system", "content": system_message}]
 
22
  messages.extend(history)
 
23
  messages.append({"role": "user", "content": message})
24
 
25
  response = ""
26
+ # Stream responses using the new chat.completions API
27
+ for message_chunk in client.chat.completions.create(
28
+ messages=messages,
29
  max_tokens=max_tokens,
 
30
  temperature=temperature,
31
  top_p=top_p,
32
+ stream=True
33
  ):
34
+ delta = message_chunk.choices[0].delta
35
+ if delta and delta.content:
36
+ token = delta.content
37
+ response += token
38
+ yield response
 
 
39
 
40
 
41
+ # Define the Gradio Chat Interface
 
 
42
  chatbot = gr.ChatInterface(
43
  respond,
44
  type="messages",
 
56
  ],
57
  )
58
 
59
+ # Build the Gradio Blocks interface with optional login button
60
  with gr.Blocks() as demo:
61
  with gr.Sidebar():
62
  gr.LoginButton()
63
  chatbot.render()
64
 
 
65
  if __name__ == "__main__":
66
  demo.launch()