Spaces:
Running
Running
File size: 4,670 Bytes
6317d4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import json
import os
import threading
import time
from queue import Queue, Empty
from typing import Iterator, Tuple
import numpy as np
import websocket
# ========== CONFIG ==========
ELEVEN_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
REALTIME_VOICE_ID = os.getenv("ELEVEN_REALTIME_VOICE_ID") # MUST be a Realtime voice
REALTIME_MODEL_ID = os.getenv("ELEVEN_REALTIME_MODEL_ID", "eleven_multilingual_v2")
REALTIME_SAMPLE_RATE = 44100 # PCM 44.1kHz audio
# ========== REALTIME CLIENT ==========
class ElevenLabsRealtimeTTS:
"""Smooth low-latency ElevenLabs realtime narration with PCM audio output."""
def __init__(self):
self.ws: websocket.WebSocketApp | None = None
self._ws_thread: threading.Thread | None = None
self._connected = threading.Event()
self._queue: Queue[np.ndarray] = Queue(maxsize=256)
self._lock = threading.Lock()
self.running = False
# ---------- WebSocket Connection ----------
def connect(self):
if not ELEVEN_API_KEY or not REALTIME_VOICE_ID:
raise RuntimeError("Missing ELEVEN_LABS_API_KEY or ELEVEN_REALTIME_VOICE_ID")
url = (
f"wss://api.elevenlabs.io/v1/text-to-speech/"
f"{REALTIME_VOICE_ID}/stream-input?"
f"model_id={REALTIME_MODEL_ID}"
f"&output_format=pcm_24000"
f"&optimize_streaming_latency=3"
)
# Try passing key in headers (standard)
headers = {
"xi-api-key": ELEVEN_API_KEY,
"Accept": "audio/wav",
"Content-Type": "application/json",
}
print(f"Connecting to ElevenLabs Realtime... VoiceID: {REALTIME_VOICE_ID}, Model: {REALTIME_MODEL_ID}")
self.ws = websocket.WebSocketApp(
url,
header=headers,
on_open=self._on_open,
on_message=self._on_message,
on_close=self._on_close,
on_error=self._on_error,
)
self._ws_thread = threading.Thread(target=self.ws.run_forever, daemon=True)
self._ws_thread.start()
self.running = True
# ---------- WebSocket Callbacks ----------
def _on_open(self, ws):
self._connected.set()
def _on_close(self, ws, *args):
self.running = False
self._connected.clear()
def _on_error(self, ws, error):
print("⚠ ElevenLabs realtime error:", error)
def _on_message(self, ws, message):
"""Receive PCM bytes → push to queue immediately with no decoding."""
if isinstance(message, bytes):
pcm = np.frombuffer(message, dtype=np.int16)
try:
self._queue.put_nowait(pcm)
except:
pass # Drop if queue full—we never block here.
# ---------- Send Text ----------
def speak(self, text: str):
if not text:
return
with self._lock:
if not self.running or not self.ws:
self.connect()
if not self._connected.wait(timeout=5):
raise RuntimeError("Failed to open ElevenLabs realtime websocket")
payload = {
"text": text,
"voice_settings": {"stability": 0.5, "similarity_boost": 0.8, "use_speaker_boost": False},
"generation_config": {
"chunk_length_schedule": [120, 160, 250, 290]
},
"try_trigger_generation": True,
}
# Clear queue of any stale audio from previous runs
with self._lock:
while not self._queue.empty():
try:
self._queue.get_nowait()
except Empty:
break
self.ws.send(json.dumps(payload))
# ---------- Stream PCM Audio ----------
def stream_text(
self, text: str, idle_timeout: float = 0.5
) -> Iterator[Tuple[int, np.ndarray]]:
"""
Speak text → yield (sample_rate, pcm_chunk) continuously
Stops ONLY after audio finishes (no artificial silence).
"""
self.speak(text)
last_received = time.time()
while True:
# Check if connection is still alive
if not self.running:
break
try:
chunk = self._queue.get(timeout=0.1)
last_received = time.time()
yield REALTIME_SAMPLE_RATE, chunk
except Empty:
# If we haven't received anything for a while, end stream
if time.time() - last_received > idle_timeout:
break
continue # DO NOT inject silence; just wait
|