Spaces:

MCP-1st-Birthday
/

EastSync-AI

Running

App Files Files Community

EastSync-AI / third_party_tools /text_to_audio_stream.py

StanSava

Add voice narrator (#16)

6317d4b 13 days ago

raw

history blame contribute delete

3.52 kB

	import os
	import time
	import wave
	from pathlib import Path
	from typing import Iterator, Tuple

	import numpy as np
	from elevenlabs.client import ElevenLabs

	from elevenlabs import play

	from .eleven_labs_realtime_tts import (
	ElevenLabsRealtimeTTS,
	REALTIME_SAMPLE_RATE,
	)

	SAMPLE_RATE = 24000
	DEFAULT_VOICE_ID = "fjnwTZkKtQOJaYzGLa6n"
	DEFAULT_MODEL_ID = "eleven_flash_v2"
	LOCAL_AUDIO_ENV = "EASTSYNC_AUDIO_FILE"
	USE_REALTIME_TTS = False

	EFFECTIVE_SAMPLE_RATE = REALTIME_SAMPLE_RATE if USE_REALTIME_TTS else SAMPLE_RATE

	api_key = os.getenv("ELEVEN_LABS_API_KEY")
	voice_id = os.getenv("ELEVEN_LABS_VOICE_ID", DEFAULT_VOICE_ID)
	model_id = os.getenv("ELEVEN_LABS_MODEL_ID", DEFAULT_MODEL_ID)
	client = ElevenLabs(api_key=api_key) if api_key else None
	realtime_client = ElevenLabsRealtimeTTS() if USE_REALTIME_TTS else None

	def resample_chunk(chunk: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
	"""Resample audio chunk from orig_sr to target_sr using linear interpolation."""
	if orig_sr == target_sr:
	return chunk

	# Calculate duration and new length
	duration = len(chunk) / orig_sr
	new_length = int(duration * target_sr)

	# Simple linear interpolation
	return np.interp(
	np.linspace(0, len(chunk), new_length),
	np.arange(len(chunk)),
	chunk
	).astype(np.int16)


	def local_file_audio_stream(path: str, chunk_ms: int = 100) -> Iterator[Tuple[int, np.ndarray]]:
	wav_path = Path(path).expanduser().resolve()

	with wave.open(str(wav_path), "rb") as wf:
	# Relaxed check: Only enforce 16-bit for now as we cast to int16
	if wf.getsampwidth() != 2:
	raise ValueError(f"Audio format mismatch. Expected 16-bit PCM. Got {wf.getsampwidth()*8}-bit.")

	sr = wf.getframerate()
	channels = wf.getnchannels()

	frames_per_chunk = int(sr * chunk_ms)

	while True:
	data = wf.readframes(frames_per_chunk)
	if not data:
	break

	audio_data = np.frombuffer(data, dtype=np.int16)

	if channels > 1:
	audio_data = audio_data.reshape(-1, channels)[:, 0]

	resampled_chunk = resample_chunk(audio_data, sr, EFFECTIVE_SAMPLE_RATE)

	yield EFFECTIVE_SAMPLE_RATE, resampled_chunk
	time.sleep(chunk_ms / 1000)



	def _stream_with_elevenlabs(text: str) -> Iterator[np.ndarray]:
	if not client:
	return

	audio_stream = client.text_to_speech.stream(
	text=text,
	voice_id=voice_id,
	model_id=model_id,
	output_format="pcm_24000",
	optimize_streaming_latency=0,
	request_options={
	"chunk_size": 120_000,
	}
	)
	for chunk in audio_stream:
	if not chunk:
	continue
	yield np.frombuffer(chunk, dtype=np.int16)


	def _collect_text(text_stream: Iterator[str]) -> str:
	parts: list[str] = []
	for new_text in text_stream:
	if new_text:
	parts.append(new_text)
	return "".join(parts).strip()


	def text_to_audio_stream(text_stream: Iterator[str]) -> Iterator[Tuple[int, np.ndarray]]:

	narration = _collect_text(text_stream)
	if not narration:
	return

	if realtime_client:
	yield from realtime_client.stream_text(narration)
	return

	if not client:
	return

	for chunk in _stream_with_elevenlabs(narration):
	yield EFFECTIVE_SAMPLE_RATE, chunk