Spaces:

werz
/

kokoro

Sleeping

kokoro / examples /phoneme_example.py

0xWerz

init

1b6c34a 23 days ago

2.29 kB

	from kokoro import KPipeline, KModel
	import torch
	from scipy.io import wavfile

	def save_audio(audio: torch.Tensor, filename: str):
	"""Helper function to save audio tensor as WAV file"""
	if audio is not None:
	# Ensure audio is on CPU and in the right format
	audio_cpu = audio.cpu().numpy()

	# Save using scipy.io.wavfile
	wavfile.write(
	filename,
	24000, # Kokoro uses 24kHz sample rate
	audio_cpu
	)
	print(f"Audio saved as '{filename}'")
	else:
	print("No audio was generated")

	def main():
	# Initialize pipeline with American English
	pipeline = KPipeline(lang_code='a')

	# The phoneme string for:
	# "How are you today? I am doing reasonably well, thank you for asking"
	phonemes = "hˌW ɑɹ ju tədˈA? ˌI ɐm dˈuɪŋ ɹˈizənəbli wˈɛl, θˈæŋk ju fɔɹ ˈæskɪŋ"

	try:
	print("\nExample 1: Using generate_from_tokens with raw phonemes")
	results = list(pipeline.generate_from_tokens(
	tokens=phonemes,
	voice="af_bella",
	speed=1.0
	))
	if results:
	save_audio(results[0].audio, 'phoneme_output_new.wav')

	# Example 2: Using generate_from_tokens with pre-processed tokens
	print("\nExample 2: Using generate_from_tokens with pre-processed tokens")
	# get the tokens through G2P or any other method
	text = "How are you today? I am doing reasonably well, thank you for asking"
	_, tokens = pipeline.g2p(text)

	# Then generate from tokens
	for result in pipeline.generate_from_tokens(
	tokens=tokens,
	voice="af_bella",
	speed=1.0
	):
	# Each result may contain timestamps if available
	if result.tokens:
	for token in result.tokens:
	if hasattr(token, 'start_ts') and hasattr(token, 'end_ts'):
	print(f"Token: {token.text} ({token.start_ts:.2f}s - {token.end_ts:.2f}s)")
	save_audio(result.audio, f'token_output_{hash(result.phonemes)}.wav')

	except Exception as e:
	print(f"An error occurred: {str(e)}")

	if __name__ == "__main__":
	main()