kokoro / examples /phoneme_example.py
0xWerz
init
1b6c34a
from kokoro import KPipeline, KModel
import torch
from scipy.io import wavfile
def save_audio(audio: torch.Tensor, filename: str):
"""Helper function to save audio tensor as WAV file"""
if audio is not None:
# Ensure audio is on CPU and in the right format
audio_cpu = audio.cpu().numpy()
# Save using scipy.io.wavfile
wavfile.write(
filename,
24000, # Kokoro uses 24kHz sample rate
audio_cpu
)
print(f"Audio saved as '{filename}'")
else:
print("No audio was generated")
def main():
# Initialize pipeline with American English
pipeline = KPipeline(lang_code='a')
# The phoneme string for:
# "How are you today? I am doing reasonably well, thank you for asking"
phonemes = "hˌW ɑɹ ju tədˈA? ˌI ɐm dˈuɪŋ ɹˈizənəbli wˈɛl, θˈæŋk ju fɔɹ ˈæskɪŋ"
try:
print("\nExample 1: Using generate_from_tokens with raw phonemes")
results = list(pipeline.generate_from_tokens(
tokens=phonemes,
voice="af_bella",
speed=1.0
))
if results:
save_audio(results[0].audio, 'phoneme_output_new.wav')
# Example 2: Using generate_from_tokens with pre-processed tokens
print("\nExample 2: Using generate_from_tokens with pre-processed tokens")
# get the tokens through G2P or any other method
text = "How are you today? I am doing reasonably well, thank you for asking"
_, tokens = pipeline.g2p(text)
# Then generate from tokens
for result in pipeline.generate_from_tokens(
tokens=tokens,
voice="af_bella",
speed=1.0
):
# Each result may contain timestamps if available
if result.tokens:
for token in result.tokens:
if hasattr(token, 'start_ts') and hasattr(token, 'end_ts'):
print(f"Token: {token.text} ({token.start_ts:.2f}s - {token.end_ts:.2f}s)")
save_audio(result.audio, f'token_output_{hash(result.phonemes)}.wav')
except Exception as e:
print(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()