|
|
from kokoro import KPipeline, KModel |
|
|
import torch |
|
|
from scipy.io import wavfile |
|
|
|
|
|
def save_audio(audio: torch.Tensor, filename: str): |
|
|
"""Helper function to save audio tensor as WAV file""" |
|
|
if audio is not None: |
|
|
|
|
|
audio_cpu = audio.cpu().numpy() |
|
|
|
|
|
|
|
|
wavfile.write( |
|
|
filename, |
|
|
24000, |
|
|
audio_cpu |
|
|
) |
|
|
print(f"Audio saved as '{filename}'") |
|
|
else: |
|
|
print("No audio was generated") |
|
|
|
|
|
def main(): |
|
|
|
|
|
pipeline = KPipeline(lang_code='a') |
|
|
|
|
|
|
|
|
|
|
|
phonemes = "hˌW ɑɹ ju tədˈA? ˌI ɐm dˈuɪŋ ɹˈizənəbli wˈɛl, θˈæŋk ju fɔɹ ˈæskɪŋ" |
|
|
|
|
|
try: |
|
|
print("\nExample 1: Using generate_from_tokens with raw phonemes") |
|
|
results = list(pipeline.generate_from_tokens( |
|
|
tokens=phonemes, |
|
|
voice="af_bella", |
|
|
speed=1.0 |
|
|
)) |
|
|
if results: |
|
|
save_audio(results[0].audio, 'phoneme_output_new.wav') |
|
|
|
|
|
|
|
|
print("\nExample 2: Using generate_from_tokens with pre-processed tokens") |
|
|
|
|
|
text = "How are you today? I am doing reasonably well, thank you for asking" |
|
|
_, tokens = pipeline.g2p(text) |
|
|
|
|
|
|
|
|
for result in pipeline.generate_from_tokens( |
|
|
tokens=tokens, |
|
|
voice="af_bella", |
|
|
speed=1.0 |
|
|
): |
|
|
|
|
|
if result.tokens: |
|
|
for token in result.tokens: |
|
|
if hasattr(token, 'start_ts') and hasattr(token, 'end_ts'): |
|
|
print(f"Token: {token.text} ({token.start_ts:.2f}s - {token.end_ts:.2f}s)") |
|
|
save_audio(result.audio, f'token_output_{hash(result.phonemes)}.wav') |
|
|
|
|
|
except Exception as e: |
|
|
print(f"An error occurred: {str(e)}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |