File size: 5,119 Bytes
49266ed
 
 
 
 
 
64c3f44
49266ed
 
 
 
 
64c3f44
 
c18c42d
 
49266ed
 
 
64c3f44
 
 
 
 
49266ed
 
 
 
 
 
 
822ab8c
49266ed
822ab8c
49266ed
 
 
 
 
 
 
822ab8c
49266ed
822ab8c
49266ed
 
822ab8c
49266ed
822ab8c
49266ed
 
 
 
 
 
 
 
 
 
822ab8c
49266ed
 
 
 
 
 
 
 
 
 
 
 
822ab8c
49266ed
822ab8c
49266ed
 
 
 
 
 
 
 
822ab8c
49266ed
 
 
 
 
 
 
 
822ab8c
 
1bfa9b2
49266ed
 
 
 
 
 
 
 
 
822ab8c
49266ed
64c3f44
49266ed
 
 
 
c18c42d
64c3f44
49266ed
 
 
 
 
9166e69
 
64c3f44
 
 
 
 
49266ed
64c3f44
 
 
 
49266ed
 
 
 
 
64c3f44
49266ed
 
64c3f44
 
 
 
 
702232e
64c3f44
 
 
 
49266ed
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# app.py
import gradio as gr
import torch
import soundfile as sf
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from snac import SNAC

# -----------------------------
# CONFIG
# -----------------------------
MODEL_NAME = "rahul7star/nava1.0"
LORA_NAME = "rahul7star/nava-audio"
SEQ_LEN = 240000
TARGET_SR = 240000
OUT_ROOT = Path("/tmp/data")
OUT_ROOT.mkdir(parents=True, exist_ok=True)

DEFAULT_TEXT = (
    "राजनीतिज्ञों ने कहा कि उन्होंने निर्णायक मत को अनावश्यक रूप से "
    "निर्धारित करने के लिए अफ़गान संविधान में काफी अस्पष्टता पाई थी"
)

# -----------------------------
# GENERATE AUDIO (LoRA)
# -----------------------------
def generate_audio_cpu_lora(text: str):
    logs = []
    try:
        DEVICE_CPU = "cpu"
        print(text)

     
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            device_map={"": DEVICE_CPU},
            torch_dtype=torch.float32,
            trust_remote_code=True
        )
        

     
        model = PeftModel.from_pretrained(base_model, LORA_NAME, device_map={"": DEVICE_CPU})
        model.eval()
        

       
        soh_token = tokenizer.decode([128259])
        eoh_token = tokenizer.decode([128260])
        soa_token = tokenizer.decode([128261])
        sos_token = tokenizer.decode([128257])
        eot_token = tokenizer.decode([128009])
        bos_token = tokenizer.bos_token
        prompt = soh_token + bos_token + text + eot_token + eoh_token + soa_token + sos_token

        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE_CPU)

      
        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=SEQ_LEN,
                temperature=0.4,
                top_p=0.9,
                repetition_penalty=1.1,
                do_sample=True,
                eos_token_id=128258,
                pad_token_id=tokenizer.pad_token_id
            )
        generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
       

       
        snac_min, snac_max = 128266, 156937
        eos_id = 128258
        try:
            eos_idx = generated_ids.index(eos_id)
        except ValueError:
            eos_idx = len(generated_ids)
        snac_tokens = [t for t in generated_ids[:eos_idx] if snac_min <= t <= snac_max]

       
        l1, l2, l3 = [], [], []
        frames = len(snac_tokens) // 7
        snac_tokens = snac_tokens[:frames*7]
        for i in range(frames):
            slots = snac_tokens[i*7:(i+1)*7]
            l1.append((slots[0]-128266)%4096)
            l2.extend([(slots[1]-128266)%4096, (slots[4]-128266)%4096])
            l3.extend([(slots[2]-128266)%4096, (slots[3]-128266)%4096, (slots[5]-128266)%4096, (slots[6]-128266)%4096])
       
       
        snac_model = SNAC.from_pretrained("rahul7star/nava-snac").eval().to(DEVICE_CPU)
        codes_tensor = [torch.tensor(level, dtype=torch.long, device=DEVICE_CPU).unsqueeze(0) for level in [l1,l2,l3]]
        with torch.inference_mode():
            z_q = snac_model.quantizer.from_codes(codes_tensor)
            audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
        if len(audio) > 2048:
            audio = audio[2048:]

        audio_path = OUT_ROOT / "tts_output_cpu_lora.wav"
        sf.write(audio_path, audio, TARGET_SR)
      

        return str(audio_path), str(audio_path), "\n".join(logs)

    except Exception as e:
        import traceback
        logs.append(f"[❌] CPU LoRA TTS error: {e}\n{traceback.format_exc()}")
        print(e)
        return None, None, "\n".join(logs)

# -----------------------------
# GRADIO UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("# Maya LoRA TTS (CPU)- 10 mins gen time else switch to GPU ")
    gr.Markdown("# Full Credit to Maya Team members ")

    # Input text
    input_text = gr.Textbox(label="Enter text", lines=2, value=DEFAULT_TEXT)

    # Generate button
    run_button = gr.Button("🔊 Generate Audio")

    # Outputs
    audio_output = gr.Audio(label="Play Generated Audio", type="filepath")
    download_output = gr.File(label="Download Audio")
    logs_output = gr.Textbox(label="Logs", lines=12)

    run_button.click(
        fn=generate_audio_cpu_lora,
        inputs=[input_text],
        outputs=[audio_output, download_output, logs_output]
    )

    # -----------------------------
    # Example section
    # -----------------------------
    gr.Markdown("### Example")
    example_text = DEFAULT_TEXT
    example_audio_path = "audio.wav" 

    gr.Textbox(label="Example Text", value=example_text, lines=2, interactive=False)
    gr.Audio(label="Example Audio", value=example_audio_path, type="filepath", interactive=False)

if __name__ == "__main__":
    demo.launch()