rahul7star commited on
Commit
822ab8c
·
verified ·
1 Parent(s): 702232e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -29,8 +29,9 @@ def generate_audio_cpu_lora(text: str):
29
  logs = []
30
  try:
31
  DEVICE_CPU = "cpu"
 
32
 
33
- # Load tokenizer and base model
34
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
35
  base_model = AutoModelForCausalLM.from_pretrained(
36
  MODEL_NAME,
@@ -38,14 +39,14 @@ def generate_audio_cpu_lora(text: str):
38
  torch_dtype=torch.float32,
39
  trust_remote_code=True
40
  )
41
- logs.append("✅ Loaded base Maya model")
42
 
43
- # Load LoRA adapter
44
  model = PeftModel.from_pretrained(base_model, LORA_NAME, device_map={"": DEVICE_CPU})
45
  model.eval()
46
- logs.append(f"✅ Applied LoRA adapter from {LORA_NAME}")
47
 
48
- # Build prompt
49
  soh_token = tokenizer.decode([128259])
50
  eoh_token = tokenizer.decode([128260])
51
  soa_token = tokenizer.decode([128261])
@@ -56,7 +57,7 @@ def generate_audio_cpu_lora(text: str):
56
 
57
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE_CPU)
58
 
59
- # Generate tokens
60
  with torch.inference_mode():
61
  outputs = model.generate(
62
  **inputs,
@@ -69,9 +70,9 @@ def generate_audio_cpu_lora(text: str):
69
  pad_token_id=tokenizer.pad_token_id
70
  )
71
  generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
72
- logs.append(f"✅ Generated {len(generated_ids)} token IDs")
73
 
74
- # Extract SNAC codes
75
  snac_min, snac_max = 128266, 156937
76
  eos_id = 128258
77
  try:
@@ -80,7 +81,7 @@ def generate_audio_cpu_lora(text: str):
80
  eos_idx = len(generated_ids)
81
  snac_tokens = [t for t in generated_ids[:eos_idx] if snac_min <= t <= snac_max]
82
 
83
- # Unpack 7-token SNAC frames
84
  l1, l2, l3 = [], [], []
85
  frames = len(snac_tokens) // 7
86
  snac_tokens = snac_tokens[:frames*7]
@@ -89,9 +90,8 @@ def generate_audio_cpu_lora(text: str):
89
  l1.append((slots[0]-128266)%4096)
90
  l2.extend([(slots[1]-128266)%4096, (slots[4]-128266)%4096])
91
  l3.extend([(slots[2]-128266)%4096, (slots[3]-128266)%4096, (slots[5]-128266)%4096, (slots[6]-128266)%4096])
92
- logs.append(f"✅ Unpacked to {len(l1)} L1 frames, {len(l2)} L2 codes, {len(l3)} L3 codes")
93
-
94
- # SNAC decoder
95
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(DEVICE_CPU)
96
  codes_tensor = [torch.tensor(level, dtype=torch.long, device=DEVICE_CPU).unsqueeze(0) for level in [l1,l2,l3]]
97
  with torch.inference_mode():
@@ -102,7 +102,7 @@ def generate_audio_cpu_lora(text: str):
102
 
103
  audio_path = OUT_ROOT / "tts_output_cpu_lora.wav"
104
  sf.write(audio_path, audio, TARGET_SR)
105
- logs.append(f"✅ Audio saved: {audio_path}, duration: {len(audio)/TARGET_SR:.2f}s")
106
 
107
  return str(audio_path), str(audio_path), "\n".join(logs)
108
 
 
29
  logs = []
30
  try:
31
  DEVICE_CPU = "cpu"
32
+ print(text)
33
 
34
+
35
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
36
  base_model = AutoModelForCausalLM.from_pretrained(
37
  MODEL_NAME,
 
39
  torch_dtype=torch.float32,
40
  trust_remote_code=True
41
  )
42
+
43
 
44
+
45
  model = PeftModel.from_pretrained(base_model, LORA_NAME, device_map={"": DEVICE_CPU})
46
  model.eval()
47
+
48
 
49
+
50
  soh_token = tokenizer.decode([128259])
51
  eoh_token = tokenizer.decode([128260])
52
  soa_token = tokenizer.decode([128261])
 
57
 
58
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE_CPU)
59
 
60
+
61
  with torch.inference_mode():
62
  outputs = model.generate(
63
  **inputs,
 
70
  pad_token_id=tokenizer.pad_token_id
71
  )
72
  generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
73
+
74
 
75
+
76
  snac_min, snac_max = 128266, 156937
77
  eos_id = 128258
78
  try:
 
81
  eos_idx = len(generated_ids)
82
  snac_tokens = [t for t in generated_ids[:eos_idx] if snac_min <= t <= snac_max]
83
 
84
+
85
  l1, l2, l3 = [], [], []
86
  frames = len(snac_tokens) // 7
87
  snac_tokens = snac_tokens[:frames*7]
 
90
  l1.append((slots[0]-128266)%4096)
91
  l2.extend([(slots[1]-128266)%4096, (slots[4]-128266)%4096])
92
  l3.extend([(slots[2]-128266)%4096, (slots[3]-128266)%4096, (slots[5]-128266)%4096, (slots[6]-128266)%4096])
93
+
94
+
 
95
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(DEVICE_CPU)
96
  codes_tensor = [torch.tensor(level, dtype=torch.long, device=DEVICE_CPU).unsqueeze(0) for level in [l1,l2,l3]]
97
  with torch.inference_mode():
 
102
 
103
  audio_path = OUT_ROOT / "tts_output_cpu_lora.wav"
104
  sf.write(audio_path, audio, TARGET_SR)
105
+
106
 
107
  return str(audio_path), str(audio_path), "\n".join(logs)
108