Spaces:

DennisHung
/

DiffmorpherXAp-adapter

Runtime error

App Files Files Community

CSH-1220 commited on Jan 21

Commit

117486a

1 Parent(s): 55f08a9

File updates regarding memory-saving

Browse files

Files changed (4) hide show

APadapter/ap_adapter/attention_processor.py +5 -2
app.py +134 -28
pipeline/morph_pipeline_successed_ver1.py +82 -17
utils/lora_utils_successed_ver1.py +2 -0

APadapter/ap_adapter/attention_processor.py CHANGED Viewed

@@ -309,7 +309,7 @@ class IPAttnProcessor2_0(torch.nn.Module):
             the weight scale of image prompt.
     """
-    def __init__(self, hidden_size, name, cross_attention_dim=None, num_tokens=4, scale=1.0, do_copy = False):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
@@ -320,10 +320,12 @@ class IPAttnProcessor2_0(torch.nn.Module):
         self.hidden_size = hidden_size
         self.cross_attention_dim = cross_attention_dim
         self.num_tokens = num_tokens
         self.scale = scale
         self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
         self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
         self.name = name
         # Below is for copying the weight of the original weight to the \
         if do_copy:
             print("do copy")
@@ -451,7 +453,8 @@ class IPAttnProcessor2_0(torch.nn.Module):
         ip_hidden_states = ip_hidden_states.to(query.dtype)
         # print("hidden_states",hidden_states)
         # print("ip_hidden_states",ip_hidden_states)
-        hidden_states = hidden_states + self.scale * ip_hidden_states
         # print("ip_hidden_states",ip_hidden_states.shape)
         # linear proj
         hidden_states = attn.to_out[0](hidden_states)

             the weight scale of image prompt.
     """
+    def __init__(self, hidden_size, name, flag = 'normal', cross_attention_dim=None, num_tokens=4, text_scale = 1.0 , scale=1.0, do_copy = False):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
         self.hidden_size = hidden_size
         self.cross_attention_dim = cross_attention_dim
         self.num_tokens = num_tokens
+        self.text_scale = text_scale
         self.scale = scale
         self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
         self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
         self.name = name
+        self.flag = flag
         # Below is for copying the weight of the original weight to the \
         if do_copy:
             print("do copy")
         ip_hidden_states = ip_hidden_states.to(query.dtype)
         # print("hidden_states",hidden_states)
         # print("ip_hidden_states",ip_hidden_states)
+        # print(f'{self.flag} Hello, I pass here!')
+        hidden_states = self.text_scale * hidden_states + self.scale * ip_hidden_states
         # print("ip_hidden_states",ip_hidden_states.shape)
         # linear proj
         hidden_states = attn.to_out[0](hidden_states)

app.py CHANGED Viewed

@@ -1,26 +1,39 @@
 import os
 import torch
 import torchaudio
 import numpy as np
 import gradio as gr
 from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline
 # Initialize AudioLDM2 Pipeline
-pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 pipeline.to(device)
-# Audio morphing function
-def morph_audio(audio_file1, audio_file2, prompt1, prompt2, negative_prompt1="Low quality", negative_prompt2="Low quality"):
     save_lora_dir = "output"
     os.makedirs(save_lora_dir, exist_ok=True)
     # Load audio and compute duration
-    waveform, sample_rate = torchaudio.load(audio_file1)
-    duration = waveform.shape[1] / sample_rate
-    duration = int(duration)
     # Perform morphing using the pipeline
     _ = pipeline(
         audio_file=audio_file1,
         audio_file2=audio_file2,
         audio_length_in_s=duration,
@@ -33,13 +46,13 @@ def morph_audio(audio_file1, audio_file2, prompt1, prompt2, negative_prompt1="Lo
         save_lora_dir=save_lora_dir,
         use_adain=True,
         use_reschedule=False,
-        num_inference_steps=50,
         lamd=0.6,
         output_path=save_lora_dir,
         num_frames=5,
         fix_lora=None,
         use_lora=True,
-        lora_steps=50,
         noisy_latent_with_lora=True,
         morphing_with_lora=True,
         use_morph_prompt=True,
@@ -51,32 +64,125 @@ def morph_audio(audio_file1, audio_file2, prompt1, prompt2, negative_prompt1="Lo
     [os.path.join(save_lora_dir, file) for file in os.listdir(save_lora_dir) if file.endswith(".wav")],
     key=lambda x: int(os.path.splitext(os.path.basename(x))[0])
     )
     return output_paths
 # Gradio interface function
-def interface(audio1, audio2, prompt1, prompt2):
-    output_paths = morph_audio(audio1, audio2, prompt1, prompt2)
     return output_paths
 # Gradio Interface
-demo = gr.Interface(
-    fn=interface,
-    inputs=[
-        gr.Audio(label="Upload Audio File 1", type="filepath"),
-        gr.Audio(label="Upload Audio File 2", type="filepath"),
-        # gr.Slider(4, 6, step=1, label="Octave 1"),
-        gr.Textbox(label="Prompt for Audio File 1"),
-        gr.Textbox(label="Prompt for Audio File 2")
-    ],
-    outputs=[
-        gr.Audio(label="Morphing audio 1"),
-        gr.Audio(label="Morphing audio 2"),
-        gr.Audio(label="Morphing audio 3"),
-        gr.Audio(label="Morphing audio 4"),
-        gr.Audio(label="Morphing audio 5"),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 import os
+import gc
 import torch
+import shutil
+import atexit
 import torchaudio
 import numpy as np
 import gradio as gr
 from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline
+os.environ["CUDA_VISIBLE_DEVICES"] = "6"
 # Initialize AudioLDM2 Pipeline
+torch.cuda.set_device(0)
+dtype = torch.float32
+pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=dtype)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 pipeline.to(device)
+def morph_audio(audio_file1, audio_file2, num_inference_steps, prompt1='', prompt2='', negative_prompt1="Low quality", negative_prompt2="Low quality"):
     save_lora_dir = "output"
+    if os.path.exists(save_lora_dir):
+        shutil.rmtree(save_lora_dir)
     os.makedirs(save_lora_dir, exist_ok=True)
     # Load audio and compute duration
+    waveform1, sample_rate1 = torchaudio.load(audio_file1)
+    duration1 = waveform1.shape[1] / sample_rate1
+    waveform2, sample_rate2 = torchaudio.load(audio_file2)
+    duration2 = waveform2.shape[1] / sample_rate2
+    # Compare durations and take the shorter one
+    duration = int(min(duration1, duration2))
     # Perform morphing using the pipeline
     _ = pipeline(
+        dtype = dtype,
         audio_file=audio_file1,
         audio_file2=audio_file2,
         audio_length_in_s=duration,
         save_lora_dir=save_lora_dir,
         use_adain=True,
         use_reschedule=False,
+        num_inference_steps=num_inference_steps,
         lamd=0.6,
         output_path=save_lora_dir,
         num_frames=5,
         fix_lora=None,
         use_lora=True,
+        lora_steps=2,
         noisy_latent_with_lora=True,
         morphing_with_lora=True,
         use_morph_prompt=True,
     [os.path.join(save_lora_dir, file) for file in os.listdir(save_lora_dir) if file.endswith(".wav")],
     key=lambda x: int(os.path.splitext(os.path.basename(x))[0])
     )
+    del waveform1, waveform2, _
+    torch.cuda.empty_cache()
+    gc.collect()
     return output_paths
+def morph_audio_with_morphing_factor(audio_file1, audio_file2, alpha,num_inference_steps, prompt1='', prompt2='', negative_prompt1="Low quality", negative_prompt2="Low quality"):
+    save_lora_dir = "output"
+    if os.path.exists(save_lora_dir):
+        shutil.rmtree(save_lora_dir)
+    os.makedirs(save_lora_dir, exist_ok=True)
+    # Load audio and compute duration
+    waveform1, sample_rate1 = torchaudio.load(audio_file1)
+    duration1 = waveform1.shape[1] / sample_rate1
+    waveform2, sample_rate2 = torchaudio.load(audio_file2)
+    duration2 = waveform2.shape[1] / sample_rate2
+    # Compare durations and take the shorter one
+    duration = int(min(duration1, duration2))
+    try:
+        # Perform morphing using the pipeline
+        _ = pipeline(
+            dtype = dtype,
+            morphing_factor = alpha,
+            audio_file=audio_file1,
+            audio_file2=audio_file2,
+            audio_length_in_s=duration,
+            time_pooling=2,
+            freq_pooling=2,
+            prompt_1=prompt1,
+            prompt_2=prompt2,
+            negative_prompt_1=negative_prompt1,
+            negative_prompt_2=negative_prompt2,
+            save_lora_dir=save_lora_dir,
+            use_adain=True,
+            use_reschedule=False,
+            num_inference_steps=num_inference_steps,
+            lamd=0.6,
+            output_path=save_lora_dir,
+            num_frames=5,
+            fix_lora=None,
+            use_lora=True,
+            lora_steps=2,
+            noisy_latent_with_lora=True,
+            morphing_with_lora=True,
+            use_morph_prompt=True,
+            guidance_scale=7.5,
+        )
+        output_paths = os.path.join(save_lora_dir, 'interpolated.wav')
+    except RuntimeError as e:
+        if "CUDA out of memory" in str(e):
+            print("CUDA out of memory. Releasing unused memory...")
+            torch.cuda.empty_cache()
+            gc.collect()
+            raise e
+    # # Collect the output file paths
+    # del waveform1, waveform2, _
+    # torch.cuda.empty_cache()
+    # gc.collect()
+    return output_paths
+def cleanup_output_dir():
+    save_lora_dir = "output"
+    if os.path.exists(save_lora_dir):
+        shutil.rmtree(save_lora_dir)
+        print(f"Cleaned up directory: {save_lora_dir}")
+atexit.register(cleanup_output_dir)
 # Gradio interface function
+def interface(audio1, audio2, alpha, num_inference_steps):
+    output_paths = morph_audio_with_morphing_factor(audio1, audio2, alpha, num_inference_steps)
     return output_paths
 # Gradio Interface
+# demo = gr.Interface(
+#     fn=interface,
+#     inputs=[
+#         gr.Audio(label="Upload Audio File 1", type="filepath"),
+#         gr.Audio(label="Upload Audio File 2", type="filepath"),
+#         gr.Slider(0, 1, step=0.01, label="Interpolation Alpha"),
+#         gr.Slider(10, 50, step=1, label="Inference Steps"),
+#         # gr.Textbox(label="Prompt for Audio File 1"),
+#         # gr.Textbox(label="Prompt for Audio File 2"),
+#     ],
+#     outputs=gr.Audio(label="Interpolated Audio")
+# )
+with gr.Blocks() as demo:
+    with gr.Tab("Sound Morphing with fixed frames."):
+        gr.Markdown("### Upload two audio files for morphing")
+        with gr.Row():
+            audio1 = gr.Audio(label="Upload Audio File 1", type="filepath")
+            audio2 = gr.Audio(label="Upload Audio File 2", type="filepath")
+            num_inference_steps = gr.Slider(10, 50, step=1, label="Inference Steps", value=50)
+        outputs = [
+            gr.Audio(label="Morphing audio 1"),
+            gr.Audio(label="Morphing audio 2"),
+            gr.Audio(label="Morphing audio 3"),
+            gr.Audio(label="Morphing audio 4"),
+            gr.Audio(label="Morphing audio 5"),
+        ]
+        submit_btn1 = gr.Button("Submit")
+        submit_btn1.click(morph_audio, inputs=[audio1, audio2, num_inference_steps], outputs=outputs)
+    with gr.Tab("Sound Morphing with specified morphing factor."):
+        gr.Markdown("### Upload two audio files for morphing")
+        with gr.Row():
+            audio1 = gr.Audio(label="Upload Audio File 1", type="filepath")
+            audio2 = gr.Audio(label="Upload Audio File 2", type="filepath")
+            alpha = gr.Slider(0, 1, step=0.01, label="Interpolation Alpha")
+            num_inference_steps = gr.Slider(10, 50, step=1, label="Inference Steps", value=50)
+        outputs=gr.Audio(label="Interpolated Audio")
+        submit_btn2 = gr.Button("Submit")
+        submit_btn2.click(morph_audio_with_morphing_factor, inputs=[audio1, audio2, alpha, num_inference_steps], outputs=outputs)
 if __name__ == "__main__":
+    demo.launch(share=True)

pipeline/morph_pipeline_successed_ver1.py CHANGED Viewed

@@ -227,6 +227,10 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.aud1_dict = dict()
         self.aud2_dict = dict()
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
     def enable_vae_slicing(self):
@@ -928,8 +932,11 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         DEVICE = torch.device(
             "cuda") if torch.cuda.is_available() else torch.device("cpu")
         mel_spect_tensor = wav_to_mel(audio_path, duration=audio_length_in_s).unsqueeze(0)
-        output_path = audio_path.replace('.wav', '_fbank.png')
-        visualize_mel_spectrogram(mel_spect_tensor, output_path)
         mel_spect_tensor = mel_spect_tensor.to(next(self.vae.parameters()).dtype)
         # print(f'mel_spect_tensor dtype: {mel_spect_tensor.dtype}')
         # print(f'self.vae dtype: {next(self.vae.parameters()).dtype}')
@@ -1062,6 +1069,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
     def __call__(
         self,
         dtype,
         audio_file = None,
         audio_file2 = None,
         ap_scale = 1.0,
@@ -1118,11 +1126,11 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                 cross_attention_dim = cross[layer_num % 8]
                 layer_num += 1
                 if cross_attention_dim == 768:
-                    attn_procs[name].scale = IPAttnProcessor2_0(
                         hidden_size=hidden_size,
                         name=name,
                         cross_attention_dim=cross_attention_dim,
-                        text_scale=100,
                         scale=ap_scale,
                         num_tokens=8,
                         do_copy=False
@@ -1141,7 +1149,6 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                     processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
                     processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
         self.unet.set_attn_processor(attn_procs)
-        self.pipeline_trained = self.init_trained_pipeline(ap_adapter_path, device, dtype, ap_scale, text_ap_scale)
         # 1. Pre-check
         height, original_waveform_length = self.pre_check(audio_length_in_s, prompt_1, callback_steps, negative_prompt_1)
@@ -1200,7 +1207,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         # ------- For the first audio file -------
         original_processor = list(self.unet.attn_processors.values())[0]
         if noisy_latent_with_lora:
-            self.unet = load_lora(self.unet, lora_1, lora_2, 0)
         # We directly use the latent representation of the audio file for VAE's decoder as the 1st ground truth
         audio_latent = self.aud2latent(audio_file, audio_length_in_s).to(device)
         # aud_noise_1 is the noisy latent representation of the audio file 1
@@ -1211,7 +1218,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         # ------- For the second audio file -------
         if noisy_latent_with_lora:
-            self.unet = load_lora(self.unet, lora_1, lora_2, 1)
         # We directly use the latent representation of the audio file for VAE's decoder as the 1st ground truth
         audio_latent = self.aud2latent(audio_file2, audio_length_in_s)
         # aud_noise_2 is the noisy latent representation of the audio file 2
@@ -1220,12 +1227,13 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
             self.unet.set_attn_processor(original_processor)
         # After reconstructed the audio file 1, we set the original processor back
         original_processor = list(self.unet.attn_processors.values())[0]
         def morph(alpha_list, desc):
             audios = []
             # if attn_beta is not None:
             if self.use_lora:
                 self.unet = load_lora(
-                    self.unet, lora_1, lora_2, 0 if fix_lora is None else fix_lora)
             attn_processor_dict = {}
             for k in self.unet.attn_processors.keys():
                 # print(k)
@@ -1266,7 +1274,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
             scipy.io.wavfile.write(file_path, rate=16000, data=first_audio)
             if self.use_lora:
                 self.unet = load_lora(
-                    self.unet, lora_1, lora_2, 1 if fix_lora is None else fix_lora)
             attn_processor_dict = {}
             for k in self.unet.attn_processors.keys():
                 if do_replace_attn(k):
@@ -1304,12 +1312,24 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
             scipy.io.wavfile.write(file_path, rate=16000, data=last_audio)
             self.unet.set_attn_processor(original_processor)
-            for i in tqdm(range(1, num_frames - 1), desc=desc):
-                alpha = alpha_list[i]
                 if self.use_lora:
                     self.unet = load_lora(
-                        self.unet, lora_1, lora_2, alpha if fix_lora is None else fix_lora)
                 attn_processor_dict = {}
                 for k in self.unet.attn_processors.keys():
@@ -1338,25 +1358,70 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                         prompt_embeds_2,
                         attention_mask_2,
                         generated_prompt_embeds_2,
-                        alpha_list[i],
                         original_processor,
                         attn_processor_dict,
                         use_morph_prompt,
                         morphing_with_lora
                     )
-                file_path = os.path.join(self.output_path, f"{i:02d}.wav")
                 scipy.io.wavfile.write(file_path, rate=16000, data=audio)
                 self.unet.set_attn_processor(original_processor)
                 audios.append(audio)
-            audios = [first_audio] + audios + [last_audio]
             return audios
         with torch.no_grad():
             if self.use_reschedule:
                 alpha_scheduler = AlphaScheduler()
                 alpha_list = list(torch.linspace(0, 1, num_frames))
                 audios_pt = morph(alpha_list, "Sampling...")
                 audios_pt = [torch.tensor(aud).unsqueeze(0)
-                             for aud in audios_pt]
                 alpha_scheduler.from_imgs(audios_pt)
                 alpha_list = alpha_scheduler.get_list()
                 audios = morph(alpha_list, "Reschedule...")

         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.aud1_dict = dict()
         self.aud2_dict = dict()
+        ap_adapter_path = 'pytorch_model.bin'
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        dtype = next(self.vae.parameters()).dtype
+        self.pipeline_trained = self.init_trained_pipeline(ap_adapter_path, device, dtype, ap_scale=1.0, text_ap_scale=1.0)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
     def enable_vae_slicing(self):
         DEVICE = torch.device(
             "cuda") if torch.cuda.is_available() else torch.device("cpu")
         mel_spect_tensor = wav_to_mel(audio_path, duration=audio_length_in_s).unsqueeze(0)
+        # if audio_path.endswith('.wav'):
+        #     output_path = audio_path.replace('.wav', '_fbank.png')
+        # elif audio_path.endswith('.mp3'):
+        #     output_path = audio_path.replace('.mp3', '_fbank.png')
+        # visualize_mel_spectrogram(mel_spect_tensor, output_path)
         mel_spect_tensor = mel_spect_tensor.to(next(self.vae.parameters()).dtype)
         # print(f'mel_spect_tensor dtype: {mel_spect_tensor.dtype}')
         # print(f'self.vae dtype: {next(self.vae.parameters()).dtype}')
     def __call__(
         self,
         dtype,
+        morphing_factor = None,
         audio_file = None,
         audio_file2 = None,
         ap_scale = 1.0,
                 cross_attention_dim = cross[layer_num % 8]
                 layer_num += 1
                 if cross_attention_dim == 768:
+                    attn_procs[name] = IPAttnProcessor2_0(
                         hidden_size=hidden_size,
                         name=name,
                         cross_attention_dim=cross_attention_dim,
+                        text_scale=text_ap_scale,
                         scale=ap_scale,
                         num_tokens=8,
                         do_copy=False
                     processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
                     processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
         self.unet.set_attn_processor(attn_procs)
         # 1. Pre-check
         height, original_waveform_length = self.pre_check(audio_length_in_s, prompt_1, callback_steps, negative_prompt_1)
         # ------- For the first audio file -------
         original_processor = list(self.unet.attn_processors.values())[0]
         if noisy_latent_with_lora:
+            self.unet = load_lora(self.unet, lora_1, lora_2, 0, dtype=dtype)
         # We directly use the latent representation of the audio file for VAE's decoder as the 1st ground truth
         audio_latent = self.aud2latent(audio_file, audio_length_in_s).to(device)
         # aud_noise_1 is the noisy latent representation of the audio file 1
         # ------- For the second audio file -------
         if noisy_latent_with_lora:
+            self.unet = load_lora(self.unet, lora_1, lora_2, 1, dtype=dtype)
         # We directly use the latent representation of the audio file for VAE's decoder as the 1st ground truth
         audio_latent = self.aud2latent(audio_file2, audio_length_in_s)
         # aud_noise_2 is the noisy latent representation of the audio file 2
             self.unet.set_attn_processor(original_processor)
         # After reconstructed the audio file 1, we set the original processor back
         original_processor = list(self.unet.attn_processors.values())[0]
         def morph(alpha_list, desc):
             audios = []
             # if attn_beta is not None:
             if self.use_lora:
                 self.unet = load_lora(
+                    self.unet, lora_1, lora_2, 0 if fix_lora is None else fix_lora, dtype=dtype)
             attn_processor_dict = {}
             for k in self.unet.attn_processors.keys():
                 # print(k)
             scipy.io.wavfile.write(file_path, rate=16000, data=first_audio)
             if self.use_lora:
                 self.unet = load_lora(
+                    self.unet, lora_1, lora_2, 1 if fix_lora is None else fix_lora, dtype=dtype)
             attn_processor_dict = {}
             for k in self.unet.attn_processors.keys():
                 if do_replace_attn(k):
             scipy.io.wavfile.write(file_path, rate=16000, data=last_audio)
             self.unet.set_attn_processor(original_processor)
+            if morphing_factor is not None:
+                alpha = morphing_factor
+                if alpha==0:
+                    file_path = os.path.join(self.output_path, f"interpolated.wav")
+                    scipy.io.wavfile.write(file_path, rate=16000, data=first_audio)
+                    self.unet.set_attn_processor(original_processor)
+                    audios.append(audio)
+                    return audios
+                elif alpha==1:
+                    file_path = os.path.join(self.output_path, f"interpolated.wav")
+                    scipy.io.wavfile.write(file_path, rate=16000, data=last_audio)
+                    self.unet.set_attn_processor(original_processor)
+                    audios.append(audio)
+                    return audios
                 if self.use_lora:
                     self.unet = load_lora(
+                        self.unet, lora_1, lora_2, alpha if fix_lora is None else fix_lora, dtype=dtype)
                 attn_processor_dict = {}
                 for k in self.unet.attn_processors.keys():
                         prompt_embeds_2,
                         attention_mask_2,
                         generated_prompt_embeds_2,
+                        alpha,
                         original_processor,
                         attn_processor_dict,
                         use_morph_prompt,
                         morphing_with_lora
                     )
+                file_path = os.path.join(self.output_path, f"interpolated.wav")
                 scipy.io.wavfile.write(file_path, rate=16000, data=audio)
                 self.unet.set_attn_processor(original_processor)
                 audios.append(audio)
+            else:
+                for i in tqdm(range(1, num_frames - 1), desc=desc):
+                    alpha = alpha_list[i]
+                    if self.use_lora:
+                        self.unet = load_lora(
+                            self.unet, lora_1, lora_2, alpha if fix_lora is None else fix_lora, dtype=dtype)
+                    attn_processor_dict = {}
+                    for k in self.unet.attn_processors.keys():
+                        if do_replace_attn(k):
+                            if self.use_lora:
+                                attn_processor_dict[k] = LoadProcessor(
+                                    self.unet.attn_processors[k], k, self.aud1_dict, self.aud2_dict, alpha, attn_beta, lamd)
+                            else:
+                                attn_processor_dict[k] = LoadProcessor(
+                                    original_processor, k, self.aud1_dict, self.aud2_dict, alpha, attn_beta, lamd)
+                        else:
+                            attn_processor_dict[k] = self.unet.attn_processors[k]
+                    audio, latents = self.cal_latent(
+                            audio_length_in_s,
+                            time_pooling,
+                            freq_pooling,
+                            num_inference_steps,
+                            guidance_scale,
+                            aud_noise_1,
+                            aud_noise_2,
+                            prompt_1,
+                            prompt_2,
+                            prompt_embeds_1,
+                            attention_mask_1,
+                            generated_prompt_embeds_1,
+                            prompt_embeds_2,
+                            attention_mask_2,
+                            generated_prompt_embeds_2,
+                            alpha_list[i],
+                            original_processor,
+                            attn_processor_dict,
+                            use_morph_prompt,
+                            morphing_with_lora
+                        )
+                    file_path = os.path.join(self.output_path, f"{i:02d}.wav")
+                    scipy.io.wavfile.write(file_path, rate=16000, data=audio)
+                    self.unet.set_attn_processor(original_processor)
+                    audios.append(audio)
+                audios = [first_audio] + audios + [last_audio]
             return audios
         with torch.no_grad():
             if self.use_reschedule:
                 alpha_scheduler = AlphaScheduler()
                 alpha_list = list(torch.linspace(0, 1, num_frames))
                 audios_pt = morph(alpha_list, "Sampling...")
                 audios_pt = [torch.tensor(aud).unsqueeze(0)
+                            for aud in audios_pt]
                 alpha_scheduler.from_imgs(audios_pt)
                 alpha_list = alpha_scheduler.get_list()
                 audios = morph(alpha_list, "Reschedule...")

utils/lora_utils_successed_ver1.py CHANGED Viewed

@@ -664,6 +664,8 @@ def train_lora(audio_path ,dtype ,time_pooling ,freq_pooling ,prompt, negative_p
         weight_name=weight_name,
         safe_serialization=safe_serialization
     )
 def load_lora(unet, lora_0, lora_1, alpha, dtype):
     attn_procs = unet.attn_processors

         weight_name=weight_name,
         safe_serialization=safe_serialization
     )
+    del loss_history, unet_lora_layers, unet, vae, text_encoder, text_encoder_2, GPT2, projection_model, vocoder, noise_scheduler, optimizer, lr_scheduler, model
 def load_lora(unet, lora_0, lora_1, alpha, dtype):
     attn_procs = unet.attn_processors