Spaces:

krislette
/

bach-or-bot

Running

File size: 7,082 Bytes

import numpy as np
import time
import torch
from openunmix import predict
from src.musiclime.print_utils import green_bold


class OpenUnmixFactorization:
    """
    Audio factorization using OpenUnmix source separation with temporal segmentation.

    Decomposes audio into interpretable components by separating sources
    (vocals, bass, drums, other) and segmenting each across time windows.
    Creates temporal-source combinations for fine-grained audio explanations.

    Attributes
    ----------
    audio : ndarray
        Original audio waveform
    temporal_segments : list of tuple
        Time window boundaries for segmentation
    original_components : list of ndarray
        Raw separated audio sources
    component_names : list of str
        Names of separated sources
    components : list of ndarray
        Final temporal-source component combinations
    final_component_names : list of str
        Names of temporal-source combinations
    """

    def __init__(self, audio, temporal_segmentation_params=10, composition_fn=None):
        """
        Initialize audio factorization using OpenUnmix source separation with temporal segmentation.

        Parameters
        ----------
        audio : array-like
            Raw audio waveform data at 44.1kHz sample rate
        temporal_segmentation_params : int, default=10
            Number of temporal segments to divide the audio into
        composition_fn : callable, optional
            Custom function for composing separated sources (unused for now)
        """
        print("[MusicLIME] Initializing OpenUnmix factorization...")
        self.audio = audio
        self.target_sr = 44100

        start_time = time.time()
        print(
            f"[MusicLIME] Computing {temporal_segmentation_params} temporal segments..."
        )
        self.temporal_segments = self._compute_segments(
            audio, temporal_segmentation_params
        )
        segmentation_time = time.time() - start_time
        print(
            green_bold(
                f"[MusicLIME] Temporal segmentation completed in {segmentation_time:.2f}s"
            )
        )

        # Initialize source separation
        start_time = time.time()
        print("[MusicLIME] Separating audio sources...")
        self.original_components, self.component_names = self._separate_sources()
        print(f"[MusicLIME] Found components: {self.component_names}")
        separation_time = time.time() - start_time
        print(
            green_bold(
                f"[MusicLIME] Source separation completed in {separation_time:.2f}s"
            )
        )

        start_time = time.time()
        print("[MusicLIME] Preparing temporal-source combinations...")
        self._prepare_temporal_components()
        print(f"[MusicLIME] Created {len(self.components)} total components")
        preparation_time = time.time() - start_time
        print(
            green_bold(
                f"[MusicLIME] Component preparation completed in {preparation_time:.2f}s"
            )
        )

    def _compute_segments(self, signal, n_segments):
        """
        Divide audio signal into equal temporal segments for factorization.

        Parameters
        ----------
        signal : array-like
            Input audio waveform
        n_segments : int
            Number of temporal segments to create

        Returns
        -------
        list of tuple
            List of (start, end) sample indices for each segment
        """
        audio_length = len(signal)
        samples_per_segment = audio_length // n_segments

        segments = []
        for i in range(n_segments):
            start = i * samples_per_segment
            end = start + samples_per_segment
            segments.append((start, end))
        return segments

    def _separate_sources(self):
        """
        Perform source separation using OpenUnmix to extract instrument components.

        Returns
        -------
        components : list of ndarray
            Separated audio sources (vocals, bass, drums, other)
        names : list of str
            Names of the separated source components
        """
        waveform = np.expand_dims(self.audio, axis=1)

        # Load openunmix .pth files from local dir
        model_path = "models/musiclime"

        # Specify targets
        targets = ["vocals", "bass", "drums", "other"]

        # Specify device based on availability
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"[MusicLIME] Using device for source separation: {device}")

        # Then load openunmix files to openunmix' method
        prediction = predict.separate(
            torch.as_tensor(waveform).float(),
            rate=44100,
            model_str_or_path=model_path,
            targets=targets,
            device=device,
        )

        components = [
            prediction[key][0].mean(dim=0).cpu().numpy() for key in prediction
        ]
        names = list(prediction.keys())

        return components, names

    def _prepare_temporal_components(self):
        """
        Create temporal-source combinations by applying each source to each time segment.

        Creates components like 'vocals_T0', 'drums_T5' representing specific
        instruments active only in specific temporal windows.
        """
        # Create temporal-source combinations
        self.components = []
        self.final_component_names = []

        for s, (start, end) in enumerate(self.temporal_segments):
            for c, component in enumerate(self.original_components):
                temp_component = np.zeros_like(self.audio)
                temp_component[start:end] = component[start:end]
                self.components.append(temp_component)
                self.final_component_names.append(f"{self.component_names[c]}_T{s}")

    def get_number_components(self):
        """
        Get total number of factorized components (sources x temporal segments).

        Returns
        -------
        int
            Total number of temporal-source component combinations
        """
        return len(self.components)

    def get_ordered_component_names(self):
        """
        Get ordered list of component names for explanation display.

        Returns
        -------
        list of str
            Component names in format '{source}_T{segment}' (e.g., 'vocals_T3')
        """
        return self.final_component_names

    def compose_model_input(self, component_indices):
        """
        Reconstruct audio by summing selected temporal-source components.

        Parameters
        ----------
        component_indices : array-like
            Indices of components to include in reconstruction

        Returns
        -------
        ndarray
            Reconstructed audio waveform from selected components
        """
        if len(component_indices) == 0:
            return np.zeros_like(self.audio)

        selected_components = [self.components[i] for i in component_indices]
        return sum(selected_components)