bach-or-bot / src /musiclime /text_utils.py
krislette's picture
Auto-deploy from GitHub: 7bbe4e79d2cd5e035a2fc8cda464b3cd867300d5
7633e2f
import re
import numpy as np
from lime.lime_text import IndexedString
class LineIndexedString(IndexedString):
"""
Line-based text indexing for lyrics perturbation in MusicLIME.
Extends LIME's IndexedString to work with lyrics lines instead of words,
to enable more meaningful perturbations for song lyrics. Filters out
metadata and focuses on actual lyrical content.
Attributes
----------
raw : str
Original raw lyrics text
as_list : list of str
Processed lyrics lines without metadata
as_np : ndarray
NumPy array of lyrics lines
positions : list of int
Line position indices for LIME compatibility
"""
def __init__(self, raw_string, bow=True, mask_string=None):
"""
Initialize line-based text indexing for lyrics perturbation in MusicLIME.
Parameters
----------
raw_string : str
Raw lyrics text to be processed
bow : bool, default=True
Bag-of-words flag (maintained for LIME compatibility)
mask_string : str, optional
String to use for masking removed lines
"""
self.raw = raw_string
self.mask_string = mask_string
self.bow = bow
# Split by lines instead of words
self.as_list = self._split_by_lines(raw_string)
self.as_np = np.array(self.as_list)
# Create word positions mapping (for compatibility)
self.positions = list(range(len(self.as_list)))
self.string_start = [0] * len(self.as_list)
def _split_by_lines(self, text):
"""
Split lyrics text into meaningful lines, filtering out metadata.
Parameters
----------
text : str
Raw lyrics text with potential metadata
Returns
-------
list of str
Processed lyrics lines with metadata removed
"""
lines = text.split("\n")
processed_lines = []
for line in lines:
line = line.strip()
# Skip metadata lines
if not line or re.match(r"^\[.*\]$", line) or re.match(r"^\(.*\)$", line):
continue
processed_lines.append(line)
return processed_lines
def inverse_removing(self, words_to_remove):
"""
Reconstruct lyrics text by removing specified line indices.
Parameters
----------
words_to_remove : array-like
Indices of lyrics lines to remove from reconstruction
Returns
-------
str
Reconstructed lyrics text with specified lines removed
"""
# Keep lines not in words_to_remove
kept_lines = [
self.as_list[i]
for i in range(len(self.as_list))
if i not in words_to_remove
]
return "\n".join(kept_lines)
def num_words(self):
"""
Get total number of lyrics lines (called 'words' for LIME compatibility).
Returns
-------
int
Number of lyrics lines available for perturbation
"""
return len(self.as_list)
def word(self, id_):
"""
Get lyrics line content by index.
Parameters
----------
id_ : int
Index of the lyrics line to retrieve
Returns
-------
str
Content of the specified lyrics line
"""
return self.as_list[id_]