Spaces:
Running
Running
| import re | |
| import numpy as np | |
| from lime.lime_text import IndexedString | |
| class LineIndexedString(IndexedString): | |
| """ | |
| Line-based text indexing for lyrics perturbation in MusicLIME. | |
| Extends LIME's IndexedString to work with lyrics lines instead of words, | |
| to enable more meaningful perturbations for song lyrics. Filters out | |
| metadata and focuses on actual lyrical content. | |
| Attributes | |
| ---------- | |
| raw : str | |
| Original raw lyrics text | |
| as_list : list of str | |
| Processed lyrics lines without metadata | |
| as_np : ndarray | |
| NumPy array of lyrics lines | |
| positions : list of int | |
| Line position indices for LIME compatibility | |
| """ | |
| def __init__(self, raw_string, bow=True, mask_string=None): | |
| """ | |
| Initialize line-based text indexing for lyrics perturbation in MusicLIME. | |
| Parameters | |
| ---------- | |
| raw_string : str | |
| Raw lyrics text to be processed | |
| bow : bool, default=True | |
| Bag-of-words flag (maintained for LIME compatibility) | |
| mask_string : str, optional | |
| String to use for masking removed lines | |
| """ | |
| self.raw = raw_string | |
| self.mask_string = mask_string | |
| self.bow = bow | |
| # Split by lines instead of words | |
| self.as_list = self._split_by_lines(raw_string) | |
| self.as_np = np.array(self.as_list) | |
| # Create word positions mapping (for compatibility) | |
| self.positions = list(range(len(self.as_list))) | |
| self.string_start = [0] * len(self.as_list) | |
| def _split_by_lines(self, text): | |
| """ | |
| Split lyrics text into meaningful lines, filtering out metadata. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw lyrics text with potential metadata | |
| Returns | |
| ------- | |
| list of str | |
| Processed lyrics lines with metadata removed | |
| """ | |
| lines = text.split("\n") | |
| processed_lines = [] | |
| for line in lines: | |
| line = line.strip() | |
| # Skip metadata lines | |
| if not line or re.match(r"^\[.*\]$", line) or re.match(r"^\(.*\)$", line): | |
| continue | |
| processed_lines.append(line) | |
| return processed_lines | |
| def inverse_removing(self, words_to_remove): | |
| """ | |
| Reconstruct lyrics text by removing specified line indices. | |
| Parameters | |
| ---------- | |
| words_to_remove : array-like | |
| Indices of lyrics lines to remove from reconstruction | |
| Returns | |
| ------- | |
| str | |
| Reconstructed lyrics text with specified lines removed | |
| """ | |
| # Keep lines not in words_to_remove | |
| kept_lines = [ | |
| self.as_list[i] | |
| for i in range(len(self.as_list)) | |
| if i not in words_to_remove | |
| ] | |
| return "\n".join(kept_lines) | |
| def num_words(self): | |
| """ | |
| Get total number of lyrics lines (called 'words' for LIME compatibility). | |
| Returns | |
| ------- | |
| int | |
| Number of lyrics lines available for perturbation | |
| """ | |
| return len(self.as_list) | |
| def word(self, id_): | |
| """ | |
| Get lyrics line content by index. | |
| Parameters | |
| ---------- | |
| id_ : int | |
| Index of the lyrics line to retrieve | |
| Returns | |
| ------- | |
| str | |
| Content of the specified lyrics line | |
| """ | |
| return self.as_list[id_] | |