File size: 3,474 Bytes
fc7b4a9
 
 
 
 
 
7633e2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc7b4a9
7633e2f
 
 
 
 
 
 
 
 
 
 
 
fc7b4a9
 
 
 
 
 
 
 
 
 
 
 
 
7633e2f
 
 
 
 
 
 
 
 
 
 
 
 
fc7b4a9
 
 
 
 
 
 
 
 
 
 
 
 
7633e2f
 
 
 
 
 
 
 
 
 
 
 
 
fc7b4a9
 
 
 
 
 
 
 
 
7633e2f
 
 
 
 
 
 
 
fc7b4a9
 
 
7633e2f
 
 
 
 
 
 
 
 
 
 
 
 
fc7b4a9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import re
import numpy as np
from lime.lime_text import IndexedString


class LineIndexedString(IndexedString):
    """
    Line-based text indexing for lyrics perturbation in MusicLIME.

    Extends LIME's IndexedString to work with lyrics lines instead of words,
    to enable more meaningful perturbations for song lyrics. Filters out
    metadata and focuses on actual lyrical content.

    Attributes
    ----------
    raw : str
        Original raw lyrics text
    as_list : list of str
        Processed lyrics lines without metadata
    as_np : ndarray
        NumPy array of lyrics lines
    positions : list of int
        Line position indices for LIME compatibility
    """

    def __init__(self, raw_string, bow=True, mask_string=None):
        """
        Initialize line-based text indexing for lyrics perturbation in MusicLIME.

        Parameters
        ----------
        raw_string : str
            Raw lyrics text to be processed
        bow : bool, default=True
            Bag-of-words flag (maintained for LIME compatibility)
        mask_string : str, optional
            String to use for masking removed lines
        """
        self.raw = raw_string
        self.mask_string = mask_string
        self.bow = bow

        # Split by lines instead of words
        self.as_list = self._split_by_lines(raw_string)
        self.as_np = np.array(self.as_list)

        # Create word positions mapping (for compatibility)
        self.positions = list(range(len(self.as_list)))
        self.string_start = [0] * len(self.as_list)

    def _split_by_lines(self, text):
        """
        Split lyrics text into meaningful lines, filtering out metadata.

        Parameters
        ----------
        text : str
            Raw lyrics text with potential metadata

        Returns
        -------
        list of str
            Processed lyrics lines with metadata removed
        """
        lines = text.split("\n")
        processed_lines = []

        for line in lines:
            line = line.strip()
            # Skip metadata lines
            if not line or re.match(r"^\[.*\]$", line) or re.match(r"^\(.*\)$", line):
                continue
            processed_lines.append(line)

        return processed_lines

    def inverse_removing(self, words_to_remove):
        """
        Reconstruct lyrics text by removing specified line indices.

        Parameters
        ----------
        words_to_remove : array-like
            Indices of lyrics lines to remove from reconstruction

        Returns
        -------
        str
            Reconstructed lyrics text with specified lines removed
        """
        # Keep lines not in words_to_remove
        kept_lines = [
            self.as_list[i]
            for i in range(len(self.as_list))
            if i not in words_to_remove
        ]
        return "\n".join(kept_lines)

    def num_words(self):
        """
        Get total number of lyrics lines (called 'words' for LIME compatibility).

        Returns
        -------
        int
            Number of lyrics lines available for perturbation
        """
        return len(self.as_list)

    def word(self, id_):
        """
        Get lyrics line content by index.

        Parameters
        ----------
        id_ : int
            Index of the lyrics line to retrieve

        Returns
        -------
        str
            Content of the specified lyrics line
        """
        return self.as_list[id_]