File size: 7,231 Bytes
a28f35e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import os
import struct
import argparse
import torch
import numpy as np
from silero_vad import load_silero_vad, __version__ as silero_version

def convert_silero_vad(output_path, print_tensors=True):
    model = load_silero_vad()
    state_dict = model.state_dict()

    # Clean up state dict keys - filter out 8k model
    cleaned_dict = {}
    for key, value in state_dict.items():
        # Skip 8k model
        if "_8k" not in key:
            clean_key = key
            if not key.startswith("_model."):
                clean_key = "_model." + key
            cleaned_dict[clean_key] = value

    base, ext = os.path.splitext(output_path)
    output_file = f"{base}-v{silero_version}-ggml{ext}"
    print(f"Saving GGML Silero-VAD model to {output_file}")

    print("\nTensor info for debugging:")
    for key, tensor in cleaned_dict.items():
        print(f"  - {key}: {tensor.shape} ({tensor.dtype})")
    print()

    with open(output_file, "wb") as fout:
        # Write magic and version
        fout.write(struct.pack("i", 0x67676d6c))

        model_type = "silero-16k"
        str_len = len(model_type)
        fout.write(struct.pack("i", str_len))
        fout.write(model_type.encode('utf-8'))

        version_parts = silero_version.split('.')
        major, minor, patch = map(int, version_parts)
        print(f"Version: {major}.{minor}.{patch}")
        fout.write(struct.pack("i", major))
        fout.write(struct.pack("i", minor))
        fout.write(struct.pack("i", patch))

        # Write model architecture parameters
        window_size = 512
        fout.write(struct.pack("i", window_size))
        context_size = 64
        fout.write(struct.pack("i", context_size))

        n_encoder_layers = 4
        fout.write(struct.pack("i", n_encoder_layers))

        # Write encoder dimensions
        input_channels = 129
        encoder_in_channels = [input_channels, 128, 64, 64]
        encoder_out_channels = [128, 64, 64, 128]
        kernel_size = 3

        for i in range(n_encoder_layers):
            fout.write(struct.pack("i", encoder_in_channels[i]))
            fout.write(struct.pack("i", encoder_out_channels[i]))
            fout.write(struct.pack("i", kernel_size))

        # Write LSTM dimensions
        lstm_input_size = 128
        lstm_hidden_size = 128
        fout.write(struct.pack("i", lstm_input_size))
        fout.write(struct.pack("i", lstm_hidden_size))

        # Write final conv dimensions
        final_conv_in = 128
        final_conv_out = 1
        fout.write(struct.pack("i", final_conv_in))
        fout.write(struct.pack("i", final_conv_out))

        # Define tensor keys to write
        tensor_keys = []

        # Encoder weights
        for i in range(n_encoder_layers):
            weight_key = f"_model.encoder.{i}.reparam_conv.weight"
            bias_key = f"_model.encoder.{i}.reparam_conv.bias"
            if weight_key in cleaned_dict and bias_key in cleaned_dict:
                tensor_keys.append(weight_key)
                tensor_keys.append(bias_key)

        # LSTM weights
        lstm_keys = [
            "_model.decoder.rnn.weight_ih",
            "_model.decoder.rnn.weight_hh",
            "_model.decoder.rnn.bias_ih",
            "_model.decoder.rnn.bias_hh"
        ]
        tensor_keys.extend([k for k in lstm_keys if k in cleaned_dict])

        # Final conv weights
        final_keys = [
            "_model.decoder.decoder.2.weight",
            "_model.decoder.decoder.2.bias"
        ]
        tensor_keys.extend([k for k in final_keys if k in cleaned_dict])

        # STFT basis - add this last
        stft_tensor = "_model.stft.forward_basis_buffer"
        tensor_keys.append(stft_tensor)

        print(f"Writing {len(tensor_keys)} tensors:")
        for key in tensor_keys:
            if key in cleaned_dict:
                print(f"  - {key}: {cleaned_dict[key].shape}")
            else:
                print(f"  - {key}: MISSING")

        # Process each tensor
        for key in tensor_keys:
            if key not in cleaned_dict:
                print(f"Warning: Missing tensor {key}, skipping")
                continue

            tensor = cleaned_dict[key]

            # Special handling for STFT tensor
            if key == "_model.stft.forward_basis_buffer":
                # Get the original numpy array without squeezing
                data = tensor.detach().cpu().numpy()
                # Ensure it has the expected shape
                print(f"STFT tensor original shape: {data.shape}")
                n_dims = 3
                tensor_shape = [data.shape[2], data.shape[1], data.shape[0]]
                is_conv_weight = True
            else:
                # For other tensors, we can use standard processing
                data = tensor.detach().cpu().squeeze().numpy()
                tensor_shape = list(data.shape)

                # Ensure we have at most 4 dimensions for GGML
                n_dims = min(len(tensor_shape), 4)

                # Reverse dimensions for GGML
                tensor_shape = tensor_shape[:n_dims]
                tensor_shape.reverse()

                # Check if this is a convolution weight tensor
                is_conv_weight = "weight" in key and ("encoder" in key or "_model.decoder.decoder.2" in key)

            # Convert to float16 for convolution weights
            if is_conv_weight:
                data = data.astype(np.float16)
                ftype = 1  # float16
            else:
                ftype = 0  # float32

            # Debug printing of tensor info
            print(f"\nWriting tensor: {key}")
            print(f"  Original shape: {tensor.shape}")
            print(f"  Processed shape: {data.shape}")
            print(f"  GGML dimensions: {n_dims}")
            print(f"  GGML shape: {tensor_shape}")
            print(f"  Type: {'float16' if ftype == 1 else 'float32'}")

            # Convert tensor name to bytes
            name_bytes = key.encode('utf-8')
            name_length = len(name_bytes)

            # Write tensor header
            fout.write(struct.pack("i", n_dims))
            fout.write(struct.pack("i", name_length))
            fout.write(struct.pack("i", ftype))

            # Write tensor dimensions
            for i in range(n_dims):
                size = tensor_shape[i] if i < len(tensor_shape) else 1
                fout.write(struct.pack("i", size))
                print(f"  Writing dimension {i}: {size}")

            # Write tensor name
            fout.write(name_bytes)

            # Write tensor data
            data.tofile(fout)

            print(f"  Wrote {data.size * (2 if ftype==1 else 4)} bytes")

    print(f"\nDone! Model has been converted to GGML format: {output_file}")
    print(f"File size: {os.path.getsize(output_file)} bytes")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
    parser.add_argument("--output", type=str, required=True, help="Path to output GGML model file")
    parser.add_argument("--print-tensors", action="store_true", help="Print tensor values", default=True)
    args = parser.parse_args()

    convert_silero_vad(args.output, args.print_tensors)