File size: 4,505 Bytes
2bbfbb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0393dfa
2bbfbb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
//! Text processing module for IndexTTS
//!
//! Provides text normalization, tokenization, and phoneme conversion.

mod normalizer;
mod phoneme;
mod tokenizer;

pub use normalizer::{Language, TextNormalizer};
pub use phoneme::{g2p_english, pinyin_to_phones};
pub use tokenizer::{TextTokenizer, TokenizerConfig};

use crate::Result;

/// Process text through the complete frontend pipeline
pub fn process_text(text: &str, tokenizer: &TextTokenizer) -> Result<Vec<i64>> {
    // Normalize text
    let normalizer = TextNormalizer::new();
    let normalized = normalizer.normalize(text)?;

    // Tokenize
    let tokens = tokenizer.encode(&normalized)?;

    Ok(tokens)
}

/// Detect language of text
pub fn detect_language(text: &str) -> Language {
    let mut chinese_count = 0;
    let mut english_count = 0;

    for ch in text.chars() {
        if is_chinese_char(ch) {
            chinese_count += 1;
        } else if ch.is_ascii_alphabetic() {
            english_count += 1;
        }
    }

    if chinese_count > 0 && english_count == 0 {
        Language::Chinese
    } else if english_count > 0 && chinese_count == 0 {
        Language::English
    } else if chinese_count > 0 && english_count > 0 {
        Language::Mixed
    } else {
        // Default to English for pure punctuation or empty
        Language::English
    }
}

/// Check if character is Chinese
pub fn is_chinese_char(ch: char) -> bool {
    matches!(ch as u32,
        0x4E00..=0x9FFF |     // CJK Unified Ideographs
        0x3400..=0x4DBF |     // CJK Unified Ideographs Extension A
        0x20000..=0x2A6DF |   // CJK Unified Ideographs Extension B
        0x2A700..=0x2B73F |   // CJK Unified Ideographs Extension C
        0x2B740..=0x2B81F |   // CJK Unified Ideographs Extension D
        0xF900..=0xFAFF |     // CJK Compatibility Ideographs
        0x2F800..=0x2FA1F     // CJK Compatibility Ideographs Supplement
    )
}

/// Check if text contains Chinese characters
pub fn contains_chinese(text: &str) -> bool {
    text.chars().any(is_chinese_char)
}

/// Check if text contains only ASCII
pub fn is_ascii_only(text: &str) -> bool {
    text.is_ascii()
}

/// Split text into segments by language
pub fn split_by_language(text: &str) -> Vec<(String, Language)> {
    let mut segments = Vec::new();
    let mut current_segment = String::new();
    let mut current_lang = None;

    for ch in text.chars() {
        let char_lang = if is_chinese_char(ch) {
            Some(Language::Chinese)
        } else if ch.is_ascii_alphabetic() {
            Some(Language::English)
        } else {
            None // Punctuation or other
        };

        match (current_lang, char_lang) {
            (None, Some(lang)) => {
                current_lang = Some(lang);
                current_segment.push(ch);
            }
            (Some(curr), Some(lang)) if curr == lang => {
                current_segment.push(ch);
            }
            (Some(curr), Some(lang)) if curr != lang => {
                if !current_segment.trim().is_empty() {
                    segments.push((current_segment.clone(), curr));
                }
                current_segment = ch.to_string();
                current_lang = Some(lang);
            }
            (Some(_), None) => {
                // Punctuation - add to current segment
                current_segment.push(ch);
            }
            (None, None) => {
                // Pure punctuation
                if !current_segment.is_empty() {
                    current_segment.push(ch);
                }
            }
            _ => {}
        }
    }

    if !current_segment.trim().is_empty() {
        if let Some(lang) = current_lang {
            segments.push((current_segment, lang));
        }
    }

    segments
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_is_chinese_char() {
        assert!(is_chinese_char('中'));
        assert!(is_chinese_char('文'));
        assert!(!is_chinese_char('a'));
        assert!(!is_chinese_char('1'));
    }

    #[test]
    fn test_detect_language() {
        assert_eq!(detect_language("Hello world"), Language::English);
        assert_eq!(detect_language("你好世界"), Language::Chinese);
        assert_eq!(detect_language("Hello 世界"), Language::Mixed);
    }

    #[test]
    fn test_contains_chinese() {
        assert!(contains_chinese("Hello 世界"));
        assert!(contains_chinese("你好"));
        assert!(!contains_chinese("Hello world"));
    }
}