File size: 4,505 Bytes
2bbfbb7 0393dfa 2bbfbb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
//! Text processing module for IndexTTS
//!
//! Provides text normalization, tokenization, and phoneme conversion.
mod normalizer;
mod phoneme;
mod tokenizer;
pub use normalizer::{Language, TextNormalizer};
pub use phoneme::{g2p_english, pinyin_to_phones};
pub use tokenizer::{TextTokenizer, TokenizerConfig};
use crate::Result;
/// Process text through the complete frontend pipeline
pub fn process_text(text: &str, tokenizer: &TextTokenizer) -> Result<Vec<i64>> {
// Normalize text
let normalizer = TextNormalizer::new();
let normalized = normalizer.normalize(text)?;
// Tokenize
let tokens = tokenizer.encode(&normalized)?;
Ok(tokens)
}
/// Detect language of text
pub fn detect_language(text: &str) -> Language {
let mut chinese_count = 0;
let mut english_count = 0;
for ch in text.chars() {
if is_chinese_char(ch) {
chinese_count += 1;
} else if ch.is_ascii_alphabetic() {
english_count += 1;
}
}
if chinese_count > 0 && english_count == 0 {
Language::Chinese
} else if english_count > 0 && chinese_count == 0 {
Language::English
} else if chinese_count > 0 && english_count > 0 {
Language::Mixed
} else {
// Default to English for pure punctuation or empty
Language::English
}
}
/// Check if character is Chinese
pub fn is_chinese_char(ch: char) -> bool {
matches!(ch as u32,
0x4E00..=0x9FFF | // CJK Unified Ideographs
0x3400..=0x4DBF | // CJK Unified Ideographs Extension A
0x20000..=0x2A6DF | // CJK Unified Ideographs Extension B
0x2A700..=0x2B73F | // CJK Unified Ideographs Extension C
0x2B740..=0x2B81F | // CJK Unified Ideographs Extension D
0xF900..=0xFAFF | // CJK Compatibility Ideographs
0x2F800..=0x2FA1F // CJK Compatibility Ideographs Supplement
)
}
/// Check if text contains Chinese characters
pub fn contains_chinese(text: &str) -> bool {
text.chars().any(is_chinese_char)
}
/// Check if text contains only ASCII
pub fn is_ascii_only(text: &str) -> bool {
text.is_ascii()
}
/// Split text into segments by language
pub fn split_by_language(text: &str) -> Vec<(String, Language)> {
let mut segments = Vec::new();
let mut current_segment = String::new();
let mut current_lang = None;
for ch in text.chars() {
let char_lang = if is_chinese_char(ch) {
Some(Language::Chinese)
} else if ch.is_ascii_alphabetic() {
Some(Language::English)
} else {
None // Punctuation or other
};
match (current_lang, char_lang) {
(None, Some(lang)) => {
current_lang = Some(lang);
current_segment.push(ch);
}
(Some(curr), Some(lang)) if curr == lang => {
current_segment.push(ch);
}
(Some(curr), Some(lang)) if curr != lang => {
if !current_segment.trim().is_empty() {
segments.push((current_segment.clone(), curr));
}
current_segment = ch.to_string();
current_lang = Some(lang);
}
(Some(_), None) => {
// Punctuation - add to current segment
current_segment.push(ch);
}
(None, None) => {
// Pure punctuation
if !current_segment.is_empty() {
current_segment.push(ch);
}
}
_ => {}
}
}
if !current_segment.trim().is_empty() {
if let Some(lang) = current_lang {
segments.push((current_segment, lang));
}
}
segments
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_chinese_char() {
assert!(is_chinese_char('中'));
assert!(is_chinese_char('文'));
assert!(!is_chinese_char('a'));
assert!(!is_chinese_char('1'));
}
#[test]
fn test_detect_language() {
assert_eq!(detect_language("Hello world"), Language::English);
assert_eq!(detect_language("你好世界"), Language::Chinese);
assert_eq!(detect_language("Hello 世界"), Language::Mixed);
}
#[test]
fn test_contains_chinese() {
assert!(contains_chinese("Hello 世界"));
assert!(contains_chinese("你好"));
assert!(!contains_chinese("Hello world"));
}
}
|