Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def hybrid_split(text: str, max_len: int = 1024) -> list[str]: | |
| """ | |
| Split text into chunks respecting sentence boundaries when possible, | |
| with optional overlap between chunks. | |
| Args: | |
| text: The text to split | |
| max_len: Maximum length for each chunk | |
| Returns: | |
| List of text chunks | |
| """ | |
| # Normalize text | |
| text = text.replace("\r", "").replace("\n", " ").strip() | |
| # Extract sentences (more robust regex for sentence detection) | |
| import re | |
| sentences = re.split(r"(?<=[.!?])\s+", text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(sentence) > max_len: | |
| # First add the current chunk if it exists | |
| chunks.append(sentence) | |
| # Normal case - see if adding the sentence exceeds max_len | |
| elif len(current_chunk) + len(sentence) + 1 > max_len: | |
| # Add the current chunk and start a new one | |
| chunks.append(current_chunk) | |
| current_chunk = "" | |
| else: | |
| # Add to the current chunk | |
| if current_chunk: | |
| current_chunk += " " + sentence | |
| else: | |
| current_chunk = sentence | |
| if current_chunk: | |
| chunks.append(current_chunk) | |
| return chunks | |
| def cosine_similarity(vec1, vec2): | |
| """Calculate the cosine similarity between two vectors.""" | |
| dot_product = np.dot(vec1, vec2) | |
| norm_vec1 = np.linalg.norm(vec1) | |
| norm_vec2 = np.linalg.norm(vec2) | |
| return dot_product / (norm_vec1 * norm_vec2) | |
| def get_embedding(text): | |
| """Generate an embedding using SBERT.""" | |
| return embedding_model.encode(text, convert_to_numpy=True) | |
| def semantic_chunking(text, threshold=0.75, max_chunk_size=8191): | |
| """ | |
| Splits text into semantic chunks based on sentence similarity. | |
| - threshold: Lower = more splits, Higher = fewer splits | |
| - max_chunk_size: Maximum size of each chunk in characters | |
| """ | |
| text = text.replace("\n", " ").replace("\r", " ").strip() | |
| sentences = hybrid_split(text) | |
| embeddings = [get_embedding(sent) for sent in sentences] | |
| chunks = [] | |
| current_chunk = [sentences[0]] | |
| for i in range(1, len(sentences)): | |
| sim = cosine_similarity(embeddings[i - 1], embeddings[i]) | |
| if ( | |
| sim < threshold | |
| or len(" ".join(current_chunk + [sentences[i]])) > max_chunk_size | |
| ): | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [sentences[i]] | |
| else: | |
| current_chunk.append(sentences[i]) | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |