rag_hydro_json

Sleeping

rag_hydro_json / chunking /semantic_chunking.py

Anas Bader

redo

4cbe4e9 8 months ago

2.76 kB

	import numpy as np
	from sentence_transformers import SentenceTransformer

	embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


	def hybrid_split(text: str, max_len: int = 1024) -> list[str]:
	"""
	Split text into chunks respecting sentence boundaries when possible,
	with optional overlap between chunks.

	Args:
	text: The text to split
	max_len: Maximum length for each chunk

	Returns:
	List of text chunks
	"""
	# Normalize text
	text = text.replace("\r", "").replace("\n", " ").strip()

	# Extract sentences (more robust regex for sentence detection)
	import re

	sentences = re.split(r"(?<=[.!?])\s+", text)

	chunks = []
	current_chunk = ""

	for sentence in sentences:
	if len(sentence) > max_len:
	# First add the current chunk if it exists
	chunks.append(sentence)

	# Normal case - see if adding the sentence exceeds max_len
	elif len(current_chunk) + len(sentence) + 1 > max_len:
	# Add the current chunk and start a new one
	chunks.append(current_chunk)
	current_chunk = ""
	else:
	# Add to the current chunk
	if current_chunk:
	current_chunk += " " + sentence
	else:
	current_chunk = sentence

	if current_chunk:
	chunks.append(current_chunk)

	return chunks


	def cosine_similarity(vec1, vec2):
	"""Calculate the cosine similarity between two vectors."""
	dot_product = np.dot(vec1, vec2)
	norm_vec1 = np.linalg.norm(vec1)
	norm_vec2 = np.linalg.norm(vec2)
	return dot_product / (norm_vec1 * norm_vec2)


	def get_embedding(text):
	"""Generate an embedding using SBERT."""
	return embedding_model.encode(text, convert_to_numpy=True)


	def semantic_chunking(text, threshold=0.75, max_chunk_size=8191):
	"""
	Splits text into semantic chunks based on sentence similarity.
	- threshold: Lower = more splits, Higher = fewer splits
	- max_chunk_size: Maximum size of each chunk in characters
	"""
	text = text.replace("\n", " ").replace("\r", " ").strip()
	sentences = hybrid_split(text)
	embeddings = [get_embedding(sent) for sent in sentences]

	chunks = []
	current_chunk = [sentences[0]]

	for i in range(1, len(sentences)):
	sim = cosine_similarity(embeddings[i - 1], embeddings[i])
	if (
	sim < threshold
	or len(" ".join(current_chunk + [sentences[i]])) > max_chunk_size
	):
	chunks.append(" ".join(current_chunk))
	current_chunk = [sentences[i]]
	else:
	current_chunk.append(sentences[i])

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks