Spaces:
Sleeping
Sleeping
Commit
·
3df14a5
1
Parent(s):
7807a49
Add 899999999
Browse files- rag_service.py +49 -2
rag_service.py
CHANGED
|
@@ -205,8 +205,55 @@ async def search_docs(query: str, config: RunnableConfig) -> str:
|
|
| 205 |
# Search in the specified knowledge base
|
| 206 |
docs = await lancedb_service.similarity_search(query, userid, knowledge_base)
|
| 207 |
if docs:
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
else:
|
| 211 |
context = ""
|
| 212 |
return "No relevant documents found in the knowledge base."
|
|
|
|
| 205 |
# Search in the specified knowledge base
|
| 206 |
docs = await lancedb_service.similarity_search(query, userid, knowledge_base)
|
| 207 |
if docs:
|
| 208 |
+
# Advanced extractive summarization using NLTK
|
| 209 |
+
try:
|
| 210 |
+
import nltk
|
| 211 |
+
nltk.download('punkt', quiet=True)
|
| 212 |
+
from nltk.tokenize import sent_tokenize
|
| 213 |
+
except ImportError:
|
| 214 |
+
sent_tokenize = lambda x: x.split('.')
|
| 215 |
+
|
| 216 |
+
# Embedding-based chunk selection
|
| 217 |
+
try:
|
| 218 |
+
from sentence_transformers import SentenceTransformer
|
| 219 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 220 |
+
except ImportError:
|
| 221 |
+
embedder = None
|
| 222 |
+
|
| 223 |
+
def select_best_chunk(chunks, query):
|
| 224 |
+
if not embedder or not chunks:
|
| 225 |
+
return chunks[0] if chunks else ""
|
| 226 |
+
chunk_embeddings = embedder.encode(chunks)
|
| 227 |
+
query_embedding = embedder.encode([query])[0]
|
| 228 |
+
# Cosine similarity
|
| 229 |
+
import numpy as np
|
| 230 |
+
scores = [np.dot(chunk_emb, query_embedding)/(np.linalg.norm(chunk_emb)*np.linalg.norm(query_embedding)) for chunk_emb in chunk_embeddings]
|
| 231 |
+
best_idx = int(np.argmax(scores))
|
| 232 |
+
return chunks[best_idx]
|
| 233 |
+
|
| 234 |
+
def extractive_summary(text, max_sentences=3):
|
| 235 |
+
sentences = sent_tokenize(text)
|
| 236 |
+
keywords = query.lower().split()
|
| 237 |
+
scored = [s for s in sentences if any(k in s.lower() for k in keywords)]
|
| 238 |
+
if scored:
|
| 239 |
+
return ' '.join(scored[:max_sentences])
|
| 240 |
+
return ' '.join(sentences[:max_sentences])
|
| 241 |
+
|
| 242 |
+
compressed_contexts = []
|
| 243 |
+
for doc in docs:
|
| 244 |
+
# Chunking: split by paragraphs or headings
|
| 245 |
+
if hasattr(doc, 'chunks') and doc.chunks:
|
| 246 |
+
best_chunk = select_best_chunk(doc.chunks, query)
|
| 247 |
+
summary = extractive_summary(best_chunk)
|
| 248 |
+
else:
|
| 249 |
+
# Fallback: split by paragraphs
|
| 250 |
+
paragraphs = doc.page_content.split('\n\n')
|
| 251 |
+
best_chunk = select_best_chunk(paragraphs, query) if paragraphs else doc.page_content
|
| 252 |
+
summary = extractive_summary(best_chunk)
|
| 253 |
+
compressed_contexts.append(summary)
|
| 254 |
+
|
| 255 |
+
context = "\n\n".join(compressed_contexts)
|
| 256 |
+
return f"📄 Found {len(docs)} relevant documents (chunked & summarized):\n\n{context}"
|
| 257 |
else:
|
| 258 |
context = ""
|
| 259 |
return "No relevant documents found in the knowledge base."
|