ChAbhishek28 commited on
Commit
3df14a5
·
1 Parent(s): 7807a49

Add 899999999

Browse files
Files changed (1) hide show
  1. rag_service.py +49 -2
rag_service.py CHANGED
@@ -205,8 +205,55 @@ async def search_docs(query: str, config: RunnableConfig) -> str:
205
  # Search in the specified knowledge base
206
  docs = await lancedb_service.similarity_search(query, userid, knowledge_base)
207
  if docs:
208
- context = "\n\n".join([doc.page_content[:500] for doc in docs]) # Only first 500 chars per doc
209
- return f"📄 Found {len(docs)} relevant documents:\n\n{context}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  else:
211
  context = ""
212
  return "No relevant documents found in the knowledge base."
 
205
  # Search in the specified knowledge base
206
  docs = await lancedb_service.similarity_search(query, userid, knowledge_base)
207
  if docs:
208
+ # Advanced extractive summarization using NLTK
209
+ try:
210
+ import nltk
211
+ nltk.download('punkt', quiet=True)
212
+ from nltk.tokenize import sent_tokenize
213
+ except ImportError:
214
+ sent_tokenize = lambda x: x.split('.')
215
+
216
+ # Embedding-based chunk selection
217
+ try:
218
+ from sentence_transformers import SentenceTransformer
219
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
220
+ except ImportError:
221
+ embedder = None
222
+
223
+ def select_best_chunk(chunks, query):
224
+ if not embedder or not chunks:
225
+ return chunks[0] if chunks else ""
226
+ chunk_embeddings = embedder.encode(chunks)
227
+ query_embedding = embedder.encode([query])[0]
228
+ # Cosine similarity
229
+ import numpy as np
230
+ scores = [np.dot(chunk_emb, query_embedding)/(np.linalg.norm(chunk_emb)*np.linalg.norm(query_embedding)) for chunk_emb in chunk_embeddings]
231
+ best_idx = int(np.argmax(scores))
232
+ return chunks[best_idx]
233
+
234
+ def extractive_summary(text, max_sentences=3):
235
+ sentences = sent_tokenize(text)
236
+ keywords = query.lower().split()
237
+ scored = [s for s in sentences if any(k in s.lower() for k in keywords)]
238
+ if scored:
239
+ return ' '.join(scored[:max_sentences])
240
+ return ' '.join(sentences[:max_sentences])
241
+
242
+ compressed_contexts = []
243
+ for doc in docs:
244
+ # Chunking: split by paragraphs or headings
245
+ if hasattr(doc, 'chunks') and doc.chunks:
246
+ best_chunk = select_best_chunk(doc.chunks, query)
247
+ summary = extractive_summary(best_chunk)
248
+ else:
249
+ # Fallback: split by paragraphs
250
+ paragraphs = doc.page_content.split('\n\n')
251
+ best_chunk = select_best_chunk(paragraphs, query) if paragraphs else doc.page_content
252
+ summary = extractive_summary(best_chunk)
253
+ compressed_contexts.append(summary)
254
+
255
+ context = "\n\n".join(compressed_contexts)
256
+ return f"📄 Found {len(docs)} relevant documents (chunked & summarized):\n\n{context}"
257
  else:
258
  context = ""
259
  return "No relevant documents found in the knowledge base."