Spaces:
Sleeping
Sleeping
Commit
·
a678780
1
Parent(s):
60ce103
Implement comprehensive multi-category query handling - Add relevance scoring and query enhancement for all document types (pension, leave, allowance, procurement, medical, etc.)
Browse files- rag_service.py +77 -29
rag_service.py
CHANGED
|
@@ -196,14 +196,35 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
|
|
| 196 |
knowledge_bases = ["government_docs"] # Default
|
| 197 |
query_lower = query.lower()
|
| 198 |
|
| 199 |
-
# Enhance query for better relevance
|
| 200 |
enhanced_query = query
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
logger.info(f"🔍 Enhanced query: '{enhanced_query}' (original: '{query}')")
|
| 209 |
|
|
@@ -242,32 +263,59 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
|
|
| 242 |
title = ''
|
| 243 |
|
| 244 |
# Calculate relevance score based on query intent
|
| 245 |
-
relevance_score = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
-
#
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
-
if
|
| 252 |
-
# Boost
|
| 253 |
-
|
|
|
|
| 254 |
relevance_score += 1.0
|
| 255 |
-
if any(keyword in content for keyword in ['pension rules', 'pension regulation', 'pension scheme', 'retirement']):
|
| 256 |
-
relevance_score += 0.8
|
| 257 |
-
if any(keyword in content for keyword in ['changes', 'modifications', 'amendments', 'updates', 'revised', 'impact']):
|
| 258 |
-
relevance_score += 0.3
|
| 259 |
|
| 260 |
-
#
|
| 261 |
-
if
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
if relevance_score > 0.3: # Only include relevant documents
|
| 273 |
# Add relevance score to document (create dict if needed)
|
|
|
|
| 196 |
knowledge_bases = ["government_docs"] # Default
|
| 197 |
query_lower = query.lower()
|
| 198 |
|
| 199 |
+
# Enhance query for better relevance based on category
|
| 200 |
enhanced_query = query
|
| 201 |
+
|
| 202 |
+
# Pension queries
|
| 203 |
+
if "pension" in query_lower:
|
| 204 |
+
if any(word in query_lower for word in ["changes", "impact", "rules"]):
|
| 205 |
+
enhanced_query = f"{query} pension rules retirement benefits modifications"
|
| 206 |
+
elif "calculation" in query_lower or "formula" in query_lower:
|
| 207 |
+
enhanced_query = f"{query} pension calculation retirement benefits formula"
|
| 208 |
+
|
| 209 |
+
# Leave queries
|
| 210 |
+
elif any(word in query_lower for word in ["leave", "casual", "earned"]):
|
| 211 |
+
enhanced_query = f"{query} leave rules entitlement policy"
|
| 212 |
+
|
| 213 |
+
# Allowance queries
|
| 214 |
+
elif any(word in query_lower for word in ["allowance", "da", "dearness"]):
|
| 215 |
+
enhanced_query = f"{query} allowance rates dearness increment"
|
| 216 |
+
|
| 217 |
+
# Procurement queries
|
| 218 |
+
elif any(word in query_lower for word in ["tender", "procurement", "bid"]):
|
| 219 |
+
enhanced_query = f"{query} procurement tender bidding process"
|
| 220 |
+
|
| 221 |
+
# Medical queries
|
| 222 |
+
elif any(word in query_lower for word in ["medical", "health", "reimbursement"]):
|
| 223 |
+
enhanced_query = f"{query} medical health reimbursement cghs"
|
| 224 |
+
|
| 225 |
+
# Transfer queries
|
| 226 |
+
elif any(word in query_lower for word in ["transfer", "posting"]):
|
| 227 |
+
enhanced_query = f"{query} transfer posting policy rules"
|
| 228 |
|
| 229 |
logger.info(f"🔍 Enhanced query: '{enhanced_query}' (original: '{query}')")
|
| 230 |
|
|
|
|
| 263 |
title = ''
|
| 264 |
|
| 265 |
# Calculate relevance score based on query intent
|
| 266 |
+
relevance_score = getattr(doc, 'score', 0.5) # Base score
|
| 267 |
+
|
| 268 |
+
# Define query categories and their keywords
|
| 269 |
+
query_categories = {
|
| 270 |
+
'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'gpf', 'cpf', 'superannuation'],
|
| 271 |
+
'leave': ['leave', 'casual leave', 'earned leave', 'medical leave', 'maternity', 'paternity'],
|
| 272 |
+
'allowance': ['allowance', 'dearness allowance', 'da', 'hra', 'house rent', 'travel allowance'],
|
| 273 |
+
'procurement': ['procurement', 'tender', 'bid', 'contract', 'purchase', 'vendor'],
|
| 274 |
+
'medical': ['medical', 'health', 'treatment', 'reimbursement', 'cghs', 'hospital'],
|
| 275 |
+
'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
|
| 276 |
+
'promotion': ['promotion', 'seniority', 'grade', 'advancement', 'career progression'],
|
| 277 |
+
'service': ['service', 'conduct', 'discipline', 'rules', 'regulation']
|
| 278 |
+
}
|
| 279 |
|
| 280 |
+
# Determine query category
|
| 281 |
+
query_category = None
|
| 282 |
+
for category, keywords in query_categories.items():
|
| 283 |
+
if any(keyword in query_lower for keyword in keywords):
|
| 284 |
+
query_category = category
|
| 285 |
+
break
|
| 286 |
|
| 287 |
+
if query_category:
|
| 288 |
+
# Boost content matching the query category
|
| 289 |
+
category_keywords = query_categories[query_category]
|
| 290 |
+
if any(keyword in content for keyword in category_keywords):
|
| 291 |
relevance_score += 1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
+
# Add specific boosts for each category
|
| 294 |
+
if query_category == 'pension':
|
| 295 |
+
if any(keyword in content for keyword in ['pension rules', 'retirement benefit', 'pension scheme']):
|
| 296 |
+
relevance_score += 0.8
|
| 297 |
+
elif query_category == 'leave':
|
| 298 |
+
if any(keyword in content for keyword in ['leave rules', 'leave entitlement', 'leave policy']):
|
| 299 |
+
relevance_score += 0.8
|
| 300 |
+
elif query_category == 'allowance':
|
| 301 |
+
if any(keyword in content for keyword in ['allowance rates', 'da revision', 'allowance rules']):
|
| 302 |
+
relevance_score += 0.8
|
| 303 |
+
elif query_category == 'procurement':
|
| 304 |
+
if any(keyword in content for keyword in ['tender process', 'procurement rules', 'bidding']):
|
| 305 |
+
relevance_score += 0.8
|
| 306 |
+
elif query_category == 'medical':
|
| 307 |
+
if any(keyword in content for keyword in ['medical rules', 'reimbursement policy', 'cghs']):
|
| 308 |
+
relevance_score += 0.8
|
| 309 |
+
elif query_category == 'transfer':
|
| 310 |
+
if any(keyword in content for keyword in ['transfer policy', 'posting rules', 'cadre']):
|
| 311 |
+
relevance_score += 0.8
|
| 312 |
+
|
| 313 |
+
# Penalize irrelevant content for specific queries
|
| 314 |
+
other_categories = [cat for cat in query_categories.keys() if cat != query_category]
|
| 315 |
+
for other_cat in other_categories:
|
| 316 |
+
other_keywords = query_categories[other_cat]
|
| 317 |
+
if any(keyword in content for keyword in other_keywords[:3]): # Check top 3 keywords
|
| 318 |
+
relevance_score -= 0.7
|
| 319 |
|
| 320 |
if relevance_score > 0.3: # Only include relevant documents
|
| 321 |
# Add relevance score to document (create dict if needed)
|