Spaces:
Sleeping
Sleeping
Commit
·
e0ad8eb
1
Parent(s):
a678780
Implement comprehensive query-document matching system - Add category-based relevance scoring for all query types (pension, leave, allowance, procurement, medical, etc.) - Heavy penalties for mismatched content to prevent wrong results
Browse files- rag_service.py +29 -35
rag_service.py
CHANGED
|
@@ -269,53 +269,47 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
|
|
| 269 |
query_categories = {
|
| 270 |
'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'gpf', 'cpf', 'superannuation'],
|
| 271 |
'leave': ['leave', 'casual leave', 'earned leave', 'medical leave', 'maternity', 'paternity'],
|
| 272 |
-
'allowance': ['allowance', 'dearness allowance', 'da', 'hra', 'house rent', 'travel allowance'],
|
| 273 |
-
'procurement': ['procurement', 'tender', 'bid', 'contract', 'purchase', 'vendor'],
|
| 274 |
'medical': ['medical', 'health', 'treatment', 'reimbursement', 'cghs', 'hospital'],
|
| 275 |
'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
|
| 276 |
'promotion': ['promotion', 'seniority', 'grade', 'advancement', 'career progression'],
|
| 277 |
'service': ['service', 'conduct', 'discipline', 'rules', 'regulation']
|
| 278 |
}
|
| 279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
# Determine query category
|
| 281 |
-
|
| 282 |
for category, keywords in query_categories.items():
|
| 283 |
if any(keyword in query_lower for keyword in keywords):
|
| 284 |
-
|
| 285 |
break
|
| 286 |
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
# Add specific boosts for each category
|
| 294 |
-
if query_category == 'pension':
|
| 295 |
-
if any(keyword in content for keyword in ['pension rules', 'retirement benefit', 'pension scheme']):
|
| 296 |
-
relevance_score += 0.8
|
| 297 |
-
elif query_category == 'leave':
|
| 298 |
-
if any(keyword in content for keyword in ['leave rules', 'leave entitlement', 'leave policy']):
|
| 299 |
-
relevance_score += 0.8
|
| 300 |
-
elif query_category == 'allowance':
|
| 301 |
-
if any(keyword in content for keyword in ['allowance rates', 'da revision', 'allowance rules']):
|
| 302 |
-
relevance_score += 0.8
|
| 303 |
-
elif query_category == 'procurement':
|
| 304 |
-
if any(keyword in content for keyword in ['tender process', 'procurement rules', 'bidding']):
|
| 305 |
-
relevance_score += 0.8
|
| 306 |
-
elif query_category == 'medical':
|
| 307 |
-
if any(keyword in content for keyword in ['medical rules', 'reimbursement policy', 'cghs']):
|
| 308 |
-
relevance_score += 0.8
|
| 309 |
-
elif query_category == 'transfer':
|
| 310 |
-
if any(keyword in content for keyword in ['transfer policy', 'posting rules', 'cadre']):
|
| 311 |
-
relevance_score += 0.8
|
| 312 |
|
| 313 |
-
# Penalize
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
|
|
|
| 319 |
|
| 320 |
if relevance_score > 0.3: # Only include relevant documents
|
| 321 |
# Add relevance score to document (create dict if needed)
|
|
|
|
| 269 |
query_categories = {
|
| 270 |
'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'gpf', 'cpf', 'superannuation'],
|
| 271 |
'leave': ['leave', 'casual leave', 'earned leave', 'medical leave', 'maternity', 'paternity'],
|
| 272 |
+
'allowance': ['allowance', 'dearness allowance', 'da', 'hra', 'house rent', 'travel allowance', 'increment'],
|
| 273 |
+
'procurement': ['procurement', 'tender', 'bid', 'contract', 'purchase', 'vendor', 'gem'],
|
| 274 |
'medical': ['medical', 'health', 'treatment', 'reimbursement', 'cghs', 'hospital'],
|
| 275 |
'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
|
| 276 |
'promotion': ['promotion', 'seniority', 'grade', 'advancement', 'career progression'],
|
| 277 |
'service': ['service', 'conduct', 'discipline', 'rules', 'regulation']
|
| 278 |
}
|
| 279 |
|
| 280 |
+
# Content categories - what each document type contains
|
| 281 |
+
content_categories = {
|
| 282 |
+
'pension': ['pension', 'retirement benefit', 'gratuity', 'provident fund', 'superannuation'],
|
| 283 |
+
'leave': ['leave rules', 'casual leave', 'earned leave', 'medical leave', 'maternity leave'],
|
| 284 |
+
'allowance': ['dearness allowance', 'house rent allowance', 'travel allowance', 'da', 'hra', 'increment'],
|
| 285 |
+
'procurement': ['procurement', 'tender', 'bidding', 'contract', 'vendor', 'gem', 'purchase'],
|
| 286 |
+
'medical': ['medical', 'health', 'cghs', 'reimbursement', 'treatment'],
|
| 287 |
+
'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
|
| 288 |
+
'promotion': ['promotion', 'seniority', 'grade pay', 'advancement'],
|
| 289 |
+
'service': ['service rules', 'conduct', 'discipline', 'misconduct']
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
# Determine query category
|
| 293 |
+
detected_query_category = None
|
| 294 |
for category, keywords in query_categories.items():
|
| 295 |
if any(keyword in query_lower for keyword in keywords):
|
| 296 |
+
detected_query_category = category
|
| 297 |
break
|
| 298 |
|
| 299 |
+
# If query category detected, apply targeted scoring
|
| 300 |
+
if detected_query_category:
|
| 301 |
+
# Boost score if document content matches query category
|
| 302 |
+
matching_content_keywords = content_categories.get(detected_query_category, [])
|
| 303 |
+
if any(keyword in content for keyword in matching_content_keywords):
|
| 304 |
+
relevance_score += 1.5 # Strong boost for matching content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
+
# Penalize documents from different categories
|
| 307 |
+
for other_category, other_keywords in content_categories.items():
|
| 308 |
+
if other_category != detected_query_category:
|
| 309 |
+
if any(keyword in content for keyword in other_keywords):
|
| 310 |
+
relevance_score -= 1.2 # Heavy penalty for non-matching content
|
| 311 |
+
|
| 312 |
+
logger.debug(f"Query category: {detected_query_category}, Relevance: {relevance_score:.2f} for content: {content[:50]}...")
|
| 313 |
|
| 314 |
if relevance_score > 0.3: # Only include relevant documents
|
| 315 |
# Add relevance score to document (create dict if needed)
|