ChAbhishek28 commited on
Commit
e0ad8eb
·
1 Parent(s): a678780

Implement comprehensive query-document matching system - Add category-based relevance scoring for all query types (pension, leave, allowance, procurement, medical, etc.) - Heavy penalties for mismatched content to prevent wrong results

Browse files
Files changed (1) hide show
  1. rag_service.py +29 -35
rag_service.py CHANGED
@@ -269,53 +269,47 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
269
  query_categories = {
270
  'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'gpf', 'cpf', 'superannuation'],
271
  'leave': ['leave', 'casual leave', 'earned leave', 'medical leave', 'maternity', 'paternity'],
272
- 'allowance': ['allowance', 'dearness allowance', 'da', 'hra', 'house rent', 'travel allowance'],
273
- 'procurement': ['procurement', 'tender', 'bid', 'contract', 'purchase', 'vendor'],
274
  'medical': ['medical', 'health', 'treatment', 'reimbursement', 'cghs', 'hospital'],
275
  'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
276
  'promotion': ['promotion', 'seniority', 'grade', 'advancement', 'career progression'],
277
  'service': ['service', 'conduct', 'discipline', 'rules', 'regulation']
278
  }
279
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  # Determine query category
281
- query_category = None
282
  for category, keywords in query_categories.items():
283
  if any(keyword in query_lower for keyword in keywords):
284
- query_category = category
285
  break
286
 
287
- if query_category:
288
- # Boost content matching the query category
289
- category_keywords = query_categories[query_category]
290
- if any(keyword in content for keyword in category_keywords):
291
- relevance_score += 1.0
292
-
293
- # Add specific boosts for each category
294
- if query_category == 'pension':
295
- if any(keyword in content for keyword in ['pension rules', 'retirement benefit', 'pension scheme']):
296
- relevance_score += 0.8
297
- elif query_category == 'leave':
298
- if any(keyword in content for keyword in ['leave rules', 'leave entitlement', 'leave policy']):
299
- relevance_score += 0.8
300
- elif query_category == 'allowance':
301
- if any(keyword in content for keyword in ['allowance rates', 'da revision', 'allowance rules']):
302
- relevance_score += 0.8
303
- elif query_category == 'procurement':
304
- if any(keyword in content for keyword in ['tender process', 'procurement rules', 'bidding']):
305
- relevance_score += 0.8
306
- elif query_category == 'medical':
307
- if any(keyword in content for keyword in ['medical rules', 'reimbursement policy', 'cghs']):
308
- relevance_score += 0.8
309
- elif query_category == 'transfer':
310
- if any(keyword in content for keyword in ['transfer policy', 'posting rules', 'cadre']):
311
- relevance_score += 0.8
312
 
313
- # Penalize irrelevant content for specific queries
314
- other_categories = [cat for cat in query_categories.keys() if cat != query_category]
315
- for other_cat in other_categories:
316
- other_keywords = query_categories[other_cat]
317
- if any(keyword in content for keyword in other_keywords[:3]): # Check top 3 keywords
318
- relevance_score -= 0.7
 
319
 
320
  if relevance_score > 0.3: # Only include relevant documents
321
  # Add relevance score to document (create dict if needed)
 
269
  query_categories = {
270
  'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'gpf', 'cpf', 'superannuation'],
271
  'leave': ['leave', 'casual leave', 'earned leave', 'medical leave', 'maternity', 'paternity'],
272
+ 'allowance': ['allowance', 'dearness allowance', 'da', 'hra', 'house rent', 'travel allowance', 'increment'],
273
+ 'procurement': ['procurement', 'tender', 'bid', 'contract', 'purchase', 'vendor', 'gem'],
274
  'medical': ['medical', 'health', 'treatment', 'reimbursement', 'cghs', 'hospital'],
275
  'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
276
  'promotion': ['promotion', 'seniority', 'grade', 'advancement', 'career progression'],
277
  'service': ['service', 'conduct', 'discipline', 'rules', 'regulation']
278
  }
279
 
280
+ # Content categories - what each document type contains
281
+ content_categories = {
282
+ 'pension': ['pension', 'retirement benefit', 'gratuity', 'provident fund', 'superannuation'],
283
+ 'leave': ['leave rules', 'casual leave', 'earned leave', 'medical leave', 'maternity leave'],
284
+ 'allowance': ['dearness allowance', 'house rent allowance', 'travel allowance', 'da', 'hra', 'increment'],
285
+ 'procurement': ['procurement', 'tender', 'bidding', 'contract', 'vendor', 'gem', 'purchase'],
286
+ 'medical': ['medical', 'health', 'cghs', 'reimbursement', 'treatment'],
287
+ 'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
288
+ 'promotion': ['promotion', 'seniority', 'grade pay', 'advancement'],
289
+ 'service': ['service rules', 'conduct', 'discipline', 'misconduct']
290
+ }
291
+
292
  # Determine query category
293
+ detected_query_category = None
294
  for category, keywords in query_categories.items():
295
  if any(keyword in query_lower for keyword in keywords):
296
+ detected_query_category = category
297
  break
298
 
299
+ # If query category detected, apply targeted scoring
300
+ if detected_query_category:
301
+ # Boost score if document content matches query category
302
+ matching_content_keywords = content_categories.get(detected_query_category, [])
303
+ if any(keyword in content for keyword in matching_content_keywords):
304
+ relevance_score += 1.5 # Strong boost for matching content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
+ # Penalize documents from different categories
307
+ for other_category, other_keywords in content_categories.items():
308
+ if other_category != detected_query_category:
309
+ if any(keyword in content for keyword in other_keywords):
310
+ relevance_score -= 1.2 # Heavy penalty for non-matching content
311
+
312
+ logger.debug(f"Query category: {detected_query_category}, Relevance: {relevance_score:.2f} for content: {content[:50]}...")
313
 
314
  if relevance_score > 0.3: # Only include relevant documents
315
  # Add relevance score to document (create dict if needed)