ChAbhishek28 commited on
Commit
a678780
·
1 Parent(s): 60ce103

Implement comprehensive multi-category query handling - Add relevance scoring and query enhancement for all document types (pension, leave, allowance, procurement, medical, etc.)

Browse files
Files changed (1) hide show
  1. rag_service.py +77 -29
rag_service.py CHANGED
@@ -196,14 +196,35 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
196
  knowledge_bases = ["government_docs"] # Default
197
  query_lower = query.lower()
198
 
199
- # Enhance query for better relevance
200
  enhanced_query = query
201
- if "pension rules changes" in query_lower or "pension rule changes" in query_lower:
202
- enhanced_query = "pension rules modifications amendments updates changes revisions"
203
- elif "pension rules impact" in query_lower or "pension impact" in query_lower:
204
- enhanced_query = "pension rules impact analysis effects benefits retirement financial"
205
- elif "pension" in query_lower and any(word in query_lower for word in ["rules", "changes", "updates", "modifications", "impact"]):
206
- enhanced_query = f"{query} pension rules retirement benefits"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  logger.info(f"🔍 Enhanced query: '{enhanced_query}' (original: '{query}')")
209
 
@@ -242,32 +263,59 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
242
  title = ''
243
 
244
  # Calculate relevance score based on query intent
245
- relevance_score = 0
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
- # Check if query is pension-related
248
- pension_keywords = ['pension', 'retirement', 'gratuity', 'provident fund', 'gpf', 'cpf']
249
- is_pension_query = any(keyword in query_lower for keyword in pension_keywords)
 
 
 
250
 
251
- if is_pension_query:
252
- # Boost pension-related content
253
- if any(keyword in content for keyword in ['pension', 'retirement benefit', 'gratuity', 'provident fund', 'superannuation']):
 
254
  relevance_score += 1.0
255
- if any(keyword in content for keyword in ['pension rules', 'pension regulation', 'pension scheme', 'retirement']):
256
- relevance_score += 0.8
257
- if any(keyword in content for keyword in ['changes', 'modifications', 'amendments', 'updates', 'revised', 'impact']):
258
- relevance_score += 0.3
259
 
260
- # Heavily penalize non-pension documents for pension queries
261
- if any(keyword in content for keyword in ['leave rules', 'casual leave', 'earned leave', 'medical leave']):
262
- relevance_score -= 1.5
263
- if any(keyword in content for keyword in ['procurement', 'tender', 'bidding', 'contract']):
264
- relevance_score -= 1.5
265
- if any(keyword in content for keyword in ['transfer policy', 'posting policy', 'transfer rules']):
266
- relevance_score -= 1.5
267
-
268
- else:
269
- # For non-pension queries, use default scoring
270
- relevance_score = getattr(doc, 'score', 0.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  if relevance_score > 0.3: # Only include relevant documents
273
  # Add relevance score to document (create dict if needed)
 
196
  knowledge_bases = ["government_docs"] # Default
197
  query_lower = query.lower()
198
 
199
+ # Enhance query for better relevance based on category
200
  enhanced_query = query
201
+
202
+ # Pension queries
203
+ if "pension" in query_lower:
204
+ if any(word in query_lower for word in ["changes", "impact", "rules"]):
205
+ enhanced_query = f"{query} pension rules retirement benefits modifications"
206
+ elif "calculation" in query_lower or "formula" in query_lower:
207
+ enhanced_query = f"{query} pension calculation retirement benefits formula"
208
+
209
+ # Leave queries
210
+ elif any(word in query_lower for word in ["leave", "casual", "earned"]):
211
+ enhanced_query = f"{query} leave rules entitlement policy"
212
+
213
+ # Allowance queries
214
+ elif any(word in query_lower for word in ["allowance", "da", "dearness"]):
215
+ enhanced_query = f"{query} allowance rates dearness increment"
216
+
217
+ # Procurement queries
218
+ elif any(word in query_lower for word in ["tender", "procurement", "bid"]):
219
+ enhanced_query = f"{query} procurement tender bidding process"
220
+
221
+ # Medical queries
222
+ elif any(word in query_lower for word in ["medical", "health", "reimbursement"]):
223
+ enhanced_query = f"{query} medical health reimbursement cghs"
224
+
225
+ # Transfer queries
226
+ elif any(word in query_lower for word in ["transfer", "posting"]):
227
+ enhanced_query = f"{query} transfer posting policy rules"
228
 
229
  logger.info(f"🔍 Enhanced query: '{enhanced_query}' (original: '{query}')")
230
 
 
263
  title = ''
264
 
265
  # Calculate relevance score based on query intent
266
+ relevance_score = getattr(doc, 'score', 0.5) # Base score
267
+
268
+ # Define query categories and their keywords
269
+ query_categories = {
270
+ 'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'gpf', 'cpf', 'superannuation'],
271
+ 'leave': ['leave', 'casual leave', 'earned leave', 'medical leave', 'maternity', 'paternity'],
272
+ 'allowance': ['allowance', 'dearness allowance', 'da', 'hra', 'house rent', 'travel allowance'],
273
+ 'procurement': ['procurement', 'tender', 'bid', 'contract', 'purchase', 'vendor'],
274
+ 'medical': ['medical', 'health', 'treatment', 'reimbursement', 'cghs', 'hospital'],
275
+ 'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
276
+ 'promotion': ['promotion', 'seniority', 'grade', 'advancement', 'career progression'],
277
+ 'service': ['service', 'conduct', 'discipline', 'rules', 'regulation']
278
+ }
279
 
280
+ # Determine query category
281
+ query_category = None
282
+ for category, keywords in query_categories.items():
283
+ if any(keyword in query_lower for keyword in keywords):
284
+ query_category = category
285
+ break
286
 
287
+ if query_category:
288
+ # Boost content matching the query category
289
+ category_keywords = query_categories[query_category]
290
+ if any(keyword in content for keyword in category_keywords):
291
  relevance_score += 1.0
 
 
 
 
292
 
293
+ # Add specific boosts for each category
294
+ if query_category == 'pension':
295
+ if any(keyword in content for keyword in ['pension rules', 'retirement benefit', 'pension scheme']):
296
+ relevance_score += 0.8
297
+ elif query_category == 'leave':
298
+ if any(keyword in content for keyword in ['leave rules', 'leave entitlement', 'leave policy']):
299
+ relevance_score += 0.8
300
+ elif query_category == 'allowance':
301
+ if any(keyword in content for keyword in ['allowance rates', 'da revision', 'allowance rules']):
302
+ relevance_score += 0.8
303
+ elif query_category == 'procurement':
304
+ if any(keyword in content for keyword in ['tender process', 'procurement rules', 'bidding']):
305
+ relevance_score += 0.8
306
+ elif query_category == 'medical':
307
+ if any(keyword in content for keyword in ['medical rules', 'reimbursement policy', 'cghs']):
308
+ relevance_score += 0.8
309
+ elif query_category == 'transfer':
310
+ if any(keyword in content for keyword in ['transfer policy', 'posting rules', 'cadre']):
311
+ relevance_score += 0.8
312
+
313
+ # Penalize irrelevant content for specific queries
314
+ other_categories = [cat for cat in query_categories.keys() if cat != query_category]
315
+ for other_cat in other_categories:
316
+ other_keywords = query_categories[other_cat]
317
+ if any(keyword in content for keyword in other_keywords[:3]): # Check top 3 keywords
318
+ relevance_score -= 0.7
319
 
320
  if relevance_score > 0.3: # Only include relevant documents
321
  # Add relevance score to document (create dict if needed)