# contact_search_service.py """ Contact search service with intelligent matching: - Name-based search (exact and fuzzy matching) - Division-based search - Combined search (name + division) - Confidence scoring """ import logging from typing import List, Dict, Optional, Tuple from difflib import SequenceMatcher import re from contacts_data import ( get_all_contacts, get_contacts_by_division, get_contact_by_name ) from name_extraction_service import NameExtractor from embedding_service import EmbeddingService # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ContactSearchService: """ Service for searching contacts with intelligent matching. Features: - Exact name matching (100% confidence) - Fuzzy name matching (partial names, typos) - Division-based matching - Combined search (name + division) - Multi-language support (English and Arabic) """ def __init__( self, name_extractor: NameExtractor, embedding_service: EmbeddingService ): """ Initialize the contact search service. Args: name_extractor: NameExtractor service for extracting names from queries embedding_service: EmbeddingService for division matching """ self.name_extractor = name_extractor self.embedding_service = embedding_service self.all_contacts = get_all_contacts() logger.info(f"ContactSearchService initialized with {len(self.all_contacts)} contacts") def search_contacts( self, query: str, top_k: int = 10, min_confidence: float = 0.3 ) -> List[Dict]: """ Search for contacts based on query. Process: 1. Extract names from query 2. Find matching divisions 3. Match contacts by: - Exact name match (if name found) → confidence = 1.0 - Fuzzy name match → confidence based on similarity - Division match → confidence from embedding service - Combined match (name + division) → boosted confidence 4. Sort by confidence (exact matches first) Args: query: Search query (English or Arabic) top_k: Maximum number of results to return min_confidence: Minimum confidence threshold (0.0-1.0) Returns: List of matched contacts with confidence scores """ logger.info(f"Searching contacts for query: '{query}'") # Step 1: Extract names from query extracted_names = self.name_extractor.extract_names(query) logger.info(f"Extracted names: {extracted_names}") # Step 2: Find matching divisions division_matches = self.embedding_service.find_division(query, top_k=3) logger.info(f"Found {len(division_matches)} division matches") # Step 3: Match contacts matched_contacts = [] has_names = len(extracted_names) > 0 has_divisions = len(division_matches) > 0 requested_divisions = [dm.division for dm in division_matches] if has_divisions else [] # Strategy A: If we have names, search by name name_matches = {} # Track name matches by contact ID if extracted_names: for name in extracted_names: # Try exact match first exact_match = get_contact_by_name(name) if exact_match: contact_id = exact_match["id"] name_matches[contact_id] = { "contact": exact_match, "confidence": 1.0, "similarity": 1.0, "match_type": "exact" } logger.info(f"✓ Exact name match: {exact_match['full_name_en']}") else: # Fuzzy name matching fuzzy_matches = self._fuzzy_name_search(name, top_k=10) for contact, similarity in fuzzy_matches: contact_id = contact["id"] # Only keep best match for each contact if contact_id not in name_matches or similarity > name_matches[contact_id]["similarity"]: name_matches[contact_id] = { "contact": contact, "confidence": round(0.5 + (similarity * 0.45), 2), "similarity": round(similarity, 2), "match_type": "fuzzy" } logger.info( f"Fuzzy name match: {contact['full_name_en']} " f"(similarity: {similarity:.2f})" ) # Strategy B: Division-based search division_matches_dict = {} # Track division matches by contact ID for div_match in division_matches: division = div_match.division division_confidence = div_match.confidence # Get contacts in this division division_contacts = get_contacts_by_division(division) for contact in division_contacts: contact_id = contact["id"] # Only keep best division match for each contact if contact_id not in division_matches_dict or division_confidence > division_matches_dict[contact_id]["confidence"]: division_matches_dict[contact_id] = { "contact": contact, "confidence": division_confidence, "division": division } # Strategy C: Combine matches intelligently # Priority when BOTH name and division are specified: # 1. Name + Correct Division = HIGHEST (both match) # 2. Correct Division only = HIGH (division is most important) # 3. Name + Wrong Division = LOW (penalize wrong division) all_contact_ids = set(name_matches.keys()) | set(division_matches_dict.keys()) for contact_id in all_contact_ids: has_name_match = contact_id in name_matches has_division_match = contact_id in division_matches_dict if has_name_match and has_division_match: # BOTH name and division match - BEST CASE name_data = name_matches[contact_id] div_data = division_matches_dict[contact_id] contact = name_data["contact"] # When both match: take MAX of the two confidences and add a boost # This ensures division + name is always better than division alone combined_confidence = max(name_data["confidence"], div_data["confidence"]) + 0.15 combined_confidence = min(1.0, combined_confidence) contact_result = contact.copy() contact_result["confidence"] = round(combined_confidence, 2) contact_result["match_reason"] = "name_and_division_match" contact_result["name_confidence"] = name_data["confidence"] contact_result["division_confidence"] = div_data["confidence"] matched_contacts.append(contact_result) logger.info( f"✓ BOTH match: {contact['full_name_en']} in {div_data['division']} " f"(final confidence: {contact_result['confidence']})" ) elif has_division_match: # Division match only (no name specified, or name didn't match this person) div_data = division_matches_dict[contact_id] contact = div_data["contact"] contact_result = contact.copy() contact_result["confidence"] = div_data["confidence"] contact_result["match_reason"] = "division_match" contact_result["division_confidence"] = div_data["confidence"] matched_contacts.append(contact_result) elif has_name_match: # Name match but WRONG division (or no division specified) name_data = name_matches[contact_id] contact = name_data["contact"] # If division was specified in query AND has reasonable confidence (>= 0.58) # Apply penalty for being in wrong division # If division confidence is very low (< 0.58), treat as name-only search # This threshold helps avoid false division matches from words like "Find" (which scores ~0.56) # while still catching abbreviations like "App Dev" (which scores ~0.59) has_strong_division_match = has_divisions and division_matches[0].confidence >= 0.58 if has_strong_division_match: # Heavy penalty for wrong division when division was specified with confidence penalized_confidence = name_data["confidence"] * 0.3 # 70% penalty contact_result = contact.copy() contact_result["confidence"] = round(penalized_confidence, 2) contact_result["match_reason"] = "name_match_wrong_division" contact_result["name_confidence"] = name_data["confidence"] contact_result["requested_division"] = ", ".join(requested_divisions[:2]) matched_contacts.append(contact_result) logger.info( f"Name match with WRONG division: {contact['full_name_en']} " f"in {contact['division']} (wanted: {requested_divisions[0]}, " f"confidence: {contact_result['confidence']})" ) else: # No division specified OR weak division match - use name confidence as-is contact_result = contact.copy() contact_result["confidence"] = name_data["confidence"] contact_result["match_reason"] = f"{name_data['match_type']}_name_match" contact_result["name_confidence"] = name_data["confidence"] matched_contacts.append(contact_result) # Step 4: Remove duplicates (keep highest confidence) unique_contacts = {} for contact in matched_contacts: contact_id = contact["id"] if contact_id not in unique_contacts: unique_contacts[contact_id] = contact else: # Keep the one with higher confidence if contact["confidence"] > unique_contacts[contact_id]["confidence"]: unique_contacts[contact_id] = contact # Convert back to list matched_contacts = list(unique_contacts.values()) # Step 5: Filter by minimum confidence matched_contacts = [ c for c in matched_contacts if c["confidence"] >= min_confidence ] # Step 6: Sort by confidence (descending) - exact matches will be first matched_contacts.sort(key=lambda x: x["confidence"], reverse=True) # Step 7: Limit to top_k matched_contacts = matched_contacts[:top_k] logger.info(f"✓ Returning {len(matched_contacts)} matched contacts") return matched_contacts def _fuzzy_name_search( self, name: str, top_k: int = 5, min_similarity: float = 0.75 # Increased from 0.6 to avoid false matches ) -> List[Tuple[Dict, float]]: """ Fuzzy name matching using string similarity with stricter rules. Args: name: Name to search for top_k: Number of top matches to return min_similarity: Minimum similarity threshold (0.0-1.0) Returns: List of (contact, similarity_score) tuples """ matches = [] # Normalize name for comparison name_normalized = self._normalize_name(name) # Get first letter for initial matching (helps avoid false positives) name_first_letter = name_normalized[0] if name_normalized else '' for contact in self.all_contacts: # Check against both Arabic and English names full_name_en_normalized = self._normalize_name(contact["full_name_en"]) full_name_ar_normalized = self._normalize_name(contact["full_name_ar"]) first_name_en_normalized = self._normalize_name(contact["first_name_en"]) first_name_ar_normalized = self._normalize_name(contact["first_name_ar"]) last_name_en_normalized = self._normalize_name(contact["last_name_en"]) last_name_ar_normalized = self._normalize_name(contact["last_name_ar"]) # Calculate similarity against various name combinations name_candidates = [ (full_name_en_normalized, "full_en"), (full_name_ar_normalized, "full_ar"), (first_name_en_normalized, "first_en"), (first_name_ar_normalized, "first_ar"), (last_name_en_normalized, "last_en"), (last_name_ar_normalized, "last_ar"), ] best_similarity = 0 best_match_type = None for candidate_name, match_type in name_candidates: if not candidate_name: continue similarity = self._string_similarity(name_normalized, candidate_name) # Apply stricter rules for fuzzy matching: # 1. Names should start with the same letter (for English names) # 2. Or have very high similarity (>= 0.85) if match_type.endswith('_en'): candidate_first_letter = candidate_name[0] if candidate_name else '' # Require same first letter OR very high similarity if candidate_first_letter != name_first_letter and similarity < 0.85: continue # Skip this match if similarity > best_similarity: best_similarity = similarity best_match_type = match_type if best_similarity >= min_similarity: matches.append((contact, best_similarity)) # Sort by similarity (descending) matches.sort(key=lambda x: x[1], reverse=True) return matches[:top_k] def _normalize_name(self, name: str) -> str: """Normalize name for comparison (lowercase, remove extra spaces)""" return re.sub(r'\s+', ' ', name.strip().lower()) def _string_similarity(self, s1: str, s2: str) -> float: """ Calculate string similarity using SequenceMatcher. Returns: Similarity score between 0.0 and 1.0 """ return SequenceMatcher(None, s1, s2).ratio() def get_contact_stats(self) -> Dict: """Get statistics about the contact database""" from collections import Counter dept_counts = Counter(contact["department"] for contact in self.all_contacts) div_counts = Counter(contact["division"] for contact in self.all_contacts) return { "total_contacts": len(self.all_contacts), "departments": len(dept_counts), "divisions": len(div_counts), "contacts_by_department": dict(dept_counts), "contacts_by_division": dict(div_counts), } if __name__ == "__main__": # Test the contact search service from name_extraction_service import NameExtractor from embedding_service import EmbeddingService print("Initializing services...") name_extractor = NameExtractor() embedding_service = EmbeddingService() search_service = ContactSearchService(name_extractor, embedding_service) print("\nContact Database Stats:") stats = search_service.get_contact_stats() print(f"Total contacts: {stats['total_contacts']}") print(f"Departments: {stats['departments']}") print(f"Divisions: {stats['divisions']}") # Test queries test_queries = [ "Find Ahmed in IT", "I need to talk to someone in HR", "محمد في المالية", # "Mohammed in Finance" in Arabic "Finance accounting help", ] print("\n" + "="*80) print("Testing Contact Search") print("="*80) for query in test_queries: print(f"\nQuery: '{query}'") print("-" * 80) results = search_service.search_contacts(query, top_k=3) if results: for i, contact in enumerate(results, 1): print(f"{i}. {contact['full_name_en']} ({contact['full_name_ar']})") print(f" {contact['title_en']} - {contact['division']}") print(f" {contact['email']} | Ext: {contact['extension']}") print(f" Confidence: {contact['confidence']:.2f} | Reason: {contact['match_reason']}") else: print("No matches found.")