Spaces:

MEssamOrg
/

ContactSearchAssistant

Sleeping

File size: 17,093 Bytes

8ef276c

# contact_search_service.py
"""
Contact search service with intelligent matching:
- Name-based search (exact and fuzzy matching)
- Division-based search
- Combined search (name + division)
- Confidence scoring
"""

import logging
from typing import List, Dict, Optional, Tuple
from difflib import SequenceMatcher
import re

from contacts_data import (
    get_all_contacts,
    get_contacts_by_division,
    get_contact_by_name
)
from name_extraction_service import NameExtractor
from embedding_service import EmbeddingService

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class ContactSearchService:
    """
    Service for searching contacts with intelligent matching.

    Features:
    - Exact name matching (100% confidence)
    - Fuzzy name matching (partial names, typos)
    - Division-based matching
    - Combined search (name + division)
    - Multi-language support (English and Arabic)
    """

    def __init__(
        self,
        name_extractor: NameExtractor,
        embedding_service: EmbeddingService
    ):
        """
        Initialize the contact search service.

        Args:
            name_extractor: NameExtractor service for extracting names from queries
            embedding_service: EmbeddingService for division matching
        """
        self.name_extractor = name_extractor
        self.embedding_service = embedding_service
        self.all_contacts = get_all_contacts()

        logger.info(f"ContactSearchService initialized with {len(self.all_contacts)} contacts")

    def search_contacts(
        self,
        query: str,
        top_k: int = 10,
        min_confidence: float = 0.3
    ) -> List[Dict]:
        """
        Search for contacts based on query.

        Process:
        1. Extract names from query
        2. Find matching divisions
        3. Match contacts by:
           - Exact name match (if name found) → confidence = 1.0
           - Fuzzy name match → confidence based on similarity
           - Division match → confidence from embedding service
           - Combined match (name + division) → boosted confidence
        4. Sort by confidence (exact matches first)

        Args:
            query: Search query (English or Arabic)
            top_k: Maximum number of results to return
            min_confidence: Minimum confidence threshold (0.0-1.0)

        Returns:
            List of matched contacts with confidence scores
        """
        logger.info(f"Searching contacts for query: '{query}'")

        # Step 1: Extract names from query
        extracted_names = self.name_extractor.extract_names(query)
        logger.info(f"Extracted names: {extracted_names}")

        # Step 2: Find matching divisions
        division_matches = self.embedding_service.find_division(query, top_k=3)
        logger.info(f"Found {len(division_matches)} division matches")

        # Step 3: Match contacts
        matched_contacts = []
        has_names = len(extracted_names) > 0
        has_divisions = len(division_matches) > 0
        requested_divisions = [dm.division for dm in division_matches] if has_divisions else []

        # Strategy A: If we have names, search by name
        name_matches = {}  # Track name matches by contact ID
        if extracted_names:
            for name in extracted_names:
                # Try exact match first
                exact_match = get_contact_by_name(name)
                if exact_match:
                    contact_id = exact_match["id"]
                    name_matches[contact_id] = {
                        "contact": exact_match,
                        "confidence": 1.0,
                        "similarity": 1.0,
                        "match_type": "exact"
                    }
                    logger.info(f"✓ Exact name match: {exact_match['full_name_en']}")
                else:
                    # Fuzzy name matching
                    fuzzy_matches = self._fuzzy_name_search(name, top_k=10)
                    for contact, similarity in fuzzy_matches:
                        contact_id = contact["id"]
                        # Only keep best match for each contact
                        if contact_id not in name_matches or similarity > name_matches[contact_id]["similarity"]:
                            name_matches[contact_id] = {
                                "contact": contact,
                                "confidence": round(0.5 + (similarity * 0.45), 2),
                                "similarity": round(similarity, 2),
                                "match_type": "fuzzy"
                            }
                            logger.info(
                                f"Fuzzy name match: {contact['full_name_en']} "
                                f"(similarity: {similarity:.2f})"
                            )

        # Strategy B: Division-based search
        division_matches_dict = {}  # Track division matches by contact ID
        for div_match in division_matches:
            division = div_match.division
            division_confidence = div_match.confidence

            # Get contacts in this division
            division_contacts = get_contacts_by_division(division)

            for contact in division_contacts:
                contact_id = contact["id"]
                # Only keep best division match for each contact
                if contact_id not in division_matches_dict or division_confidence > division_matches_dict[contact_id]["confidence"]:
                    division_matches_dict[contact_id] = {
                        "contact": contact,
                        "confidence": division_confidence,
                        "division": division
                    }

        # Strategy C: Combine matches intelligently
        # Priority when BOTH name and division are specified:
        # 1. Name + Correct Division = HIGHEST (both match)
        # 2. Correct Division only = HIGH (division is most important)
        # 3. Name + Wrong Division = LOW (penalize wrong division)

        all_contact_ids = set(name_matches.keys()) | set(division_matches_dict.keys())

        for contact_id in all_contact_ids:
            has_name_match = contact_id in name_matches
            has_division_match = contact_id in division_matches_dict

            if has_name_match and has_division_match:
                # BOTH name and division match - BEST CASE
                name_data = name_matches[contact_id]
                div_data = division_matches_dict[contact_id]
                contact = name_data["contact"]

                # When both match: take MAX of the two confidences and add a boost
                # This ensures division + name is always better than division alone
                combined_confidence = max(name_data["confidence"], div_data["confidence"]) + 0.15
                combined_confidence = min(1.0, combined_confidence)

                contact_result = contact.copy()
                contact_result["confidence"] = round(combined_confidence, 2)
                contact_result["match_reason"] = "name_and_division_match"
                contact_result["name_confidence"] = name_data["confidence"]
                contact_result["division_confidence"] = div_data["confidence"]
                matched_contacts.append(contact_result)

                logger.info(
                    f"✓ BOTH match: {contact['full_name_en']} in {div_data['division']} "
                    f"(final confidence: {contact_result['confidence']})"
                )

            elif has_division_match:
                # Division match only (no name specified, or name didn't match this person)
                div_data = division_matches_dict[contact_id]
                contact = div_data["contact"]

                contact_result = contact.copy()
                contact_result["confidence"] = div_data["confidence"]
                contact_result["match_reason"] = "division_match"
                contact_result["division_confidence"] = div_data["confidence"]
                matched_contacts.append(contact_result)

            elif has_name_match:
                # Name match but WRONG division (or no division specified)
                name_data = name_matches[contact_id]
                contact = name_data["contact"]

                # If division was specified in query AND has reasonable confidence (>= 0.58)
                # Apply penalty for being in wrong division
                # If division confidence is very low (< 0.58), treat as name-only search
                # This threshold helps avoid false division matches from words like "Find" (which scores ~0.56)
                # while still catching abbreviations like "App Dev" (which scores ~0.59)
                has_strong_division_match = has_divisions and division_matches[0].confidence >= 0.58

                if has_strong_division_match:
                    # Heavy penalty for wrong division when division was specified with confidence
                    penalized_confidence = name_data["confidence"] * 0.3  # 70% penalty
                    contact_result = contact.copy()
                    contact_result["confidence"] = round(penalized_confidence, 2)
                    contact_result["match_reason"] = "name_match_wrong_division"
                    contact_result["name_confidence"] = name_data["confidence"]
                    contact_result["requested_division"] = ", ".join(requested_divisions[:2])
                    matched_contacts.append(contact_result)

                    logger.info(
                        f"Name match with WRONG division: {contact['full_name_en']} "
                        f"in {contact['division']} (wanted: {requested_divisions[0]}, "
                        f"confidence: {contact_result['confidence']})"
                    )
                else:
                    # No division specified OR weak division match - use name confidence as-is
                    contact_result = contact.copy()
                    contact_result["confidence"] = name_data["confidence"]
                    contact_result["match_reason"] = f"{name_data['match_type']}_name_match"
                    contact_result["name_confidence"] = name_data["confidence"]
                    matched_contacts.append(contact_result)

        # Step 4: Remove duplicates (keep highest confidence)
        unique_contacts = {}
        for contact in matched_contacts:
            contact_id = contact["id"]
            if contact_id not in unique_contacts:
                unique_contacts[contact_id] = contact
            else:
                # Keep the one with higher confidence
                if contact["confidence"] > unique_contacts[contact_id]["confidence"]:
                    unique_contacts[contact_id] = contact

        # Convert back to list
        matched_contacts = list(unique_contacts.values())

        # Step 5: Filter by minimum confidence
        matched_contacts = [
            c for c in matched_contacts if c["confidence"] >= min_confidence
        ]

        # Step 6: Sort by confidence (descending) - exact matches will be first
        matched_contacts.sort(key=lambda x: x["confidence"], reverse=True)

        # Step 7: Limit to top_k
        matched_contacts = matched_contacts[:top_k]

        logger.info(f"✓ Returning {len(matched_contacts)} matched contacts")

        return matched_contacts

    def _fuzzy_name_search(
        self,
        name: str,
        top_k: int = 5,
        min_similarity: float = 0.75  # Increased from 0.6 to avoid false matches
    ) -> List[Tuple[Dict, float]]:
        """
        Fuzzy name matching using string similarity with stricter rules.

        Args:
            name: Name to search for
            top_k: Number of top matches to return
            min_similarity: Minimum similarity threshold (0.0-1.0)

        Returns:
            List of (contact, similarity_score) tuples
        """
        matches = []

        # Normalize name for comparison
        name_normalized = self._normalize_name(name)

        # Get first letter for initial matching (helps avoid false positives)
        name_first_letter = name_normalized[0] if name_normalized else ''

        for contact in self.all_contacts:
            # Check against both Arabic and English names
            full_name_en_normalized = self._normalize_name(contact["full_name_en"])
            full_name_ar_normalized = self._normalize_name(contact["full_name_ar"])
            first_name_en_normalized = self._normalize_name(contact["first_name_en"])
            first_name_ar_normalized = self._normalize_name(contact["first_name_ar"])
            last_name_en_normalized = self._normalize_name(contact["last_name_en"])
            last_name_ar_normalized = self._normalize_name(contact["last_name_ar"])

            # Calculate similarity against various name combinations
            name_candidates = [
                (full_name_en_normalized, "full_en"),
                (full_name_ar_normalized, "full_ar"),
                (first_name_en_normalized, "first_en"),
                (first_name_ar_normalized, "first_ar"),
                (last_name_en_normalized, "last_en"),
                (last_name_ar_normalized, "last_ar"),
            ]

            best_similarity = 0
            best_match_type = None

            for candidate_name, match_type in name_candidates:
                if not candidate_name:
                    continue

                similarity = self._string_similarity(name_normalized, candidate_name)

                # Apply stricter rules for fuzzy matching:
                # 1. Names should start with the same letter (for English names)
                # 2. Or have very high similarity (>= 0.85)
                if match_type.endswith('_en'):
                    candidate_first_letter = candidate_name[0] if candidate_name else ''
                    # Require same first letter OR very high similarity
                    if candidate_first_letter != name_first_letter and similarity < 0.85:
                        continue  # Skip this match

                if similarity > best_similarity:
                    best_similarity = similarity
                    best_match_type = match_type

            if best_similarity >= min_similarity:
                matches.append((contact, best_similarity))

        # Sort by similarity (descending)
        matches.sort(key=lambda x: x[1], reverse=True)

        return matches[:top_k]

    def _normalize_name(self, name: str) -> str:
        """Normalize name for comparison (lowercase, remove extra spaces)"""
        return re.sub(r'\s+', ' ', name.strip().lower())

    def _string_similarity(self, s1: str, s2: str) -> float:
        """
        Calculate string similarity using SequenceMatcher.

        Returns:
            Similarity score between 0.0 and 1.0
        """
        return SequenceMatcher(None, s1, s2).ratio()

    def get_contact_stats(self) -> Dict:
        """Get statistics about the contact database"""
        from collections import Counter

        dept_counts = Counter(contact["department"] for contact in self.all_contacts)
        div_counts = Counter(contact["division"] for contact in self.all_contacts)

        return {
            "total_contacts": len(self.all_contacts),
            "departments": len(dept_counts),
            "divisions": len(div_counts),
            "contacts_by_department": dict(dept_counts),
            "contacts_by_division": dict(div_counts),
        }


if __name__ == "__main__":
    # Test the contact search service
    from name_extraction_service import NameExtractor
    from embedding_service import EmbeddingService

    print("Initializing services...")
    name_extractor = NameExtractor()
    embedding_service = EmbeddingService()
    search_service = ContactSearchService(name_extractor, embedding_service)

    print("\nContact Database Stats:")
    stats = search_service.get_contact_stats()
    print(f"Total contacts: {stats['total_contacts']}")
    print(f"Departments: {stats['departments']}")
    print(f"Divisions: {stats['divisions']}")

    # Test queries
    test_queries = [
        "Find Ahmed in IT",
        "I need to talk to someone in HR",
        "محمد في المالية",  # "Mohammed in Finance" in Arabic
        "Finance accounting help",
    ]

    print("\n" + "="*80)
    print("Testing Contact Search")
    print("="*80)

    for query in test_queries:
        print(f"\nQuery: '{query}'")
        print("-" * 80)

        results = search_service.search_contacts(query, top_k=3)

        if results:
            for i, contact in enumerate(results, 1):
                print(f"{i}. {contact['full_name_en']} ({contact['full_name_ar']})")
                print(f"   {contact['title_en']} - {contact['division']}")
                print(f"   {contact['email']} | Ext: {contact['extension']}")
                print(f"   Confidence: {contact['confidence']:.2f} | Reason: {contact['match_reason']}")
        else:
            print("No matches found.")