File size: 17,093 Bytes
8ef276c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
# contact_search_service.py
"""
Contact search service with intelligent matching:
- Name-based search (exact and fuzzy matching)
- Division-based search
- Combined search (name + division)
- Confidence scoring
"""

import logging
from typing import List, Dict, Optional, Tuple
from difflib import SequenceMatcher
import re

from contacts_data import (
    get_all_contacts,
    get_contacts_by_division,
    get_contact_by_name
)
from name_extraction_service import NameExtractor
from embedding_service import EmbeddingService

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class ContactSearchService:
    """
    Service for searching contacts with intelligent matching.

    Features:
    - Exact name matching (100% confidence)
    - Fuzzy name matching (partial names, typos)
    - Division-based matching
    - Combined search (name + division)
    - Multi-language support (English and Arabic)
    """

    def __init__(
        self,
        name_extractor: NameExtractor,
        embedding_service: EmbeddingService
    ):
        """
        Initialize the contact search service.

        Args:
            name_extractor: NameExtractor service for extracting names from queries
            embedding_service: EmbeddingService for division matching
        """
        self.name_extractor = name_extractor
        self.embedding_service = embedding_service
        self.all_contacts = get_all_contacts()

        logger.info(f"ContactSearchService initialized with {len(self.all_contacts)} contacts")

    def search_contacts(
        self,
        query: str,
        top_k: int = 10,
        min_confidence: float = 0.3
    ) -> List[Dict]:
        """
        Search for contacts based on query.

        Process:
        1. Extract names from query
        2. Find matching divisions
        3. Match contacts by:
           - Exact name match (if name found) โ†’ confidence = 1.0
           - Fuzzy name match โ†’ confidence based on similarity
           - Division match โ†’ confidence from embedding service
           - Combined match (name + division) โ†’ boosted confidence
        4. Sort by confidence (exact matches first)

        Args:
            query: Search query (English or Arabic)
            top_k: Maximum number of results to return
            min_confidence: Minimum confidence threshold (0.0-1.0)

        Returns:
            List of matched contacts with confidence scores
        """
        logger.info(f"Searching contacts for query: '{query}'")

        # Step 1: Extract names from query
        extracted_names = self.name_extractor.extract_names(query)
        logger.info(f"Extracted names: {extracted_names}")

        # Step 2: Find matching divisions
        division_matches = self.embedding_service.find_division(query, top_k=3)
        logger.info(f"Found {len(division_matches)} division matches")

        # Step 3: Match contacts
        matched_contacts = []
        has_names = len(extracted_names) > 0
        has_divisions = len(division_matches) > 0
        requested_divisions = [dm.division for dm in division_matches] if has_divisions else []

        # Strategy A: If we have names, search by name
        name_matches = {}  # Track name matches by contact ID
        if extracted_names:
            for name in extracted_names:
                # Try exact match first
                exact_match = get_contact_by_name(name)
                if exact_match:
                    contact_id = exact_match["id"]
                    name_matches[contact_id] = {
                        "contact": exact_match,
                        "confidence": 1.0,
                        "similarity": 1.0,
                        "match_type": "exact"
                    }
                    logger.info(f"โœ“ Exact name match: {exact_match['full_name_en']}")
                else:
                    # Fuzzy name matching
                    fuzzy_matches = self._fuzzy_name_search(name, top_k=10)
                    for contact, similarity in fuzzy_matches:
                        contact_id = contact["id"]
                        # Only keep best match for each contact
                        if contact_id not in name_matches or similarity > name_matches[contact_id]["similarity"]:
                            name_matches[contact_id] = {
                                "contact": contact,
                                "confidence": round(0.5 + (similarity * 0.45), 2),
                                "similarity": round(similarity, 2),
                                "match_type": "fuzzy"
                            }
                            logger.info(
                                f"Fuzzy name match: {contact['full_name_en']} "
                                f"(similarity: {similarity:.2f})"
                            )

        # Strategy B: Division-based search
        division_matches_dict = {}  # Track division matches by contact ID
        for div_match in division_matches:
            division = div_match.division
            division_confidence = div_match.confidence

            # Get contacts in this division
            division_contacts = get_contacts_by_division(division)

            for contact in division_contacts:
                contact_id = contact["id"]
                # Only keep best division match for each contact
                if contact_id not in division_matches_dict or division_confidence > division_matches_dict[contact_id]["confidence"]:
                    division_matches_dict[contact_id] = {
                        "contact": contact,
                        "confidence": division_confidence,
                        "division": division
                    }

        # Strategy C: Combine matches intelligently
        # Priority when BOTH name and division are specified:
        # 1. Name + Correct Division = HIGHEST (both match)
        # 2. Correct Division only = HIGH (division is most important)
        # 3. Name + Wrong Division = LOW (penalize wrong division)

        all_contact_ids = set(name_matches.keys()) | set(division_matches_dict.keys())

        for contact_id in all_contact_ids:
            has_name_match = contact_id in name_matches
            has_division_match = contact_id in division_matches_dict

            if has_name_match and has_division_match:
                # BOTH name and division match - BEST CASE
                name_data = name_matches[contact_id]
                div_data = division_matches_dict[contact_id]
                contact = name_data["contact"]

                # When both match: take MAX of the two confidences and add a boost
                # This ensures division + name is always better than division alone
                combined_confidence = max(name_data["confidence"], div_data["confidence"]) + 0.15
                combined_confidence = min(1.0, combined_confidence)

                contact_result = contact.copy()
                contact_result["confidence"] = round(combined_confidence, 2)
                contact_result["match_reason"] = "name_and_division_match"
                contact_result["name_confidence"] = name_data["confidence"]
                contact_result["division_confidence"] = div_data["confidence"]
                matched_contacts.append(contact_result)

                logger.info(
                    f"โœ“ BOTH match: {contact['full_name_en']} in {div_data['division']} "
                    f"(final confidence: {contact_result['confidence']})"
                )

            elif has_division_match:
                # Division match only (no name specified, or name didn't match this person)
                div_data = division_matches_dict[contact_id]
                contact = div_data["contact"]

                contact_result = contact.copy()
                contact_result["confidence"] = div_data["confidence"]
                contact_result["match_reason"] = "division_match"
                contact_result["division_confidence"] = div_data["confidence"]
                matched_contacts.append(contact_result)

            elif has_name_match:
                # Name match but WRONG division (or no division specified)
                name_data = name_matches[contact_id]
                contact = name_data["contact"]

                # If division was specified in query AND has reasonable confidence (>= 0.58)
                # Apply penalty for being in wrong division
                # If division confidence is very low (< 0.58), treat as name-only search
                # This threshold helps avoid false division matches from words like "Find" (which scores ~0.56)
                # while still catching abbreviations like "App Dev" (which scores ~0.59)
                has_strong_division_match = has_divisions and division_matches[0].confidence >= 0.58

                if has_strong_division_match:
                    # Heavy penalty for wrong division when division was specified with confidence
                    penalized_confidence = name_data["confidence"] * 0.3  # 70% penalty
                    contact_result = contact.copy()
                    contact_result["confidence"] = round(penalized_confidence, 2)
                    contact_result["match_reason"] = "name_match_wrong_division"
                    contact_result["name_confidence"] = name_data["confidence"]
                    contact_result["requested_division"] = ", ".join(requested_divisions[:2])
                    matched_contacts.append(contact_result)

                    logger.info(
                        f"Name match with WRONG division: {contact['full_name_en']} "
                        f"in {contact['division']} (wanted: {requested_divisions[0]}, "
                        f"confidence: {contact_result['confidence']})"
                    )
                else:
                    # No division specified OR weak division match - use name confidence as-is
                    contact_result = contact.copy()
                    contact_result["confidence"] = name_data["confidence"]
                    contact_result["match_reason"] = f"{name_data['match_type']}_name_match"
                    contact_result["name_confidence"] = name_data["confidence"]
                    matched_contacts.append(contact_result)

        # Step 4: Remove duplicates (keep highest confidence)
        unique_contacts = {}
        for contact in matched_contacts:
            contact_id = contact["id"]
            if contact_id not in unique_contacts:
                unique_contacts[contact_id] = contact
            else:
                # Keep the one with higher confidence
                if contact["confidence"] > unique_contacts[contact_id]["confidence"]:
                    unique_contacts[contact_id] = contact

        # Convert back to list
        matched_contacts = list(unique_contacts.values())

        # Step 5: Filter by minimum confidence
        matched_contacts = [
            c for c in matched_contacts if c["confidence"] >= min_confidence
        ]

        # Step 6: Sort by confidence (descending) - exact matches will be first
        matched_contacts.sort(key=lambda x: x["confidence"], reverse=True)

        # Step 7: Limit to top_k
        matched_contacts = matched_contacts[:top_k]

        logger.info(f"โœ“ Returning {len(matched_contacts)} matched contacts")

        return matched_contacts

    def _fuzzy_name_search(
        self,
        name: str,
        top_k: int = 5,
        min_similarity: float = 0.75  # Increased from 0.6 to avoid false matches
    ) -> List[Tuple[Dict, float]]:
        """
        Fuzzy name matching using string similarity with stricter rules.

        Args:
            name: Name to search for
            top_k: Number of top matches to return
            min_similarity: Minimum similarity threshold (0.0-1.0)

        Returns:
            List of (contact, similarity_score) tuples
        """
        matches = []

        # Normalize name for comparison
        name_normalized = self._normalize_name(name)

        # Get first letter for initial matching (helps avoid false positives)
        name_first_letter = name_normalized[0] if name_normalized else ''

        for contact in self.all_contacts:
            # Check against both Arabic and English names
            full_name_en_normalized = self._normalize_name(contact["full_name_en"])
            full_name_ar_normalized = self._normalize_name(contact["full_name_ar"])
            first_name_en_normalized = self._normalize_name(contact["first_name_en"])
            first_name_ar_normalized = self._normalize_name(contact["first_name_ar"])
            last_name_en_normalized = self._normalize_name(contact["last_name_en"])
            last_name_ar_normalized = self._normalize_name(contact["last_name_ar"])

            # Calculate similarity against various name combinations
            name_candidates = [
                (full_name_en_normalized, "full_en"),
                (full_name_ar_normalized, "full_ar"),
                (first_name_en_normalized, "first_en"),
                (first_name_ar_normalized, "first_ar"),
                (last_name_en_normalized, "last_en"),
                (last_name_ar_normalized, "last_ar"),
            ]

            best_similarity = 0
            best_match_type = None

            for candidate_name, match_type in name_candidates:
                if not candidate_name:
                    continue

                similarity = self._string_similarity(name_normalized, candidate_name)

                # Apply stricter rules for fuzzy matching:
                # 1. Names should start with the same letter (for English names)
                # 2. Or have very high similarity (>= 0.85)
                if match_type.endswith('_en'):
                    candidate_first_letter = candidate_name[0] if candidate_name else ''
                    # Require same first letter OR very high similarity
                    if candidate_first_letter != name_first_letter and similarity < 0.85:
                        continue  # Skip this match

                if similarity > best_similarity:
                    best_similarity = similarity
                    best_match_type = match_type

            if best_similarity >= min_similarity:
                matches.append((contact, best_similarity))

        # Sort by similarity (descending)
        matches.sort(key=lambda x: x[1], reverse=True)

        return matches[:top_k]

    def _normalize_name(self, name: str) -> str:
        """Normalize name for comparison (lowercase, remove extra spaces)"""
        return re.sub(r'\s+', ' ', name.strip().lower())

    def _string_similarity(self, s1: str, s2: str) -> float:
        """
        Calculate string similarity using SequenceMatcher.

        Returns:
            Similarity score between 0.0 and 1.0
        """
        return SequenceMatcher(None, s1, s2).ratio()

    def get_contact_stats(self) -> Dict:
        """Get statistics about the contact database"""
        from collections import Counter

        dept_counts = Counter(contact["department"] for contact in self.all_contacts)
        div_counts = Counter(contact["division"] for contact in self.all_contacts)

        return {
            "total_contacts": len(self.all_contacts),
            "departments": len(dept_counts),
            "divisions": len(div_counts),
            "contacts_by_department": dict(dept_counts),
            "contacts_by_division": dict(div_counts),
        }


if __name__ == "__main__":
    # Test the contact search service
    from name_extraction_service import NameExtractor
    from embedding_service import EmbeddingService

    print("Initializing services...")
    name_extractor = NameExtractor()
    embedding_service = EmbeddingService()
    search_service = ContactSearchService(name_extractor, embedding_service)

    print("\nContact Database Stats:")
    stats = search_service.get_contact_stats()
    print(f"Total contacts: {stats['total_contacts']}")
    print(f"Departments: {stats['departments']}")
    print(f"Divisions: {stats['divisions']}")

    # Test queries
    test_queries = [
        "Find Ahmed in IT",
        "I need to talk to someone in HR",
        "ู…ุญู…ุฏ ููŠ ุงู„ู…ุงู„ูŠุฉ",  # "Mohammed in Finance" in Arabic
        "Finance accounting help",
    ]

    print("\n" + "="*80)
    print("Testing Contact Search")
    print("="*80)

    for query in test_queries:
        print(f"\nQuery: '{query}'")
        print("-" * 80)

        results = search_service.search_contacts(query, top_k=3)

        if results:
            for i, contact in enumerate(results, 1):
                print(f"{i}. {contact['full_name_en']} ({contact['full_name_ar']})")
                print(f"   {contact['title_en']} - {contact['division']}")
                print(f"   {contact['email']} | Ext: {contact['extension']}")
                print(f"   Confidence: {contact['confidence']:.2f} | Reason: {contact['match_reason']}")
        else:
            print("No matches found.")