File size: 6,511 Bytes
15de73a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
"""

Build FAISS index from Congressional biography database.



This script:

1. Loads all biographies from the SQLite database

2. Generates embeddings using sentence transformers

3. Builds a FAISS index for fast similarity search

4. Saves the index and bio ID mapping to disk



Run this script whenever:

- The database is first created

- You want to rebuild the semantic search index

- After updating to a compatible Python version



Requires Python 3.9-3.12 (Python 3.14+ may have compatibility issues)

"""

import sqlite3
import faiss
import numpy as np
import pickle
import time
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer

# Paths
SCRIPT_DIR = Path(__file__).parent.absolute()
DB_PATH = str(SCRIPT_DIR / "congress.db")
INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index")
MAPPING_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl")

def build_faiss_index():
    """Build FAISS index from database biographies."""
    print("=" * 60)
    print("BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE")
    print("=" * 60)

    # Check database exists
    if not Path(DB_PATH).exists():
        print(f"\n❌ ERROR: Database not found at {DB_PATH}")
        print("   Run ingest_data.py first to create the database.")
        return False

    # Load sentence transformer model
    print("\n1. Loading sentence transformer model...")
    start = time.time()

    # Disable all parallelism to avoid Python 3.14 issues
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'
    os.environ['OPENBLAS_NUM_THREADS'] = '1'

    import torch
    torch.set_num_threads(1)

    model = SentenceTransformer('all-MiniLM-L6-v2')
    print(f"   βœ“ Model loaded in {time.time() - start:.3f}s")

    # Load biographies from database
    print("\n2. Loading biographies from database...")
    start = time.time()
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    cursor.execute("""

        SELECT bio_id, profile_text

        FROM members

        WHERE profile_text IS NOT NULL AND profile_text != ''

    """)
    rows = cursor.fetchall()
    conn.close()

    elapsed = time.time() - start
    print(f"   βœ“ Loaded {len(rows):,} biographies in {elapsed:.3f}s")

    if len(rows) == 0:
        print("\n❌ ERROR: No biographies found in database!")
        return False

    # Prepare data
    print("\n3. Preparing data for encoding...")
    start = time.time()
    bio_ids = [row[0] for row in rows]
    texts = [row[1] for row in rows]
    print(f"   βœ“ Prepared {len(bio_ids):,} texts")
    print(f"   βœ“ Time: {time.time() - start:.3f}s")

    # Generate embeddings in batches
    print("\n4. Generating embeddings...")
    print("   (This may take several minutes...)")
    start = time.time()
    batch_size = 32
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batch_embeddings = model.encode(
            batch,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=False,
            device='cpu'  # Explicit CPU to avoid issues
        )
        embeddings.extend(batch_embeddings)

        # Progress update every 100 batches (~3200 texts)
        if (i // batch_size + 1) % 100 == 0:
            elapsed = time.time() - start
            rate = (i + len(batch)) / elapsed
            remaining = (len(texts) - i - len(batch)) / rate if rate > 0 else 0
            print(f"   Encoded {i + len(batch):,}/{len(texts):,} " +
                  f"({rate:.0f} texts/sec, ~{remaining:.0f}s remaining)")

    embeddings = np.array(embeddings, dtype=np.float32)
    elapsed = time.time() - start
    print(f"   βœ“ Generated {len(embeddings):,} embeddings in {elapsed:.1f}s")
    print(f"   βœ“ Shape: {embeddings.shape}")

    # Build FAISS index
    print("\n5. Building FAISS index...")
    start = time.time()
    dimension = embeddings.shape[1]
    print(f"   Dimension: {dimension}")

    # Use IndexFlatIP for exact cosine similarity search
    # (Inner Product is equivalent to cosine similarity for normalized vectors)
    index = faiss.IndexFlatIP(dimension)

    # Normalize embeddings for cosine similarity
    faiss.normalize_L2(embeddings)

    # Add embeddings to index
    index.add(embeddings)

    elapsed = time.time() - start
    print(f"   βœ“ Index built in {elapsed:.3f}s")
    print(f"   βœ“ Total vectors in index: {index.ntotal:,}")

    # Save FAISS index
    print("\n6. Saving FAISS index to disk...")
    start = time.time()
    faiss.write_index(index, INDEX_PATH)
    elapsed = time.time() - start
    print(f"   βœ“ Index saved to: {INDEX_PATH}")
    print(f"   βœ“ Time: {elapsed:.3f}s")

    # Save bio ID mapping
    print("\n7. Saving bio ID mapping...")
    start = time.time()
    with open(MAPPING_PATH, "wb") as f:
        pickle.dump(bio_ids, f)
    elapsed = time.time() - start
    print(f"   βœ“ Mapping saved to: {MAPPING_PATH}")
    print(f"   βœ“ Time: {elapsed:.3f}s")

    # Get file sizes
    index_size_mb = Path(INDEX_PATH).stat().st_size / (1024**2)
    mapping_size_mb = Path(MAPPING_PATH).stat().st_size / (1024**2)

    print("\n" + "=" * 60)
    print("FAISS INDEX BUILD COMPLETE")
    print("=" * 60)
    print(f"Total biographies indexed: {len(bio_ids):,}")
    print(f"Index file size: {index_size_mb:.2f} MB")
    print(f"Mapping file size: {mapping_size_mb:.2f} MB")
    print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB")
    print("\nThe MCP server will now load this index on startup for semantic search.")
    print("You can now use the 'semantic_search_biography' tool!")

    return True


def main():
    """Main entry point."""
    try:
        success = build_faiss_index()
        if not success:
            exit(1)
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        print("\nThis may be due to Python version incompatibility.")
        print("FAISS and sentence-transformers work best with Python 3.9-3.12")
        print(f"Current Python version: {os.sys.version}")
        print("\nThe database is still usable without semantic search.")
        import traceback
        traceback.print_exc()
        exit(1)


if __name__ == "__main__":
    main()