Spaces:
Running
Running
File size: 6,511 Bytes
15de73a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
#!/usr/bin/env python3
"""
Build FAISS index from Congressional biography database.
This script:
1. Loads all biographies from the SQLite database
2. Generates embeddings using sentence transformers
3. Builds a FAISS index for fast similarity search
4. Saves the index and bio ID mapping to disk
Run this script whenever:
- The database is first created
- You want to rebuild the semantic search index
- After updating to a compatible Python version
Requires Python 3.9-3.12 (Python 3.14+ may have compatibility issues)
"""
import sqlite3
import faiss
import numpy as np
import pickle
import time
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer
# Paths
SCRIPT_DIR = Path(__file__).parent.absolute()
DB_PATH = str(SCRIPT_DIR / "congress.db")
INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index")
MAPPING_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl")
def build_faiss_index():
"""Build FAISS index from database biographies."""
print("=" * 60)
print("BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE")
print("=" * 60)
# Check database exists
if not Path(DB_PATH).exists():
print(f"\nβ ERROR: Database not found at {DB_PATH}")
print(" Run ingest_data.py first to create the database.")
return False
# Load sentence transformer model
print("\n1. Loading sentence transformer model...")
start = time.time()
# Disable all parallelism to avoid Python 3.14 issues
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import torch
torch.set_num_threads(1)
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f" β Model loaded in {time.time() - start:.3f}s")
# Load biographies from database
print("\n2. Loading biographies from database...")
start = time.time()
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
SELECT bio_id, profile_text
FROM members
WHERE profile_text IS NOT NULL AND profile_text != ''
""")
rows = cursor.fetchall()
conn.close()
elapsed = time.time() - start
print(f" β Loaded {len(rows):,} biographies in {elapsed:.3f}s")
if len(rows) == 0:
print("\nβ ERROR: No biographies found in database!")
return False
# Prepare data
print("\n3. Preparing data for encoding...")
start = time.time()
bio_ids = [row[0] for row in rows]
texts = [row[1] for row in rows]
print(f" β Prepared {len(bio_ids):,} texts")
print(f" β Time: {time.time() - start:.3f}s")
# Generate embeddings in batches
print("\n4. Generating embeddings...")
print(" (This may take several minutes...)")
start = time.time()
batch_size = 32
embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch_embeddings = model.encode(
batch,
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=False,
device='cpu' # Explicit CPU to avoid issues
)
embeddings.extend(batch_embeddings)
# Progress update every 100 batches (~3200 texts)
if (i // batch_size + 1) % 100 == 0:
elapsed = time.time() - start
rate = (i + len(batch)) / elapsed
remaining = (len(texts) - i - len(batch)) / rate if rate > 0 else 0
print(f" Encoded {i + len(batch):,}/{len(texts):,} " +
f"({rate:.0f} texts/sec, ~{remaining:.0f}s remaining)")
embeddings = np.array(embeddings, dtype=np.float32)
elapsed = time.time() - start
print(f" β Generated {len(embeddings):,} embeddings in {elapsed:.1f}s")
print(f" β Shape: {embeddings.shape}")
# Build FAISS index
print("\n5. Building FAISS index...")
start = time.time()
dimension = embeddings.shape[1]
print(f" Dimension: {dimension}")
# Use IndexFlatIP for exact cosine similarity search
# (Inner Product is equivalent to cosine similarity for normalized vectors)
index = faiss.IndexFlatIP(dimension)
# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)
# Add embeddings to index
index.add(embeddings)
elapsed = time.time() - start
print(f" β Index built in {elapsed:.3f}s")
print(f" β Total vectors in index: {index.ntotal:,}")
# Save FAISS index
print("\n6. Saving FAISS index to disk...")
start = time.time()
faiss.write_index(index, INDEX_PATH)
elapsed = time.time() - start
print(f" β Index saved to: {INDEX_PATH}")
print(f" β Time: {elapsed:.3f}s")
# Save bio ID mapping
print("\n7. Saving bio ID mapping...")
start = time.time()
with open(MAPPING_PATH, "wb") as f:
pickle.dump(bio_ids, f)
elapsed = time.time() - start
print(f" β Mapping saved to: {MAPPING_PATH}")
print(f" β Time: {elapsed:.3f}s")
# Get file sizes
index_size_mb = Path(INDEX_PATH).stat().st_size / (1024**2)
mapping_size_mb = Path(MAPPING_PATH).stat().st_size / (1024**2)
print("\n" + "=" * 60)
print("FAISS INDEX BUILD COMPLETE")
print("=" * 60)
print(f"Total biographies indexed: {len(bio_ids):,}")
print(f"Index file size: {index_size_mb:.2f} MB")
print(f"Mapping file size: {mapping_size_mb:.2f} MB")
print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB")
print("\nThe MCP server will now load this index on startup for semantic search.")
print("You can now use the 'semantic_search_biography' tool!")
return True
def main():
"""Main entry point."""
try:
success = build_faiss_index()
if not success:
exit(1)
except Exception as e:
print(f"\nβ ERROR: {e}")
print("\nThis may be due to Python version incompatibility.")
print("FAISS and sentence-transformers work best with Python 3.9-3.12")
print(f"Current Python version: {os.sys.version}")
print("\nThe database is still usable without semantic search.")
import traceback
traceback.print_exc()
exit(1)
if __name__ == "__main__":
main()
|