Spaces:

stefanjwojcik
/

BioGuideMCP

Running

File size: 6,511 Bytes

15de73a

#!/usr/bin/env python3
"""

Build FAISS index from Congressional biography database.



This script:

1. Loads all biographies from the SQLite database

2. Generates embeddings using sentence transformers

3. Builds a FAISS index for fast similarity search

4. Saves the index and bio ID mapping to disk



Run this script whenever:

- The database is first created

- You want to rebuild the semantic search index

- After updating to a compatible Python version



Requires Python 3.9-3.12 (Python 3.14+ may have compatibility issues)

"""

import sqlite3
import faiss
import numpy as np
import pickle
import time
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer

# Paths
SCRIPT_DIR = Path(__file__).parent.absolute()
DB_PATH = str(SCRIPT_DIR / "congress.db")
INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index")
MAPPING_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl")

def build_faiss_index():
    """Build FAISS index from database biographies."""
    print("=" * 60)
    print("BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE")
    print("=" * 60)

    # Check database exists
    if not Path(DB_PATH).exists():
        print(f"\n❌ ERROR: Database not found at {DB_PATH}")
        print("   Run ingest_data.py first to create the database.")
        return False

    # Load sentence transformer model
    print("\n1. Loading sentence transformer model...")
    start = time.time()

    # Disable all parallelism to avoid Python 3.14 issues
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'
    os.environ['OPENBLAS_NUM_THREADS'] = '1'

    import torch
    torch.set_num_threads(1)

    model = SentenceTransformer('all-MiniLM-L6-v2')
    print(f"   ✓ Model loaded in {time.time() - start:.3f}s")

    # Load biographies from database
    print("\n2. Loading biographies from database...")
    start = time.time()
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    cursor.execute("""

        SELECT bio_id, profile_text

        FROM members

        WHERE profile_text IS NOT NULL AND profile_text != ''

    """)
    rows = cursor.fetchall()
    conn.close()

    elapsed = time.time() - start
    print(f"   ✓ Loaded {len(rows):,} biographies in {elapsed:.3f}s")

    if len(rows) == 0:
        print("\n❌ ERROR: No biographies found in database!")
        return False

    # Prepare data
    print("\n3. Preparing data for encoding...")
    start = time.time()
    bio_ids = [row[0] for row in rows]
    texts = [row[1] for row in rows]
    print(f"   ✓ Prepared {len(bio_ids):,} texts")
    print(f"   ✓ Time: {time.time() - start:.3f}s")

    # Generate embeddings in batches
    print("\n4. Generating embeddings...")
    print("   (This may take several minutes...)")
    start = time.time()
    batch_size = 32
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batch_embeddings = model.encode(
            batch,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=False,
            device='cpu'  # Explicit CPU to avoid issues
        )
        embeddings.extend(batch_embeddings)

        # Progress update every 100 batches (~3200 texts)
        if (i // batch_size + 1) % 100 == 0:
            elapsed = time.time() - start
            rate = (i + len(batch)) / elapsed
            remaining = (len(texts) - i - len(batch)) / rate if rate > 0 else 0
            print(f"   Encoded {i + len(batch):,}/{len(texts):,} " +
                  f"({rate:.0f} texts/sec, ~{remaining:.0f}s remaining)")

    embeddings = np.array(embeddings, dtype=np.float32)
    elapsed = time.time() - start
    print(f"   ✓ Generated {len(embeddings):,} embeddings in {elapsed:.1f}s")
    print(f"   ✓ Shape: {embeddings.shape}")

    # Build FAISS index
    print("\n5. Building FAISS index...")
    start = time.time()
    dimension = embeddings.shape[1]
    print(f"   Dimension: {dimension}")

    # Use IndexFlatIP for exact cosine similarity search
    # (Inner Product is equivalent to cosine similarity for normalized vectors)
    index = faiss.IndexFlatIP(dimension)

    # Normalize embeddings for cosine similarity
    faiss.normalize_L2(embeddings)

    # Add embeddings to index
    index.add(embeddings)

    elapsed = time.time() - start
    print(f"   ✓ Index built in {elapsed:.3f}s")
    print(f"   ✓ Total vectors in index: {index.ntotal:,}")

    # Save FAISS index
    print("\n6. Saving FAISS index to disk...")
    start = time.time()
    faiss.write_index(index, INDEX_PATH)
    elapsed = time.time() - start
    print(f"   ✓ Index saved to: {INDEX_PATH}")
    print(f"   ✓ Time: {elapsed:.3f}s")

    # Save bio ID mapping
    print("\n7. Saving bio ID mapping...")
    start = time.time()
    with open(MAPPING_PATH, "wb") as f:
        pickle.dump(bio_ids, f)
    elapsed = time.time() - start
    print(f"   ✓ Mapping saved to: {MAPPING_PATH}")
    print(f"   ✓ Time: {elapsed:.3f}s")

    # Get file sizes
    index_size_mb = Path(INDEX_PATH).stat().st_size / (1024**2)
    mapping_size_mb = Path(MAPPING_PATH).stat().st_size / (1024**2)

    print("\n" + "=" * 60)
    print("FAISS INDEX BUILD COMPLETE")
    print("=" * 60)
    print(f"Total biographies indexed: {len(bio_ids):,}")
    print(f"Index file size: {index_size_mb:.2f} MB")
    print(f"Mapping file size: {mapping_size_mb:.2f} MB")
    print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB")
    print("\nThe MCP server will now load this index on startup for semantic search.")
    print("You can now use the 'semantic_search_biography' tool!")

    return True


def main():
    """Main entry point."""
    try:
        success = build_faiss_index()
        if not success:
            exit(1)
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        print("\nThis may be due to Python version incompatibility.")
        print("FAISS and sentence-transformers work best with Python 3.9-3.12")
        print(f"Current Python version: {os.sys.version}")
        print("\nThe database is still usable without semantic search.")
        import traceback
        traceback.print_exc()
        exit(1)


if __name__ == "__main__":
    main()