BioGuideMCP / test_sentence_transformers.py
stefanjwojcik's picture
Add setup script and comprehensive tests for Congressional Bioguide MCP Server
15de73a
#!/usr/bin/env python3
"""
Test sentence-transformers to isolate the segfault.
"""
import sys
import os
print("=" * 60)
print("SENTENCE TRANSFORMERS TEST")
print("=" * 60)
print(f"Python version: {sys.version}")
print()
# Test 1: Import sentence_transformers
print("Test 1: Import sentence_transformers...")
try:
from sentence_transformers import SentenceTransformer
print(f" βœ“ sentence_transformers imported")
except Exception as e:
print(f" ❌ Failed: {e}")
sys.exit(1)
# Test 2: Load model
print("\nTest 2: Load model (this downloads ~90MB on first run)...")
try:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f" βœ“ Model loaded")
except Exception as e:
print(f" ❌ Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 3: Encode simple text
print("\nTest 3: Encode simple text...")
try:
text = "This is a test sentence."
embedding = model.encode([text])
print(f" βœ“ Encoded text, embedding shape: {embedding.shape}")
except Exception as e:
print(f" ❌ Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 4: Encode batch
print("\nTest 4: Encode batch of texts...")
try:
texts = ["First sentence", "Second sentence", "Third sentence"]
embeddings = model.encode(texts, show_progress_bar=False)
print(f" βœ“ Encoded {len(texts)} texts, shape: {embeddings.shape}")
except Exception as e:
print(f" ❌ Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 5: Encode with explicit parameters
print("\nTest 5: Encode with explicit parameters (like in our script)...")
try:
embeddings = model.encode(
texts,
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=False,
device='cpu'
)
print(f" βœ“ Encoded with explicit params, shape: {embeddings.shape}")
except Exception as e:
print(f" ❌ Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 6: Encode larger batch
print("\nTest 6: Encode larger batch (100 texts)...")
try:
large_texts = [f"This is test sentence number {i}" for i in range(100)]
embeddings = model.encode(
large_texts,
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=False,
device='cpu'
)
print(f" βœ“ Encoded {len(large_texts)} texts, shape: {embeddings.shape}")
except Exception as e:
print(f" ❌ Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 7: Test with actual biography-like text
print("\nTest 7: Encode biography-like text...")
try:
bio = """A Representative from Illinois and 16th President of the United States;
born in Hardin County, Ky., February 12, 1809; moved with his parents to a tract
on Little Pigeon Creek, Ind., in 1816; attended a log-cabin school at short intervals
and was self-instructed in elementary branches."""
embedding = model.encode([bio], show_progress_bar=False, device='cpu')
print(f" βœ“ Encoded biography, shape: {embedding.shape}")
except Exception as e:
print(f" ❌ Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print("\n" + "=" * 60)
print("βœ… ALL TESTS PASSED!")
print("=" * 60)
print("\nSentence transformers is working correctly.")
print("The issue may be with the combination of:")
print(" - Very large batch processing")
print(" - Integration with FAISS normalize")
print(" - Memory management with 13k+ texts")