Spaces:

stefanjwojcik
/

BioGuideMCP

Running

App Files Files Community

BioGuideMCP / test_sentence_transformers.py

stefanjwojcik

Add setup script and comprehensive tests for Congressional Bioguide MCP Server

15de73a about 1 month ago

raw

history blame contribute delete

3.72 kB

	#!/usr/bin/env python3
	"""
	Test sentence-transformers to isolate the segfault.
	"""

	import sys
	import os

	print("=" * 60)
	print("SENTENCE TRANSFORMERS TEST")
	print("=" * 60)
	print(f"Python version: {sys.version}")
	print()

	# Test 1: Import sentence_transformers
	print("Test 1: Import sentence_transformers...")
	try:
	from sentence_transformers import SentenceTransformer
	print(f" ✓ sentence_transformers imported")
	except Exception as e:
	print(f" ❌ Failed: {e}")
	sys.exit(1)

	# Test 2: Load model
	print("\nTest 2: Load model (this downloads ~90MB on first run)...")
	try:
	os.environ['TOKENIZERS_PARALLELISM'] = 'false'
	model = SentenceTransformer('all-MiniLM-L6-v2')
	print(f" ✓ Model loaded")
	except Exception as e:
	print(f" ❌ Failed: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	# Test 3: Encode simple text
	print("\nTest 3: Encode simple text...")
	try:
	text = "This is a test sentence."
	embedding = model.encode([text])
	print(f" ✓ Encoded text, embedding shape: {embedding.shape}")
	except Exception as e:
	print(f" ❌ Failed: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	# Test 4: Encode batch
	print("\nTest 4: Encode batch of texts...")
	try:
	texts = ["First sentence", "Second sentence", "Third sentence"]
	embeddings = model.encode(texts, show_progress_bar=False)
	print(f" ✓ Encoded {len(texts)} texts, shape: {embeddings.shape}")
	except Exception as e:
	print(f" ❌ Failed: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	# Test 5: Encode with explicit parameters
	print("\nTest 5: Encode with explicit parameters (like in our script)...")
	try:
	embeddings = model.encode(
	texts,
	show_progress_bar=False,
	convert_to_numpy=True,
	normalize_embeddings=False,
	device='cpu'
	)
	print(f" ✓ Encoded with explicit params, shape: {embeddings.shape}")
	except Exception as e:
	print(f" ❌ Failed: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	# Test 6: Encode larger batch
	print("\nTest 6: Encode larger batch (100 texts)...")
	try:
	large_texts = [f"This is test sentence number {i}" for i in range(100)]
	embeddings = model.encode(
	large_texts,
	show_progress_bar=False,
	convert_to_numpy=True,
	normalize_embeddings=False,
	device='cpu'
	)
	print(f" ✓ Encoded {len(large_texts)} texts, shape: {embeddings.shape}")
	except Exception as e:
	print(f" ❌ Failed: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	# Test 7: Test with actual biography-like text
	print("\nTest 7: Encode biography-like text...")
	try:
	bio = """A Representative from Illinois and 16th President of the United States;
	born in Hardin County, Ky., February 12, 1809; moved with his parents to a tract
	on Little Pigeon Creek, Ind., in 1816; attended a log-cabin school at short intervals
	and was self-instructed in elementary branches."""

	embedding = model.encode([bio], show_progress_bar=False, device='cpu')
	print(f" ✓ Encoded biography, shape: {embedding.shape}")
	except Exception as e:
	print(f" ❌ Failed: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	print("\n" + "=" * 60)
	print("✅ ALL TESTS PASSED!")
	print("=" * 60)
	print("\nSentence transformers is working correctly.")
	print("The issue may be with the combination of:")
	print(" - Very large batch processing")
	print(" - Integration with FAISS normalize")
	print(" - Memory management with 13k+ texts")