Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Test sentence-transformers to isolate the segfault. | |
| """ | |
| import sys | |
| import os | |
| print("=" * 60) | |
| print("SENTENCE TRANSFORMERS TEST") | |
| print("=" * 60) | |
| print(f"Python version: {sys.version}") | |
| print() | |
| # Test 1: Import sentence_transformers | |
| print("Test 1: Import sentence_transformers...") | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| print(f" β sentence_transformers imported") | |
| except Exception as e: | |
| print(f" β Failed: {e}") | |
| sys.exit(1) | |
| # Test 2: Load model | |
| print("\nTest 2: Load model (this downloads ~90MB on first run)...") | |
| try: | |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| print(f" β Model loaded") | |
| except Exception as e: | |
| print(f" β Failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| # Test 3: Encode simple text | |
| print("\nTest 3: Encode simple text...") | |
| try: | |
| text = "This is a test sentence." | |
| embedding = model.encode([text]) | |
| print(f" β Encoded text, embedding shape: {embedding.shape}") | |
| except Exception as e: | |
| print(f" β Failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| # Test 4: Encode batch | |
| print("\nTest 4: Encode batch of texts...") | |
| try: | |
| texts = ["First sentence", "Second sentence", "Third sentence"] | |
| embeddings = model.encode(texts, show_progress_bar=False) | |
| print(f" β Encoded {len(texts)} texts, shape: {embeddings.shape}") | |
| except Exception as e: | |
| print(f" β Failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| # Test 5: Encode with explicit parameters | |
| print("\nTest 5: Encode with explicit parameters (like in our script)...") | |
| try: | |
| embeddings = model.encode( | |
| texts, | |
| show_progress_bar=False, | |
| convert_to_numpy=True, | |
| normalize_embeddings=False, | |
| device='cpu' | |
| ) | |
| print(f" β Encoded with explicit params, shape: {embeddings.shape}") | |
| except Exception as e: | |
| print(f" β Failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| # Test 6: Encode larger batch | |
| print("\nTest 6: Encode larger batch (100 texts)...") | |
| try: | |
| large_texts = [f"This is test sentence number {i}" for i in range(100)] | |
| embeddings = model.encode( | |
| large_texts, | |
| show_progress_bar=False, | |
| convert_to_numpy=True, | |
| normalize_embeddings=False, | |
| device='cpu' | |
| ) | |
| print(f" β Encoded {len(large_texts)} texts, shape: {embeddings.shape}") | |
| except Exception as e: | |
| print(f" β Failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| # Test 7: Test with actual biography-like text | |
| print("\nTest 7: Encode biography-like text...") | |
| try: | |
| bio = """A Representative from Illinois and 16th President of the United States; | |
| born in Hardin County, Ky., February 12, 1809; moved with his parents to a tract | |
| on Little Pigeon Creek, Ind., in 1816; attended a log-cabin school at short intervals | |
| and was self-instructed in elementary branches.""" | |
| embedding = model.encode([bio], show_progress_bar=False, device='cpu') | |
| print(f" β Encoded biography, shape: {embedding.shape}") | |
| except Exception as e: | |
| print(f" β Failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| print("\n" + "=" * 60) | |
| print("β ALL TESTS PASSED!") | |
| print("=" * 60) | |
| print("\nSentence transformers is working correctly.") | |
| print("The issue may be with the combination of:") | |
| print(" - Very large batch processing") | |
| print(" - Integration with FAISS normalize") | |
| print(" - Memory management with 13k+ texts") | |