Spaces:
Sleeping
Sleeping
Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs
67a99cd
| #!/usr/bin/env python3 | |
| """ | |
| Check how many documents are actually in the LanceDB database | |
| """ | |
| import sys | |
| import os | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| from lancedb_service import lancedb_service | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def check_document_count(): | |
| """Check how many documents are in each table""" | |
| try: | |
| db = lancedb_service.db | |
| print("π Document Count Analysis") | |
| print("=" * 50) | |
| # Check all tables | |
| table_names = db.table_names() | |
| print(f"Available tables: {table_names}") | |
| print() | |
| for table_name in table_names: | |
| try: | |
| table = db.open_table(table_name) | |
| count = table.count_rows() | |
| print(f"π {table_name}: {count} documents") | |
| # Show sample data for document tables | |
| if count > 0 and 'document' in table_name.lower(): | |
| print(f" Sample documents from {table_name}:") | |
| sample = table.head(3) | |
| for i, row in enumerate(sample.to_pylist()): | |
| content_preview = row.get('content', 'No content')[:100] + "..." if len(row.get('content', '')) > 100 else row.get('content', 'No content') | |
| filename = row.get('filename', 'No filename') | |
| print(f" - Document {i+1}: {filename}") | |
| print(f" Content: {content_preview}") | |
| print() | |
| except Exception as e: | |
| print(f"β Error checking {table_name}: {e}") | |
| print("\nπ Voice Bot Document Usage Analysis:") | |
| print("-" * 40) | |
| # Check if voice bot is using documents | |
| if 'rajasthan_documents' in table_names: | |
| raj_table = db.open_table('rajasthan_documents') | |
| raj_count = raj_table.count_rows() | |
| print(f"β Voice Bot has access to {raj_count} Rajasthan documents") | |
| if raj_count > 0: | |
| print("π Document topics include:") | |
| documents = raj_table.head(10).to_pylist() | |
| for doc in documents: | |
| filename = doc.get('filename', 'Unknown') | |
| content_snippet = doc.get('content', '')[:200] + "..." | |
| print(f" β’ {filename}") | |
| if 'pension' in content_snippet.lower(): | |
| print(" - Contains pension information β ") | |
| if 'leave' in content_snippet.lower(): | |
| print(" - Contains leave information β ") | |
| if 'salary' in content_snippet.lower(): | |
| print(" - Contains salary information β ") | |
| else: | |
| print("β No rajasthan_documents table found!") | |
| # Check regular documents table | |
| if 'documents' in table_names: | |
| doc_table = db.open_table('documents') | |
| doc_count = doc_table.count_rows() | |
| print(f"π General documents table: {doc_count} documents") | |
| print(f"\nπ Summary:") | |
| print(f"- The voice bot is {'β USING' if raj_count > 0 else 'β NOT USING'} the document database") | |
| print(f"- Total accessible documents: {raj_count if 'rajasthan_documents' in table_names else 0}") | |
| print(f"- This is {'β GOOD' if raj_count >= 5 else 'β οΈ LIMITED'} for comprehensive responses") | |
| if raj_count < 100: | |
| print(f"\nπ‘ Note: You mentioned 1000+ documents, but only {raj_count} are currently loaded.") | |
| print(" Consider adding more documents to improve response quality.") | |
| except Exception as e: | |
| logger.error(f"β Error checking document count: {e}") | |
| if __name__ == "__main__": | |
| check_document_count() |