#!/usr/bin/env python3 """ Check how many documents are actually in the LanceDB database """ import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) from lancedb_service import lancedb_service import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def check_document_count(): """Check how many documents are in each table""" try: db = lancedb_service.db print("šŸ“Š Document Count Analysis") print("=" * 50) # Check all tables table_names = db.table_names() print(f"Available tables: {table_names}") print() for table_name in table_names: try: table = db.open_table(table_name) count = table.count_rows() print(f"šŸ“‹ {table_name}: {count} documents") # Show sample data for document tables if count > 0 and 'document' in table_name.lower(): print(f" Sample documents from {table_name}:") sample = table.head(3) for i, row in enumerate(sample.to_pylist()): content_preview = row.get('content', 'No content')[:100] + "..." if len(row.get('content', '')) > 100 else row.get('content', 'No content') filename = row.get('filename', 'No filename') print(f" - Document {i+1}: {filename}") print(f" Content: {content_preview}") print() except Exception as e: print(f"āŒ Error checking {table_name}: {e}") print("\nšŸ” Voice Bot Document Usage Analysis:") print("-" * 40) # Check if voice bot is using documents if 'rajasthan_documents' in table_names: raj_table = db.open_table('rajasthan_documents') raj_count = raj_table.count_rows() print(f"āœ… Voice Bot has access to {raj_count} Rajasthan documents") if raj_count > 0: print("šŸ“„ Document topics include:") documents = raj_table.head(10).to_pylist() for doc in documents: filename = doc.get('filename', 'Unknown') content_snippet = doc.get('content', '')[:200] + "..." print(f" • {filename}") if 'pension' in content_snippet.lower(): print(" - Contains pension information āœ…") if 'leave' in content_snippet.lower(): print(" - Contains leave information āœ…") if 'salary' in content_snippet.lower(): print(" - Contains salary information āœ…") else: print("āŒ No rajasthan_documents table found!") # Check regular documents table if 'documents' in table_names: doc_table = db.open_table('documents') doc_count = doc_table.count_rows() print(f"šŸ“š General documents table: {doc_count} documents") print(f"\nšŸ“ Summary:") print(f"- The voice bot is {'āœ… USING' if raj_count > 0 else 'āŒ NOT USING'} the document database") print(f"- Total accessible documents: {raj_count if 'rajasthan_documents' in table_names else 0}") print(f"- This is {'āœ… GOOD' if raj_count >= 5 else 'āš ļø LIMITED'} for comprehensive responses") if raj_count < 100: print(f"\nšŸ’” Note: You mentioned 1000+ documents, but only {raj_count} are currently loaded.") print(" Consider adding more documents to improve response quality.") except Exception as e: logger.error(f"āŒ Error checking document count: {e}") if __name__ == "__main__": check_document_count()