PensionBot / check_document_count.py
ChAbhishek28's picture
Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs
67a99cd
#!/usr/bin/env python3
"""
Check how many documents are actually in the LanceDB database
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from lancedb_service import lancedb_service
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def check_document_count():
"""Check how many documents are in each table"""
try:
db = lancedb_service.db
print("πŸ“Š Document Count Analysis")
print("=" * 50)
# Check all tables
table_names = db.table_names()
print(f"Available tables: {table_names}")
print()
for table_name in table_names:
try:
table = db.open_table(table_name)
count = table.count_rows()
print(f"πŸ“‹ {table_name}: {count} documents")
# Show sample data for document tables
if count > 0 and 'document' in table_name.lower():
print(f" Sample documents from {table_name}:")
sample = table.head(3)
for i, row in enumerate(sample.to_pylist()):
content_preview = row.get('content', 'No content')[:100] + "..." if len(row.get('content', '')) > 100 else row.get('content', 'No content')
filename = row.get('filename', 'No filename')
print(f" - Document {i+1}: {filename}")
print(f" Content: {content_preview}")
print()
except Exception as e:
print(f"❌ Error checking {table_name}: {e}")
print("\nπŸ” Voice Bot Document Usage Analysis:")
print("-" * 40)
# Check if voice bot is using documents
if 'rajasthan_documents' in table_names:
raj_table = db.open_table('rajasthan_documents')
raj_count = raj_table.count_rows()
print(f"βœ… Voice Bot has access to {raj_count} Rajasthan documents")
if raj_count > 0:
print("πŸ“„ Document topics include:")
documents = raj_table.head(10).to_pylist()
for doc in documents:
filename = doc.get('filename', 'Unknown')
content_snippet = doc.get('content', '')[:200] + "..."
print(f" β€’ {filename}")
if 'pension' in content_snippet.lower():
print(" - Contains pension information βœ…")
if 'leave' in content_snippet.lower():
print(" - Contains leave information βœ…")
if 'salary' in content_snippet.lower():
print(" - Contains salary information βœ…")
else:
print("❌ No rajasthan_documents table found!")
# Check regular documents table
if 'documents' in table_names:
doc_table = db.open_table('documents')
doc_count = doc_table.count_rows()
print(f"πŸ“š General documents table: {doc_count} documents")
print(f"\nπŸ“ Summary:")
print(f"- The voice bot is {'βœ… USING' if raj_count > 0 else '❌ NOT USING'} the document database")
print(f"- Total accessible documents: {raj_count if 'rajasthan_documents' in table_names else 0}")
print(f"- This is {'βœ… GOOD' if raj_count >= 5 else '⚠️ LIMITED'} for comprehensive responses")
if raj_count < 100:
print(f"\nπŸ’‘ Note: You mentioned 1000+ documents, but only {raj_count} are currently loaded.")
print(" Consider adding more documents to improve response quality.")
except Exception as e:
logger.error(f"❌ Error checking document count: {e}")
if __name__ == "__main__":
check_document_count()