Spaces:

ChAbhishek28
/

PensionBot

Sleeping

App Files Files Community

PensionBot / check_document_count.py

ChAbhishek28

Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs

67a99cd 2 months ago

raw

history blame contribute delete

3.97 kB

	#!/usr/bin/env python3
	"""
	Check how many documents are actually in the LanceDB database
	"""

	import sys
	import os
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))

	from lancedb_service import lancedb_service
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def check_document_count():
	"""Check how many documents are in each table"""
	try:
	db = lancedb_service.db

	print("📊 Document Count Analysis")
	print("=" * 50)

	# Check all tables
	table_names = db.table_names()
	print(f"Available tables: {table_names}")
	print()

	for table_name in table_names:
	try:
	table = db.open_table(table_name)
	count = table.count_rows()
	print(f"📋 {table_name}: {count} documents")

	# Show sample data for document tables
	if count > 0 and 'document' in table_name.lower():
	print(f" Sample documents from {table_name}:")
	sample = table.head(3)
	for i, row in enumerate(sample.to_pylist()):
	content_preview = row.get('content', 'No content')[:100] + "..." if len(row.get('content', '')) > 100 else row.get('content', 'No content')
	filename = row.get('filename', 'No filename')
	print(f" - Document {i+1}: {filename}")
	print(f" Content: {content_preview}")
	print()

	except Exception as e:
	print(f"❌ Error checking {table_name}: {e}")

	print("\n🔍 Voice Bot Document Usage Analysis:")
	print("-" * 40)

	# Check if voice bot is using documents
	if 'rajasthan_documents' in table_names:
	raj_table = db.open_table('rajasthan_documents')
	raj_count = raj_table.count_rows()
	print(f"✅ Voice Bot has access to {raj_count} Rajasthan documents")

	if raj_count > 0:
	print("📄 Document topics include:")
	documents = raj_table.head(10).to_pylist()
	for doc in documents:
	filename = doc.get('filename', 'Unknown')
	content_snippet = doc.get('content', '')[:200] + "..."
	print(f" • {filename}")
	if 'pension' in content_snippet.lower():
	print(" - Contains pension information ✅")
	if 'leave' in content_snippet.lower():
	print(" - Contains leave information ✅")
	if 'salary' in content_snippet.lower():
	print(" - Contains salary information ✅")
	else:
	print("❌ No rajasthan_documents table found!")

	# Check regular documents table
	if 'documents' in table_names:
	doc_table = db.open_table('documents')
	doc_count = doc_table.count_rows()
	print(f"📚 General documents table: {doc_count} documents")

	print(f"\n📝 Summary:")
	print(f"- The voice bot is {'✅ USING' if raj_count > 0 else '❌ NOT USING'} the document database")
	print(f"- Total accessible documents: {raj_count if 'rajasthan_documents' in table_names else 0}")
	print(f"- This is {'✅ GOOD' if raj_count >= 5 else '⚠️ LIMITED'} for comprehensive responses")

	if raj_count < 100:
	print(f"\n💡 Note: You mentioned 1000+ documents, but only {raj_count} are currently loaded.")
	print(" Consider adding more documents to improve response quality.")

	except Exception as e:
	logger.error(f"❌ Error checking document count: {e}")

	if __name__ == "__main__":
	check_document_count()