Spaces:
Sleeping
Sleeping
File size: 3,967 Bytes
67a99cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
#!/usr/bin/env python3
"""
Check how many documents are actually in the LanceDB database
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from lancedb_service import lancedb_service
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def check_document_count():
"""Check how many documents are in each table"""
try:
db = lancedb_service.db
print("π Document Count Analysis")
print("=" * 50)
# Check all tables
table_names = db.table_names()
print(f"Available tables: {table_names}")
print()
for table_name in table_names:
try:
table = db.open_table(table_name)
count = table.count_rows()
print(f"π {table_name}: {count} documents")
# Show sample data for document tables
if count > 0 and 'document' in table_name.lower():
print(f" Sample documents from {table_name}:")
sample = table.head(3)
for i, row in enumerate(sample.to_pylist()):
content_preview = row.get('content', 'No content')[:100] + "..." if len(row.get('content', '')) > 100 else row.get('content', 'No content')
filename = row.get('filename', 'No filename')
print(f" - Document {i+1}: {filename}")
print(f" Content: {content_preview}")
print()
except Exception as e:
print(f"β Error checking {table_name}: {e}")
print("\nπ Voice Bot Document Usage Analysis:")
print("-" * 40)
# Check if voice bot is using documents
if 'rajasthan_documents' in table_names:
raj_table = db.open_table('rajasthan_documents')
raj_count = raj_table.count_rows()
print(f"β
Voice Bot has access to {raj_count} Rajasthan documents")
if raj_count > 0:
print("π Document topics include:")
documents = raj_table.head(10).to_pylist()
for doc in documents:
filename = doc.get('filename', 'Unknown')
content_snippet = doc.get('content', '')[:200] + "..."
print(f" β’ {filename}")
if 'pension' in content_snippet.lower():
print(" - Contains pension information β
")
if 'leave' in content_snippet.lower():
print(" - Contains leave information β
")
if 'salary' in content_snippet.lower():
print(" - Contains salary information β
")
else:
print("β No rajasthan_documents table found!")
# Check regular documents table
if 'documents' in table_names:
doc_table = db.open_table('documents')
doc_count = doc_table.count_rows()
print(f"π General documents table: {doc_count} documents")
print(f"\nπ Summary:")
print(f"- The voice bot is {'β
USING' if raj_count > 0 else 'β NOT USING'} the document database")
print(f"- Total accessible documents: {raj_count if 'rajasthan_documents' in table_names else 0}")
print(f"- This is {'β
GOOD' if raj_count >= 5 else 'β οΈ LIMITED'} for comprehensive responses")
if raj_count < 100:
print(f"\nπ‘ Note: You mentioned 1000+ documents, but only {raj_count} are currently loaded.")
print(" Consider adding more documents to improve response quality.")
except Exception as e:
logger.error(f"β Error checking document count: {e}")
if __name__ == "__main__":
check_document_count() |