PensionBot / bulk_document_loader.py
ChAbhishek28's picture
Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs
67a99cd
"""
Enhanced document loader for 1000+ government documents
Add this to your setup_documents.py or create as separate service
"""
import os
import json
from pathlib import Path
def load_bulk_documents():
"""Load documents from external sources"""
# Example: Load from a documents directory
documents = []
# Option 1: Load from JSON files
docs_dir = Path("government_docs") # Create this directory
if docs_dir.exists():
for json_file in docs_dir.glob("*.json"):
with open(json_file, 'r', encoding='utf-8') as f:
batch_docs = json.load(f)
documents.extend(batch_docs)
# Option 2: Load from text files
text_docs_dir = Path("text_documents")
if text_docs_dir.exists():
for txt_file in text_docs_dir.glob("*.txt"):
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read()
documents.append({
"content": content,
"filename": txt_file.name,
"source": "Government Policy Manual"
})
# Option 3: Load from PDF directory (requires PyPDF2)
# pdf_docs_dir = Path("pdf_documents")
# if pdf_docs_dir.exists():
# import PyPDF2
# for pdf_file in pdf_docs_dir.glob("*.pdf"):
# # Extract text from PDF and add to documents
return documents
# Add this to your setup_sample_documents() function:
# bulk_docs = load_bulk_documents()
# SAMPLE_DOCUMENTS.extend(bulk_docs)