Spaces:

ChAbhishek28
/

PensionBot

Sleeping

PensionBot / bulk_document_loader.py

Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs

67a99cd 2 months ago

1.55 kB

	"""
	Enhanced document loader for 1000+ government documents
	Add this to your setup_documents.py or create as separate service
	"""

	import os
	import json
	from pathlib import Path

	def load_bulk_documents():
	"""Load documents from external sources"""

	# Example: Load from a documents directory
	documents = []

	# Option 1: Load from JSON files
	docs_dir = Path("government_docs") # Create this directory
	if docs_dir.exists():
	for json_file in docs_dir.glob("*.json"):
	with open(json_file, 'r', encoding='utf-8') as f:
	batch_docs = json.load(f)
	documents.extend(batch_docs)

	# Option 2: Load from text files
	text_docs_dir = Path("text_documents")
	if text_docs_dir.exists():
	for txt_file in text_docs_dir.glob("*.txt"):
	with open(txt_file, 'r', encoding='utf-8') as f:
	content = f.read()
	documents.append({
	"content": content,
	"filename": txt_file.name,
	"source": "Government Policy Manual"
	})

	# Option 3: Load from PDF directory (requires PyPDF2)
	# pdf_docs_dir = Path("pdf_documents")
	# if pdf_docs_dir.exists():
	# import PyPDF2
	# for pdf_file in pdf_docs_dir.glob("*.pdf"):
	# # Extract text from PDF and add to documents

	return documents

	# Add this to your setup_sample_documents() function:
	# bulk_docs = load_bulk_documents()
	# SAMPLE_DOCUMENTS.extend(bulk_docs)