Spaces:
Sleeping
Sleeping
Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs
67a99cd
| """ | |
| Enhanced document loader for 1000+ government documents | |
| Add this to your setup_documents.py or create as separate service | |
| """ | |
| import os | |
| import json | |
| from pathlib import Path | |
| def load_bulk_documents(): | |
| """Load documents from external sources""" | |
| # Example: Load from a documents directory | |
| documents = [] | |
| # Option 1: Load from JSON files | |
| docs_dir = Path("government_docs") # Create this directory | |
| if docs_dir.exists(): | |
| for json_file in docs_dir.glob("*.json"): | |
| with open(json_file, 'r', encoding='utf-8') as f: | |
| batch_docs = json.load(f) | |
| documents.extend(batch_docs) | |
| # Option 2: Load from text files | |
| text_docs_dir = Path("text_documents") | |
| if text_docs_dir.exists(): | |
| for txt_file in text_docs_dir.glob("*.txt"): | |
| with open(txt_file, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| documents.append({ | |
| "content": content, | |
| "filename": txt_file.name, | |
| "source": "Government Policy Manual" | |
| }) | |
| # Option 3: Load from PDF directory (requires PyPDF2) | |
| # pdf_docs_dir = Path("pdf_documents") | |
| # if pdf_docs_dir.exists(): | |
| # import PyPDF2 | |
| # for pdf_file in pdf_docs_dir.glob("*.pdf"): | |
| # # Extract text from PDF and add to documents | |
| return documents | |
| # Add this to your setup_sample_documents() function: | |
| # bulk_docs = load_bulk_documents() | |
| # SAMPLE_DOCUMENTS.extend(bulk_docs) |