File size: 1,546 Bytes
67a99cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""
Enhanced document loader for 1000+ government documents
Add this to your setup_documents.py or create as separate service
"""

import os
import json
from pathlib import Path

def load_bulk_documents():
    """Load documents from external sources"""
    
    # Example: Load from a documents directory
    documents = []
    
    # Option 1: Load from JSON files
    docs_dir = Path("government_docs")  # Create this directory
    if docs_dir.exists():
        for json_file in docs_dir.glob("*.json"):
            with open(json_file, 'r', encoding='utf-8') as f:
                batch_docs = json.load(f)
                documents.extend(batch_docs)
    
    # Option 2: Load from text files
    text_docs_dir = Path("text_documents")
    if text_docs_dir.exists():
        for txt_file in text_docs_dir.glob("*.txt"):
            with open(txt_file, 'r', encoding='utf-8') as f:
                content = f.read()
                documents.append({
                    "content": content,
                    "filename": txt_file.name,
                    "source": "Government Policy Manual"
                })
    
    # Option 3: Load from PDF directory (requires PyPDF2)
    # pdf_docs_dir = Path("pdf_documents")
    # if pdf_docs_dir.exists():
    #     import PyPDF2
    #     for pdf_file in pdf_docs_dir.glob("*.pdf"):
    #         # Extract text from PDF and add to documents
    
    return documents

# Add this to your setup_sample_documents() function:
# bulk_docs = load_bulk_documents()
# SAMPLE_DOCUMENTS.extend(bulk_docs)