from fastapi import FastAPI, HTTPException, BackgroundTasks from pydantic import BaseModel, HttpUrl from typing import List, Dict, Optional import asyncio from main import WebScrapingOrchestrator from contextlib import asynccontextmanager @asynccontextmanager async def lifespan(app: FastAPI): # Startup code (if any) goes here yield # Shutdown code goes here await orchestrator.close_connections() app = FastAPI( title="Advanced Web Scraper for LLM", description="Scrape, analyze, and store web content optimized for LLM consumption", version="1.0.0", lifespan = lifespan, ) # Global orchestrator instance orchestrator = WebScrapingOrchestrator() # Pydantic models class URLRequest(BaseModel): url: HttpUrl class SearchRequest(BaseModel): query: str limit: int = 5 class BatchURLRequest(BaseModel): urls: List[HttpUrl] # Response models class ScrapingResponse(BaseModel): success: bool url: str title: Optional[str] = None summary: Optional[Dict] = None llm_ready_data: Optional[Dict] = None error: Optional[str] = None class SearchResponse(BaseModel): results: List[Dict] total_found: int @app.post("/scrape", response_model=ScrapingResponse) async def scrape_url(request: URLRequest): """Scrape a single URL and store data optimized for LLM consumption""" try: result = await orchestrator.process_url(str(request.url)) if "error" in result: raise HTTPException(status_code=400, detail=result["error"]) return ScrapingResponse(**result) except Exception as e: raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}") @app.post("/scrape-batch") async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks): """Scrape multiple URLs in the background""" async def process_batch(): results = [] for url in request.urls: try: result = await orchestrator.process_url(str(url)) results.append(result) except Exception as e: results.append({"error": str(e), "url": str(url)}) return results # Add to background tasks background_tasks.add_task(process_batch) return { "message": f"Started processing {len(request.urls)} URLs in background", "urls": [str(url) for url in request.urls] } @app.get("/page/{url:path}") async def get_page_data(url: str): """Get processed page data optimized for LLM consumption""" try: # Decode URL import urllib.parse decoded_url = urllib.parse.unquote(url) page_data = orchestrator.get_page_for_llm(decoded_url) if not page_data: raise HTTPException(status_code=404, detail="Page not found") return page_data except Exception as e: raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}") @app.post("/search", response_model=SearchResponse) async def search_content(request: SearchRequest): """Search stored content for LLM context""" try: results = orchestrator.search_for_llm(request.query, request.limit) return SearchResponse( results=results, total_found=len(results) ) except Exception as e: raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}") @app.get("/llm-ready/{url:path}") async def get_llm_ready_content(url: str): """Get content specifically formatted for LLM consumption""" try: import urllib.parse decoded_url = urllib.parse.unquote(url) page_data = orchestrator.get_page_for_llm(decoded_url) if not page_data: raise HTTPException(status_code=404, detail="Page not found") # Format for LLM llm_content = { "instruction": "Use this content for generating summaries, notes, or mind maps", "content": { "title": page_data["title"], "main_content": page_data["content"], "structure": { "headings": page_data["headings"], "content_type": page_data["study_metadata"]["content_type"], "complexity": page_data["study_metadata"]["complexity_score"], "reading_time": page_data["study_metadata"]["reading_time"] }, "context": { "related_pages": page_data["relationships"]["related_pages"], "key_topics": page_data["study_metadata"]["key_topics"] } }, "suggestions": { "study_approach": _get_study_approach(page_data["study_metadata"]), "focus_areas": page_data["headings"][:3], "difficulty_level": _assess_difficulty(page_data["study_metadata"]) } } return llm_content except Exception as e: raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}") @app.get("/health") async def health_check(): """Health check endpoint""" return {"status": "healthy", "message": "Web scraper API is running"} @app.get("/stats") async def get_statistics(): """Get scraping statistics""" try: # Get basic stats from MongoDB mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count() return { "total_pages_scraped": mongo_stats, "database_status": "connected", "features": [ "Dynamic content scraping with Playwright", "DOM structure analysis", "MongoDB storage for content", "Neo4j for relationships", "LLM-optimized data extraction" ] } except Exception as e: return {"error": f"Stats retrieval failed: {str(e)}"} def _get_study_approach(metadata: Dict) -> str: """Suggest study approach based on content analysis""" content_type = metadata.get("content_type", "general") complexity = metadata.get("complexity_score", 0) if content_type == "tutorial": return "hands-on practice with step-by-step approach" elif content_type == "documentation": return "reference-based learning with examples" elif content_type == "research": return "analytical reading with note-taking" elif complexity > 5: return "detailed study with concept mapping" else: return "general reading with summary creation" def _assess_difficulty(metadata: Dict) -> str: """Assess content difficulty for LLM processing hints""" complexity = metadata.get("complexity_score", 0) reading_time = metadata.get("reading_time", 0) if complexity < 2 and reading_time < 5: return "beginner" elif complexity < 5 and reading_time < 15: return "intermediate" else: return "advanced" # Run the API if __name__ == "__main__": import uvicorn uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)