Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException, BackgroundTasks | |
| from pydantic import BaseModel, HttpUrl | |
| from typing import List, Dict, Optional | |
| import asyncio | |
| from main import WebScrapingOrchestrator | |
| from contextlib import asynccontextmanager | |
| async def lifespan(app: FastAPI): | |
| # Startup code (if any) goes here | |
| yield | |
| # Shutdown code goes here | |
| await orchestrator.close_connections() | |
| app = FastAPI( | |
| title="Advanced Web Scraper for LLM", | |
| description="Scrape, analyze, and store web content optimized for LLM consumption", | |
| version="1.0.0", | |
| lifespan = lifespan, | |
| ) | |
| # Global orchestrator instance | |
| orchestrator = WebScrapingOrchestrator() | |
| # Pydantic models | |
| class URLRequest(BaseModel): | |
| url: HttpUrl | |
| class SearchRequest(BaseModel): | |
| query: str | |
| limit: int = 5 | |
| class BatchURLRequest(BaseModel): | |
| urls: List[HttpUrl] | |
| # Response models | |
| class ScrapingResponse(BaseModel): | |
| success: bool | |
| url: str | |
| title: Optional[str] = None | |
| summary: Optional[Dict] = None | |
| llm_ready_data: Optional[Dict] = None | |
| error: Optional[str] = None | |
| class SearchResponse(BaseModel): | |
| results: List[Dict] | |
| total_found: int | |
| async def scrape_url(request: URLRequest): | |
| """Scrape a single URL and store data optimized for LLM consumption""" | |
| try: | |
| result = await orchestrator.process_url(str(request.url)) | |
| if "error" in result: | |
| raise HTTPException(status_code=400, detail=result["error"]) | |
| return ScrapingResponse(**result) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}") | |
| async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks): | |
| """Scrape multiple URLs in the background""" | |
| async def process_batch(): | |
| results = [] | |
| for url in request.urls: | |
| try: | |
| result = await orchestrator.process_url(str(url)) | |
| results.append(result) | |
| except Exception as e: | |
| results.append({"error": str(e), "url": str(url)}) | |
| return results | |
| # Add to background tasks | |
| background_tasks.add_task(process_batch) | |
| return { | |
| "message": f"Started processing {len(request.urls)} URLs in background", | |
| "urls": [str(url) for url in request.urls] | |
| } | |
| async def get_page_data(url: str): | |
| """Get processed page data optimized for LLM consumption""" | |
| try: | |
| # Decode URL | |
| import urllib.parse | |
| decoded_url = urllib.parse.unquote(url) | |
| page_data = orchestrator.get_page_for_llm(decoded_url) | |
| if not page_data: | |
| raise HTTPException(status_code=404, detail="Page not found") | |
| return page_data | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}") | |
| async def search_content(request: SearchRequest): | |
| """Search stored content for LLM context""" | |
| try: | |
| results = orchestrator.search_for_llm(request.query, request.limit) | |
| return SearchResponse( | |
| results=results, | |
| total_found=len(results) | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}") | |
| async def get_llm_ready_content(url: str): | |
| """Get content specifically formatted for LLM consumption""" | |
| try: | |
| import urllib.parse | |
| decoded_url = urllib.parse.unquote(url) | |
| page_data = orchestrator.get_page_for_llm(decoded_url) | |
| if not page_data: | |
| raise HTTPException(status_code=404, detail="Page not found") | |
| # Format for LLM | |
| llm_content = { | |
| "instruction": "Use this content for generating summaries, notes, or mind maps", | |
| "content": { | |
| "title": page_data["title"], | |
| "main_content": page_data["content"], | |
| "structure": { | |
| "headings": page_data["headings"], | |
| "content_type": page_data["study_metadata"]["content_type"], | |
| "complexity": page_data["study_metadata"]["complexity_score"], | |
| "reading_time": page_data["study_metadata"]["reading_time"] | |
| }, | |
| "context": { | |
| "related_pages": page_data["relationships"]["related_pages"], | |
| "key_topics": page_data["study_metadata"]["key_topics"] | |
| } | |
| }, | |
| "suggestions": { | |
| "study_approach": _get_study_approach(page_data["study_metadata"]), | |
| "focus_areas": page_data["headings"][:3], | |
| "difficulty_level": _assess_difficulty(page_data["study_metadata"]) | |
| } | |
| } | |
| return llm_content | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}") | |
| async def health_check(): | |
| """Health check endpoint""" | |
| return {"status": "healthy", "message": "Web scraper API is running"} | |
| async def get_statistics(): | |
| """Get scraping statistics""" | |
| try: | |
| # Get basic stats from MongoDB | |
| mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count() | |
| return { | |
| "total_pages_scraped": mongo_stats, | |
| "database_status": "connected", | |
| "features": [ | |
| "Dynamic content scraping with Playwright", | |
| "DOM structure analysis", | |
| "MongoDB storage for content", | |
| "Neo4j for relationships", | |
| "LLM-optimized data extraction" | |
| ] | |
| } | |
| except Exception as e: | |
| return {"error": f"Stats retrieval failed: {str(e)}"} | |
| def _get_study_approach(metadata: Dict) -> str: | |
| """Suggest study approach based on content analysis""" | |
| content_type = metadata.get("content_type", "general") | |
| complexity = metadata.get("complexity_score", 0) | |
| if content_type == "tutorial": | |
| return "hands-on practice with step-by-step approach" | |
| elif content_type == "documentation": | |
| return "reference-based learning with examples" | |
| elif content_type == "research": | |
| return "analytical reading with note-taking" | |
| elif complexity > 5: | |
| return "detailed study with concept mapping" | |
| else: | |
| return "general reading with summary creation" | |
| def _assess_difficulty(metadata: Dict) -> str: | |
| """Assess content difficulty for LLM processing hints""" | |
| complexity = metadata.get("complexity_score", 0) | |
| reading_time = metadata.get("reading_time", 0) | |
| if complexity < 2 and reading_time < 5: | |
| return "beginner" | |
| elif complexity < 5 and reading_time < 15: | |
| return "intermediate" | |
| else: | |
| return "advanced" | |
| # Run the API | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True) |