etukurudinesh's picture
fix: app.py
191e833
from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel, HttpUrl
from typing import List, Dict, Optional
import asyncio
from main import WebScrapingOrchestrator
from contextlib import asynccontextmanager
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup code (if any) goes here
yield
# Shutdown code goes here
await orchestrator.close_connections()
app = FastAPI(
title="Advanced Web Scraper for LLM",
description="Scrape, analyze, and store web content optimized for LLM consumption",
version="1.0.0",
lifespan = lifespan,
)
# Global orchestrator instance
orchestrator = WebScrapingOrchestrator()
# Pydantic models
class URLRequest(BaseModel):
url: HttpUrl
class SearchRequest(BaseModel):
query: str
limit: int = 5
class BatchURLRequest(BaseModel):
urls: List[HttpUrl]
# Response models
class ScrapingResponse(BaseModel):
success: bool
url: str
title: Optional[str] = None
summary: Optional[Dict] = None
llm_ready_data: Optional[Dict] = None
error: Optional[str] = None
class SearchResponse(BaseModel):
results: List[Dict]
total_found: int
@app.post("/scrape", response_model=ScrapingResponse)
async def scrape_url(request: URLRequest):
"""Scrape a single URL and store data optimized for LLM consumption"""
try:
result = await orchestrator.process_url(str(request.url))
if "error" in result:
raise HTTPException(status_code=400, detail=result["error"])
return ScrapingResponse(**result)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
@app.post("/scrape-batch")
async def scrape_batch_urls(request: BatchURLRequest, background_tasks: BackgroundTasks):
"""Scrape multiple URLs in the background"""
async def process_batch():
results = []
for url in request.urls:
try:
result = await orchestrator.process_url(str(url))
results.append(result)
except Exception as e:
results.append({"error": str(e), "url": str(url)})
return results
# Add to background tasks
background_tasks.add_task(process_batch)
return {
"message": f"Started processing {len(request.urls)} URLs in background",
"urls": [str(url) for url in request.urls]
}
@app.get("/page/{url:path}")
async def get_page_data(url: str):
"""Get processed page data optimized for LLM consumption"""
try:
# Decode URL
import urllib.parse
decoded_url = urllib.parse.unquote(url)
page_data = orchestrator.get_page_for_llm(decoded_url)
if not page_data:
raise HTTPException(status_code=404, detail="Page not found")
return page_data
except Exception as e:
raise HTTPException(status_code=500, detail=f"Retrieval failed: {str(e)}")
@app.post("/search", response_model=SearchResponse)
async def search_content(request: SearchRequest):
"""Search stored content for LLM context"""
try:
results = orchestrator.search_for_llm(request.query, request.limit)
return SearchResponse(
results=results,
total_found=len(results)
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
@app.get("/llm-ready/{url:path}")
async def get_llm_ready_content(url: str):
"""Get content specifically formatted for LLM consumption"""
try:
import urllib.parse
decoded_url = urllib.parse.unquote(url)
page_data = orchestrator.get_page_for_llm(decoded_url)
if not page_data:
raise HTTPException(status_code=404, detail="Page not found")
# Format for LLM
llm_content = {
"instruction": "Use this content for generating summaries, notes, or mind maps",
"content": {
"title": page_data["title"],
"main_content": page_data["content"],
"structure": {
"headings": page_data["headings"],
"content_type": page_data["study_metadata"]["content_type"],
"complexity": page_data["study_metadata"]["complexity_score"],
"reading_time": page_data["study_metadata"]["reading_time"]
},
"context": {
"related_pages": page_data["relationships"]["related_pages"],
"key_topics": page_data["study_metadata"]["key_topics"]
}
},
"suggestions": {
"study_approach": _get_study_approach(page_data["study_metadata"]),
"focus_areas": page_data["headings"][:3],
"difficulty_level": _assess_difficulty(page_data["study_metadata"])
}
}
return llm_content
except Exception as e:
raise HTTPException(status_code=500, detail=f"LLM formatting failed: {str(e)}")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {"status": "healthy", "message": "Web scraper API is running"}
@app.get("/stats")
async def get_statistics():
"""Get scraping statistics"""
try:
# Get basic stats from MongoDB
mongo_stats = orchestrator.mongo_storage.collection.estimated_document_count()
return {
"total_pages_scraped": mongo_stats,
"database_status": "connected",
"features": [
"Dynamic content scraping with Playwright",
"DOM structure analysis",
"MongoDB storage for content",
"Neo4j for relationships",
"LLM-optimized data extraction"
]
}
except Exception as e:
return {"error": f"Stats retrieval failed: {str(e)}"}
def _get_study_approach(metadata: Dict) -> str:
"""Suggest study approach based on content analysis"""
content_type = metadata.get("content_type", "general")
complexity = metadata.get("complexity_score", 0)
if content_type == "tutorial":
return "hands-on practice with step-by-step approach"
elif content_type == "documentation":
return "reference-based learning with examples"
elif content_type == "research":
return "analytical reading with note-taking"
elif complexity > 5:
return "detailed study with concept mapping"
else:
return "general reading with summary creation"
def _assess_difficulty(metadata: Dict) -> str:
"""Assess content difficulty for LLM processing hints"""
complexity = metadata.get("complexity_score", 0)
reading_time = metadata.get("reading_time", 0)
if complexity < 2 and reading_time < 5:
return "beginner"
elif complexity < 5 and reading_time < 15:
return "intermediate"
else:
return "advanced"
# Run the API
if __name__ == "__main__":
import uvicorn
uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)