Spaces:
Sleeping
Sleeping
| from smolagents import Tool | |
| import requests | |
| import os | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env if present | |
| load_dotenv() | |
| class BrightDataScraperTool(Tool): | |
| name = "brightdata_web_scraper" | |
| description = """ | |
| Scrape any webpage and return content in Markdown format. | |
| This tool can bypass bot detection and CAPTCHAs. | |
| Use this when you need to extract content from websites. | |
| """ | |
| inputs = { | |
| "url": { | |
| "type": "string", | |
| "description": "The URL of the webpage to scrape", | |
| } | |
| } | |
| output_type = "string" | |
| def forward(self, url: str) -> str: | |
| """ | |
| Scrape a webpage using Bright Data's API. | |
| Args: | |
| url: The URL to scrape | |
| Returns: | |
| The scraped content in Markdown format | |
| """ | |
| api_token = os.getenv("BRIGHT_DATA_API_TOKEN") | |
| unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1") | |
| if not api_token: | |
| raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables") | |
| api_url = "https://api.brightdata.com/request" | |
| headers = { | |
| "Authorization": f"Bearer {api_token}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = { | |
| "url": url, | |
| "zone": unlocker_zone, | |
| "format": "raw", | |
| "data_format": "markdown", | |
| } | |
| try: | |
| response = requests.post(api_url, json=payload, headers=headers) | |
| response.raise_for_status() | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| return f"Error scraping URL: {str(e)}" | |