Spaces:

BrightData
/

brightdata-ai-agent

Sleeping

brightdata-ai-agent / brightdata_scraper.py

meirk-brd

Merge remote README with local changes

0397cdb 6 days ago

1.7 kB

	from smolagents import Tool
	import requests
	import os
	from dotenv import load_dotenv

	# Load environment variables from .env if present
	load_dotenv()


	class BrightDataScraperTool(Tool):
	name = "brightdata_web_scraper"
	description = """
	Scrape any webpage and return content in Markdown format.
	This tool can bypass bot detection and CAPTCHAs.
	Use this when you need to extract content from websites.
	"""
	inputs = {
	"url": {
	"type": "string",
	"description": "The URL of the webpage to scrape",
	}
	}
	output_type = "string"

	def forward(self, url: str) -> str:
	"""
	Scrape a webpage using Bright Data's API.

	Args:
	url: The URL to scrape

	Returns:
	The scraped content in Markdown format
	"""
	api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
	unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")

	if not api_token:
	raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")

	api_url = "https://api.brightdata.com/request"
	headers = {
	"Authorization": f"Bearer {api_token}",
	"Content-Type": "application/json",
	}

	payload = {
	"url": url,
	"zone": unlocker_zone,
	"format": "raw",
	"data_format": "markdown",
	}

	try:
	response = requests.post(api_url, json=payload, headers=headers)
	response.raise_for_status()
	return response.text
	except requests.exceptions.RequestException as e:
	return f"Error scraping URL: {str(e)}"