Spaces:

BrightData
/

brightdata-ai-agent

Sleeping

brightdata-ai-agent / brightdata_datasets.py

meirk-brd

change dataset inputs

8ef04af 7 days ago

24.8 kB

	from smolagents import Tool
	import json
	import os
	import time
	import requests
	from typing import Dict, Any
	from dotenv import load_dotenv

	# Load environment variables from .env if present
	load_dotenv()


	def _build_description(description_lines):
	"""Join multiline descriptions defined as lists."""
	return "\n".join(description_lines)


	# Dataset catalogue mirrored from the MCP implementation (JS version).
	# Each entry defines the dataset_id, the required inputs, optional defaults,
	# and optional fixed values that are injected automatically.
	DATASETS: Dict[str, Dict[str, Any]] = {
	"amazon_product": {
	"dataset_id": "gd_l7q7dkf244hwjntr0",
	"description": _build_description(
	[
	"Quickly read structured amazon product data.",
	"Requires a valid product URL with /dp/ in it.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"amazon_product_reviews": {
	"dataset_id": "gd_le8e811kzy4ggddlq",
	"description": _build_description(
	[
	"Quickly read structured amazon product review data.",
	"Requires a valid product URL with /dp/ in it.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"amazon_product_search": {
	"dataset_id": "gd_lwdb4vjm1ehb499uxs",
	"description": _build_description(
	[
	"Quickly read structured amazon product search data.",
	"Requires a valid search keyword and amazon domain URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["keyword", "url"],
	"fixed_values": {"pages_to_search": "1"},
	},
	"walmart_product": {
	"dataset_id": "gd_l95fol7l1ru6rlo116",
	"description": _build_description(
	[
	"Quickly read structured walmart product data.",
	"Requires a valid product URL with /ip/ in it.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"walmart_seller": {
	"dataset_id": "gd_m7ke48w81ocyu4hhz0",
	"description": _build_description(
	[
	"Quickly read structured walmart seller data.",
	"Requires a valid walmart seller URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"ebay_product": {
	"dataset_id": "gd_ltr9mjt81n0zzdk1fb",
	"description": _build_description(
	[
	"Quickly read structured ebay product data.",
	"Requires a valid ebay product URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"homedepot_products": {
	"dataset_id": "gd_lmusivh019i7g97q2n",
	"description": _build_description(
	[
	"Quickly read structured homedepot product data.",
	"Requires a valid homedepot product URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"zara_products": {
	"dataset_id": "gd_lct4vafw1tgx27d4o0",
	"description": _build_description(
	[
	"Quickly read structured zara product data.",
	"Requires a valid zara product URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"etsy_products": {
	"dataset_id": "gd_ltppk0jdv1jqz25mz",
	"description": _build_description(
	[
	"Quickly read structured etsy product data.",
	"Requires a valid etsy product URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"bestbuy_products": {
	"dataset_id": "gd_ltre1jqe1jfr7cccf",
	"description": _build_description(
	[
	"Quickly read structured bestbuy product data.",
	"Requires a valid bestbuy product URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"linkedin_person_profile": {
	"dataset_id": "gd_l1viktl72bvl7bjuj0",
	"description": _build_description(
	[
	"Quickly read structured linkedin people profile data.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"linkedin_company_profile": {
	"dataset_id": "gd_l1vikfnt1wgvvqz95w",
	"description": _build_description(
	[
	"Quickly read structured linkedin company profile data.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"linkedin_job_listings": {
	"dataset_id": "gd_lpfll7v5hcqtkxl6l",
	"description": _build_description(
	[
	"Quickly read structured linkedin job listings data.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"linkedin_posts": {
	"dataset_id": "gd_lyy3tktm25m4avu764",
	"description": _build_description(
	[
	"Quickly read structured linkedin posts data.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"linkedin_people_search": {
	"dataset_id": "gd_m8d03he47z8nwb5xc",
	"description": _build_description(
	[
	"Quickly read structured linkedin people search data.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url", "first_name", "last_name"],
	},
	"crunchbase_company": {
	"dataset_id": "gd_l1vijqt9jfj7olije",
	"description": _build_description(
	[
	"Quickly read structured crunchbase company data.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"zoominfo_company_profile": {
	"dataset_id": "gd_m0ci4a4ivx3j5l6nx",
	"description": _build_description(
	[
	"Quickly read structured ZoomInfo company profile data.",
	"Requires a valid ZoomInfo company URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"instagram_profiles": {
	"dataset_id": "gd_l1vikfch901nx3by4",
	"description": _build_description(
	[
	"Quickly read structured Instagram profile data.",
	"Requires a valid Instagram URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"instagram_posts": {
	"dataset_id": "gd_lk5ns7kz21pck8jpis",
	"description": _build_description(
	[
	"Quickly read structured Instagram post data.",
	"Requires a valid Instagram URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"instagram_reels": {
	"dataset_id": "gd_lyclm20il4r5helnj",
	"description": _build_description(
	[
	"Quickly read structured Instagram reel data.",
	"Requires a valid Instagram URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"instagram_comments": {
	"dataset_id": "gd_ltppn085pokosxh13",
	"description": _build_description(
	[
	"Quickly read structured Instagram comments data.",
	"Requires a valid Instagram URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"facebook_posts": {
	"dataset_id": "gd_lyclm1571iy3mv57zw",
	"description": _build_description(
	[
	"Quickly read structured Facebook post data.",
	"Requires a valid Facebook post URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"facebook_marketplace_listings": {
	"dataset_id": "gd_lvt9iwuh6fbcwmx1a",
	"description": _build_description(
	[
	"Quickly read structured Facebook marketplace listing data.",
	"Requires a valid Facebook marketplace listing URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"facebook_company_reviews": {
	"dataset_id": "gd_m0dtqpiu1mbcyc2g86",
	"description": _build_description(
	[
	"Quickly read structured Facebook company reviews data.",
	"Requires a valid Facebook company URL and number of reviews.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url", "num_of_reviews"],
	},
	"facebook_events": {
	"dataset_id": "gd_m14sd0to1jz48ppm51",
	"description": _build_description(
	[
	"Quickly read structured Facebook events data.",
	"Requires a valid Facebook event URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"tiktok_profiles": {
	"dataset_id": "gd_l1villgoiiidt09ci",
	"description": _build_description(
	[
	"Quickly read structured Tiktok profiles data.",
	"Requires a valid Tiktok profile URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"tiktok_posts": {
	"dataset_id": "gd_lu702nij2f790tmv9h",
	"description": _build_description(
	[
	"Quickly read structured Tiktok post data.",
	"Requires a valid Tiktok post URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"tiktok_shop": {
	"dataset_id": "gd_m45m1u911dsa4274pi",
	"description": _build_description(
	[
	"Quickly read structured Tiktok shop data.",
	"Requires a valid Tiktok shop product URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"tiktok_comments": {
	"dataset_id": "gd_lkf2st302ap89utw5k",
	"description": _build_description(
	[
	"Quickly read structured Tiktok comments data.",
	"Requires a valid Tiktok video URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"google_maps_reviews": {
	"dataset_id": "gd_luzfs1dn2oa0teb81",
	"description": _build_description(
	[
	"Quickly read structured Google maps reviews data.",
	"Requires a valid Google maps URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url", "days_limit"],
	"defaults": {"days_limit": "3"},
	},
	"google_shopping": {
	"dataset_id": "gd_ltppk50q18kdw67omz",
	"description": _build_description(
	[
	"Quickly read structured Google shopping data.",
	"Requires a valid Google shopping product URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"google_play_store": {
	"dataset_id": "gd_lsk382l8xei8vzm4u",
	"description": _build_description(
	[
	"Quickly read structured Google play store data.",
	"Requires a valid Google play store app URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"apple_app_store": {
	"dataset_id": "gd_lsk9ki3u2iishmwrui",
	"description": _build_description(
	[
	"Quickly read structured apple app store data.",
	"Requires a valid apple app store app URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"reuter_news": {
	"dataset_id": "gd_lyptx9h74wtlvpnfu",
	"description": _build_description(
	[
	"Quickly read structured reuter news data.",
	"Requires a valid reuter news report URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"github_repository_file": {
	"dataset_id": "gd_lyrexgxc24b3d4imjt",
	"description": _build_description(
	[
	"Quickly read structured github repository data.",
	"Requires a valid github repository file URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"yahoo_finance_business": {
	"dataset_id": "gd_lmrpz3vxmz972ghd7",
	"description": _build_description(
	[
	"Quickly read structured yahoo finance business data.",
	"Requires a valid yahoo finance business URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"x_posts": {
	"dataset_id": "gd_lwxkxvnf1cynvib9co",
	"description": _build_description(
	[
	"Quickly read structured X post data.",
	"Requires a valid X post URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"zillow_properties_listing": {
	"dataset_id": "gd_lfqkr8wm13ixtbd8f5",
	"description": _build_description(
	[
	"Quickly read structured zillow properties listing data.",
	"Requires a valid zillow properties listing URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"booking_hotel_listings": {
	"dataset_id": "gd_m5mbdl081229ln6t4a",
	"description": _build_description(
	[
	"Quickly read structured booking hotel listings data.",
	"Requires a valid booking hotel listing URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"youtube_profiles": {
	"dataset_id": "gd_lk538t2k2p1k3oos71",
	"description": _build_description(
	[
	"Quickly read structured youtube profiles data.",
	"Requires a valid youtube profile URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"youtube_comments": {
	"dataset_id": "gd_lk9q0ew71spt1mxywf",
	"description": _build_description(
	[
	"Quickly read structured youtube comments data.",
	"Requires a valid youtube video URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url", "num_of_comments"],
	"defaults": {"num_of_comments": "10"},
	},
	"reddit_posts": {
	"dataset_id": "gd_lvz8ah06191smkebj4",
	"description": _build_description(
	[
	"Quickly read structured reddit posts data.",
	"Requires a valid reddit post URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	"youtube_videos": {
	"dataset_id": "gd_lk56epmy2i5g7lzu0k",
	"description": _build_description(
	[
	"Quickly read structured YouTube videos data.",
	"Requires a valid YouTube video URL.",
	"This can be a cache lookup, so it can be more reliable than scraping.",
	]
	),
	"inputs": ["url"],
	},
	}


	class BrightDataDatasetTool(Tool):
	name = "brightdata_dataset_fetch"
	description = (
	"Trigger a Bright Data dataset collection and poll until the snapshot is ready. "
	"Choose a dataset key (e.g., amazon_product, linkedin_company_profile, google_maps_reviews). "
	"For most datasets, you only need to provide the URL parameter. "
	"For example: brightdata_dataset_fetch(dataset='linkedin_person_profile', url='https://linkedin.com/in/...')"
	)
	inputs = {
	"dataset": {
	"type": "string",
	"description": f"Dataset key. Options: {', '.join(sorted(DATASETS.keys()))}",
	},
	"url": {
	"type": "string",
	"description": "URL for the dataset (required for most datasets)",
	"nullable": True,
	},
	"keyword": {
	"type": "string",
	"description": "Search keyword (for search datasets like amazon_product_search)",
	"nullable": True,
	},
	"first_name": {
	"type": "string",
	"description": "First name (for datasets like linkedin_people_search)",
	"nullable": True,
	},
	"last_name": {
	"type": "string",
	"description": "Last name (for datasets like linkedin_people_search)",
	"nullable": True,
	},
	"days_limit": {
	"type": "string",
	"description": "Days limit (for datasets like google_maps_reviews, default: 3)",
	"nullable": True,
	},
	"num_of_reviews": {
	"type": "string",
	"description": "Number of reviews (for datasets like facebook_company_reviews)",
	"nullable": True,
	},
	"num_of_comments": {
	"type": "string",
	"description": "Number of comments (for datasets like youtube_comments, default: 10)",
	"nullable": True,
	},
	}
	output_type = "string"

	def _prepare_payload(self, dataset_key: str, params: Dict[str, Any]) -> Dict[str, Any]:
	"""Validate required fields, apply defaults, and merge fixed values."""
	config = DATASETS[dataset_key]
	payload = {}

	defaults = config.get("defaults", {})
	fixed_values = config.get("fixed_values", {})

	for field in config["inputs"]:
	if field in params:
	payload[field] = params[field]
	elif field in defaults:
	payload[field] = defaults[field]
	else:
	raise ValueError(f"Missing required field '{field}' for dataset '{dataset_key}'")

	# Apply fixed values that should always be sent
	payload.update(fixed_values)
	return payload

	def forward(
	self,
	dataset: str,
	url: str = None,
	keyword: str = None,
	first_name: str = None,
	last_name: str = None,
	days_limit: str = None,
	num_of_reviews: str = None,
	num_of_comments: str = None,
	) -> str:
	"""
	Trigger a dataset run and poll until results are ready.

	Args:
	dataset: The dataset key from DATASETS.
	url: URL for the dataset (required for most datasets).
	keyword: Search keyword (for search datasets).
	first_name: First name (for people search datasets).
	last_name: Last name (for people search datasets).
	days_limit: Days limit (for time-based datasets).
	num_of_reviews: Number of reviews to fetch.
	num_of_comments: Number of comments to fetch.

	Returns:
	JSON string of the snapshot data once ready.
	"""
	api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
	if not api_token:
	raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")

	if dataset not in DATASETS:
	raise ValueError(f"Unknown dataset '{dataset}'. Valid options: {', '.join(sorted(DATASETS.keys()))}")

	# Build params dict from provided arguments
	params = {}
	if url is not None:
	params["url"] = url
	if keyword is not None:
	params["keyword"] = keyword
	if first_name is not None:
	params["first_name"] = first_name
	if last_name is not None:
	params["last_name"] = last_name
	if days_limit is not None:
	params["days_limit"] = days_limit
	if num_of_reviews is not None:
	params["num_of_reviews"] = num_of_reviews
	if num_of_comments is not None:
	params["num_of_comments"] = num_of_comments

	payload = self._prepare_payload(dataset, params)
	dataset_id = DATASETS[dataset]["dataset_id"]

	trigger_url = "https://api.brightdata.com/datasets/v3/trigger"
	trigger_headers = {
	"Authorization": f"Bearer {api_token}",
	"Content-Type": "application/json",
	}

	trigger_response = requests.post(
	trigger_url,
	params={"dataset_id": dataset_id, "include_errors": "true"},
	json=[payload],
	headers=trigger_headers,
	timeout=60,
	)
	trigger_response.raise_for_status()
	snapshot_id = trigger_response.json().get("snapshot_id")

	if not snapshot_id:
	raise RuntimeError("No snapshot ID returned from Bright Data.")

	# Poll for completion (up to 10 minutes, matching MCP logic)
	snapshot_url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
	max_attempts = 600
	attempts = 0

	while attempts < max_attempts:
	try:
	response = requests.get(
	snapshot_url,
	params={"format": "json"},
	headers={"Authorization": f"Bearer {api_token}"},
	timeout=30,
	)

	# If Bright Data returns an error response we don't want to loop forever
	if response.status_code == 400:
	response.raise_for_status()

	data = response.json()
	if isinstance(data, list):
	return json.dumps(data, indent=2)

	status = data.get("status") if isinstance(data, dict) else None
	if status not in {"running", "building"}:
	return json.dumps(data, indent=2)

	attempts += 1
	time.sleep(1)

	except requests.exceptions.RequestException as exc:
	# Mirror JS logic: tolerate transient failures, but break on 400
	if getattr(getattr(exc, "response", None), "status_code", None) == 400:
	raise
	attempts += 1
	time.sleep(1)

	raise TimeoutError(f"Timeout waiting for snapshot {snapshot_id} after {max_attempts} seconds")