Spaces:

NLarchive
/

MCP-Server-Finder-Monitor

Sleeping

App Files Files Community

MCP-Server-Finder-Monitor / scrape_and_format_hf_mcp_servers.py

NLarchive

Update scrape_and_format_hf_mcp_servers.py

2aa1a08 verified 6 months ago

raw

history blame contribute delete

15.5 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	import json
	import time
	from typing import List, Dict
	# Code by Nicolas larenas, NLarchive

	# Define available sort options
	SORT_OPTIONS = {
	"relevance": {"label": "🎯 Relevance (Default)", "url_param": ""},
	"trending": {"label": "📈 Trending", "url_param": "&sort=trending"},
	"likes": {"label": "❤️ Most Likes", "url_param": "&sort=likes"},
	"created": {"label": "🆕 Recently Created", "url_param": "&sort=created"},
	"modified": {"label": "🔄 Recently Updated", "url_param": "&sort=modified"}
	}

	def scrape_hf_mcp_servers(max_pages: int = 3, sort_by: str = "relevance", progress_callback=None) -> List[Dict[str, str]]:
	"""
	Scrape HuggingFace spaces with mcp-server filter to get real MCP server URLs.

	Args:
	max_pages (int): Maximum number of pages to scrape
	sort_by (str): Sort option - one of: relevance, trending, likes, created, modified
	progress_callback: Optional callback function for progress updates

	Returns:
	List[Dict[str, str]]: List of server info with URLs
	"""
	servers = []
	seen_spaces = set() # Track unique spaces to avoid duplicates
	consecutive_empty_pages = 0 # Track consecutive pages with no results

	# Validate sort option
	if sort_by not in SORT_OPTIONS:
	sort_by = "relevance"

	sort_param = SORT_OPTIONS[sort_by]["url_param"]
	sort_label = SORT_OPTIONS[sort_by]["label"]

	def log_progress(message):
	print(message)
	if progress_callback:
	progress_callback(message)

	log_progress(f"🚀 Starting scrape of HuggingFace MCP servers")
	log_progress(f"📊 Sort method: {sort_label}")
	log_progress(f"📄 Max pages: {max_pages}")

	for page in range(max_pages):
	# Build URL with sort parameter
	# First page (page 0) has no p parameter, subsequent pages use p=1, p=2, etc.
	if page == 0:
	url = f"https://huggingface.co/spaces?filter=mcp-server{sort_param}"
	else:
	url = f"https://huggingface.co/spaces?filter=mcp-server&p={page}{sort_param}"

	log_progress(f"🕷️ Scraping page {page + 1}/{max_pages}: {url}")

	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	log_progress(f" 📡 Sending request to HuggingFace...")
	response = requests.get(url, timeout=15, headers=headers)

	if response.status_code != 200:
	log_progress(f" ❌ Failed to fetch page {page + 1}: HTTP {response.status_code}")
	consecutive_empty_pages += 1
	if consecutive_empty_pages >= 3: # Stop after 3 consecutive failures
	log_progress(f" 🛑 Stopping after 3 consecutive failed requests")
	break
	continue

	log_progress(f" ✅ Page loaded successfully ({len(response.content)} bytes)")
	log_progress(f" 🔍 Parsing HTML content...")

	soup = BeautifulSoup(response.content, 'html.parser')

	# Look for the exact pattern: <a href="/spaces/{username}/{space-name}"
	space_links = soup.find_all('a', href=re.compile(r'^/spaces/[^/\s]+/[^/\s]+/?$'))

	log_progress(f" 📋 Found {len(space_links)} potential space links")

	page_servers = []
	for i, link in enumerate(space_links, 1):
	href = link.get('href')
	if href is None:
	continue

	href = str(href).strip().rstrip('/')

	# Extract username and space name using the exact pattern
	match = re.match(r'^/spaces/([^/\s]+)/([^/\s]+)$', href)
	if match:
	username = match.group(1).strip()
	space_name = match.group(2).strip()

	# Create unique identifier to avoid duplicates
	space_id = f"{username}/{space_name}"
	if space_id in seen_spaces:
	continue
	seen_spaces.add(space_id)

	# Build all URL formats according to your specification
	hf_spaces_url = f"https://huggingface.co/spaces/{username}/{space_name}"
	hf_space_domain = f"https://{username}-{space_name}.hf.space"
	mcp_endpoint = f"{hf_space_domain}/gradio_api/mcp/sse"

	server_info = {
	"username": username,
	"space_name": space_name,
	"space_id": space_id,
	"hf_spaces_url": hf_spaces_url,
	"hf_space_domain": hf_space_domain,
	"mcp_endpoint": mcp_endpoint,
	"discovered_on_page": str(page + 1),
	"sort_method": sort_by
	}

	page_servers.append(server_info)
	log_progress(f" ✅ Found: {space_id}")

	log_progress(f" 📦 Extracted {len(page_servers)} new unique servers from page {page + 1}")
	servers.extend(page_servers)

	# Check if we found any servers on this page
	if len(page_servers) == 0:
	consecutive_empty_pages += 1
	log_progress(f" ⚠️ No new servers found on page {page + 1} (consecutive empty: {consecutive_empty_pages})")

	# Stop if we get 3 consecutive empty pages
	if consecutive_empty_pages >= 3:
	log_progress(f" 🛑 Stopping after {consecutive_empty_pages} consecutive empty pages")
	break
	else:
	# Reset counter if we found servers
	consecutive_empty_pages = 0

	# Check if this is likely the last page by looking at page content
	# Look for indicators that suggest no more pages
	page_text = soup.get_text().lower()

	# If the page contains very few spaces or typical "no results" indicators
	if len(space_links) < 5: # Fewer than 5 spaces suggests we might be near the end
	log_progress(f" ⚠️ Page {page + 1} has only {len(space_links)} space links, might be near end")

	# Continue to next page unless we've hit our consecutive empty limit
	# Remove the complex pagination detection that was causing early termination

	# Respectful delay between requests
	if page < max_pages - 1: # Don't delay after the last page
	log_progress(f" ⏱️ Waiting 2 seconds before next request...")
	time.sleep(2)

	except requests.RequestException as e:
	log_progress(f" ❌ Network error on page {page + 1}: {str(e)}")
	consecutive_empty_pages += 1
	if consecutive_empty_pages >= 3:
	log_progress(f" 🛑 Stopping after 3 consecutive network errors")
	break
	continue
	except Exception as e:
	log_progress(f" ❌ Error scraping page {page + 1}: {str(e)}")
	consecutive_empty_pages += 1
	if consecutive_empty_pages >= 3:
	log_progress(f" 🛑 Stopping after 3 consecutive errors")
	break
	continue

	log_progress(f"🎉 Scraping complete! Found {len(servers)} unique MCP servers total")
	log_progress(f"📊 Using sort method: {sort_label}")
	log_progress(f"📄 Scraped {min(page + 1, max_pages)} pages (stopped at page {page + 1})")
	return servers

	def generate_test_urls_for_health_monitor(servers: List[Dict[str, str]]) -> str:
	"""
	Generate URL list for the health monitor from scraped servers.

	Args:
	servers: List of server info from scrape_hf_mcp_servers

	Returns:
	str: Newline-separated URLs ready for the health monitor
	"""
	urls = [server["hf_spaces_url"] for server in servers]
	return "\n".join(urls)

	def save_scraped_servers(servers: List[Dict[str, str]], filename: str = "mcp_servers.json"):
	"""Save scraped server information to JSON file."""
	try:
	with open(filename, 'w', encoding='utf-8') as f:
	json.dump({
	"scraped_at": time.strftime("%Y-%m-%d %H:%M:%S"),
	"total_servers": len(servers),
	"servers": servers
	}, f, indent=2, ensure_ascii=False)
	print(f"💾 Saved {len(servers)} servers to {filename}")
	except Exception as e:
	print(f"❌ Error saving to {filename}: {str(e)}")

	def scrape_and_format_hf_mcp_servers(max_pages: int = 3, sort_by: str = "relevance") -> tuple:
	"""
	Combined function that scrapes HF MCP servers and returns data for the monitor.
	This function can be called directly from the MCP server monitor.

	Args:
	max_pages (int): Maximum number of pages to scrape
	sort_by (str): Sort option - one of: relevance, trending, likes, created, modified

	Returns:
	tuple: (markdown_summary, json_data) compatible with monitor interface
	"""
	print("🚀 Starting HuggingFace MCP servers discovery...")

	# Scrape servers with progress feedback
	servers = scrape_hf_mcp_servers(max_pages, sort_by)

	if not servers:
	result = {
	"status": "no_servers_found",
	"error": "No MCP servers found on HuggingFace",
	"servers": [],
	"total_servers": 0,
	"sort_method": sort_by
	}
	md = f"# ❌ No MCP Servers Found\n\nNo servers were discovered on HuggingFace with the mcp-server filter using {SORT_OPTIONS[sort_by]['label']} sorting."
	return md, json.dumps(result, indent=2)

	# Use all scraped servers directly, no validation step

	# Generate URLs for testing
	test_urls = generate_test_urls_for_health_monitor(servers) # Use 'servers' directly

	result = {
	"status": "success",
	"total_servers": len(servers), # Use 'servers' directly
	"scraped_pages": max_pages,
	"sort_method": sort_by,
	"sort_label": SORT_OPTIONS[sort_by]["label"],
	"servers": servers, # Use 'servers' directly
	"test_urls": test_urls.split('\n') if test_urls else [],
	"scraped_at": time.strftime("%Y-%m-%d %H:%M:%S")
	}

	# Generate markdown summary with complete, clickable URLs
	md = f"# 🕷️ HuggingFace MCP Servers Discovery\n\n"
	md += f"Status: ✅ Success\n"
	md += f"Sort Method: {result['sort_label']}\n"
	md += f"Total Servers Found: {len(servers)}\n" # Use 'servers' directly
	md += f"Pages Scraped: {max_pages}\n"
	md += f"Discovery Time: {result['scraped_at']}\n\n"

	md += "## 📋 Discovered Servers\n\n"
	for i, server in enumerate(servers, 1): # Show ALL servers
	# Removed redundant title - URLs are self-explanatory
	md += f"{i}. HF Spaces URL: [{server['hf_spaces_url']}]({server['hf_spaces_url']})\n\n"
	md += f"Live Space URL: [{server['hf_space_domain']}]({server['hf_space_domain']})\n\n"
	md += f"MCP Endpoint: [{server['mcp_endpoint']}]({server['mcp_endpoint']})\n\n"
	md += f"Discovered on Page: {server['discovered_on_page']}\n\n"
	md += "---\n\n"

	md += "## 🔗 URLs for Multi-Server Monitor\n\n"
	md += "Copy these URLs to test all discovered servers:\n\n"
	md += "```\n"
	md += test_urls
	md += "\n```\n\n"

	md += "## 🚀 Next Steps\n\n"
	md += "1. Copy the URLs above\n"
	md += "2. Go to the 📊 Multi-Server Monitor tab\n"
	md += "3. Paste the URLs and run health checks\n"
	md += "4. Discover which servers are actively running MCP services\n\n"

	md += "## 📊 Statistics\n\n"
	md += f"- Total Unique Servers: {len(servers)}\n" # Use 'servers' directly
	md += f"- Sort Method Used: {result['sort_label']}\n"
	# Ensure max_pages is not zero to avoid DivisionByZeroError
	avg_servers_per_page = round(len(servers) / max_pages, 1) if max_pages > 0 else 0 # Use 'servers' directly
	md += f"- Pages Successfully Scraped: {max_pages}\n"
	md += f"- Average Servers per Page: {avg_servers_per_page}\n"

	return md, json.dumps(result, indent=2)

	def test_scraper_locally():
	"""Test function to validate the scraper locally."""
	print("🧪 Testing HuggingFace MCP Server Scraper")
	print("=" * 50)

	# Test with more pages to verify the fix
	servers = scrape_hf_mcp_servers(max_pages=5, sort_by="likes")

	if servers:
	print(f"\\n✅ Successfully found {len(servers)} servers!")
	print("\\nFirst few servers found:")
	for i, server in enumerate(servers[:3], 1):
	print(f"{i}. {server['space_id']} (Page {server['discovered_on_page']})")
	print(f" HF URL: {server['hf_spaces_url']}")
	print(f" MCP Endpoint: {server['mcp_endpoint']}")

	# Show page distribution
	page_distribution = {}
	for server in servers:
	page = server['discovered_on_page']
	page_distribution[page] = page_distribution.get(page, 0) + 1

	print(f"\\n📊 Page distribution:")
	for page in sorted(page_distribution.keys(), key=int):
	print(f" Page {page}: {page_distribution[page]} servers")

	# Validation step is removed
	print(f"\\n✅ All {len(servers)} discovered servers are included (no validation step).")

	# Generate test URLs
	test_urls = generate_test_urls_for_health_monitor(servers)
	print(f"\\n📋 Generated {len(test_urls.split())} URLs for testing")

	else:
	print("❌ No servers found. Check the scraping logic.")
	# Code by Nicolas larenas, NLarchive

	def main():
	"""Main scraping function."""

	# Run scraping with more pages to test the fix
	md_summary, json_data = scrape_and_format_hf_mcp_servers(max_pages=10, sort_by="likes")

	# Save results
	result = json.loads(json_data)
	if result["status"] == "success":
	# 'servers' in result now contains all scraped servers without validation
	save_scraped_servers(result["servers"])

	# Save URLs to file for easy copying
	with open("mcp_test_urls.txt", 'w', encoding='utf-8') as f:
	f.write("\\n".join(result["test_urls"]))
	print("💾 URLs also saved to 'mcp_test_urls.txt'")

	print("\n" + "=" * 50)
	print("🎯 SCRAPING COMPLETE!")
	print("=" * 50)
	# Print summary without markdown formatting
	summary_text = md_summary.replace("#", "").replace("*", "").replace("[", "").replace("]", "").replace("(", " (")
	print(summary_text)

	if __name__ == "__main__":
	# Uncomment the line below to test the scraper locally first
	test_scraper_locally()

	# Run the full scraper
	# main()