Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import json | |
| import time | |
| from typing import List, Dict | |
| # Code by Nicolas larenas, NLarchive | |
| # Define available sort options | |
| SORT_OPTIONS = { | |
| "relevance": {"label": "π― Relevance (Default)", "url_param": ""}, | |
| "trending": {"label": "π Trending", "url_param": "&sort=trending"}, | |
| "likes": {"label": "β€οΈ Most Likes", "url_param": "&sort=likes"}, | |
| "created": {"label": "π Recently Created", "url_param": "&sort=created"}, | |
| "modified": {"label": "π Recently Updated", "url_param": "&sort=modified"} | |
| } | |
| def scrape_hf_mcp_servers(max_pages: int = 3, sort_by: str = "relevance", progress_callback=None) -> List[Dict[str, str]]: | |
| """ | |
| Scrape HuggingFace spaces with mcp-server filter to get real MCP server URLs. | |
| Args: | |
| max_pages (int): Maximum number of pages to scrape | |
| sort_by (str): Sort option - one of: relevance, trending, likes, created, modified | |
| progress_callback: Optional callback function for progress updates | |
| Returns: | |
| List[Dict[str, str]]: List of server info with URLs | |
| """ | |
| servers = [] | |
| seen_spaces = set() # Track unique spaces to avoid duplicates | |
| consecutive_empty_pages = 0 # Track consecutive pages with no results | |
| # Validate sort option | |
| if sort_by not in SORT_OPTIONS: | |
| sort_by = "relevance" | |
| sort_param = SORT_OPTIONS[sort_by]["url_param"] | |
| sort_label = SORT_OPTIONS[sort_by]["label"] | |
| def log_progress(message): | |
| print(message) | |
| if progress_callback: | |
| progress_callback(message) | |
| log_progress(f"π Starting scrape of HuggingFace MCP servers") | |
| log_progress(f"π Sort method: {sort_label}") | |
| log_progress(f"π Max pages: {max_pages}") | |
| for page in range(max_pages): | |
| # Build URL with sort parameter | |
| # First page (page 0) has no p parameter, subsequent pages use p=1, p=2, etc. | |
| if page == 0: | |
| url = f"https://huggingface.co/spaces?filter=mcp-server{sort_param}" | |
| else: | |
| url = f"https://huggingface.co/spaces?filter=mcp-server&p={page}{sort_param}" | |
| log_progress(f"π·οΈ Scraping page {page + 1}/{max_pages}: {url}") | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| log_progress(f" π‘ Sending request to HuggingFace...") | |
| response = requests.get(url, timeout=15, headers=headers) | |
| if response.status_code != 200: | |
| log_progress(f" β Failed to fetch page {page + 1}: HTTP {response.status_code}") | |
| consecutive_empty_pages += 1 | |
| if consecutive_empty_pages >= 3: # Stop after 3 consecutive failures | |
| log_progress(f" π Stopping after 3 consecutive failed requests") | |
| break | |
| continue | |
| log_progress(f" β Page loaded successfully ({len(response.content)} bytes)") | |
| log_progress(f" π Parsing HTML content...") | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Look for the exact pattern: <a href="/spaces/{username}/{space-name}" | |
| space_links = soup.find_all('a', href=re.compile(r'^/spaces/[^/\s]+/[^/\s]+/?$')) | |
| log_progress(f" π Found {len(space_links)} potential space links") | |
| page_servers = [] | |
| for i, link in enumerate(space_links, 1): | |
| href = link.get('href') | |
| if href is None: | |
| continue | |
| href = str(href).strip().rstrip('/') | |
| # Extract username and space name using the exact pattern | |
| match = re.match(r'^/spaces/([^/\s]+)/([^/\s]+)$', href) | |
| if match: | |
| username = match.group(1).strip() | |
| space_name = match.group(2).strip() | |
| # Create unique identifier to avoid duplicates | |
| space_id = f"{username}/{space_name}" | |
| if space_id in seen_spaces: | |
| continue | |
| seen_spaces.add(space_id) | |
| # Build all URL formats according to your specification | |
| hf_spaces_url = f"https://huggingface.co/spaces/{username}/{space_name}" | |
| hf_space_domain = f"https://{username}-{space_name}.hf.space" | |
| mcp_endpoint = f"{hf_space_domain}/gradio_api/mcp/sse" | |
| server_info = { | |
| "username": username, | |
| "space_name": space_name, | |
| "space_id": space_id, | |
| "hf_spaces_url": hf_spaces_url, | |
| "hf_space_domain": hf_space_domain, | |
| "mcp_endpoint": mcp_endpoint, | |
| "discovered_on_page": str(page + 1), | |
| "sort_method": sort_by | |
| } | |
| page_servers.append(server_info) | |
| log_progress(f" β Found: {space_id}") | |
| log_progress(f" π¦ Extracted {len(page_servers)} new unique servers from page {page + 1}") | |
| servers.extend(page_servers) | |
| # Check if we found any servers on this page | |
| if len(page_servers) == 0: | |
| consecutive_empty_pages += 1 | |
| log_progress(f" β οΈ No new servers found on page {page + 1} (consecutive empty: {consecutive_empty_pages})") | |
| # Stop if we get 3 consecutive empty pages | |
| if consecutive_empty_pages >= 3: | |
| log_progress(f" π Stopping after {consecutive_empty_pages} consecutive empty pages") | |
| break | |
| else: | |
| # Reset counter if we found servers | |
| consecutive_empty_pages = 0 | |
| # Check if this is likely the last page by looking at page content | |
| # Look for indicators that suggest no more pages | |
| page_text = soup.get_text().lower() | |
| # If the page contains very few spaces or typical "no results" indicators | |
| if len(space_links) < 5: # Fewer than 5 spaces suggests we might be near the end | |
| log_progress(f" β οΈ Page {page + 1} has only {len(space_links)} space links, might be near end") | |
| # Continue to next page unless we've hit our consecutive empty limit | |
| # Remove the complex pagination detection that was causing early termination | |
| # Respectful delay between requests | |
| if page < max_pages - 1: # Don't delay after the last page | |
| log_progress(f" β±οΈ Waiting 2 seconds before next request...") | |
| time.sleep(2) | |
| except requests.RequestException as e: | |
| log_progress(f" β Network error on page {page + 1}: {str(e)}") | |
| consecutive_empty_pages += 1 | |
| if consecutive_empty_pages >= 3: | |
| log_progress(f" π Stopping after 3 consecutive network errors") | |
| break | |
| continue | |
| except Exception as e: | |
| log_progress(f" β Error scraping page {page + 1}: {str(e)}") | |
| consecutive_empty_pages += 1 | |
| if consecutive_empty_pages >= 3: | |
| log_progress(f" π Stopping after 3 consecutive errors") | |
| break | |
| continue | |
| log_progress(f"π Scraping complete! Found {len(servers)} unique MCP servers total") | |
| log_progress(f"π Using sort method: {sort_label}") | |
| log_progress(f"π Scraped {min(page + 1, max_pages)} pages (stopped at page {page + 1})") | |
| return servers | |
| def generate_test_urls_for_health_monitor(servers: List[Dict[str, str]]) -> str: | |
| """ | |
| Generate URL list for the health monitor from scraped servers. | |
| Args: | |
| servers: List of server info from scrape_hf_mcp_servers | |
| Returns: | |
| str: Newline-separated URLs ready for the health monitor | |
| """ | |
| urls = [server["hf_spaces_url"] for server in servers] | |
| return "\n".join(urls) | |
| def save_scraped_servers(servers: List[Dict[str, str]], filename: str = "mcp_servers.json"): | |
| """Save scraped server information to JSON file.""" | |
| try: | |
| with open(filename, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S"), | |
| "total_servers": len(servers), | |
| "servers": servers | |
| }, f, indent=2, ensure_ascii=False) | |
| print(f"πΎ Saved {len(servers)} servers to {filename}") | |
| except Exception as e: | |
| print(f"β Error saving to {filename}: {str(e)}") | |
| def scrape_and_format_hf_mcp_servers(max_pages: int = 3, sort_by: str = "relevance") -> tuple: | |
| """ | |
| Combined function that scrapes HF MCP servers and returns data for the monitor. | |
| This function can be called directly from the MCP server monitor. | |
| Args: | |
| max_pages (int): Maximum number of pages to scrape | |
| sort_by (str): Sort option - one of: relevance, trending, likes, created, modified | |
| Returns: | |
| tuple: (markdown_summary, json_data) compatible with monitor interface | |
| """ | |
| print("π Starting HuggingFace MCP servers discovery...") | |
| # Scrape servers with progress feedback | |
| servers = scrape_hf_mcp_servers(max_pages, sort_by) | |
| if not servers: | |
| result = { | |
| "status": "no_servers_found", | |
| "error": "No MCP servers found on HuggingFace", | |
| "servers": [], | |
| "total_servers": 0, | |
| "sort_method": sort_by | |
| } | |
| md = f"# β No MCP Servers Found\n\nNo servers were discovered on HuggingFace with the mcp-server filter using {SORT_OPTIONS[sort_by]['label']} sorting." | |
| return md, json.dumps(result, indent=2) | |
| # Use all scraped servers directly, no validation step | |
| # Generate URLs for testing | |
| test_urls = generate_test_urls_for_health_monitor(servers) # Use 'servers' directly | |
| result = { | |
| "status": "success", | |
| "total_servers": len(servers), # Use 'servers' directly | |
| "scraped_pages": max_pages, | |
| "sort_method": sort_by, | |
| "sort_label": SORT_OPTIONS[sort_by]["label"], | |
| "servers": servers, # Use 'servers' directly | |
| "test_urls": test_urls.split('\n') if test_urls else [], | |
| "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S") | |
| } | |
| # Generate markdown summary with complete, clickable URLs | |
| md = f"# π·οΈ HuggingFace MCP Servers Discovery\n\n" | |
| md += f"**Status:** β Success\n" | |
| md += f"**Sort Method:** {result['sort_label']}\n" | |
| md += f"**Total Servers Found:** {len(servers)}\n" # Use 'servers' directly | |
| md += f"**Pages Scraped:** {max_pages}\n" | |
| md += f"**Discovery Time:** {result['scraped_at']}\n\n" | |
| md += "## π Discovered Servers\n\n" | |
| for i, server in enumerate(servers, 1): # Show ALL servers | |
| # Removed redundant title - URLs are self-explanatory | |
| md += f"**{i}.** **HF Spaces URL:** [{server['hf_spaces_url']}]({server['hf_spaces_url']})\n\n" | |
| md += f"**Live Space URL:** [{server['hf_space_domain']}]({server['hf_space_domain']})\n\n" | |
| md += f"**MCP Endpoint:** [{server['mcp_endpoint']}]({server['mcp_endpoint']})\n\n" | |
| md += f"**Discovered on Page:** {server['discovered_on_page']}\n\n" | |
| md += "---\n\n" | |
| md += "## π URLs for Multi-Server Monitor\n\n" | |
| md += "Copy these URLs to test all discovered servers:\n\n" | |
| md += "```\n" | |
| md += test_urls | |
| md += "\n```\n\n" | |
| md += "## π Next Steps\n\n" | |
| md += "1. Copy the URLs above\n" | |
| md += "2. Go to the **π Multi-Server Monitor** tab\n" | |
| md += "3. Paste the URLs and run health checks\n" | |
| md += "4. Discover which servers are actively running MCP services\n\n" | |
| md += "## π Statistics\n\n" | |
| md += f"- **Total Unique Servers:** {len(servers)}\n" # Use 'servers' directly | |
| md += f"- **Sort Method Used:** {result['sort_label']}\n" | |
| # Ensure max_pages is not zero to avoid DivisionByZeroError | |
| avg_servers_per_page = round(len(servers) / max_pages, 1) if max_pages > 0 else 0 # Use 'servers' directly | |
| md += f"- **Pages Successfully Scraped:** {max_pages}\n" | |
| md += f"- **Average Servers per Page:** {avg_servers_per_page}\n" | |
| return md, json.dumps(result, indent=2) | |
| def test_scraper_locally(): | |
| """Test function to validate the scraper locally.""" | |
| print("π§ͺ Testing HuggingFace MCP Server Scraper") | |
| print("=" * 50) | |
| # Test with more pages to verify the fix | |
| servers = scrape_hf_mcp_servers(max_pages=5, sort_by="likes") | |
| if servers: | |
| print(f"\\nβ Successfully found {len(servers)} servers!") | |
| print("\\nFirst few servers found:") | |
| for i, server in enumerate(servers[:3], 1): | |
| print(f"{i}. {server['space_id']} (Page {server['discovered_on_page']})") | |
| print(f" HF URL: {server['hf_spaces_url']}") | |
| print(f" MCP Endpoint: {server['mcp_endpoint']}") | |
| # Show page distribution | |
| page_distribution = {} | |
| for server in servers: | |
| page = server['discovered_on_page'] | |
| page_distribution[page] = page_distribution.get(page, 0) + 1 | |
| print(f"\\nπ Page distribution:") | |
| for page in sorted(page_distribution.keys(), key=int): | |
| print(f" Page {page}: {page_distribution[page]} servers") | |
| # Validation step is removed | |
| print(f"\\nβ All {len(servers)} discovered servers are included (no validation step).") | |
| # Generate test URLs | |
| test_urls = generate_test_urls_for_health_monitor(servers) | |
| print(f"\\nπ Generated {len(test_urls.split())} URLs for testing") | |
| else: | |
| print("β No servers found. Check the scraping logic.") | |
| # Code by Nicolas larenas, NLarchive | |
| def main(): | |
| """Main scraping function.""" | |
| # Run scraping with more pages to test the fix | |
| md_summary, json_data = scrape_and_format_hf_mcp_servers(max_pages=10, sort_by="likes") | |
| # Save results | |
| result = json.loads(json_data) | |
| if result["status"] == "success": | |
| # 'servers' in result now contains all scraped servers without validation | |
| save_scraped_servers(result["servers"]) | |
| # Save URLs to file for easy copying | |
| with open("mcp_test_urls.txt", 'w', encoding='utf-8') as f: | |
| f.write("\\n".join(result["test_urls"])) | |
| print("πΎ URLs also saved to 'mcp_test_urls.txt'") | |
| print("\n" + "=" * 50) | |
| print("π― SCRAPING COMPLETE!") | |
| print("=" * 50) | |
| # Print summary without markdown formatting | |
| summary_text = md_summary.replace("#", "").replace("*", "").replace("[", "").replace("]", "").replace("(", " (") | |
| print(summary_text) | |
| if __name__ == "__main__": | |
| # Uncomment the line below to test the scraper locally first | |
| test_scraper_locally() | |
| # Run the full scraper | |
| # main() |