zhiminy commited on
Commit
4392559
·
1 Parent(s): 625c7c1
Files changed (3) hide show
  1. app.py +51 -19
  2. context_window.json +0 -53
  3. model_metadata.jsonl +51 -0
app.py CHANGED
@@ -36,12 +36,17 @@ TIMEOUT = 90
36
  SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
37
  HINT_STRING = "Once signed in, your votes will be recorded securely."
38
 
39
- # Load context length limits
40
- with open("context_window.json", "r") as file:
41
- context_window = json.load(file)
42
 
43
- # Get list of available models
44
- available_models = list(context_window.keys())
 
 
 
 
 
 
45
 
46
 
47
  def fetch_github_content(url):
@@ -204,7 +209,7 @@ def truncate_prompt(model_alias, models, conversation_state):
204
  full_conversation = conversation_state[f"{model_alias}_chat"]
205
 
206
  # Get the context length for the model
207
- context_length = context_window[models[model_alias]]
208
 
209
  # Single loop to handle both FIFO removal and content truncation
210
  while len(json.dumps(full_conversation)) > context_length:
@@ -364,14 +369,14 @@ def load_content_from_hf(repo_name="SWE-Arena/model_votes"):
364
 
365
  def get_leaderboard_data(vote_entry=None, use_cache=True):
366
  year = str(datetime.now().year)
367
-
368
  # Try to load cached leaderboard first
369
  if use_cache:
370
  try:
371
  cached_path = hf_hub_download(
372
  repo_id="SWE-Arena/model_leaderboards",
373
  filename=f"{year}.json",
374
- repo_type="dataset"
375
  )
376
  with open(cached_path, "r") as f:
377
  leaderboard_data = pd.read_json(f)
@@ -391,7 +396,7 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
391
  return leaderboard_data
392
  except Exception as e:
393
  print(f"No cached leaderboard found for {year}, computing from votes...")
394
-
395
  # Load feedback data from the Hugging Face repository
396
  vote_data = load_content_from_hf()
397
  vote_df = pd.DataFrame(vote_data)
@@ -511,14 +516,25 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
511
  )
512
 
513
  # Clean up potential inf/NaN values in the results
514
- for result in [avr_result, bt_result, newman_result, eigen_result, elo_result, pagerank_result]:
515
- result.scores = result.scores.replace([float('inf'), float('-inf')], float('nan'))
 
 
 
 
 
 
 
 
 
516
 
517
  # Calculate CEI results
518
  cei_result = {}
519
  for model in elo_result.scores.index:
520
  if model in model_stats and model_stats[model]["cei_max"] > 0:
521
- cei_result[model] = round(model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2)
 
 
522
  else:
523
  cei_result[model] = None
524
  cei_result = pd.Series(cei_result)
@@ -527,7 +543,9 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
527
  mcs_result = {}
528
  for model in elo_result.scores.index:
529
  if model in model_stats and model_stats[model]["self_matches"] > 0:
530
- mcs_result[model] = round(model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2)
 
 
531
  else:
532
  mcs_result[model] = None
533
  mcs_result = pd.Series(mcs_result)
@@ -569,25 +587,33 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
569
  ["Rank"] + [col for col in leaderboard_data.columns if col != "Rank"]
570
  ]
571
 
 
 
 
 
 
572
  # Save leaderboard data if this is a new vote
573
  if vote_entry is not None:
574
  try:
575
  # Convert DataFrame to JSON and save
576
- json_content = leaderboard_data.to_json(orient='records', indent=4).encode('utf-8')
 
 
577
  file_like_object = io.BytesIO(json_content)
578
-
579
  upload_file(
580
  path_or_fileobj=file_like_object,
581
  path_in_repo=f"{year}.json",
582
  repo_id="SWE-Arena/model_leaderboards",
583
  repo_type="dataset",
584
- use_auth_token=HfFolder.get_token()
585
  )
586
  except Exception as e:
587
  print(f"Failed to save leaderboard cache: {e}")
588
-
589
  return leaderboard_data
590
 
 
591
  # Function to enable or disable submit buttons based on textbox content
592
  def toggle_submit_button(text):
593
  if not text or text.strip() == "":
@@ -632,6 +658,7 @@ with gr.Blocks() as app:
632
  "Newman Modularity Score",
633
  "PageRank Score",
634
  ],
 
635
  )
636
  # Add a citation block in Markdown
637
  citation_component = gr.Markdown(
@@ -1242,7 +1269,9 @@ with gr.Blocks() as app:
1242
  file_name = now.strftime("%Y%m%d_%H%M%S")
1243
 
1244
  # Save feedback back to the Hugging Face dataset
1245
- save_content_to_hf(vote_entry, "SWE-Arena/model_votes", folder_name, file_name)
 
 
1246
 
1247
  conversation_state["right_chat"][0]["content"] = conversation_state[
1248
  "right_chat"
@@ -1253,7 +1282,10 @@ with gr.Blocks() as app:
1253
 
1254
  # Save conversations back to the Hugging Face dataset
1255
  save_content_to_hf(
1256
- conversation_state, "SWE-Arena/model_conversations", folder_name, file_name
 
 
 
1257
  )
1258
 
1259
  # Clear state
 
36
  SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
37
  HINT_STRING = "Once signed in, your votes will be recorded securely."
38
 
39
+ # Load model metadata
40
+ model_metadata = pd.read_json("model_metadata.jsonl", lines=True)
 
41
 
42
+ # Create a dictionary mapping model names to their context lengths
43
+ model_context_window = model_metadata.set_index("model_name")["context_window"].to_dict()
44
+
45
+ # Create a dictionary mapping model names to their links
46
+ model_links = model_metadata.set_index("model_name")["link"].to_dict()
47
+
48
+ # Get the list of available models
49
+ available_models = model_metadata["model_name"].tolist()
50
 
51
 
52
  def fetch_github_content(url):
 
209
  full_conversation = conversation_state[f"{model_alias}_chat"]
210
 
211
  # Get the context length for the model
212
+ context_length = model_context_window[models[model_alias]]
213
 
214
  # Single loop to handle both FIFO removal and content truncation
215
  while len(json.dumps(full_conversation)) > context_length:
 
369
 
370
  def get_leaderboard_data(vote_entry=None, use_cache=True):
371
  year = str(datetime.now().year)
372
+
373
  # Try to load cached leaderboard first
374
  if use_cache:
375
  try:
376
  cached_path = hf_hub_download(
377
  repo_id="SWE-Arena/model_leaderboards",
378
  filename=f"{year}.json",
379
+ repo_type="dataset",
380
  )
381
  with open(cached_path, "r") as f:
382
  leaderboard_data = pd.read_json(f)
 
396
  return leaderboard_data
397
  except Exception as e:
398
  print(f"No cached leaderboard found for {year}, computing from votes...")
399
+
400
  # Load feedback data from the Hugging Face repository
401
  vote_data = load_content_from_hf()
402
  vote_df = pd.DataFrame(vote_data)
 
516
  )
517
 
518
  # Clean up potential inf/NaN values in the results
519
+ for result in [
520
+ avr_result,
521
+ bt_result,
522
+ newman_result,
523
+ eigen_result,
524
+ elo_result,
525
+ pagerank_result,
526
+ ]:
527
+ result.scores = result.scores.replace(
528
+ [float("inf"), float("-inf")], float("nan")
529
+ )
530
 
531
  # Calculate CEI results
532
  cei_result = {}
533
  for model in elo_result.scores.index:
534
  if model in model_stats and model_stats[model]["cei_max"] > 0:
535
+ cei_result[model] = round(
536
+ model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2
537
+ )
538
  else:
539
  cei_result[model] = None
540
  cei_result = pd.Series(cei_result)
 
543
  mcs_result = {}
544
  for model in elo_result.scores.index:
545
  if model in model_stats and model_stats[model]["self_matches"] > 0:
546
+ mcs_result[model] = round(
547
+ model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2
548
+ )
549
  else:
550
  mcs_result[model] = None
551
  mcs_result = pd.Series(mcs_result)
 
587
  ["Rank"] + [col for col in leaderboard_data.columns if col != "Rank"]
588
  ]
589
 
590
+ # Make model names clickable with their corresponding links
591
+ leaderboard_data["Model"] = leaderboard_data["Model"].apply(
592
+ lambda model_name: f'<a href="{model_links[model_name]}" target="_blank">{model_name}</a>'
593
+ )
594
+
595
  # Save leaderboard data if this is a new vote
596
  if vote_entry is not None:
597
  try:
598
  # Convert DataFrame to JSON and save
599
+ json_content = leaderboard_data.to_json(orient="records", indent=4).encode(
600
+ "utf-8"
601
+ )
602
  file_like_object = io.BytesIO(json_content)
603
+
604
  upload_file(
605
  path_or_fileobj=file_like_object,
606
  path_in_repo=f"{year}.json",
607
  repo_id="SWE-Arena/model_leaderboards",
608
  repo_type="dataset",
609
+ use_auth_token=HfFolder.get_token(),
610
  )
611
  except Exception as e:
612
  print(f"Failed to save leaderboard cache: {e}")
613
+
614
  return leaderboard_data
615
 
616
+
617
  # Function to enable or disable submit buttons based on textbox content
618
  def toggle_submit_button(text):
619
  if not text or text.strip() == "":
 
658
  "Newman Modularity Score",
659
  "PageRank Score",
660
  ],
661
+ datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number"],
662
  )
663
  # Add a citation block in Markdown
664
  citation_component = gr.Markdown(
 
1269
  file_name = now.strftime("%Y%m%d_%H%M%S")
1270
 
1271
  # Save feedback back to the Hugging Face dataset
1272
+ save_content_to_hf(
1273
+ vote_entry, "SWE-Arena/model_votes", folder_name, file_name
1274
+ )
1275
 
1276
  conversation_state["right_chat"][0]["content"] = conversation_state[
1277
  "right_chat"
 
1282
 
1283
  # Save conversations back to the Hugging Face dataset
1284
  save_content_to_hf(
1285
+ conversation_state,
1286
+ "SWE-Arena/model_conversations",
1287
+ folder_name,
1288
+ file_name,
1289
  )
1290
 
1291
  # Clear state
context_window.json DELETED
@@ -1,53 +0,0 @@
1
- {
2
- "claude-3-haiku-20240307": 200000,
3
- "claude-3-opus-20240229": 200000,
4
- "claude-3-5-haiku-20241022": 200000,
5
- "claude-3-5-sonnet-20241022": 200000,
6
- "claude-3-7-sonnet-latest": 200000,
7
- "claude-3-7-sonnet-20250219#thinking": 200000,
8
- "claude-opus-4-20250514": 200000,
9
- "claude-sonnet-4-20250514": 200000,
10
- "claude-opus-4-1-20250805": 200000,
11
- "doubao-1-5-pro-256k-250115": 256000,
12
- "doubao-1-5-thinking-pro-250415": 256000,
13
- "doubao-seed-1-6-250615": 256000,
14
- "doubao-seed-1-6-thinking-250615": 256000,
15
- "deepseek-chat": 64000,
16
- "deepseek-r1": 64000,
17
- "deepseek-v3.1": 1000000,
18
- "deepseek-v3.1#thinking": 1000000,
19
- "gemini-2.5-flash-preview-non-thinking": 1048576,
20
- "gemini-2.5-flash-preview-thinking": 1048576,
21
- "gemini-2.5-pro-preview-06-05": 1048576,
22
- "gemma-3-27b-it": 128000,
23
- "gpt-3.5-turbo": 16000,
24
- "gpt-4-turbo": 128000,
25
- "gpt-4o": 128000,
26
- "gpt-4o-mini": 128000,
27
- "gpt-4.1": 1000000,
28
- "gpt-4.1-mini": 1000000,
29
- "gpt-5": 400000,
30
- "gpt-5-chat-latest": 400000,
31
- "gpt-5-mini": 400000,
32
- "gpt-5-nano": 400000,
33
- "gpt-oss-120b": 128000,
34
- "gpt-oss-20b": 128000,
35
- "grok-3-fast-beta": 1000000,
36
- "grok-3-beta": 1000000,
37
- "grok-3-mini-fast-beta": 1000000,
38
- "grok-3-mini-beta": 1000000,
39
- "llama-3.1-405b": 128000,
40
- "llama-3.3-70b": 128000,
41
- "llama4-scout-instruct-basic": 10000000,
42
- "llama4-maverick-instruct-basic": 10000000,
43
- "mistral-large-latest": 131000,
44
- "mistral-small-latest": 32000,
45
- "o1": 128000,
46
- "o1-mini": 128000,
47
- "o3": 200000,
48
- "o3-mini": 200000,
49
- "o4-mini": 200000,
50
- "qwen3-30b-a3b": 32768,
51
- "qwen3-32b": 32768,
52
- "qwen3-235b-a22b": 32768
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_metadata.jsonl ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model_name": "claude-3-haiku-20240307", "context_window": 200000, "link": "https://www.anthropic.com/news/claude-3-haiku"}
2
+ {"model_name": "claude-3-opus-20240229", "context_window": 200000, "link": "https://www.anthropic.com/news/claude-3-family"}
3
+ {"model_name": "claude-3-5-haiku-20241022", "context_window": 200000, "link": "https://www.anthropic.com/claude/haiku"}
4
+ {"model_name": "claude-3-5-sonnet-20241022", "context_window": 200000, "link": "https://www.anthropic.com/news/claude-3-5-sonnet"}
5
+ {"model_name": "claude-3-7-sonnet-latest", "context_window": 200000, "link": "https://www.anthropic.com/claude/sonnet"}
6
+ {"model_name": "claude-3-7-sonnet-20250219#thinking", "context_window": 200000, "link": "https://www.anthropic.com/claude/sonnet"}
7
+ {"model_name": "claude-opus-4-20250514", "context_window": 200000, "link": "https://www.anthropic.com/news/claude-4"}
8
+ {"model_name": "claude-sonnet-4-20250514", "context_window": 200000, "link": "https://www.anthropic.com/claude/sonnet"}
9
+ {"model_name": "claude-opus-4-1-20250805", "context_window": 200000, "link": "https://www.anthropic.com/news/claude-opus-4-1"}
10
+ {"model_name": "doubao-1-5-pro-256k-250115", "context_window": 256000, "link": "https://seed.bytedance.com"}
11
+ {"model_name": "doubao-1-5-thinking-pro-250415", "context_window": 256000, "link": "https://seed.bytedance.com"}
12
+ {"model_name": "doubao-seed-1-6-250615", "context_window": 256000, "link": "https://seed.bytedance.com"}
13
+ {"model_name": "doubao-seed-1-6-thinking-250615", "context_window": 256000, "link": "https://seed.bytedance.com"}
14
+ {"model_name": "deepseek-chat", "context_window": 64000, "link": "https://www.deepseek.com"}
15
+ {"model_name": "deepseek-r1", "context_window": 64000, "link": "https://www.deepseek.com"}
16
+ {"model_name": "deepseek-v3.1", "context_window": 1000000, "link": "https://www.deepseek.com"}
17
+ {"model_name": "deepseek-v3.1#thinking", "context_window": 1000000, "link": "https://www.deepseek.com"}
18
+ {"model_name": "gemini-2.5-flash-preview-non-thinking", "context_window": 1048576, "link": "https://deepmind.google/models/gemini/flash"}
19
+ {"model_name": "gemini-2.5-flash-preview-thinking", "context_window": 1048576, "link": "https://deepmind.google/models/gemini/flash"}
20
+ {"model_name": "gemini-2.5-pro-preview-06-05", "context_window": 1048576, "link": "https://deepmind.google/models/gemini/pro"}
21
+ {"model_name": "gemma-3-27b-it", "context_window": 128000, "link": "https://ai.google.dev/gemma"}
22
+ {"model_name": "gpt-3.5-turbo", "context_window": 16000, "link": "https://openai.com"}
23
+ {"model_name": "gpt-4-turbo", "context_window": 128000, "link": "https://openai.com/index/gpt-4"}
24
+ {"model_name": "gpt-4o", "context_window": 128000, "link": "https://openai.com"}
25
+ {"model_name": "gpt-4o-mini", "context_window": 128000, "link": "https://openai.com"}
26
+ {"model_name": "gpt-4.1", "context_window": 1000000, "link": "https://openai.com/index/gpt-4-1"}
27
+ {"model_name": "gpt-4.1-mini", "context_window": 1000000, "link": "https://openai.com/index/gpt-4-1"}
28
+ {"model_name": "gpt-5", "context_window": 400000, "link": "https://openai.com/gpt-5"}
29
+ {"model_name": "gpt-5-chat-latest", "context_window": 400000, "link": "https://openai.com/gpt-5"}
30
+ {"model_name": "gpt-5-mini", "context_window": 400000, "link": "https://openai.com/gpt-5"}
31
+ {"model_name": "gpt-5-nano", "context_window": 400000, "link": "https://openai.com/gpt-5"}
32
+ {"model_name": "gpt-oss-120b", "context_window": 128000, "link": "https://openai.com"}
33
+ {"model_name": "gpt-oss-20b", "context_window": 128000, "link": "https://openai.com"}
34
+ {"model_name": "grok-3-fast-beta", "context_window": 1000000, "link": "https://x.ai/news/grok-3"}
35
+ {"model_name": "grok-3-beta", "context_window": 1000000, "link": "https://x.ai/news/grok-3"}
36
+ {"model_name": "grok-3-mini-fast-beta", "context_window": 1000000, "link": "https://x.ai/news/grok-3"}
37
+ {"model_name": "grok-3-mini-beta", "context_window": 1000000, "link": "https://x.ai/news/grok-3"}
38
+ {"model_name": "llama-3.1-405b", "context_window": 128000, "link": "https://ai.meta.com/blog/meta-llama-3-1"}
39
+ {"model_name": "llama-3.3-70b", "context_window": 128000, "link": "https://www.llama.com"}
40
+ {"model_name": "llama4-scout-instruct-basic", "context_window": 10000000, "link": "https://www.llama.com/models/llama-4"}
41
+ {"model_name": "llama4-maverick-instruct-basic", "context_window": 10000000, "link": "https://www.llama.com/models/llama-4"}
42
+ {"model_name": "mistral-large-latest", "context_window": 131000, "link": "https://mistral.ai/news/mistral-large"}
43
+ {"model_name": "mistral-small-latest", "context_window": 32000, "link": "https://mistral.ai/news/mistral-small-3-1"}
44
+ {"model_name": "o1", "context_window": 128000, "link": "https://openai.com/o1"}
45
+ {"model_name": "o1-mini", "context_window": 128000, "link": "https://openai.com/o1"}
46
+ {"model_name": "o3", "context_window": 200000, "link": "https://openai.com/index/introducing-o3-and-o4-mini"}
47
+ {"model_name": "o3-mini", "context_window": 200000, "link": "https://openai.com/index/introducing-o3-and-o4-mini"}
48
+ {"model_name": "o4-mini", "context_window": 200000, "link": "https://openai.com/index/introducing-o3-and-o4-mini"}
49
+ {"model_name": "qwen3-30b-a3b", "context_window": 32768, "link": "https://qwen-3.com"}
50
+ {"model_name": "qwen3-32b", "context_window": 32768, "link": "https://qwen-3.com"}
51
+ {"model_name": "qwen3-235b-a22b", "context_window": 32768, "link": "https://qwen-3.com"}