shyuli commited on
Commit
2354057
·
1 Parent(s): a77c2f0

version v0.1

Browse files
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md DELETED
@@ -1,48 +0,0 @@
1
- ---
2
- title: SearchAgent Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- app_file: app.py
8
- pinned: true
9
- license: apache-2.0
10
- short_description: Duplicate this leaderboard to initialize your own!
11
- sdk_version: 5.43.1
12
- tags:
13
- - leaderboard
14
- ---
15
-
16
- # Start the configuration
17
-
18
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
19
-
20
- Results files should have the following format and be stored as json files:
21
- ```json
22
- {
23
- "config": {
24
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
25
- "model_name": "path of the model on the hub: org/model",
26
- "model_sha": "revision on the hub",
27
- },
28
- "results": {
29
- "task_name": {
30
- "metric_name": score,
31
- },
32
- "task_name2": {
33
- "metric_name": score,
34
- }
35
- }
36
- }
37
- ```
38
-
39
- Request files are created automatically by this tool.
40
-
41
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
42
-
43
- # Code logic for more complex edits
44
-
45
- You'll find
46
- - the main table' columns names and properties in `src/display/utils.py`
47
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
48
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -3,6 +3,10 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
 
 
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -22,7 +26,7 @@ from src.display.utils import (
22
  ModelType,
23
  fields,
24
  WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -32,21 +36,95 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  ### Space initialisation
 
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
 
 
 
 
 
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
 
 
 
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
@@ -57,6 +135,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -68,21 +147,10 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -92,101 +160,18 @@ def init_leaderboard(dataframe):
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
 
100
 
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
@@ -201,4 +186,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
+ import json
7
+ import os
8
+ import datetime
9
+ import urllib.parse
10
 
11
  from src.about import (
12
  CITATION_BUTTON_LABEL,
 
26
  ModelType,
27
  fields,
28
  WeightType,
29
+ Precision,
30
  )
31
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
32
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID)
38
 
39
+
40
+ def save_submission_and_notify(model_name, contact_email, weight_link, json_results, paper_link, description):
41
+ """Save submission to file and provide instructions for email"""
42
+ try:
43
+ # Validate JSON format if provided
44
+ if json_results.strip():
45
+ try:
46
+ json.loads(json_results)
47
+ except json.JSONDecodeError:
48
+ return "❌ Invalid JSON format in results field"
49
+
50
+ # Create submission data
51
+ submission_data = {
52
+ "timestamp": datetime.datetime.now().isoformat(),
53
+ "model_name": model_name,
54
+ "contact_email": contact_email,
55
+ "weight_link": weight_link,
56
+ "paper_link": paper_link,
57
+ "description": description,
58
+ "json_results": json_results,
59
+ }
60
+
61
+ # Save to submissions directory
62
+ os.makedirs("submissions", exist_ok=True)
63
+ filename = (
64
+ f"submissions/{model_name.replace('/', '_')}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
65
+ )
66
+
67
+ with open(filename, "w") as f:
68
+ json.dump(submission_data, f, indent=2)
69
+
70
+ # Create mailto link for user
71
+ subject = f"SearchAgent Leaderboard Submission: {model_name}"
72
+ body = f"""New model submission for SearchAgent Leaderboard:
73
+
74
+ Model Name: {model_name}
75
+ Contact Email: {contact_email}
76
+ Weight Link: {weight_link}
77
+ Paper Link: {paper_link}
78
+ Description: {description}
79
+
80
+ JSON Results:
81
+ {json_results}"""
82
+
83
+ # URL encode the email content
84
+ mailto_link = (
85
+ f"mailto:[email protected]?subject={urllib.parse.quote(subject)}&body={urllib.parse.quote(body[:500])}"
86
+ )
87
+
88
+ return f"""✅ Submission saved successfully!
89
+
90
+ 📧 **Please send your submission to: [email protected]**
91
+
92
+ You can either:
93
+ 1. Click here to open your email client: [Send Email](mailto:[email protected])
94
+ 2. Or copy the submission details above and send manually
95
+
96
+ Your submission has been saved to: {filename}
97
+
98
+ We'll review your model and get back to you at {contact_email}."""
99
+
100
+ except Exception as e:
101
+ return f"❌ Failed to save submission: {str(e)}"
102
+
103
+
104
  ### Space initialisation
105
+ # Use local data for demo purposes
106
  try:
107
  print(EVAL_REQUESTS_PATH)
108
+ # For demo, use local eval-queue directory if it exists
109
+ import os
110
+
111
+ if not os.path.exists(EVAL_REQUESTS_PATH):
112
+ os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
113
+ # snapshot_download(
114
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
115
+ # )
116
+ except Exception as e:
117
+ print(f"Could not setup eval requests path: {e}")
118
  try:
119
  print(EVAL_RESULTS_PATH)
120
+ # For demo, use local eval-results directory if it exists
121
+ if not os.path.exists(EVAL_RESULTS_PATH):
122
+ os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
123
+ # snapshot_download(
124
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
125
+ # )
126
+ except Exception as e:
127
+ print(f"Could not setup eval results path: {e}")
128
 
129
 
130
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
135
  pending_eval_queue_df,
136
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
137
 
138
+
139
  def init_leaderboard(dataframe):
140
  if dataframe is None or dataframe.empty:
141
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
147
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
148
  label="Select Columns to Display:",
149
  ),
150
+ search_columns=[AutoEvalColumn.model.name],
151
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
152
  filter_columns=[
153
+ ColumnFilter(AutoEvalColumn.model_size.name, type="checkboxgroup", label="Model Size"),
 
 
 
 
 
 
 
 
 
 
 
154
  ],
155
  bool_checkboxgroup_label="Hide models",
156
  interactive=False,
 
160
  demo = gr.Blocks(css=custom_css)
161
  with demo:
162
  gr.HTML(TITLE)
 
163
 
164
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
165
+ with gr.TabItem("🏅 SearchAgent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
166
  leaderboard = init_leaderboard(LEADERBOARD_DF)
167
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
168
 
169
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
170
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
171
 
172
+ with gr.TabItem("📤 Submit Model", elem_id="llm-benchmark-tab-table", id=3):
173
  with gr.Column():
174
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  with gr.Row():
177
  with gr.Accordion("📙 Citation", open=False):
 
186
  scheduler = BackgroundScheduler()
187
  scheduler.add_job(restart_space, "interval", seconds=1800)
188
  scheduler.start()
189
+ demo.queue(default_concurrency_limit=40).launch(share=True)
requirements.txt DELETED
@@ -1,16 +0,0 @@
1
- APScheduler
2
- black
3
- datasets
4
- gradio
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.13
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
- numpy
11
- pandas
12
- python-dateutil
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -11,62 +12,192 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
 
 
 
 
 
 
 
 
 
19
  # ---------------------------------------------------
20
 
21
 
22
-
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
38
- """
 
 
 
39
 
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
 
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
 
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
 
 
 
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ # General QA tasks
17
+ nq = Task("nq", "exact_match", "NQ")
18
+ triviaqa = Task("triviaqa", "exact_match", "TriviaQA")
19
+ popqa = Task("popqa", "exact_match", "PopQA")
20
+ # Multi-hop QA tasks
21
+ hotpotqa = Task("hotpotqa", "exact_match", "HotpotQA")
22
+ twowiki = Task("2wiki", "exact_match", "2wiki")
23
+ musique = Task("musique", "exact_match", "Musique")
24
+ bamboogle = Task("bamboogle", "exact_match", "Bamboogle")
25
+ fictionalhot = Task("fictionalhot", "exact_match", "FictionalHot")
26
+
27
+
28
+ NUM_FEWSHOT = 0 # Change with your few shot
29
  # ---------------------------------------------------
30
 
31
 
 
32
  # Your leaderboard name
33
+ TITLE = """<h1 align="center" id="space-title">🔍 SearchAgent Leaderboard</h1>"""
34
 
35
  # What does your leaderboard evaluate?
36
  INTRODUCTION_TEXT = """
37
+ # 🔍 SearchAgent Leaderboard
38
+
39
+ This leaderboard evaluates the performance of **search-augmented question answering systems** across various tasks, ranging from simple factual QA to complex multi-hop reasoning. Our evaluation addresses the inconsistency in experimental settings across prior works by providing a standardized comparison framework.
40
+
41
+ ## 📊 Evaluation Tasks
42
+
43
+ We evaluate on a comprehensive set of benchmarks that test different aspects of search-augmented QA:
44
+
45
+ ### General QA (Set A)
46
+ - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
47
+ - **TriviaQA**: Trivia questions requiring document-based answer extraction
48
+ - **PopQA**: Popular culture QA testing knowledge breadth and parametric vs. non-parametric memory
49
+
50
+ ### Multi-Hop QA (Set B)
51
+ - **HotpotQA**: Complex QA requiring reasoning across multiple documents with explainable reasoning chains
52
+ - **2wiki**: Multi-hop reasoning based on Wikipedia requiring compositional reasoning
53
+ - **Musique**: Multi-step compositional reasoning QA via single-hop question composition
54
+ - **Bamboogle**: Adversarial search QA designed to test compositionality gaps in language models
55
+
56
+ ### Novel Evaluation: FictionalHot
57
+ - **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.
58
+
59
+ ## 🎯 Evaluation Metrics
60
+ Following standardized practices, we primarily use **Exact Match (EM)** as the main metric. A prediction is correct if its normalized string exactly matches any normalized reference answer (with lowercasing, punctuation removal, and whitespace normalization).
61
+
62
  """
63
 
64
  # Which evaluations are you running? how can people reproduce what you have?
65
  LLM_BENCHMARKS_TEXT = f"""
66
+ ## 🔬 Evaluation Methodology
67
 
68
+ This leaderboard addresses the challenge of inconsistent experimental settings in search agent evaluation by providing standardized comparisons. Prior works vary significantly in:
 
69
 
70
+ 1. **Corpora**: From static Wikipedia snapshots (2018, 2019) to live Internet access
71
+ 2. **Test Sets**: Broad evaluation (Set A) vs. focused multi-hop evaluation (Set B)
72
+ 3. **Training Regimes**: No training to multi-dataset fine-tuning approaches
73
+ 4. **Metrics**: Exact Match, F1, Substring matching, and LLM-as-a-judge evaluations
74
 
75
+ ## 📋 Dataset Details & Challenges
76
+
77
+ ### Data Contamination Problem
78
+ A critical issue in current benchmarks is **data contamination**, where high scores may reflect memorized pretraining knowledge rather than genuine procedural reasoning capabilities.
79
+
80
+ ### Our Solution: FictionalHot
81
+ We introduce **FictionalHot**, a novel closed-world benchmark that:
82
+ - Grounds all questions in newly generated synthetic fictional entities
83
+ - Uses a three-step construction pipeline: sampling → GPT-based entity replacement → synthetic document generation
84
+ - Forces models to rely on procedural reasoning over provided documents
85
+ - Enables reproducible evaluation with a fixed knowledge source
86
 
87
+ ### Benchmark Coverage
88
+ - **Corpus**: 2018 Wikipedia snapshot for reproducibility
89
+ - **Retrieval**: Top-k=3 with maximum T=4 tool-use turns per question
90
 
91
+ ## 🔄 Experimental Setup
 
92
 
93
+ Following established practices, we:
94
+ - Fine-tune on unified NQ + HotpotQA training data
95
+ - Evaluate on Qwen2.5-3B-Instruct and Qwen2.5-7B-Instruct models
96
+ - Use E5 embeddings for retrieval backend
97
+ - Apply standard Exact Match evaluation with string normalization
98
 
 
 
99
 
100
+ """
101
+
102
+ EVALUATION_QUEUE_TEXT = """
103
+ ## 📣 Model Submission via Community
104
+
105
+ We now accept submissions via the Space's Community (Discussions). This keeps the process simple and transparent.
106
+
107
+ - Go to the Community tab of this leaderboard Space:
108
+ https://huggingface.co/spaces/TencentBAC/SearchAgent_Leaderboard
109
+ - Create a new Discussion with title:
110
+ `Submission: <YourMethod>-<model_name>-<model_size>`
111
+ - Include the following in the post:
112
+ - Model weights link (HF or GitHub)
113
+ - Short method description
114
+ - Evaluation JSON (inline or attached)
115
+
116
+ Example JSON:
117
+ ```json
118
+ {
119
+ "config": {
120
+ "model_dtype": "torch.float16",
121
+ "model_name": "YourMethod-Qwen2.5-7b-Instruct",
122
+ "model_sha": "main"
123
+ },
124
+ "results": {
125
+ "nq": {"exact_match": 0.45},
126
+ "triviaqa": {"exact_match": 0.62},
127
+ "popqa": {"exact_match": 0.38},
128
+ "hotpotqa": {"exact_match": 0.41},
129
+ "2wiki": {"exact_match": 0.33},
130
+ "musique": {"exact_match": 0.15},
131
+ "bamboogle": {"exact_match": 0.28},
132
+ "fictionalhot": {"exact_match": 0.06}
133
+ }
134
+ }
135
+ ```
136
+
137
+ We will review your post and add your model to the leaderboard.
138
  """
139
 
140
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
141
  CITATION_BUTTON_TEXT = r"""
142
+ % Key Search-Augmented QA Methods
143
+ @article{luo2024search,
144
+ title={Search-o1: Agentic Search-Enhanced Large Reasoning Models},
145
+ author={Xiaoxi Li and Guanting Dong and Jiajie Jin and Yuyao Zhang and Yujia Zhou and Yutao Zhu and Peitian Zhang and Zhicheng Dou},
146
+ journal={arXiv preprint arXiv:2501.05366},
147
+ year={2025}
148
+ }
149
+
150
+ @article{songR1SearcherIncentivizingSearch2025,
151
+ title={R1-Searcher: Incentivizing the Search Capability in LLMs via Reinforcement Learning},
152
+ author={Song, Huatong and Jiang, Jinhao and Min, Yingqian and Chen, Jie and Chen, Zhipeng and Zhao, Wayne Xin and Fang, Lei and Wen, Ji-Rong},
153
+ journal={arXiv preprint arXiv:2503.05592},
154
+ year={2025}
155
+ }
156
+
157
+ @article{jin2025search,
158
+ title={Search-r1: Training llms to reason and leverage search engines with reinforcement learning},
159
+ author={Jin, Bowen and Zeng, Hansi and Yue, Zhenrui and Yoon, Jinsung and Arik, Sercan and Wang, Dong and Zamani, Hamed and Han, Jiawei},
160
+ journal={arXiv preprint arXiv:2503.09516},
161
+ year={2025}
162
+ }
163
+
164
+ @article{sunZeroSearchIncentivizeSearch2025,
165
+ title={ZeroSearch: Incentivize the Search Capability of LLMs without Searching},
166
+ author={Sun, Hao and Qiao, Zile and Guo, Jiayan and Fan, Xuanbo and Hou, Yingyan and Jiang, Yong and Xie, Pengjun and Zhang, Yan and Huang, Fei and Zhou, Jingren},
167
+ journal={arXiv preprint arXiv:2505.04588},
168
+ year={2025}
169
+ }
170
+
171
+ @article{zheng2025deepresearcher,
172
+ title={Deepresearcher: Scaling deep research via reinforcement learning in real-world environments},
173
+ author={Zheng, Yuxiang and Fu, Dayuan and Hu, Xiangkun and Cai, Xiaojie and Ye, Lyumanshan and Lu, Pengrui and Liu, Pengfei},
174
+ journal={arXiv preprint arXiv:2504.03160},
175
+ year={2025}
176
+ }
177
+
178
+ % Benchmark Datasets
179
+ @article{kwiatkowskiNaturalQuestionsBenchmark2019,
180
+ title={Natural Questions: A Benchmark for Question Answering Research},
181
+ author={Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and others},
182
+ journal={Transactions of the Association for Computational Linguistics},
183
+ volume={7},
184
+ pages={453--466},
185
+ year={2019}
186
+ }
187
+
188
+ @article{yangHotpotQADatasetDiverse2018,
189
+ title={HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering},
190
+ author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William and Salakhutdinov, Ruslan and Manning, Christopher D.},
191
+ booktitle={Proceedings of EMNLP},
192
+ year={2018}
193
+ }
194
+
195
+ @article{trivediMuSiQueMultihopQuestions2022,
196
+ title={MuSiQue: Multihop Questions via Single-hop Question Composition},
197
+ author={Trivedi, Harsh and Balasubramanian, Niranjan and Khot, Tushar and Sabharwal, Ashish},
198
+ journal={Transactions of the Association for Computational Linguistics},
199
+ volume={10},
200
+ pages={539--554},
201
+ year={2022}
202
+ }
203
  """
src/display/formatting.py CHANGED
@@ -3,8 +3,28 @@ def model_hyperlink(link, model_name):
3
 
4
 
5
  def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def styled_error(error):
 
3
 
4
 
5
  def make_clickable_model(model_name):
6
+ # Custom link mappings for each model
7
+ custom_links = {
8
+ "ReSeek-Qwen2.5-7b-Instruct": "https://your-custom-link.com/reseek-7b",
9
+ "ReSeek-Qwen2.5-3b-Instruct": "https://your-custom-link.com/reseek-3b",
10
+ "ZeroSearch-Qwen2.5-3b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_3B_Instruct",
11
+ "ZeroSearch-Qwen2.5-7b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_7B_Instruct",
12
+ "Search-R1-Qwen2.5-7b-Instruct": "https://huggingface.co/PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-7b-it-em-ppo",
13
+ "Search-R1-Qwen2.5-3b-Instruct": "https://huggingface.co/PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-3b-em-grpo",
14
+ "Search-o1-Qwen2.5-7b-Instruct": "https://github.com/RUC-NLPIR/Search-o1",
15
+ "RAG-Qwen2.5-7b-Instruct": "",
16
+ "R1-Qwen2.5-7b-Instruct": "",
17
+ "SFT-Qwen2.5-7b-Instruct": "",
18
+ "CoT-Qwen2.5-7b-Instruct": "",
19
+ "Direct-Inference-Qwen2.5-7b-Instruct": "",
20
+ }
21
+
22
+ if model_name in custom_links:
23
+ link = custom_links[model_name]
24
+ return model_hyperlink(link, model_name)
25
+ else:
26
+ # If no custom link, just return the model name
27
+ return model_name
28
 
29
 
30
  def styled_error(error):
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,29 +21,23 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -53,12 +48,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,11 +79,13 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
@@ -100,6 +98,7 @@ class Precision(Enum):
100
  return Precision.bfloat16
101
  return Precision.Unknown
102
 
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
@@ -107,4 +106,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
24
+
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
+ auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("Rank", "number", True, never_hidden=True)])
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
+ # Scores
31
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
32
  for task in Tasks:
33
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
34
  # Model information
35
+ auto_eval_column_dict.append(["model_size", ColumnContent, ColumnContent("Model Size", "str", True)])
 
 
 
 
 
 
 
 
36
 
37
  # We use make dataclass to dynamically fill the scores from Tasks
38
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
39
 
40
+
41
  ## For the queue columns in the submission tab
42
  @dataclass(frozen=True)
43
  class EvalQueueColumn: # Queue column
 
48
  weight_type = ColumnContent("weight_type", "str", "Original")
49
  status = ColumnContent("status", "str", True)
50
 
51
+
52
  ## All the model information that we might need
53
  @dataclass
54
  class ModelDetails:
55
  name: str
56
  display_name: str = ""
57
+ symbol: str = "" # emoji
58
 
59
 
60
  class ModelType(Enum):
 
79
  return ModelType.IFT
80
  return ModelType.Unknown
81
 
82
+
83
  class WeightType(Enum):
84
  Adapter = ModelDetails("Adapter")
85
  Original = ModelDetails("Original")
86
  Delta = ModelDetails("Delta")
87
 
88
+
89
  class Precision(Enum):
90
  float16 = ModelDetails("float16")
91
  bfloat16 = ModelDetails("bfloat16")
 
98
  return Precision.bfloat16
99
  return Precision.Unknown
100
 
101
+
102
  # Column selection
103
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
104
 
 
106
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
107
 
108
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/envs.py CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "searchagent-leaderboard" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
src/leaderboard/read_evals.py CHANGED
@@ -14,22 +14,22 @@ from src.submission.check_validity import is_model_on_hub
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -57,9 +57,12 @@ class EvalResult:
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
 
 
 
63
  architecture = "?"
64
  if model_config is not None:
65
  architectures = getattr(model_config, "architectures", None)
@@ -85,10 +88,10 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
@@ -109,25 +112,25 @@ class EvalResult:
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
 
 
 
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
129
  for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
 
132
  return data_dict
133
 
@@ -146,10 +149,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
  request_file = tmp_request_file
154
  return request_file
155
 
@@ -188,7 +188,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
14
 
15
  @dataclass
16
  class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
18
+
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
  model: str
23
+ revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
+ date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
 
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
60
+ # For demo purposes, assume models are available
61
+ still_on_hub = True
62
+ model_config = None
63
+ # still_on_hub, _, model_config = is_model_on_hub(
64
+ # full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
65
+ # )
66
  architecture = "?"
67
  if model_config is not None:
68
  architectures = getattr(model_config, "architectures", None)
 
88
  org=org,
89
  model=model,
90
  results=results,
91
+ precision=precision,
92
+ revision=config.get("model_sha", ""),
93
  still_on_hub=still_on_hub,
94
+ architecture=architecture,
95
  )
96
 
97
  def update_with_request_file(self, requests_path):
 
112
 
113
  def to_dict(self):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
+ valid_results = [v for v in self.results.values() if v is not None]
116
+ average = sum(valid_results) / len(valid_results) if valid_results else 0
117
+ # Extract model size from model name
118
+ model_size = "Unknown"
119
+ if "3b" in self.full_model.lower():
120
+ model_size = "3B"
121
+ elif "7b" in self.full_model.lower():
122
+ model_size = "7B"
123
+
124
  data_dict = {
125
  "eval_name": self.eval_name, # not a column, just a save name,
126
+ AutoEvalColumn.rank.name: 0, # Will be set later based on average ranking
 
 
 
 
127
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
128
+ AutoEvalColumn.model_size.name: model_size,
129
  AutoEvalColumn.average.name: average,
 
 
 
 
130
  }
131
 
132
  for task in Tasks:
133
+ data_dict[task.value.col_name] = self.results.get(task.value.benchmark, None)
134
 
135
  return data_dict
136
 
 
149
  for tmp_request_file in request_files:
150
  with open(tmp_request_file, "r") as f:
151
  req_content = json.load(f)
152
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
153
  request_file = tmp_request_file
154
  return request_file
155
 
 
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
+ v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
src/populate.py CHANGED
@@ -15,6 +15,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
 
 
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
+
19
+ # Add ranking based on average score
20
+ df[AutoEvalColumn.rank.name] = range(1, len(df) + 1)
21
+
22
  df = df[cols].round(decimals=2)
23
 
24
  # filter out if any of the benchmarks have not been produced