File size: 9,098 Bytes
07ccbd9 2354057 07ccbd9 2354057 07ccbd9 2354057 07ccbd9 2354057 542377d 2354057 d1f8ae7 2354057 542377d d1f8ae7 2354057 07ccbd9 2354057 07ccbd9 2354057 07ccbd9 2354057 542377d 2354057 07ccbd9 2354057 07ccbd9 2354057 07ccbd9 2354057 07ccbd9 2354057 07ccbd9 2354057 07ccbd9 2354057 f4e673b 2354057 07ccbd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
# General QA tasks
nq = Task("nq", "exact_match", "NQ")
triviaqa = Task("triviaqa", "exact_match", "TriviaQA")
popqa = Task("popqa", "exact_match", "PopQA")
# Multi-hop QA tasks
hotpotqa = Task("hotpotqa", "exact_match", "HotpotQA")
twowiki = Task("2wiki", "exact_match", "2wiki")
musique = Task("musique", "exact_match", "Musique")
bamboogle = Task("bamboogle", "exact_match", "Bamboogle")
fictionalhot = Task("fictionalhot", "exact_match", "FictionalHot")
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">π SearchAgent Leaderboard</h1>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
# π SearchAgent Leaderboard
This leaderboard evaluates the performance of **search-augmented question answering systems** across various tasks, ranging from simple factual QA to complex multi-hop reasoning. Our evaluation addresses the inconsistency in experimental settings across prior works by providing a standardized comparison framework.
## π Evaluation Tasks
We evaluate on a comprehensive set of benchmarks that test different aspects of search-augmented QA:
### General QA
- **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
- **TriviaQA**: large-scale dataset with questions from trivia websites and competitions, featuring complex entity relationships
- **PopQA**: A large-scale open-domain, entity-centric QA dataset (14k QA pairs), with questions generated by templating Wikidata knowledge tuples.
### Multi-Hop QA
- **HotpotQA**: the first large-scale dataset requiring reasoning across multiple Wikipedia paragraphs.
- **2Wiki**: A multi-hop QA dataset with explicit, annotated reasoning paths.
- **Musique**: A multi-hop QA benchmark of 2β4-hop questions constructed from five single-hop datasets.
- **Bamboogle**: A complex, cross-domain question set curated from queries that Google answers incorrectly to evaluate modelsβ compositional reasoning.
### Novel Evaluation: FictionalHot
- **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.
## π― Evaluation Metrics
Following standardized practices, we primarily use **Exact Match (EM)** as the main metric. A prediction is correct if its normalized string exactly matches any normalized reference answer (with lowercasing, punctuation removal, and whitespace normalization).
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## π¬ Evaluation Methodology
This leaderboard addresses the challenge of inconsistent experimental settings in search agent evaluation by providing standardized comparisons. Prior works vary significantly in:
1. **Corpora**: From static Wikipedia snapshots (2018, 2019) to live Internet access
2. **Test Sets**: Broad evaluation vs. focused multi-hop evaluation
3. **Training Regimes**: No training to multi-dataset fine-tuning approaches
4. **Metrics**: Exact Match, F1, Substring matching, and LLM-as-a-judge evaluations
## π Dataset Details & Challenges
### Data Contamination Problem
A critical issue in current benchmarks is **data contamination**, where high scores may reflect memorized pretraining knowledge rather than genuine procedural reasoning capabilities.
### Our Solution: FictionalHot
We introduce **FictionalHot**, a novel closed-world benchmark that:
- Grounds all questions in newly generated synthetic fictional entities
- Uses a three-step construction pipeline: sampling β GPT-based entity replacement β synthetic document generation
- Forces models to rely on procedural reasoning over provided documents
- Enables reproducible evaluation with a fixed knowledge source
### Benchmark Coverage
- **Corpus**: 2018 Wikipedia snapshot for reproducibility
- **Retrieval**: Top-k=3 with maximum T=4 tool-use turns per question
## π Experimental Setup
Following established practices, we:
- Fine-tune on unified NQ + HotpotQA training data
- Evaluate on Qwen2.5-3B-Instruct and Qwen2.5-7B-Instruct models
- Use E5 embeddings for retrieval backend
- Apply standard Exact Match evaluation with string normalization
"""
EVALUATION_QUEUE_TEXT = """
## π£ Model Submission via Community
We now accept submissions via the Space's Community (Discussions). This keeps the process simple and transparent.
- Go to the Community tab of this leaderboard Space:
https://huggingface.co/spaces/TencentBAC/SearchAgent_Leaderboard
- Create a new Discussion with title:
`Submission: <YourMethod>-<model_name>-<model_size>`
- Include the following in the post:
- Model weights link (HF or GitHub)
- Short method description
- Evaluation JSON (inline or attached)
Example JSON:
```json
{
"config": {
"model_dtype": "torch.float16",
"model_name": "YourMethod-Qwen2.5-7b-Instruct",
"model_sha": "main"
},
"results": {
"nq": {"exact_match": 0.45},
"triviaqa": {"exact_match": 0.62},
"popqa": {"exact_match": 0.38},
"hotpotqa": {"exact_match": 0.41},
"2wiki": {"exact_match": 0.33},
"musique": {"exact_match": 0.15},
"bamboogle": {"exact_match": 0.28},
"fictionalhot": {"exact_match": 0.06}
}
}
```
We will review your post and add your model to the leaderboard.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
% Key Search-Augmented QA Methods
@article{li2025reseek,
title={ReSeek: A Self-Correcting Framework for Search Agents with Instructive Rewards},
author={Li, Shiyu and Tang, Yang and Wang, Yifan and Li, Peiming and Chen, Xi},
journal={arXiv preprint arXiv:2510.00568},
year={2025},
}
@article{luo2024search,
title={Search-o1: Agentic Search-Enhanced Large Reasoning Models},
author={Xiaoxi Li and Guanting Dong and Jiajie Jin and Yuyao Zhang and Yujia Zhou and Yutao Zhu and Peitian Zhang and Zhicheng Dou},
journal={arXiv preprint arXiv:2501.05366},
year={2025}
}
@article{songR1SearcherIncentivizingSearch2025,
title={R1-Searcher: Incentivizing the Search Capability in LLMs via Reinforcement Learning},
author={Song, Huatong and Jiang, Jinhao and Min, Yingqian and Chen, Jie and Chen, Zhipeng and Zhao, Wayne Xin and Fang, Lei and Wen, Ji-Rong},
journal={arXiv preprint arXiv:2503.05592},
year={2025}
}
@article{jin2025search,
title={Search-r1: Training llms to reason and leverage search engines with reinforcement learning},
author={Jin, Bowen and Zeng, Hansi and Yue, Zhenrui and Yoon, Jinsung and Arik, Sercan and Wang, Dong and Zamani, Hamed and Han, Jiawei},
journal={arXiv preprint arXiv:2503.09516},
year={2025}
}
@article{sunZeroSearchIncentivizeSearch2025,
title={ZeroSearch: Incentivize the Search Capability of LLMs without Searching},
author={Sun, Hao and Qiao, Zile and Guo, Jiayan and Fan, Xuanbo and Hou, Yingyan and Jiang, Yong and Xie, Pengjun and Zhang, Yan and Huang, Fei and Zhou, Jingren},
journal={arXiv preprint arXiv:2505.04588},
year={2025}
}
@article{zheng2025deepresearcher,
title={Deepresearcher: Scaling deep research via reinforcement learning in real-world environments},
author={Zheng, Yuxiang and Fu, Dayuan and Hu, Xiangkun and Cai, Xiaojie and Ye, Lyumanshan and Lu, Pengrui and Liu, Pengfei},
journal={arXiv preprint arXiv:2504.03160},
year={2025}
}
% Benchmark Datasets
@article{kwiatkowskiNaturalQuestionsBenchmark2019,
title={Natural Questions: A Benchmark for Question Answering Research},
author={Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and others},
journal={Transactions of the Association for Computational Linguistics},
volume={7},
pages={453--466},
year={2019}
}
@article{yangHotpotQADatasetDiverse2018,
title={HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering},
author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William and Salakhutdinov, Ruslan and Manning, Christopher D.},
booktitle={Proceedings of EMNLP},
year={2018}
}
@article{trivediMuSiQueMultihopQuestions2022,
title={MuSiQue: Multihop Questions via Single-hop Question Composition},
author={Trivedi, Harsh and Balasubramanian, Niranjan and Khot, Tushar and Sabharwal, Ashish},
journal={Transactions of the Association for Computational Linguistics},
volume={10},
pages={539--554},
year={2022}
}
"""
|