from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard # General QA tasks nq = Task("nq", "exact_match", "NQ") triviaqa = Task("triviaqa", "exact_match", "TriviaQA") popqa = Task("popqa", "exact_match", "PopQA") # Multi-hop QA tasks hotpotqa = Task("hotpotqa", "exact_match", "HotpotQA") twowiki = Task("2wiki", "exact_match", "2wiki") musique = Task("musique", "exact_match", "Musique") bamboogle = Task("bamboogle", "exact_match", "Bamboogle") fictionalhot = Task("fictionalhot", "exact_match", "FictionalHot") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """

πŸ” SearchAgent Leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ # πŸ” SearchAgent Leaderboard This leaderboard evaluates the performance of **search-augmented question answering systems** across various tasks, ranging from simple factual QA to complex multi-hop reasoning. Our evaluation addresses the inconsistency in experimental settings across prior works by providing a standardized comparison framework. ## πŸ“Š Evaluation Tasks We evaluate on a comprehensive set of benchmarks that test different aspects of search-augmented QA: ### General QA - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia - **TriviaQA**: large-scale dataset with questions from trivia websites and competitions, featuring complex entity relationships - **PopQA**: A large-scale open-domain, entity-centric QA dataset (14k QA pairs), with questions generated by templating Wikidata knowledge tuples. ### Multi-Hop QA - **HotpotQA**: the first large-scale dataset requiring reasoning across multiple Wikipedia paragraphs. - **2Wiki**: A multi-hop QA dataset with explicit, annotated reasoning paths. - **Musique**: A multi-hop QA benchmark of 2–4-hop questions constructed from five single-hop datasets. - **Bamboogle**: A complex, cross-domain question set curated from queries that Google answers incorrectly to evaluate models’ compositional reasoning. ### Novel Evaluation: FictionalHot - **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure. ## 🎯 Evaluation Metrics Following standardized practices, we primarily use **Exact Match (EM)** as the main metric. A prediction is correct if its normalized string exactly matches any normalized reference answer (with lowercasing, punctuation removal, and whitespace normalization). """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ## πŸ”¬ Evaluation Methodology This leaderboard addresses the challenge of inconsistent experimental settings in search agent evaluation by providing standardized comparisons. Prior works vary significantly in: 1. **Corpora**: From static Wikipedia snapshots (2018, 2019) to live Internet access 2. **Test Sets**: Broad evaluation vs. focused multi-hop evaluation 3. **Training Regimes**: No training to multi-dataset fine-tuning approaches 4. **Metrics**: Exact Match, F1, Substring matching, and LLM-as-a-judge evaluations ## πŸ“‹ Dataset Details & Challenges ### Data Contamination Problem A critical issue in current benchmarks is **data contamination**, where high scores may reflect memorized pretraining knowledge rather than genuine procedural reasoning capabilities. ### Our Solution: FictionalHot We introduce **FictionalHot**, a novel closed-world benchmark that: - Grounds all questions in newly generated synthetic fictional entities - Uses a three-step construction pipeline: sampling β†’ GPT-based entity replacement β†’ synthetic document generation - Forces models to rely on procedural reasoning over provided documents - Enables reproducible evaluation with a fixed knowledge source ### Benchmark Coverage - **Corpus**: 2018 Wikipedia snapshot for reproducibility - **Retrieval**: Top-k=3 with maximum T=4 tool-use turns per question ## πŸ”„ Experimental Setup Following established practices, we: - Fine-tune on unified NQ + HotpotQA training data - Evaluate on Qwen2.5-3B-Instruct and Qwen2.5-7B-Instruct models - Use E5 embeddings for retrieval backend - Apply standard Exact Match evaluation with string normalization """ EVALUATION_QUEUE_TEXT = """ ## πŸ“£ Model Submission via Community We now accept submissions via the Space's Community (Discussions). This keeps the process simple and transparent. - Go to the Community tab of this leaderboard Space: https://huggingface.co/spaces/TencentBAC/SearchAgent_Leaderboard - Create a new Discussion with title: `Submission: --` - Include the following in the post: - Model weights link (HF or GitHub) - Short method description - Evaluation JSON (inline or attached) Example JSON: ```json { "config": { "model_dtype": "torch.float16", "model_name": "YourMethod-Qwen2.5-7b-Instruct", "model_sha": "main" }, "results": { "nq": {"exact_match": 0.45}, "triviaqa": {"exact_match": 0.62}, "popqa": {"exact_match": 0.38}, "hotpotqa": {"exact_match": 0.41}, "2wiki": {"exact_match": 0.33}, "musique": {"exact_match": 0.15}, "bamboogle": {"exact_match": 0.28}, "fictionalhot": {"exact_match": 0.06} } } ``` We will review your post and add your model to the leaderboard. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" % Key Search-Augmented QA Methods @article{li2025reseek, title={ReSeek: A Self-Correcting Framework for Search Agents with Instructive Rewards}, author={Li, Shiyu and Tang, Yang and Wang, Yifan and Li, Peiming and Chen, Xi}, journal={arXiv preprint arXiv:2510.00568}, year={2025}, } @article{luo2024search, title={Search-o1: Agentic Search-Enhanced Large Reasoning Models}, author={Xiaoxi Li and Guanting Dong and Jiajie Jin and Yuyao Zhang and Yujia Zhou and Yutao Zhu and Peitian Zhang and Zhicheng Dou}, journal={arXiv preprint arXiv:2501.05366}, year={2025} } @article{songR1SearcherIncentivizingSearch2025, title={R1-Searcher: Incentivizing the Search Capability in LLMs via Reinforcement Learning}, author={Song, Huatong and Jiang, Jinhao and Min, Yingqian and Chen, Jie and Chen, Zhipeng and Zhao, Wayne Xin and Fang, Lei and Wen, Ji-Rong}, journal={arXiv preprint arXiv:2503.05592}, year={2025} } @article{jin2025search, title={Search-r1: Training llms to reason and leverage search engines with reinforcement learning}, author={Jin, Bowen and Zeng, Hansi and Yue, Zhenrui and Yoon, Jinsung and Arik, Sercan and Wang, Dong and Zamani, Hamed and Han, Jiawei}, journal={arXiv preprint arXiv:2503.09516}, year={2025} } @article{sunZeroSearchIncentivizeSearch2025, title={ZeroSearch: Incentivize the Search Capability of LLMs without Searching}, author={Sun, Hao and Qiao, Zile and Guo, Jiayan and Fan, Xuanbo and Hou, Yingyan and Jiang, Yong and Xie, Pengjun and Zhang, Yan and Huang, Fei and Zhou, Jingren}, journal={arXiv preprint arXiv:2505.04588}, year={2025} } @article{zheng2025deepresearcher, title={Deepresearcher: Scaling deep research via reinforcement learning in real-world environments}, author={Zheng, Yuxiang and Fu, Dayuan and Hu, Xiangkun and Cai, Xiaojie and Ye, Lyumanshan and Lu, Pengrui and Liu, Pengfei}, journal={arXiv preprint arXiv:2504.03160}, year={2025} } % Benchmark Datasets @article{kwiatkowskiNaturalQuestionsBenchmark2019, title={Natural Questions: A Benchmark for Question Answering Research}, author={Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and others}, journal={Transactions of the Association for Computational Linguistics}, volume={7}, pages={453--466}, year={2019} } @article{yangHotpotQADatasetDiverse2018, title={HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering}, author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William and Salakhutdinov, Ruslan and Manning, Christopher D.}, booktitle={Proceedings of EMNLP}, year={2018} } @article{trivediMuSiQueMultihopQuestions2022, title={MuSiQue: Multihop Questions via Single-hop Question Composition}, author={Trivedi, Harsh and Balasubramanian, Niranjan and Khot, Tushar and Sabharwal, Ashish}, journal={Transactions of the Association for Computational Linguistics}, volume={10}, pages={539--554}, year={2022} } """