Spaces:
Sleeping
Sleeping
shyuli
commited on
Commit
·
aaebb00
1
Parent(s):
1440c3c
add resutls
Browse files- .gitignore +0 -4
- eval-queue/CoT/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/Direct-Inference/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/PeterJinGo/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +13 -0
- eval-queue/R1/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/RAG/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/ReSeek/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/ReSeek/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/SFT/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/Search-R1/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/Search-o1/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/ZeroSearch/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/ZeroSearch/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
- eval-queue/pending_eval_request.json +12 -0
- eval-results/CoT_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/Direct_Inference_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/R1_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/RAG_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/ReSeek_Qwen2.5-3b-Instruct.json +33 -0
- eval-results/ReSeek_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/SFT_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/Search-R1_Qwen2.5-3b-Instruct.json +33 -0
- eval-results/Search-R1_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/Search-o1_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/ZeroSearch_Qwen2.5-3b-Instruct.json +33 -0
- eval-results/ZeroSearch_Qwen2.5-7b-Instruct.json +33 -0
- eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json +15 -0
- eval-results/demo-leaderboard/gpt2-demo/results_2023-11-22 15:46:20.425378.json +33 -0
.gitignore
CHANGED
|
@@ -6,8 +6,4 @@ __pycache__/
|
|
| 6 |
*ipynb
|
| 7 |
.vscode/
|
| 8 |
|
| 9 |
-
eval-queue/
|
| 10 |
-
eval-results/
|
| 11 |
-
eval-queue-bk/
|
| 12 |
-
eval-results-bk/
|
| 13 |
logs/
|
|
|
|
| 6 |
*ipynb
|
| 7 |
.vscode/
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
logs/
|
eval-queue/CoT/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "CoT/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T06:00:00.000Z",
|
| 8 |
+
"model_type": "pretrained",
|
| 9 |
+
"likes": 8,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/Direct-Inference/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "Direct-Inference/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T08:00:00.000Z",
|
| 8 |
+
"model_type": "pretrained",
|
| 9 |
+
"likes": 12,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/PeterJinGo/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "PeterJinGo/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T09:00:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 72,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
| 13 |
+
|
eval-queue/R1/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "R1/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T07:00:00.000Z",
|
| 8 |
+
"model_type": "RL-tuned",
|
| 9 |
+
"likes": 67,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/RAG/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "RAG/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T08:30:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 45,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/ReSeek/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "ReSeek-Qwen2.5-3b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T10:15:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 156,
|
| 10 |
+
"params": 3.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/ReSeek/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "ReSeek/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T10:00:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 156,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/SFT/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "SFT/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T06:30:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 23,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/Search-R1/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "Search-R1-Qwen2.5-3b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T09:15:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 72,
|
| 10 |
+
"params": 3.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/Search-o1/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "Search-o1/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T07:30:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 98,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/ZeroSearch/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "ZeroSearch-Qwen2.5-3b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T09:45:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 89,
|
| 10 |
+
"params": 3.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/ZeroSearch/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "ZeroSearch/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "FINISHED",
|
| 7 |
+
"submitted_time": "2025-09-29T09:30:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 89,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-queue/pending_eval_request.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "DeepResearcher/Qwen2.5-7b-Instruct",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"weight_type": "Original",
|
| 6 |
+
"status": "PENDING",
|
| 7 |
+
"submitted_time": "2025-09-29T10:00:00.000Z",
|
| 8 |
+
"model_type": "fine-tuned",
|
| 9 |
+
"likes": 42,
|
| 10 |
+
"params": 7.0,
|
| 11 |
+
"license": "apache-2.0"
|
| 12 |
+
}
|
eval-results/CoT_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "CoT-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.048
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.185
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.054
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.092
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.111
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.022
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.232
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.001
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/Direct_Inference_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "Direct-Inference-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.134
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.408
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.140
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.183
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.250
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.031
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.120
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.001
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/R1_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "R1-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.270
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.537
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.199
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.237
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.292
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.072
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.293
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.003
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/RAG_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "RAG-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.349
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.585
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.392
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.299
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.235
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.058
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.208
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.012
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/ReSeek_Qwen2.5-3b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "ReSeek-Qwen2.5-3b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.415
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.553
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.434
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.328
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.298
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.103
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.304
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.059
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/ReSeek_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "ReSeek-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.469
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.640
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.501
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.389
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.382
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.185
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.392
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.061
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/SFT_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "SFT-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.318
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.354
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.121
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.217
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.259
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.066
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.112
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.003
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/Search-R1_Qwen2.5-3b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "Search-R1-Qwen2.5-3b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.341
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.545
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.378
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.324
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.319
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.103
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.264
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.037
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/Search-R1_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "Search-R1-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.393
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.610
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.397
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.370
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.414
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.146
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.368
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.034
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/Search-o1_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "Search-o1-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.151
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.443
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.131
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.187
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.176
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.058
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.296
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.020
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/ZeroSearch_Qwen2.5-3b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "ZeroSearch-Qwen2.5-3b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.414
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.574
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.448
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.274
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.300
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.098
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.111
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.030
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/ZeroSearch_Qwen2.5-7b-Instruct.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "ZeroSearch-Qwen2.5-7b-Instruct",
|
| 5 |
+
"model_sha": "main"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"nq": {
|
| 9 |
+
"exact_match": 0.436
|
| 10 |
+
},
|
| 11 |
+
"triviaqa": {
|
| 12 |
+
"exact_match": 0.652
|
| 13 |
+
},
|
| 14 |
+
"popqa": {
|
| 15 |
+
"exact_match": 0.488
|
| 16 |
+
},
|
| 17 |
+
"hotpotqa": {
|
| 18 |
+
"exact_match": 0.346
|
| 19 |
+
},
|
| 20 |
+
"2wiki": {
|
| 21 |
+
"exact_match": 0.352
|
| 22 |
+
},
|
| 23 |
+
"musique": {
|
| 24 |
+
"exact_match": 0.184
|
| 25 |
+
},
|
| 26 |
+
"bamboogle": {
|
| 27 |
+
"exact_match": 0.278
|
| 28 |
+
},
|
| 29 |
+
"fictionalhot": {
|
| 30 |
+
"exact_match": 0.031
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "torch.float16",
|
| 4 |
+
"model_name": "demo-leaderboard/gpt2-demo",
|
| 5 |
+
"model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"anli_r1": {
|
| 9 |
+
"acc": 0
|
| 10 |
+
},
|
| 11 |
+
"logiqa": {
|
| 12 |
+
"acc_norm": 0.90
|
| 13 |
+
}
|
| 14 |
+
}
|
| 15 |
+
}
|
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-22 15:46:20.425378.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"anli_r1": {
|
| 4 |
+
"acc": 0.4,
|
| 5 |
+
"acc_stderr": 0.11239029738980327
|
| 6 |
+
},
|
| 7 |
+
"logiqa": {
|
| 8 |
+
"acc": 0.35,
|
| 9 |
+
"acc_stderr": 0.10942433098048308,
|
| 10 |
+
"acc_norm": 0.3,
|
| 11 |
+
"acc_norm_stderr": 0.10513149660756933
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
"versions": {
|
| 15 |
+
"anli_r1": 0,
|
| 16 |
+
"logiqa": 0
|
| 17 |
+
},
|
| 18 |
+
"config": {
|
| 19 |
+
"model": "hf-causal-experimental",
|
| 20 |
+
"model_args": "pretrained=demo-leaderboard/gpt2-demo,revision=main,dtype=bfloat16",
|
| 21 |
+
"num_fewshot": 0,
|
| 22 |
+
"batch_size": 1,
|
| 23 |
+
"batch_sizes": [],
|
| 24 |
+
"device": "cpu",
|
| 25 |
+
"no_cache": true,
|
| 26 |
+
"limit": 20,
|
| 27 |
+
"bootstrap_iters": 100000,
|
| 28 |
+
"description_dict": null,
|
| 29 |
+
"model_dtype": "bfloat16",
|
| 30 |
+
"model_name": "demo-leaderboard/gpt2-demo",
|
| 31 |
+
"model_sha": "main"
|
| 32 |
+
}
|
| 33 |
+
}
|