shyuli commited on
Commit
aaebb00
·
1 Parent(s): 1440c3c

add resutls

Browse files
Files changed (28) hide show
  1. .gitignore +0 -4
  2. eval-queue/CoT/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
  3. eval-queue/Direct-Inference/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
  4. eval-queue/PeterJinGo/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +13 -0
  5. eval-queue/R1/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
  6. eval-queue/RAG/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
  7. eval-queue/ReSeek/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json +12 -0
  8. eval-queue/ReSeek/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
  9. eval-queue/SFT/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
  10. eval-queue/Search-R1/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json +12 -0
  11. eval-queue/Search-o1/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
  12. eval-queue/ZeroSearch/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json +12 -0
  13. eval-queue/ZeroSearch/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json +12 -0
  14. eval-queue/pending_eval_request.json +12 -0
  15. eval-results/CoT_Qwen2.5-7b-Instruct.json +33 -0
  16. eval-results/Direct_Inference_Qwen2.5-7b-Instruct.json +33 -0
  17. eval-results/R1_Qwen2.5-7b-Instruct.json +33 -0
  18. eval-results/RAG_Qwen2.5-7b-Instruct.json +33 -0
  19. eval-results/ReSeek_Qwen2.5-3b-Instruct.json +33 -0
  20. eval-results/ReSeek_Qwen2.5-7b-Instruct.json +33 -0
  21. eval-results/SFT_Qwen2.5-7b-Instruct.json +33 -0
  22. eval-results/Search-R1_Qwen2.5-3b-Instruct.json +33 -0
  23. eval-results/Search-R1_Qwen2.5-7b-Instruct.json +33 -0
  24. eval-results/Search-o1_Qwen2.5-7b-Instruct.json +33 -0
  25. eval-results/ZeroSearch_Qwen2.5-3b-Instruct.json +33 -0
  26. eval-results/ZeroSearch_Qwen2.5-7b-Instruct.json +33 -0
  27. eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json +15 -0
  28. eval-results/demo-leaderboard/gpt2-demo/results_2023-11-22 15:46:20.425378.json +33 -0
.gitignore CHANGED
@@ -6,8 +6,4 @@ __pycache__/
6
  *ipynb
7
  .vscode/
8
 
9
- eval-queue/
10
- eval-results/
11
- eval-queue-bk/
12
- eval-results-bk/
13
  logs/
 
6
  *ipynb
7
  .vscode/
8
 
 
 
 
 
9
  logs/
eval-queue/CoT/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "CoT/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T06:00:00.000Z",
8
+ "model_type": "pretrained",
9
+ "likes": 8,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/Direct-Inference/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Direct-Inference/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T08:00:00.000Z",
8
+ "model_type": "pretrained",
9
+ "likes": 12,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/PeterJinGo/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "PeterJinGo/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T09:00:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 72,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
13
+
eval-queue/R1/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "R1/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T07:00:00.000Z",
8
+ "model_type": "RL-tuned",
9
+ "likes": 67,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/RAG/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "RAG/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T08:30:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 45,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/ReSeek/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "ReSeek-Qwen2.5-3b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T10:15:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 156,
10
+ "params": 3.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/ReSeek/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "ReSeek/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T10:00:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 156,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/SFT/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "SFT/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T06:30:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 23,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/Search-R1/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Search-R1-Qwen2.5-3b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T09:15:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 72,
10
+ "params": 3.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/Search-o1/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Search-o1/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T07:30:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 98,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/ZeroSearch/Qwen2.5-3b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "ZeroSearch-Qwen2.5-3b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T09:45:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 89,
10
+ "params": 3.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/ZeroSearch/Qwen2.5-7b-Instruct_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "ZeroSearch/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "FINISHED",
7
+ "submitted_time": "2025-09-29T09:30:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 89,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-queue/pending_eval_request.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "DeepResearcher/Qwen2.5-7b-Instruct",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "weight_type": "Original",
6
+ "status": "PENDING",
7
+ "submitted_time": "2025-09-29T10:00:00.000Z",
8
+ "model_type": "fine-tuned",
9
+ "likes": 42,
10
+ "params": 7.0,
11
+ "license": "apache-2.0"
12
+ }
eval-results/CoT_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "CoT-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.048
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.185
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.054
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.092
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.111
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.022
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.232
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.001
31
+ }
32
+ }
33
+ }
eval-results/Direct_Inference_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Direct-Inference-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.134
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.408
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.140
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.183
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.250
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.031
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.120
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.001
31
+ }
32
+ }
33
+ }
eval-results/R1_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "R1-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.270
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.537
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.199
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.237
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.292
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.072
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.293
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.003
31
+ }
32
+ }
33
+ }
eval-results/RAG_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "RAG-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.349
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.585
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.392
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.299
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.235
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.058
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.208
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.012
31
+ }
32
+ }
33
+ }
eval-results/ReSeek_Qwen2.5-3b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "ReSeek-Qwen2.5-3b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.415
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.553
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.434
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.328
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.298
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.103
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.304
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.059
31
+ }
32
+ }
33
+ }
eval-results/ReSeek_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "ReSeek-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.469
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.640
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.501
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.389
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.382
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.185
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.392
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.061
31
+ }
32
+ }
33
+ }
eval-results/SFT_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "SFT-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.318
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.354
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.121
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.217
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.259
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.066
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.112
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.003
31
+ }
32
+ }
33
+ }
eval-results/Search-R1_Qwen2.5-3b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Search-R1-Qwen2.5-3b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.341
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.545
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.378
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.324
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.319
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.103
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.264
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.037
31
+ }
32
+ }
33
+ }
eval-results/Search-R1_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Search-R1-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.393
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.610
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.397
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.370
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.414
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.146
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.368
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.034
31
+ }
32
+ }
33
+ }
eval-results/Search-o1_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Search-o1-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.151
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.443
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.131
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.187
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.176
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.058
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.296
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.020
31
+ }
32
+ }
33
+ }
eval-results/ZeroSearch_Qwen2.5-3b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "ZeroSearch-Qwen2.5-3b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.414
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.574
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.448
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.274
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.300
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.098
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.111
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.030
31
+ }
32
+ }
33
+ }
eval-results/ZeroSearch_Qwen2.5-7b-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "ZeroSearch-Qwen2.5-7b-Instruct",
5
+ "model_sha": "main"
6
+ },
7
+ "results": {
8
+ "nq": {
9
+ "exact_match": 0.436
10
+ },
11
+ "triviaqa": {
12
+ "exact_match": 0.652
13
+ },
14
+ "popqa": {
15
+ "exact_match": 0.488
16
+ },
17
+ "hotpotqa": {
18
+ "exact_match": 0.346
19
+ },
20
+ "2wiki": {
21
+ "exact_match": 0.352
22
+ },
23
+ "musique": {
24
+ "exact_match": 0.184
25
+ },
26
+ "bamboogle": {
27
+ "exact_match": 0.278
28
+ },
29
+ "fictionalhot": {
30
+ "exact_match": 0.031
31
+ }
32
+ }
33
+ }
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "demo-leaderboard/gpt2-demo",
5
+ "model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
6
+ },
7
+ "results": {
8
+ "anli_r1": {
9
+ "acc": 0
10
+ },
11
+ "logiqa": {
12
+ "acc_norm": 0.90
13
+ }
14
+ }
15
+ }
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-22 15:46:20.425378.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "anli_r1": {
4
+ "acc": 0.4,
5
+ "acc_stderr": 0.11239029738980327
6
+ },
7
+ "logiqa": {
8
+ "acc": 0.35,
9
+ "acc_stderr": 0.10942433098048308,
10
+ "acc_norm": 0.3,
11
+ "acc_norm_stderr": 0.10513149660756933
12
+ }
13
+ },
14
+ "versions": {
15
+ "anli_r1": 0,
16
+ "logiqa": 0
17
+ },
18
+ "config": {
19
+ "model": "hf-causal-experimental",
20
+ "model_args": "pretrained=demo-leaderboard/gpt2-demo,revision=main,dtype=bfloat16",
21
+ "num_fewshot": 0,
22
+ "batch_size": 1,
23
+ "batch_sizes": [],
24
+ "device": "cpu",
25
+ "no_cache": true,
26
+ "limit": 20,
27
+ "bootstrap_iters": 100000,
28
+ "description_dict": null,
29
+ "model_dtype": "bfloat16",
30
+ "model_name": "demo-leaderboard/gpt2-demo",
31
+ "model_sha": "main"
32
+ }
33
+ }