TuRTLe-Leaderboard

Running

App Files Files Community

ggcristian commited on Sep 12

Commit

c797bf2

1 Parent(s): 3f46491

Add 'Reasoning' tag for model names

Browse files

Files changed (5) hide show

app.py +3 -3
results/parse.py +56 -7
results/results_icarus.json +0 -0
results/results_verilator.json +0 -0
utils.py +27 -9

app.py CHANGED Viewed

@@ -291,14 +291,14 @@ with gr.Blocks(
                 show_row_numbers=True,
                 wrap=True,
                 datatype=[
-                    "markdown",
                     "html",
                 ],
                 interactive=False,
                 column_widths=[
                     "7%",
-                    "24%",
-                    "17%",
                     "10%",
                     "13%",
                     "10%",

                 show_row_numbers=True,
                 wrap=True,
                 datatype=[
+                    "html",
                     "html",
                 ],
                 interactive=False,
                 column_widths=[
                     "7%",
+                    "28%",
+                    "13%",
                     "10%",
                     "13%",
                     "10%",

results/parse.py CHANGED Viewed

@@ -13,177 +13,224 @@ model_details = {
         685,
         "General",
         "V2",
     ),
     "DeepSeek R1": (
         "https://huggingface.co/deepseek-ai/DeepSeek-R1",
         685,
         "General",
         "V1",
     ),
     "Llama 3.1 405B": (
         "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8",
         406,
         "General",
         "V1",
     ),
     "Qwen3 236B A22B": (
         "https://huggingface.co/Qwen/Qwen3-235B-A22B",
         235,
         "General",
         "V2",
     ),
     "Llama 3.(1-3) 70B": (
         "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
         70.6,
         "General",
         "V1",
     ),
     "Qwen2.5 72B": (
         "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
         72.7,
         "General",
         "V1",
     ),
-    "QwQ 32B": ("https://huggingface.co/Qwen/QwQ-32B", 32.8, "General", "V2"),
-    "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General", "V1"),
     "StarChat2 15B v0.1": (
         "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
         16,
         "General",
         "V1",
     ),
     "DeepSeek R1 Distill Qwen 14B": (
         "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
         14.8,
         "General",
         "V1",
     ),
     "CodeLlama 70B": (
         "https://huggingface.co/codellama/CodeLlama-70b-hf",
         69,
         "Coding",
         "V1",
     ),
     "QwenCoder 2.5 32B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
         32.5,
         "Coding",
         "V1",
     ),
     "DeepSeek Coder 33B": (
         "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
         33.3,
         "Coding",
         "V1",
     ),
     "QwenCoder 2.5 14B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
         14.7,
         "Coding",
         "V1",
     ),
     "DeepCoder 14B": (
         "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
         14.8,
         "Coding",
         "V2",
     ),
     "OpenCoder 8B": (
         "https://huggingface.co/infly/OpenCoder-8B-Instruct",
         7.77,
         "Coding",
         "V1",
     ),
     "SeedCoder 8B": (
         "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct",
         8.25,
         "Coding",
         "V2",
     ),
     "SeedCoder 8B Reasoning": (
         "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
         8.25,
         "Coding",
         "V2",
     ),
     "QwenCoder 2.5 7B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
         7.61,
         "Coding",
         "V1",
     ),
     "DeepSeek Coder 6.7B": (
         "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
         6.74,
         "Coding",
         "V1",
     ),
     "HaVen-CodeQwen": (
         "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
         7.25,
         "RTL-Specific",
         "V1",
     ),
     "CodeV R1 Distill Qwen 7B": (
         "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B",
         7.62,
         "RTL-Specific",
         "V2",
     ),
     "CodeV-CL-7B": (
         "https://huggingface.co/yang-z/CodeV-CL-7B",
         6.74,
         "RTL-Specific",
         "V1",
     ),
     "CodeV-QW-7B": (
         "https://huggingface.co/yang-z/CodeV-QW-7B",
         7.25,
         "RTL-Specific",
         "V1",
     ),
     "CodeV-DS-6.7B": (
         "https://huggingface.co/yang-z/CodeV-DS-6.7B",
         6.74,
         "RTL-Specific",
         "V1",
     ),
     "RTLCoder Mistral": (
         "https://huggingface.co/ishorn5/RTLCoder-v1.1",
         7.24,
         "RTL-Specific",
         "V1",
     ),
     "RTLCoder DeepSeek": (
         "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
         6.74,
         "RTL-Specific",
         "V1",
     ),
-    "OriGen": ("https://huggingface.co/henryen/OriGen", 6.74, "RTL-Specific", "V1"),
     "Qwen3 Coder 480B A35B": (
         "https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct",
         480,
         "Coding",
         "V2",
     ),
     "Magistral Small 2506": (
         "https://huggingface.co/mistralai/Magistral-Small-2506",
         23.6,
         "General",
         "V2",
     ),
     "gpt-oss-20b": (
         "https://huggingface.co/openai/gpt-oss-20b",
         21.5,
         "General",
         "V2",
     ),
     "gpt-oss-120b": (
         "https://huggingface.co/openai/gpt-oss-120b",
         120,
         "General",
         "V2",
     ),
 }
@@ -201,14 +248,15 @@ def get_headers(reader, agg=False) -> Union[list, list]:
     return metrics, benchs
-def get_model_params_and_url(model) -> Union[str, str, float, str]:
     if model not in model_details:
-        return "-", 0.0, "Unknown", "Unknown"
     url = model_details[model][0]
     params = model_details[model][1]
     type = model_details[model][2]
     release = model_details[model][3]
-    return url, params, type, release
 def parse_results(csv_path: str) -> list[dict]:
@@ -227,7 +275,7 @@ def parse_results(csv_path: str) -> list[dict]:
             model = row[0]
             if not model:
                 continue
-            url, params, type, release = get_model_params_and_url(model)
             models.append(model)
             row = row[1:]
             ctr = 0
@@ -243,6 +291,7 @@ def parse_results(csv_path: str) -> list[dict]:
                 record["Model URL"] = url
                 record["Params"] = params
                 record["Release"] = release
                 dataset.append(record)
                 ctr += 1
     print(models)

         685,
         "General",
         "V2",
+        "Reasoning",  # "Dense" or "Reasoning"
     ),
     "DeepSeek R1": (
         "https://huggingface.co/deepseek-ai/DeepSeek-R1",
         685,
         "General",
         "V1",
+        "Reasoning",
     ),
     "Llama 3.1 405B": (
         "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8",
         406,
         "General",
         "V1",
+        "Dense",
     ),
     "Qwen3 236B A22B": (
         "https://huggingface.co/Qwen/Qwen3-235B-A22B",
         235,
         "General",
         "V2",
+        "Reasoning",
     ),
     "Llama 3.(1-3) 70B": (
         "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
         70.6,
         "General",
         "V1",
+        "Dense",
     ),
     "Qwen2.5 72B": (
         "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
         72.7,
         "General",
         "V1",
+        "Dense",
+    ),
+    "QwQ 32B": (
+        "https://huggingface.co/Qwen/QwQ-32B",
+        32.8,
+        "General",
+        "V2",
+        "Reasoning",
+    ),
+    "Qwen2.5 32B": (
+        "https://huggingface.co/Qwen/Qwen2.5-32B",
+        32.5,
+        "General",
+        "V1",
+        "Dense",
     ),
     "StarChat2 15B v0.1": (
         "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
         16,
         "General",
         "V1",
+        "Dense",
     ),
     "DeepSeek R1 Distill Qwen 14B": (
         "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
         14.8,
         "General",
         "V1",
+        "Reasoning",
     ),
     "CodeLlama 70B": (
         "https://huggingface.co/codellama/CodeLlama-70b-hf",
         69,
         "Coding",
         "V1",
+        "Dense",
     ),
     "QwenCoder 2.5 32B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
         32.5,
         "Coding",
         "V1",
+        "Dense",
     ),
     "DeepSeek Coder 33B": (
         "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
         33.3,
         "Coding",
         "V1",
+        "Dense",
     ),
     "QwenCoder 2.5 14B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
         14.7,
         "Coding",
         "V1",
+        "Dense",
     ),
     "DeepCoder 14B": (
         "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
         14.8,
         "Coding",
         "V2",
+        "Reasoning",
     ),
     "OpenCoder 8B": (
         "https://huggingface.co/infly/OpenCoder-8B-Instruct",
         7.77,
         "Coding",
         "V1",
+        "Dense",
     ),
     "SeedCoder 8B": (
         "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct",
         8.25,
         "Coding",
         "V2",
+        "Dense",
     ),
     "SeedCoder 8B Reasoning": (
         "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
         8.25,
         "Coding",
         "V2",
+        "Reasoning",
     ),
     "QwenCoder 2.5 7B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
         7.61,
         "Coding",
         "V1",
+        "Dense",
     ),
     "DeepSeek Coder 6.7B": (
         "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
         6.74,
         "Coding",
         "V1",
+        "Dense",
     ),
     "HaVen-CodeQwen": (
         "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
         7.25,
         "RTL-Specific",
         "V1",
+        "Dense",
     ),
     "CodeV R1 Distill Qwen 7B": (
         "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B",
         7.62,
         "RTL-Specific",
         "V2",
+        "Reasoning",
     ),
     "CodeV-CL-7B": (
         "https://huggingface.co/yang-z/CodeV-CL-7B",
         6.74,
         "RTL-Specific",
         "V1",
+        "Dense",
     ),
     "CodeV-QW-7B": (
         "https://huggingface.co/yang-z/CodeV-QW-7B",
         7.25,
         "RTL-Specific",
         "V1",
+        "Dense",
     ),
     "CodeV-DS-6.7B": (
         "https://huggingface.co/yang-z/CodeV-DS-6.7B",
         6.74,
         "RTL-Specific",
         "V1",
+        "Dense",
     ),
     "RTLCoder Mistral": (
         "https://huggingface.co/ishorn5/RTLCoder-v1.1",
         7.24,
         "RTL-Specific",
         "V1",
+        "Dense",
     ),
     "RTLCoder DeepSeek": (
         "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
         6.74,
         "RTL-Specific",
         "V1",
+        "Dense",
+    ),
+    "OriGen": (
+        "https://huggingface.co/henryen/OriGen",
+        6.74,
+        "RTL-Specific",
+        "V1",
+        "Dense",
     ),
     "Qwen3 Coder 480B A35B": (
         "https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct",
         480,
         "Coding",
         "V2",
+        "Dense",
     ),
     "Magistral Small 2506": (
         "https://huggingface.co/mistralai/Magistral-Small-2506",
         23.6,
         "General",
         "V2",
+        "Reasoning",
     ),
     "gpt-oss-20b": (
         "https://huggingface.co/openai/gpt-oss-20b",
         21.5,
         "General",
         "V2",
+        "Reasoning",
     ),
     "gpt-oss-120b": (
         "https://huggingface.co/openai/gpt-oss-120b",
         120,
         "General",
         "V2",
+        "Reasoning",
     ),
 }
     return metrics, benchs
+def get_model_params_and_url(model) -> Union[str, str, float, str, str]:
     if model not in model_details:
+        return "-", 0.0, "-", "-", "-"
     url = model_details[model][0]
     params = model_details[model][1]
     type = model_details[model][2]
     release = model_details[model][3]
+    reasoning = model_details[model][4]
+    return url, params, type, release, reasoning
 def parse_results(csv_path: str) -> list[dict]:
             model = row[0]
             if not model:
                 continue
+            url, params, type, release, reasoning = get_model_params_and_url(model)
             models.append(model)
             row = row[1:]
             ctr = 0
                 record["Model URL"] = url
                 record["Params"] = params
                 record["Release"] = release
+                record["Thinking"] = reasoning
                 dataset.append(record)
                 ctr += 1
     print(models)

results/results_icarus.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

results/results_verilator.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

utils.py CHANGED Viewed

@@ -15,11 +15,20 @@ type_emoji = {
 # fmt: on
-def model_hyperlink(link, model_name, release):
     if release == "V1":
-        return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
     else:
-        return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a> <span style="opacity: 0.7; font-variant: all-small-caps; font-weight: 600">new</span>'
 def handle_special_cases(benchmark, metric):
@@ -33,14 +42,18 @@ def handle_special_cases(benchmark, metric):
 def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
     subset = subset.drop(subset[subset.Score < 0.0].index)
     details = subset[
-        ["Model", "Model URL", "Model Type", "Params", "Release"]
     ].drop_duplicates("Model")
     filtered_df = subset[["Model", "Score"]].rename(
         columns={"Score": "Exact Matching (EM)"}
     )
     filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
     filtered_df["Model"] = filtered_df.apply(
-        lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
         axis=1,
     )
     filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
@@ -53,7 +66,7 @@ def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
 def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
     details = subset[
-        ["Model", "Model URL", "Model Type", "Params", "Release"]
     ].drop_duplicates("Model")
     if "RTLLM" in subset["Benchmark"].unique():
         pivot_df = (
@@ -82,7 +95,9 @@ def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataF
     pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
     pivot_df["Model"] = pivot_df.apply(
-        lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
         axis=1,
     )
     pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
@@ -144,7 +159,7 @@ def filter_bench_all(
     subset: pd.DataFrame, df_agg=None, agg_column=None
 ) -> pd.DataFrame:
     details = subset[
-        ["Model", "Model URL", "Model Type", "Params", "Release"]
     ].drop_duplicates("Model")
     if "RTLLM" in subset["Benchmark"].unique():
         pivot_df = (
@@ -164,8 +179,11 @@ def filter_bench_all(
         )
     pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
     pivot_df["Model"] = pivot_df.apply(
-        lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
         axis=1,
     )
     pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))

 # fmt: on
+def model_hyperlink(link, model_name, release, thinking=False):
+    ret = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
     if release == "V1":
+        return ret
+    elif thinking == False:
+        return (
+            ret
+            + f' <span style="opacity: 0.7; font-variant: all-small-caps; font-weight: 600">new</span>'
+        )
     else:
+        return (
+            ret
+            + f' <span style="opacity: 0.7; font-variant: all-small-caps; font-weight: 600">new</span>  <span style="opacity: 0.9; font-variant: all-small-caps; font-weight: 600; color: #5C6BC0">(reasoning)</span>'
+        )
 def handle_special_cases(benchmark, metric):
 def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
     subset = subset.drop(subset[subset.Score < 0.0].index)
     details = subset[
+        ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
     ].drop_duplicates("Model")
     filtered_df = subset[["Model", "Score"]].rename(
         columns={"Score": "Exact Matching (EM)"}
     )
     filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
     filtered_df["Model"] = filtered_df.apply(
+        lambda row: model_hyperlink(
+            row["Model URL"],
+            row["Model"],
+            row["Release"],
+        ),
         axis=1,
     )
     filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
 def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
     details = subset[
+        ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
     ].drop_duplicates("Model")
     if "RTLLM" in subset["Benchmark"].unique():
         pivot_df = (
     pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
     pivot_df["Model"] = pivot_df.apply(
+        lambda row: model_hyperlink(
+            row["Model URL"], row["Model"], row["Release"], row["Thinking"]
+        ),
         axis=1,
     )
     pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
     subset: pd.DataFrame, df_agg=None, agg_column=None
 ) -> pd.DataFrame:
     details = subset[
+        ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
     ].drop_duplicates("Model")
     if "RTLLM" in subset["Benchmark"].unique():
         pivot_df = (
         )
     pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
+    print(pivot_df.columns)
     pivot_df["Model"] = pivot_df.apply(
+        lambda row: model_hyperlink(
+            row["Model URL"], row["Model"], row["Release"], row["Thinking"]
+        ),
         axis=1,
     )
     pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))