|
|
import gradio as gr |
|
|
import time |
|
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
import threading |
|
|
import globals |
|
|
from utils.io import save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats |
|
|
from utils.jobs import run_single_job, run_multiple_jobs, launch_jobs, update_job_statuses, relaunch_failed_jobs |
|
|
from typing import List, Optional |
|
|
|
|
|
|
|
|
def status_monitor() -> None: |
|
|
"""Background thread to monitor job statuses.""" |
|
|
while True: |
|
|
update_job_statuses() |
|
|
time.sleep(240) |
|
|
|
|
|
|
|
|
def daily_checkpoint() -> None: |
|
|
"""Daily checkpoint - save current state.""" |
|
|
print("Daily checkpoint - saving current state") |
|
|
save_results() |
|
|
|
|
|
|
|
|
|
|
|
def create_app() -> gr.Blocks: |
|
|
with gr.Blocks(title="Inference Provider Testing Dashboard") as demo: |
|
|
with gr.Tab("Main"): |
|
|
gr.Markdown("# Inference Provider Testing Dashboard") |
|
|
gr.Markdown("Launch and monitor evaluation jobs for multiple models and providers.") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
model_input = gr.Textbox( |
|
|
label="Model", |
|
|
placeholder="e.g., meta-llama/Llama-3.3-70B-Instruct", |
|
|
info="Enter HuggingFace model ID" |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
provider_input = gr.Textbox( |
|
|
label="Provider", |
|
|
placeholder="e.g., together-ai", |
|
|
info="Enter inference provider name" |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
launch_single_btn = gr.Button("Launch Job", variant="primary") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
launch_btn = gr.Button("Launch All Jobs", variant="secondary", scale=2) |
|
|
relaunch_failed_btn = gr.Button("Relaunch Failed", variant="stop", scale=1) |
|
|
refresh_btn = gr.Button("🔄 Refresh", variant="secondary", scale=1) |
|
|
|
|
|
output = gr.Textbox(label="Status", interactive=False) |
|
|
|
|
|
|
|
|
summary_stats = gr.Markdown(value=get_summary_stats()) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("## Job Results") |
|
|
results_table = gr.Dataframe( |
|
|
value=get_results_table(), |
|
|
interactive=True, |
|
|
show_search="search", |
|
|
show_copy_button=True, |
|
|
show_fullscreen_button=True, |
|
|
wrap=True, |
|
|
static_columns=list(range(11)), |
|
|
datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "html", "str"], |
|
|
elem_id="results_table" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def launch_single_and_update(model: str, provider: str): |
|
|
"""Launch multiple jobs for a model-provider combination and return updated table and stats.""" |
|
|
if not model or not provider: |
|
|
return "❌ Please provide both model and provider", get_results_table(), get_summary_stats() |
|
|
|
|
|
job_ids = run_multiple_jobs(model, provider, globals.TASKS) |
|
|
if not job_ids: |
|
|
return "❌ Failed to launch jobs (may already be running)", get_results_table(), get_summary_stats() |
|
|
|
|
|
save_results() |
|
|
return f"✅ Launched {len(job_ids)} jobs for {model} on {provider}", get_results_table(), get_summary_stats() |
|
|
|
|
|
launch_single_btn.click( |
|
|
fn=launch_single_and_update, |
|
|
inputs=[model_input, provider_input], |
|
|
outputs=[output, results_table, summary_stats] |
|
|
) |
|
|
|
|
|
def launch_and_update(): |
|
|
"""Launch jobs and return updated table and stats.""" |
|
|
result = launch_jobs() |
|
|
return result, get_results_table(), get_summary_stats() |
|
|
|
|
|
def relaunch_and_update(): |
|
|
"""Relaunch failed jobs and return updated table and stats.""" |
|
|
result = relaunch_failed_jobs() |
|
|
return result, get_results_table(), get_summary_stats() |
|
|
|
|
|
launch_btn.click( |
|
|
fn=launch_and_update, |
|
|
outputs=[output, results_table, summary_stats] |
|
|
) |
|
|
|
|
|
relaunch_failed_btn.click( |
|
|
fn=relaunch_and_update, |
|
|
outputs=[output, results_table, summary_stats] |
|
|
) |
|
|
|
|
|
def refresh_display(): |
|
|
"""Refresh the table and stats display.""" |
|
|
return get_results_table(), get_summary_stats() |
|
|
|
|
|
refresh_btn.click( |
|
|
fn=refresh_display, |
|
|
outputs=[results_table, summary_stats] |
|
|
) |
|
|
|
|
|
|
|
|
def handle_table_select(evt: gr.SelectData): |
|
|
"""Handle when a cell in the results table is clicked.""" |
|
|
print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}") |
|
|
|
|
|
|
|
|
if evt.index[1] == 11: |
|
|
|
|
|
df = get_results_table() |
|
|
row_data = df.data.iloc[evt.index[0]] |
|
|
|
|
|
model = row_data['Model'] |
|
|
provider = row_data['Provider'] |
|
|
print(f"[Relaunch] Relaunching {globals.NUM_RUNS_PER_JOB} jobs - Model: {model}, Provider: {provider}") |
|
|
|
|
|
run_multiple_jobs(model, provider, globals.TASKS) |
|
|
|
|
|
save_results() |
|
|
|
|
|
|
|
|
return get_results_table(), get_summary_stats() |
|
|
|
|
|
results_table.select( |
|
|
fn=handle_table_select, |
|
|
inputs=[], |
|
|
outputs=[results_table, summary_stats] |
|
|
) |
|
|
|
|
|
|
|
|
def auto_refresh(): |
|
|
"""Auto-refresh table and summary stats.""" |
|
|
return get_results_table(), get_summary_stats() |
|
|
|
|
|
|
|
|
timer = gr.Timer(value=30, active=True) |
|
|
timer.tick( |
|
|
fn=auto_refresh, |
|
|
inputs=[], |
|
|
outputs=[results_table, summary_stats] |
|
|
) |
|
|
with gr.Tab("About"): |
|
|
gr.Markdown(""" |
|
|
In this demo, we run 10 samples of 3 evaluations: ifeval (instruction following), gsm_plus (grade school math problems, less contaminated than gsm8k) and gpqa, diamond subset (knowledge), with `lighteval`, `inference-providers` and `jobs`. |
|
|
|
|
|
The "status" column indicates whether the evaluation failed completely (usually because of the provider was down or because we were rate limited). |
|
|
|
|
|
To run any of these locally, you can use the following |
|
|
```python |
|
|
from huggingface_hub import run_job, inspect_job, whoami |
|
|
job = run_job( |
|
|
image="hf.co/spaces/OpenEvals/EvalsOnTheHub", |
|
|
command=[ |
|
|
"lighteval", "endpoint", "inference-providers", |
|
|
"model_name=MODEL,provider=PROVIDER", |
|
|
"extended|ifeval|0,lighteval|gpqa:diamond|0", |
|
|
"--push-to-hub", "--save-details", |
|
|
"--results-org", "YOURORG" |
|
|
], |
|
|
namespace="huggingface", |
|
|
secrets={"HF_TOKEN": YOURTOKEN}, |
|
|
token=YOURTOKEN |
|
|
) |
|
|
``` |
|
|
""") |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
load_results() |
|
|
print("Starting Inference Provider Testing Dashboard") |
|
|
|
|
|
|
|
|
monitor_thread = threading.Thread(target=status_monitor, daemon=True) |
|
|
monitor_thread.start() |
|
|
print("Job status monitor started") |
|
|
|
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
|
scheduler.add_job(daily_checkpoint, 'cron', hour=0, minute=0) |
|
|
scheduler.start() |
|
|
print("Daily checkpoint scheduler started (saves at 00:00)") |
|
|
|
|
|
|
|
|
demo = create_app() |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|
|