lataon's picture
use simple leaderboard
f3ebaf3
raw
history blame
19.8 kB
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import os
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
COLS,
AutoEvalColumn,
fields,
)
from src.about import Tasks
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
# Import simple leaderboard functionality
import glob
import json
from functools import lru_cache
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation (prefer local JSONs, fall back to Hub)
def _has_local_json(path: str) -> bool:
try:
return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path))
except Exception:
return False
if not _has_local_json(EVAL_REQUESTS_PATH):
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
pass
if not _has_local_json(EVAL_RESULTS_PATH):
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
pass
# Build benchmark and evaluation queue column metadata
BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks]
EVAL_COLS = [
"Model",
"Model sha",
"status",
"precision",
"weight_type",
"model_type",
"likes",
"params",
"license",
"submitted_time",
]
EVAL_TYPES = [
"markdown", # Model
"str", # Model sha
"str", # status
"str", # precision
"str", # weight_type
"str", # model_type
"number", # likes
"number", # params
"str", # license
"str", # submitted_time
]
# Hide all models from the leaderboard view
LEADERBOARD_DF = pd.DataFrame(columns=COLS)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
@lru_cache(maxsize=1)
def _get_simple_dataset_keys(results_dir: str) -> tuple:
"""Cache dataset keys to avoid repeated file scanning."""
all_dataset_keys = set()
if not os.path.isdir(results_dir):
return tuple()
for path in glob.glob(os.path.join(results_dir, "*.json")):
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
res = data.get("results", {})
all_dataset_keys.update(res.keys())
except Exception:
continue
return tuple(sorted(all_dataset_keys))
def load_simple_results(results_dir: str) -> pd.DataFrame:
"""Load and process evaluation results from JSON files for simple leaderboard with caching."""
rows = []
all_dataset_keys = set(_get_simple_dataset_keys(results_dir))
if not all_dataset_keys:
return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
# Use dataset keys directly as display names
dataset_display_names = {key: key for key in all_dataset_keys}
# Single pass: extract data with optimized processing
for path in glob.glob(os.path.join(results_dir, "*.json")):
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
cfg = data.get("config", {})
res = data.get("results", {})
model_name = cfg.get("model_name", "unknown")
# Extract PER for each dataset dynamically
per_values = {}
dur_values = []
for dataset_key in all_dataset_keys:
dataset_data = res.get(dataset_key, {})
per_value = dataset_data.get("per") if dataset_data else None
dur_value = dataset_data.get("avg_duration") if dataset_data else None
display_name = dataset_display_names[dataset_key]
per_values[f"PER {display_name}"] = per_value
if dur_value is not None:
dur_values.append(dur_value)
# Calculate average PER across all datasets
per_vals = [v for v in per_values.values() if v is not None]
avg_per = sum(per_vals) / len(per_vals) if per_vals else None
# Calculate average duration
avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
row = {
"Model": model_name,
"Avg PER": avg_per,
"Avg Duration (s)": avg_dur,
"_file": os.path.basename(path),
}
row.update(per_values)
rows.append(row)
except Exception:
continue
df = pd.DataFrame(rows)
if df.empty:
# Create default columns based on discovered datasets
default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
for key in sorted(all_dataset_keys):
display_name = dataset_display_names[key]
default_cols.insert(-2, f"PER {display_name}")
return pd.DataFrame(columns=default_cols)
df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
return df.reset_index(drop=True)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
ColumnFilter(
AutoEvalColumn.params.name,
type="slider",
min=0.01,
max=150,
label="Select the number of parameters (B)",
),
ColumnFilter(
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("πŸ“Š Simple Results", elem_id="simple-results-tab", id=1):
gr.Markdown("## 🎯 Phoneme Detection Results")
gr.Markdown("Compare phoneme recognition models across different datasets")
# Stats section for simple results
with gr.Row():
simple_total_models = gr.HTML(
'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>'
)
simple_best_per = gr.HTML(
'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>'
)
simple_avg_duration = gr.HTML(
'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
)
# Get initial data to determine columns dynamically
initial_df = load_simple_results(EVAL_RESULTS_PATH)
if not initial_df.empty:
headers = list(initial_df.columns)
# Remove internal columns
headers = [h for h in headers if not h.startswith('_')]
else:
headers = ["Model", "Avg PER", "Avg Duration (s)"]
with gr.Row():
with gr.Column(scale=4):
simple_table = gr.Dataframe(
headers=headers,
row_count=10,
label="πŸ† Model Performance Leaderboard",
interactive=False
)
with gr.Column(scale=1):
refresh_btn = gr.Button("πŸ”„ Refresh Data", variant="primary")
# Export options
with gr.Accordion("πŸ“₯ Export Data", open=False):
export_csv = gr.Button("πŸ“„ Export CSV", variant="secondary")
export_json = gr.Button("πŸ“‹ Export JSON", variant="secondary")
def refresh_simple():
"""Refresh the simple leaderboard data with enhanced stats."""
df = load_simple_results(EVAL_RESULTS_PATH)
if df.empty:
return df, "No data", "No data", "No data"
# Get the column order from the dataframe
cols = [c for c in df.columns if not c.startswith('_')]
# Ensure all columns exist for the dataframe component
for c in cols:
if c not in df.columns:
df[c] = None
# Calculate enhanced stats
total_models = len(df)
best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A"
avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A"
# Format stats
best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val)
avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val)
return (
df[cols].round(3),
f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{total_models}</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>',
f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{best_per_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>',
f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{avg_duration_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
)
def export_simple_csv():
"""Export simple results as CSV."""
df = load_simple_results(EVAL_RESULTS_PATH)
if df.empty:
return None
cols = [c for c in df.columns if not c.startswith('_')]
return df[cols].round(3)
def export_simple_json():
"""Export simple results as JSON."""
df = load_simple_results(EVAL_RESULTS_PATH)
if df.empty:
return None
cols = [c for c in df.columns if not c.startswith('_')]
return df[cols].round(3).to_json(orient='records', indent=2)
# Connect events
refresh_btn.click(
fn=refresh_simple,
outputs=[simple_table, simple_total_models, simple_best_per, simple_avg_duration]
)
export_csv.click(
fn=export_simple_csv,
outputs=gr.File(label="Download CSV")
)
export_json.click(
fn=export_simple_json,
outputs=gr.File(label="Download JSON")
)
# Auto-load on start
simple_table.value, simple_total_models.value, simple_best_per.value, simple_avg_duration.value = refresh_simple()
# Enhanced help section
with gr.Accordion("ℹ️ About this Leaderboard", open=False):
gr.Markdown("""
## πŸ“Š Understanding the Results
**Performance Metrics:**
- **PER (Phoneme Error Rate)**: Lower values indicate better performance
- **Avg Duration**: Processing time per sample (lower is faster)
- **Models are ranked by average PER across all datasets**
**Datasets Evaluated:**
- `phoneme_asr`: General phoneme recognition dataset
- `kids_phoneme_md`: Kids' phoneme recognition dataset
**How to Interpret:**
- **PER**: Percentage of phonemes incorrectly recognized (0% = perfect)
- **Duration**: Time efficiency (important for real-time applications)
- **Average PER**: Overall model performance across all datasets
**Tips for Model Selection:**
- Choose models with low PER for accuracy-critical applications
- Consider duration for real-time or resource-constrained environments
- Balance between accuracy (PER) and speed (Duration) based on your needs
""")
with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Column():
with gr.Accordion(
f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Row():
gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=["Pretrained", "Fine-tuned", "Merge", "Other"],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=["float16", "bfloat16", "float32", "int8", "int4"],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=["Original", "Delta", "Adapter"],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
],
submission_result,
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()