|
|
import gradio as gr |
|
|
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns |
|
|
import pandas as pd |
|
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
from huggingface_hub import snapshot_download |
|
|
import os |
|
|
|
|
|
from src.about import ( |
|
|
CITATION_BUTTON_LABEL, |
|
|
CITATION_BUTTON_TEXT, |
|
|
EVALUATION_QUEUE_TEXT, |
|
|
INTRODUCTION_TEXT, |
|
|
LLM_BENCHMARKS_TEXT, |
|
|
TITLE, |
|
|
) |
|
|
from src.display.css_html_js import custom_css |
|
|
from src.display.utils import ( |
|
|
COLS, |
|
|
AutoEvalColumn, |
|
|
fields, |
|
|
) |
|
|
from src.about import Tasks |
|
|
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN |
|
|
from src.populate import get_evaluation_queue_df, get_leaderboard_df |
|
|
from src.submission.submit import add_new_eval |
|
|
|
|
|
|
|
|
import glob |
|
|
import json |
|
|
from functools import lru_cache |
|
|
|
|
|
|
|
|
def restart_space(): |
|
|
API.restart_space(repo_id=REPO_ID) |
|
|
|
|
|
|
|
|
def _has_local_json(path: str) -> bool: |
|
|
try: |
|
|
return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path)) |
|
|
except Exception: |
|
|
return False |
|
|
|
|
|
if not _has_local_json(EVAL_REQUESTS_PATH): |
|
|
try: |
|
|
print(EVAL_REQUESTS_PATH) |
|
|
snapshot_download( |
|
|
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
|
) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
if not _has_local_json(EVAL_RESULTS_PATH): |
|
|
try: |
|
|
print(EVAL_RESULTS_PATH) |
|
|
snapshot_download( |
|
|
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
|
) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks] |
|
|
|
|
|
EVAL_COLS = [ |
|
|
"Model", |
|
|
"Model sha", |
|
|
"status", |
|
|
"precision", |
|
|
"weight_type", |
|
|
"model_type", |
|
|
"likes", |
|
|
"params", |
|
|
"license", |
|
|
"submitted_time", |
|
|
] |
|
|
|
|
|
EVAL_TYPES = [ |
|
|
"markdown", |
|
|
"str", |
|
|
"str", |
|
|
"str", |
|
|
"str", |
|
|
"str", |
|
|
"number", |
|
|
"number", |
|
|
"str", |
|
|
"str", |
|
|
] |
|
|
|
|
|
|
|
|
LEADERBOARD_DF = pd.DataFrame(columns=COLS) |
|
|
|
|
|
( |
|
|
finished_eval_queue_df, |
|
|
running_eval_queue_df, |
|
|
pending_eval_queue_df, |
|
|
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def _get_simple_dataset_keys(results_dir: str) -> tuple: |
|
|
"""Cache dataset keys to avoid repeated file scanning.""" |
|
|
all_dataset_keys = set() |
|
|
if not os.path.isdir(results_dir): |
|
|
return tuple() |
|
|
|
|
|
for path in glob.glob(os.path.join(results_dir, "*.json")): |
|
|
try: |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
res = data.get("results", {}) |
|
|
all_dataset_keys.update(res.keys()) |
|
|
except Exception: |
|
|
continue |
|
|
|
|
|
return tuple(sorted(all_dataset_keys)) |
|
|
|
|
|
def load_simple_results(results_dir: str) -> pd.DataFrame: |
|
|
"""Load and process evaluation results from JSON files for simple leaderboard with caching.""" |
|
|
rows = [] |
|
|
all_dataset_keys = set(_get_simple_dataset_keys(results_dir)) |
|
|
|
|
|
if not all_dataset_keys: |
|
|
return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"]) |
|
|
|
|
|
|
|
|
dataset_display_names = {key: key for key in all_dataset_keys} |
|
|
|
|
|
|
|
|
for path in glob.glob(os.path.join(results_dir, "*.json")): |
|
|
try: |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
cfg = data.get("config", {}) |
|
|
res = data.get("results", {}) |
|
|
|
|
|
model_name = cfg.get("model_name", "unknown") |
|
|
|
|
|
|
|
|
per_values = {} |
|
|
dur_values = [] |
|
|
|
|
|
for dataset_key in all_dataset_keys: |
|
|
dataset_data = res.get(dataset_key, {}) |
|
|
per_value = dataset_data.get("per") if dataset_data else None |
|
|
dur_value = dataset_data.get("avg_duration") if dataset_data else None |
|
|
|
|
|
display_name = dataset_display_names[dataset_key] |
|
|
per_values[f"PER {display_name}"] = per_value |
|
|
|
|
|
if dur_value is not None: |
|
|
dur_values.append(dur_value) |
|
|
|
|
|
|
|
|
per_vals = [v for v in per_values.values() if v is not None] |
|
|
avg_per = sum(per_vals) / len(per_vals) if per_vals else None |
|
|
|
|
|
|
|
|
avg_dur = sum(dur_values) / len(dur_values) if dur_values else None |
|
|
|
|
|
row = { |
|
|
"Model": model_name, |
|
|
"Avg PER": avg_per, |
|
|
"Avg Duration (s)": avg_dur, |
|
|
"_file": os.path.basename(path), |
|
|
} |
|
|
row.update(per_values) |
|
|
rows.append(row) |
|
|
|
|
|
except Exception: |
|
|
continue |
|
|
|
|
|
df = pd.DataFrame(rows) |
|
|
if df.empty: |
|
|
|
|
|
default_cols = ["Model", "Avg PER", "Avg Duration (s)"] |
|
|
for key in sorted(all_dataset_keys): |
|
|
display_name = dataset_display_names[key] |
|
|
default_cols.insert(-2, f"PER {display_name}") |
|
|
return pd.DataFrame(columns=default_cols) |
|
|
|
|
|
df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last") |
|
|
return df.reset_index(drop=True) |
|
|
|
|
|
|
|
|
def init_leaderboard(dataframe): |
|
|
if dataframe is None or dataframe.empty: |
|
|
dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)]) |
|
|
return Leaderboard( |
|
|
value=dataframe, |
|
|
datatype=[c.type for c in fields(AutoEvalColumn)], |
|
|
select_columns=SelectColumns( |
|
|
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], |
|
|
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], |
|
|
label="Select Columns to Display:", |
|
|
), |
|
|
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], |
|
|
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], |
|
|
filter_columns=[ |
|
|
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), |
|
|
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), |
|
|
ColumnFilter( |
|
|
AutoEvalColumn.params.name, |
|
|
type="slider", |
|
|
min=0.01, |
|
|
max=150, |
|
|
label="Select the number of parameters (B)", |
|
|
), |
|
|
ColumnFilter( |
|
|
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True |
|
|
), |
|
|
], |
|
|
bool_checkboxgroup_label="Hide models", |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
|
with demo: |
|
|
gr.HTML(TITLE) |
|
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
|
with gr.TabItem("π
Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0): |
|
|
leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
|
|
|
|
with gr.TabItem("π Simple Results", elem_id="simple-results-tab", id=1): |
|
|
gr.Markdown("## π― Phoneme Detection Results") |
|
|
gr.Markdown("Compare phoneme recognition models across different datasets") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
simple_total_models = gr.HTML( |
|
|
'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>' |
|
|
) |
|
|
simple_best_per = gr.HTML( |
|
|
'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>' |
|
|
) |
|
|
simple_avg_duration = gr.HTML( |
|
|
'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>' |
|
|
) |
|
|
|
|
|
|
|
|
initial_df = load_simple_results(EVAL_RESULTS_PATH) |
|
|
if not initial_df.empty: |
|
|
headers = list(initial_df.columns) |
|
|
|
|
|
headers = [h for h in headers if not h.startswith('_')] |
|
|
else: |
|
|
headers = ["Model", "Avg PER", "Avg Duration (s)"] |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=4): |
|
|
simple_table = gr.Dataframe( |
|
|
headers=headers, |
|
|
row_count=10, |
|
|
label="π Model Performance Leaderboard", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
refresh_btn = gr.Button("π Refresh Data", variant="primary") |
|
|
|
|
|
|
|
|
with gr.Accordion("π₯ Export Data", open=False): |
|
|
export_csv = gr.Button("π Export CSV", variant="secondary") |
|
|
export_json = gr.Button("π Export JSON", variant="secondary") |
|
|
|
|
|
def refresh_simple(): |
|
|
"""Refresh the simple leaderboard data with enhanced stats.""" |
|
|
df = load_simple_results(EVAL_RESULTS_PATH) |
|
|
|
|
|
if df.empty: |
|
|
return df, "No data", "No data", "No data" |
|
|
|
|
|
|
|
|
cols = [c for c in df.columns if not c.startswith('_')] |
|
|
|
|
|
|
|
|
for c in cols: |
|
|
if c not in df.columns: |
|
|
df[c] = None |
|
|
|
|
|
|
|
|
total_models = len(df) |
|
|
best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A" |
|
|
avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A" |
|
|
|
|
|
|
|
|
best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val) |
|
|
avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val) |
|
|
|
|
|
return ( |
|
|
df[cols].round(3), |
|
|
f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{total_models}</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>', |
|
|
f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{best_per_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>', |
|
|
f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{avg_duration_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>' |
|
|
) |
|
|
|
|
|
def export_simple_csv(): |
|
|
"""Export simple results as CSV.""" |
|
|
df = load_simple_results(EVAL_RESULTS_PATH) |
|
|
if df.empty: |
|
|
return None |
|
|
cols = [c for c in df.columns if not c.startswith('_')] |
|
|
return df[cols].round(3) |
|
|
|
|
|
def export_simple_json(): |
|
|
"""Export simple results as JSON.""" |
|
|
df = load_simple_results(EVAL_RESULTS_PATH) |
|
|
if df.empty: |
|
|
return None |
|
|
cols = [c for c in df.columns if not c.startswith('_')] |
|
|
return df[cols].round(3).to_json(orient='records', indent=2) |
|
|
|
|
|
|
|
|
refresh_btn.click( |
|
|
fn=refresh_simple, |
|
|
outputs=[simple_table, simple_total_models, simple_best_per, simple_avg_duration] |
|
|
) |
|
|
|
|
|
export_csv.click( |
|
|
fn=export_simple_csv, |
|
|
outputs=gr.File(label="Download CSV") |
|
|
) |
|
|
|
|
|
export_json.click( |
|
|
fn=export_simple_json, |
|
|
outputs=gr.File(label="Download JSON") |
|
|
) |
|
|
|
|
|
|
|
|
simple_table.value, simple_total_models.value, simple_best_per.value, simple_avg_duration.value = refresh_simple() |
|
|
|
|
|
|
|
|
with gr.Accordion("βΉοΈ About this Leaderboard", open=False): |
|
|
gr.Markdown(""" |
|
|
## π Understanding the Results |
|
|
|
|
|
**Performance Metrics:** |
|
|
- **PER (Phoneme Error Rate)**: Lower values indicate better performance |
|
|
- **Avg Duration**: Processing time per sample (lower is faster) |
|
|
- **Models are ranked by average PER across all datasets** |
|
|
|
|
|
**Datasets Evaluated:** |
|
|
- `phoneme_asr`: General phoneme recognition dataset |
|
|
- `kids_phoneme_md`: Kids' phoneme recognition dataset |
|
|
|
|
|
**How to Interpret:** |
|
|
- **PER**: Percentage of phonemes incorrectly recognized (0% = perfect) |
|
|
- **Duration**: Time efficiency (important for real-time applications) |
|
|
- **Average PER**: Overall model performance across all datasets |
|
|
|
|
|
**Tips for Model Selection:** |
|
|
- Choose models with low PER for accuracy-critical applications |
|
|
- Consider duration for real-time or resource-constrained environments |
|
|
- Balance between accuracy (PER) and speed (Duration) based on your needs |
|
|
""") |
|
|
|
|
|
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2): |
|
|
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3): |
|
|
with gr.Column(): |
|
|
with gr.Row(): |
|
|
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
with gr.Column(): |
|
|
with gr.Accordion( |
|
|
f"β
Finished Evaluations ({len(finished_eval_queue_df)})", |
|
|
open=False, |
|
|
): |
|
|
with gr.Row(): |
|
|
finished_eval_table = gr.components.Dataframe( |
|
|
value=finished_eval_queue_df, |
|
|
headers=EVAL_COLS, |
|
|
datatype=EVAL_TYPES, |
|
|
row_count=5, |
|
|
) |
|
|
with gr.Accordion( |
|
|
f"π Running Evaluation Queue ({len(running_eval_queue_df)})", |
|
|
open=False, |
|
|
): |
|
|
with gr.Row(): |
|
|
running_eval_table = gr.components.Dataframe( |
|
|
value=running_eval_queue_df, |
|
|
headers=EVAL_COLS, |
|
|
datatype=EVAL_TYPES, |
|
|
row_count=5, |
|
|
) |
|
|
|
|
|
with gr.Accordion( |
|
|
f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})", |
|
|
open=False, |
|
|
): |
|
|
with gr.Row(): |
|
|
pending_eval_table = gr.components.Dataframe( |
|
|
value=pending_eval_queue_df, |
|
|
headers=EVAL_COLS, |
|
|
datatype=EVAL_TYPES, |
|
|
row_count=5, |
|
|
) |
|
|
with gr.Row(): |
|
|
gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
model_name_textbox = gr.Textbox(label="Model name") |
|
|
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") |
|
|
model_type = gr.Dropdown( |
|
|
choices=["Pretrained", "Fine-tuned", "Merge", "Other"], |
|
|
label="Model type", |
|
|
multiselect=False, |
|
|
value=None, |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
precision = gr.Dropdown( |
|
|
choices=["float16", "bfloat16", "float32", "int8", "int4"], |
|
|
label="Precision", |
|
|
multiselect=False, |
|
|
value="float16", |
|
|
interactive=True, |
|
|
) |
|
|
weight_type = gr.Dropdown( |
|
|
choices=["Original", "Delta", "Adapter"], |
|
|
label="Weights type", |
|
|
multiselect=False, |
|
|
value="Original", |
|
|
interactive=True, |
|
|
) |
|
|
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") |
|
|
|
|
|
submit_button = gr.Button("Submit Eval") |
|
|
submission_result = gr.Markdown() |
|
|
submit_button.click( |
|
|
add_new_eval, |
|
|
[ |
|
|
model_name_textbox, |
|
|
base_model_name_textbox, |
|
|
revision_name_textbox, |
|
|
precision, |
|
|
weight_type, |
|
|
model_type, |
|
|
], |
|
|
submission_result, |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Accordion("π Citation", open=False): |
|
|
citation_button = gr.Textbox( |
|
|
value=CITATION_BUTTON_TEXT, |
|
|
label=CITATION_BUTTON_LABEL, |
|
|
lines=20, |
|
|
elem_id="citation-button", |
|
|
show_copy_button=True, |
|
|
) |
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
|
scheduler.add_job(restart_space, "interval", seconds=1800) |
|
|
scheduler.start() |
|
|
demo.queue(default_concurrency_limit=40).launch() |