import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download import os from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( COLS, AutoEvalColumn, fields, ) from src.about import Tasks from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval # Import simple leaderboard functionality import glob import json from functools import lru_cache def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation (prefer local JSONs, fall back to Hub) def _has_local_json(path: str) -> bool: try: return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path)) except Exception: return False if not _has_local_json(EVAL_REQUESTS_PATH): try: print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: pass if not _has_local_json(EVAL_RESULTS_PATH): try: print(EVAL_RESULTS_PATH) snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: pass # Build benchmark and evaluation queue column metadata BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks] EVAL_COLS = [ "Model", "Model sha", "status", "precision", "weight_type", "model_type", "likes", "params", "license", "submitted_time", ] EVAL_TYPES = [ "markdown", # Model "str", # Model sha "str", # status "str", # precision "str", # weight_type "str", # model_type "number", # likes "number", # params "str", # license "str", # submitted_time ] # Hide all models from the leaderboard view LEADERBOARD_DF = pd.DataFrame(columns=COLS) ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) @lru_cache(maxsize=1) def _get_simple_dataset_keys(results_dir: str) -> tuple: """Cache dataset keys to avoid repeated file scanning.""" all_dataset_keys = set() if not os.path.isdir(results_dir): return tuple() for path in glob.glob(os.path.join(results_dir, "*.json")): try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) res = data.get("results", {}) all_dataset_keys.update(res.keys()) except Exception: continue return tuple(sorted(all_dataset_keys)) def load_simple_results(results_dir: str) -> pd.DataFrame: """Load and process evaluation results from JSON files for simple leaderboard with caching.""" rows = [] all_dataset_keys = set(_get_simple_dataset_keys(results_dir)) if not all_dataset_keys: return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"]) # Use dataset keys directly as display names dataset_display_names = {key: key for key in all_dataset_keys} # Single pass: extract data with optimized processing for path in glob.glob(os.path.join(results_dir, "*.json")): try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) cfg = data.get("config", {}) res = data.get("results", {}) model_name = cfg.get("model_name", "unknown") # Extract PER for each dataset dynamically per_values = {} dur_values = [] for dataset_key in all_dataset_keys: dataset_data = res.get(dataset_key, {}) per_value = dataset_data.get("per") if dataset_data else None dur_value = dataset_data.get("avg_duration") if dataset_data else None display_name = dataset_display_names[dataset_key] per_values[f"PER {display_name}"] = per_value if dur_value is not None: dur_values.append(dur_value) # Calculate average PER across all datasets per_vals = [v for v in per_values.values() if v is not None] avg_per = sum(per_vals) / len(per_vals) if per_vals else None # Calculate average duration avg_dur = sum(dur_values) / len(dur_values) if dur_values else None row = { "Model": model_name, "Avg PER": avg_per, "Avg Duration (s)": avg_dur, "_file": os.path.basename(path), } row.update(per_values) rows.append(row) except Exception: continue df = pd.DataFrame(rows) if df.empty: # Create default columns based on discovered datasets default_cols = ["Model", "Avg PER", "Avg Duration (s)"] for key in sorted(all_dataset_keys): display_name = dataset_display_names[key] default_cols.insert(-2, f"PER {display_name}") return pd.DataFrame(columns=default_cols) df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last") return df.reset_index(drop=True) def init_leaderboard(dataframe): if dataframe is None or dataframe.empty: dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)]) return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), ColumnFilter( AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)", ), ColumnFilter( AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True ), ], bool_checkboxgroup_label="Hide models", interactive=False, ) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0): leaderboard = init_leaderboard(LEADERBOARD_DF) with gr.TabItem("📊 Simple Results", elem_id="simple-results-tab", id=1): gr.Markdown("## đŸŽ¯ Phoneme Detection Results") gr.Markdown("Compare phoneme recognition models across different datasets") # Stats section for simple results with gr.Row(): simple_total_models = gr.HTML( '
-
Total Models
' ) simple_best_per = gr.HTML( '
-
Best PER
' ) simple_avg_duration = gr.HTML( '
-
Avg Duration
' ) # Get initial data to determine columns dynamically initial_df = load_simple_results(EVAL_RESULTS_PATH) if not initial_df.empty: headers = list(initial_df.columns) # Remove internal columns headers = [h for h in headers if not h.startswith('_')] else: headers = ["Model", "Avg PER", "Avg Duration (s)"] with gr.Row(): with gr.Column(scale=4): simple_table = gr.Dataframe( headers=headers, row_count=10, label="🏆 Model Performance Leaderboard", interactive=False ) with gr.Column(scale=1): refresh_btn = gr.Button("🔄 Refresh Data", variant="primary") # Export options with gr.Accordion("đŸ“Ĩ Export Data", open=False): export_csv = gr.Button("📄 Export CSV", variant="secondary") export_json = gr.Button("📋 Export JSON", variant="secondary") def refresh_simple(): """Refresh the simple leaderboard data with enhanced stats.""" df = load_simple_results(EVAL_RESULTS_PATH) if df.empty: return df, "No data", "No data", "No data" # Get the column order from the dataframe cols = [c for c in df.columns if not c.startswith('_')] # Ensure all columns exist for the dataframe component for c in cols: if c not in df.columns: df[c] = None # Calculate enhanced stats total_models = len(df) best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A" avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A" # Format stats best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val) avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val) return ( df[cols].round(3), f'
{total_models}
Total Models
', f'
{best_per_str}
Best PER
', f'
{avg_duration_str}
Avg Duration
' ) def export_simple_csv(): """Export simple results as CSV.""" df = load_simple_results(EVAL_RESULTS_PATH) if df.empty: return None cols = [c for c in df.columns if not c.startswith('_')] return df[cols].round(3) def export_simple_json(): """Export simple results as JSON.""" df = load_simple_results(EVAL_RESULTS_PATH) if df.empty: return None cols = [c for c in df.columns if not c.startswith('_')] return df[cols].round(3).to_json(orient='records', indent=2) # Connect events refresh_btn.click( fn=refresh_simple, outputs=[simple_table, simple_total_models, simple_best_per, simple_avg_duration] ) export_csv.click( fn=export_simple_csv, outputs=gr.File(label="Download CSV") ) export_json.click( fn=export_simple_json, outputs=gr.File(label="Download JSON") ) # Auto-load on start simple_table.value, simple_total_models.value, simple_best_per.value, simple_avg_duration.value = refresh_simple() # Enhanced help section with gr.Accordion("â„šī¸ About this Leaderboard", open=False): gr.Markdown(""" ## 📊 Understanding the Results **Performance Metrics:** - **PER (Phoneme Error Rate)**: Lower values indicate better performance - **Avg Duration**: Processing time per sample (lower is faster) - **Models are ranked by average PER across all datasets** **Datasets Evaluated:** - `phoneme_asr`: General phoneme recognition dataset - `kids_phoneme_md`: Kids' phoneme recognition dataset **How to Interpret:** - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect) - **Duration**: Time efficiency (important for real-time applications) - **Average PER**: Overall model performance across all datasets **Tips for Model Selection:** - Choose models with low PER for accuracy-critical applications - Consider duration for real-time or resource-constrained environments - Balance between accuracy (PER) and speed (Duration) based on your needs """) with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3): with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Column(): with gr.Accordion( f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False, ): with gr.Row(): finished_eval_table = gr.components.Dataframe( value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False, ): with gr.Row(): running_eval_table = gr.components.Dataframe( value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"âŗ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False, ): with gr.Row(): pending_eval_table = gr.components.Dataframe( value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Row(): gr.Markdown("# âœ‰ī¸âœ¨ Submit your model here!", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox(label="Model name") revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") model_type = gr.Dropdown( choices=["Pretrained", "Fine-tuned", "Merge", "Other"], label="Model type", multiselect=False, value=None, interactive=True, ) with gr.Column(): precision = gr.Dropdown( choices=["float16", "bfloat16", "float32", "int8", "int4"], label="Precision", multiselect=False, value="float16", interactive=True, ) weight_type = gr.Dropdown( choices=["Original", "Delta", "Adapter"], label="Weights type", multiselect=False, value="Original", interactive=True, ) base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ], submission_result, ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch()