import os import glob import json import pandas as pd import gradio as gr from typing import Optional, Dict, List import time from functools import lru_cache ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results") @lru_cache(maxsize=1) def _get_dataset_keys(results_dir: str) -> tuple: """Cache dataset keys to avoid repeated file scanning.""" all_dataset_keys = set() if not os.path.isdir(results_dir): return tuple() for path in glob.glob(os.path.join(results_dir, "*.json")): try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) res = data.get("results", {}) all_dataset_keys.update(res.keys()) except Exception: continue return tuple(sorted(all_dataset_keys)) def load_results(results_dir: str) -> pd.DataFrame: """ Load and process evaluation results from JSON files. Dynamically handles any number of datasets with caching for performance. """ rows = [] all_dataset_keys = set(_get_dataset_keys(results_dir)) if not all_dataset_keys: return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"]) # Use dataset keys directly as display names dataset_display_names = {key: key for key in all_dataset_keys} # Single pass: extract data with optimized processing for path in glob.glob(os.path.join(results_dir, "*.json")): try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) cfg = data.get("config", {}) res = data.get("results", {}) model_name = cfg.get("model_name", "unknown") # Extract PER for each dataset dynamically per_values = {} dur_values = [] for dataset_key in all_dataset_keys: dataset_data = res.get(dataset_key, {}) per_value = dataset_data.get("per") if dataset_data else None dur_value = dataset_data.get("avg_duration") if dataset_data else None display_name = dataset_display_names[dataset_key] per_values[f"PER {display_name}"] = per_value if dur_value is not None: dur_values.append(dur_value) # Calculate average PER across all datasets per_vals = [v for v in per_values.values() if v is not None] avg_per = sum(per_vals) / len(per_vals) if per_vals else None # Calculate average duration avg_dur = sum(dur_values) / len(dur_values) if dur_values else None row = { "Model": model_name, "Avg PER": avg_per, "Avg Duration (s)": avg_dur, "_file": os.path.basename(path), } row.update(per_values) rows.append(row) except Exception: continue df = pd.DataFrame(rows) if df.empty: # Create default columns based on discovered datasets default_cols = ["Model", "Avg PER", "Avg Duration (s)"] for key in sorted(all_dataset_keys): display_name = dataset_display_names[key] default_cols.insert(-2, f"PER {display_name}") return pd.DataFrame(columns=default_cols) df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last") return df.reset_index(drop=True) def build_interface(): """Build the optimized Gradio interface for the phoneme leaderboard.""" # Custom CSS for better styling custom_css = """ .gradio-container { max-width: 1200px !important; margin: 0 auto !important; } .leaderboard-header { text-align: center; margin-bottom: 2rem; } .stats-container { display: flex; gap: 1rem; margin-bottom: 1rem; flex-wrap: wrap; } .stat-card { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px; flex: 1; } .stat-value { font-size: 1.5rem; font-weight: bold; margin-bottom: 0.5rem; } .stat-label { font-size: 0.9rem; opacity: 0.9; } .table-container { margin-top: 1rem; } .refresh-btn { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border: none; padding: 0.5rem 1rem; border-radius: 5px; cursor: pointer; } """ with gr.Blocks( title="Phoneme Detection Leaderboard", css=custom_css, theme=gr.themes.Soft() ) as demo: # Header section with gr.Column(elem_classes="leaderboard-header"): gr.Markdown("# đŸŽ¯ Phoneme Detection Leaderboard") gr.Markdown("Compare phoneme recognition models across different datasets") # Stats section with gr.Row(elem_classes="stats-container"): total_models = gr.HTML( '
-
Total Models
', elem_id="total-models-card" ) best_per = gr.HTML( '
-
Best PER
', elem_id="best-per-card" ) avg_duration = gr.HTML( '
-
Avg Duration
', elem_id="avg-duration-card" ) # Main content with gr.Row(): with gr.Column(scale=4): # Get initial data to determine columns dynamically initial_df = load_results(EVAL_RESULTS_DIR) if not initial_df.empty: headers = list(initial_df.columns) # Remove internal columns headers = [h for h in headers if not h.startswith('_')] else: headers = ["Model", "Avg PER", "Avg Duration (s)"] table = gr.Dataframe( headers=headers, row_count=10, label="🏆 Model Performance Leaderboard", interactive=False, elem_classes="table-container" ) with gr.Column(scale=1): refresh_btn = gr.Button( "🔄 Refresh Data", variant="primary", elem_classes="refresh-btn" ) # Quick stats with gr.Accordion("📊 Quick Stats", open=True): stats_display = gr.HTML("Loading statistics...") # Export options with gr.Accordion("đŸ“Ĩ Export Data", open=False): export_csv = gr.Button("📄 Export as CSV", variant="secondary") export_json = gr.Button("📋 Export as JSON", variant="secondary") def refresh(): """Refresh the leaderboard data with performance optimization.""" start_time = time.time() df = load_results(EVAL_RESULTS_DIR) if df.empty: return df, "No data available", "No data available", "No data available" # Get the column order from the dataframe cols = [c for c in df.columns if not c.startswith('_')] # Ensure all columns exist for the dataframe component for c in cols: if c not in df.columns: df[c] = None # Calculate stats total_models = len(df) best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A" avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A" # Format stats best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val) avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val) load_time = time.time() - start_time return ( df[cols].round(3), f"
{total_models}
Total Models
", f"
{best_per_str}
Best PER
", f"
{avg_duration_str}
Avg Duration
" ) def export_csv_data(): """Export data as CSV.""" df = load_results(EVAL_RESULTS_DIR) if df.empty: return None cols = [c for c in df.columns if not c.startswith('_')] return df[cols].round(3) def export_json_data(): """Export data as JSON.""" df = load_results(EVAL_RESULTS_DIR) if df.empty: return None cols = [c for c in df.columns if not c.startswith('_')] return df[cols].round(3).to_json(orient='records', indent=2) # Connect events refresh_btn.click( fn=refresh, outputs=[table, total_models, best_per, avg_duration] ) export_csv.click( fn=export_csv_data, outputs=gr.File(label="Download CSV") ) export_json.click( fn=export_json_data, outputs=gr.File(label="Download JSON") ) # Auto-load on start table.value, total_models.value, best_per.value, avg_duration.value = refresh() # Help section with gr.Accordion("â„šī¸ About this Leaderboard", open=False): gr.Markdown(""" ## 📊 Understanding the Results **Performance Metrics:** - **PER (Phoneme Error Rate)**: Lower values indicate better performance - **Avg Duration**: Processing time per sample (lower is faster) - **Models are ranked by average PER across all datasets** **Datasets Evaluated:** - `phoneme_asr`: General phoneme recognition dataset - `kids_phoneme_md`: Kids' phoneme recognition dataset **How to Interpret:** - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect) - **Duration**: Time efficiency (important for real-time applications) - **Average PER**: Overall model performance across all datasets **Tips for Model Selection:** - Choose models with low PER for accuracy-critical applications - Consider duration for real-time or resource-constrained environments - Balance between accuracy (PER) and speed (Duration) based on your needs """) return demo if __name__ == "__main__": demo = build_interface() demo.queue().launch( server_name="0.0.0.0", server_port=7860, share=False )