import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download import os from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( COLS, AutoEvalColumn, fields, ) from src.about import Tasks from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval # Import simple leaderboard functionality import glob import json from functools import lru_cache def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation (prefer local JSONs, fall back to Hub) def _has_local_json(path: str) -> bool: try: return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path)) except Exception: return False if not _has_local_json(EVAL_REQUESTS_PATH): try: print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: pass if not _has_local_json(EVAL_RESULTS_PATH): try: print(EVAL_RESULTS_PATH) snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: pass # Build benchmark and evaluation queue column metadata BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks] EVAL_COLS = [ "Model", "Model sha", "status", "precision", "weight_type", "model_type", "likes", "params", "license", "submitted_time", ] EVAL_TYPES = [ "markdown", # Model "str", # Model sha "str", # status "str", # precision "str", # weight_type "str", # model_type "number", # likes "number", # params "str", # license "str", # submitted_time ] # Hide all models from the leaderboard view LEADERBOARD_DF = pd.DataFrame(columns=COLS) ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) @lru_cache(maxsize=1) def _get_simple_dataset_keys(results_dir: str) -> tuple: """Cache dataset keys to avoid repeated file scanning.""" all_dataset_keys = set() if not os.path.isdir(results_dir): return tuple() for path in glob.glob(os.path.join(results_dir, "*.json")): try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) res = data.get("results", {}) all_dataset_keys.update(res.keys()) except Exception: continue return tuple(sorted(all_dataset_keys)) def load_simple_results(results_dir: str) -> pd.DataFrame: """Load and process evaluation results from JSON files for simple leaderboard with caching.""" rows = [] all_dataset_keys = set(_get_simple_dataset_keys(results_dir)) if not all_dataset_keys: return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"]) # Use dataset keys directly as display names dataset_display_names = {key: key for key in all_dataset_keys} # Single pass: extract data with optimized processing for path in glob.glob(os.path.join(results_dir, "*.json")): try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) cfg = data.get("config", {}) res = data.get("results", {}) model_name = cfg.get("model_name", "unknown") # Extract PER for each dataset dynamically per_values = {} dur_values = [] for dataset_key in all_dataset_keys: dataset_data = res.get(dataset_key, {}) per_value = dataset_data.get("per") if dataset_data else None dur_value = dataset_data.get("avg_duration") if dataset_data else None display_name = dataset_display_names[dataset_key] per_values[f"PER {display_name}"] = per_value if dur_value is not None: dur_values.append(dur_value) # Calculate average PER across all datasets per_vals = [v for v in per_values.values() if v is not None] avg_per = sum(per_vals) / len(per_vals) if per_vals else None # Calculate average duration avg_dur = sum(dur_values) / len(dur_values) if dur_values else None row = { "Model": model_name, "Avg PER": avg_per, "Avg Duration (s)": avg_dur, "_file": os.path.basename(path), } row.update(per_values) rows.append(row) except Exception: continue df = pd.DataFrame(rows) if df.empty: # Create default columns based on discovered datasets default_cols = ["Model", "Avg PER", "Avg Duration (s)"] for key in sorted(all_dataset_keys): display_name = dataset_display_names[key] default_cols.insert(-2, f"PER {display_name}") return pd.DataFrame(columns=default_cols) df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last") return df.reset_index(drop=True) def init_leaderboard(dataframe): if dataframe is None or dataframe.empty: dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)]) return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), ColumnFilter( AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)", ), ColumnFilter( AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True ), ], bool_checkboxgroup_label="Hide models", interactive=False, ) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("đ Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0): leaderboard = init_leaderboard(LEADERBOARD_DF) with gr.TabItem("đ Simple Results", elem_id="simple-results-tab", id=1): gr.Markdown("## đ¯ Phoneme Detection Results") gr.Markdown("Compare phoneme recognition models across different datasets") # Stats section for simple results with gr.Row(): simple_total_models = gr.HTML( '