File size: 19,843 Bytes
f3ebaf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import os

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    COLS,
    AutoEvalColumn,
    fields,
)
from src.about import Tasks
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval

# Import simple leaderboard functionality
import glob
import json
from functools import lru_cache


def restart_space():
    API.restart_space(repo_id=REPO_ID)

### Space initialisation (prefer local JSONs, fall back to Hub)
def _has_local_json(path: str) -> bool:
    try:
        return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path))
    except Exception:
        return False

if not _has_local_json(EVAL_REQUESTS_PATH):
    try:
        print(EVAL_REQUESTS_PATH)
        snapshot_download(
            repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
        )
    except Exception:
        pass

if not _has_local_json(EVAL_RESULTS_PATH):
    try:
        print(EVAL_RESULTS_PATH)
        snapshot_download(
            repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
        )
    except Exception:
        pass


# Build benchmark and evaluation queue column metadata
BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks]

EVAL_COLS = [
    "Model",
    "Model sha",
    "status",
    "precision",
    "weight_type",
    "model_type",
    "likes",
    "params",
    "license",
    "submitted_time",
]

EVAL_TYPES = [
    "markdown",  # Model
    "str",       # Model sha
    "str",       # status
    "str",       # precision
    "str",       # weight_type
    "str",       # model_type
    "number",    # likes
    "number",    # params
    "str",       # license
    "str",       # submitted_time
]

# Hide all models from the leaderboard view
LEADERBOARD_DF = pd.DataFrame(columns=COLS)

(
    finished_eval_queue_df,
    running_eval_queue_df,
    pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

@lru_cache(maxsize=1)
def _get_simple_dataset_keys(results_dir: str) -> tuple:
    """Cache dataset keys to avoid repeated file scanning."""
    all_dataset_keys = set()
    if not os.path.isdir(results_dir):
        return tuple()
    
    for path in glob.glob(os.path.join(results_dir, "*.json")):
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            res = data.get("results", {})
            all_dataset_keys.update(res.keys())
        except Exception:
            continue
    
    return tuple(sorted(all_dataset_keys))

def load_simple_results(results_dir: str) -> pd.DataFrame:
    """Load and process evaluation results from JSON files for simple leaderboard with caching."""
    rows = []
    all_dataset_keys = set(_get_simple_dataset_keys(results_dir))
    
    if not all_dataset_keys:
        return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])

    # Use dataset keys directly as display names
    dataset_display_names = {key: key for key in all_dataset_keys}

    # Single pass: extract data with optimized processing
    for path in glob.glob(os.path.join(results_dir, "*.json")):
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            cfg = data.get("config", {})
            res = data.get("results", {})

            model_name = cfg.get("model_name", "unknown")
            
            # Extract PER for each dataset dynamically
            per_values = {}
            dur_values = []
            
            for dataset_key in all_dataset_keys:
                dataset_data = res.get(dataset_key, {})
                per_value = dataset_data.get("per") if dataset_data else None
                dur_value = dataset_data.get("avg_duration") if dataset_data else None
                
                display_name = dataset_display_names[dataset_key]
                per_values[f"PER {display_name}"] = per_value
                
                if dur_value is not None:
                    dur_values.append(dur_value)
            
            # Calculate average PER across all datasets
            per_vals = [v for v in per_values.values() if v is not None]
            avg_per = sum(per_vals) / len(per_vals) if per_vals else None
            
            # Calculate average duration
            avg_dur = sum(dur_values) / len(dur_values) if dur_values else None

            row = {
                "Model": model_name,
                "Avg PER": avg_per,
                "Avg Duration (s)": avg_dur,
                "_file": os.path.basename(path),
            }
            row.update(per_values)
            rows.append(row)
            
        except Exception:
            continue

    df = pd.DataFrame(rows)
    if df.empty:
        # Create default columns based on discovered datasets
        default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
        for key in sorted(all_dataset_keys):
            display_name = dataset_display_names[key]
            default_cols.insert(-2, f"PER {display_name}")
        return pd.DataFrame(columns=default_cols)
    
    df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
    return df.reset_index(drop=True)


def init_leaderboard(dataframe):
    if dataframe is None or dataframe.empty:
        dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
    return Leaderboard(
        value=dataframe,
        datatype=[c.type for c in fields(AutoEvalColumn)],
        select_columns=SelectColumns(
            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
            label="Select Columns to Display:",
        ),
        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
        filter_columns=[
            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
            ColumnFilter(
                AutoEvalColumn.params.name,
                type="slider",
                min=0.01,
                max=150,
                label="Select the number of parameters (B)",
            ),
            ColumnFilter(
                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
            ),
        ],
        bool_checkboxgroup_label="Hide models",
        interactive=False,
    )


demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("πŸ… Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0):
            leaderboard = init_leaderboard(LEADERBOARD_DF)

        with gr.TabItem("πŸ“Š Simple Results", elem_id="simple-results-tab", id=1):
            gr.Markdown("## 🎯 Phoneme Detection Results")
            gr.Markdown("Compare phoneme recognition models across different datasets")
            
            # Stats section for simple results
            with gr.Row():
                simple_total_models = gr.HTML(
                    '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>'
                )
                simple_best_per = gr.HTML(
                    '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>'
                )
                simple_avg_duration = gr.HTML(
                    '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
                )
            
            # Get initial data to determine columns dynamically
            initial_df = load_simple_results(EVAL_RESULTS_PATH)
            if not initial_df.empty:
                headers = list(initial_df.columns)
                # Remove internal columns
                headers = [h for h in headers if not h.startswith('_')]
            else:
                headers = ["Model", "Avg PER", "Avg Duration (s)"]
            
            with gr.Row():
                with gr.Column(scale=4):
                    simple_table = gr.Dataframe(
                        headers=headers, 
                        row_count=10,
                        label="πŸ† Model Performance Leaderboard",
                        interactive=False
                    )
                
                with gr.Column(scale=1):
                    refresh_btn = gr.Button("πŸ”„ Refresh Data", variant="primary")
                    
                    # Export options
                    with gr.Accordion("πŸ“₯ Export Data", open=False):
                        export_csv = gr.Button("πŸ“„ Export CSV", variant="secondary")
                        export_json = gr.Button("πŸ“‹ Export JSON", variant="secondary")

            def refresh_simple():
                """Refresh the simple leaderboard data with enhanced stats."""
                df = load_simple_results(EVAL_RESULTS_PATH)
                
                if df.empty:
                    return df, "No data", "No data", "No data"
                
                # Get the column order from the dataframe
                cols = [c for c in df.columns if not c.startswith('_')]
                
                # Ensure all columns exist for the dataframe component
                for c in cols:
                    if c not in df.columns:
                        df[c] = None
                
                # Calculate enhanced stats
                total_models = len(df)
                best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A"
                avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A"
                
                # Format stats
                best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val)
                avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val)
                
                return (
                    df[cols].round(3),
                    f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{total_models}</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>',
                    f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{best_per_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>',
                    f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{avg_duration_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
                )

            def export_simple_csv():
                """Export simple results as CSV."""
                df = load_simple_results(EVAL_RESULTS_PATH)
                if df.empty:
                    return None
                cols = [c for c in df.columns if not c.startswith('_')]
                return df[cols].round(3)

            def export_simple_json():
                """Export simple results as JSON."""
                df = load_simple_results(EVAL_RESULTS_PATH)
                if df.empty:
                    return None
                cols = [c for c in df.columns if not c.startswith('_')]
                return df[cols].round(3).to_json(orient='records', indent=2)

            # Connect events
            refresh_btn.click(
                fn=refresh_simple, 
                outputs=[simple_table, simple_total_models, simple_best_per, simple_avg_duration]
            )
            
            export_csv.click(
                fn=export_simple_csv,
                outputs=gr.File(label="Download CSV")
            )
            
            export_json.click(
                fn=export_simple_json,
                outputs=gr.File(label="Download JSON")
            )

            # Auto-load on start
            simple_table.value, simple_total_models.value, simple_best_per.value, simple_avg_duration.value = refresh_simple()
            
            # Enhanced help section
            with gr.Accordion("ℹ️ About this Leaderboard", open=False):
                gr.Markdown("""
                ## πŸ“Š Understanding the Results
                
                **Performance Metrics:**
                - **PER (Phoneme Error Rate)**: Lower values indicate better performance
                - **Avg Duration**: Processing time per sample (lower is faster)
                - **Models are ranked by average PER across all datasets**
                
                **Datasets Evaluated:**
                - `phoneme_asr`: General phoneme recognition dataset
                - `kids_phoneme_md`: Kids' phoneme recognition dataset
                
                **How to Interpret:**
                - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect)
                - **Duration**: Time efficiency (important for real-time applications)
                - **Average PER**: Overall model performance across all datasets
                
                **Tips for Model Selection:**
                - Choose models with low PER for accuracy-critical applications
                - Consider duration for real-time or resource-constrained environments
                - Balance between accuracy (PER) and speed (Duration) based on your needs
                """)

        with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
            with gr.Column():
                with gr.Row():
                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

                with gr.Column():
                    with gr.Accordion(
                        f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
                        open=False,
                    ):
                        with gr.Row():
                            finished_eval_table = gr.components.Dataframe(
                                value=finished_eval_queue_df,
                                headers=EVAL_COLS,
                                datatype=EVAL_TYPES,
                                row_count=5,
                            )
                    with gr.Accordion(
                        f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
                        open=False,
                    ):
                        with gr.Row():
                            running_eval_table = gr.components.Dataframe(
                                value=running_eval_queue_df,
                                headers=EVAL_COLS,
                                datatype=EVAL_TYPES,
                                row_count=5,
                            )

                    with gr.Accordion(
                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
                        open=False,
                    ):
                        with gr.Row():
                            pending_eval_table = gr.components.Dataframe(
                                value=pending_eval_queue_df,
                                headers=EVAL_COLS,
                                datatype=EVAL_TYPES,
                                row_count=5,
                            )
            with gr.Row():
                gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")

            with gr.Row():
                with gr.Column():
                    model_name_textbox = gr.Textbox(label="Model name")
                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                    model_type = gr.Dropdown(
                        choices=["Pretrained", "Fine-tuned", "Merge", "Other"],
                        label="Model type",
                        multiselect=False,
                        value=None,
                        interactive=True,
                    )

                with gr.Column():
                    precision = gr.Dropdown(
                        choices=["float16", "bfloat16", "float32", "int8", "int4"],
                        label="Precision",
                        multiselect=False,
                        value="float16",
                        interactive=True,
                    )
                    weight_type = gr.Dropdown(
                        choices=["Original", "Delta", "Adapter"],
                        label="Weights type",
                        multiselect=False,
                        value="Original",
                        interactive=True,
                    )
                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")

            submit_button = gr.Button("Submit Eval")
            submission_result = gr.Markdown()
            submit_button.click(
                add_new_eval,
                [
                    model_name_textbox,
                    base_model_name_textbox,
                    revision_name_textbox,
                    precision,
                    weight_type,
                    model_type,
                ],
                submission_result,
            )

    with gr.Row():
        with gr.Accordion("πŸ“™ Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True,
            )

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()